diff --git a/.travis.yml b/.travis.yml
index 817d1ace7f87..d6b38f4ac8de 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,7 +2,12 @@ sudo: true
 
 language: cpp
 
-cache: ccache
+cache:
+  directories:
+    - $HOME/.ccache
+    - $HOME/.cache/pip
+    - $HOME/.mxnet
+    - $HOME/Library/Caches/Homebrew
 
 os:
   - osx
@@ -17,7 +22,7 @@ before_install:
   - export PYTHONPATH=${PYTHONPATH}:${PWD}/python
 
 install:
-  - brew install ccache
+  - HOMEBREW_NO_AUTO_UPDATE=1 brew install ccache
   - export PATH="/usr/local/opt/ccache/libexec:$PATH"
   - source ci/travis/install.sh
 
@@ -29,4 +34,7 @@ script:
   - export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
   - mv make/osx.mk config.mk
   - make -j 2
-  - python -m nose --verbose tests/python/unittest/
+  # We ignore several tests to avoid possible timeouts on large PRs.
+  # This lowers our test coverage, but is required for consistent Travis runs.
+  # These tests will be tested in a variety of environments in Jenkins based tests.
+  - python -m nose --with-timer --exclude-test=test_sparse_operator.test_elemwise_binary_ops --exclude-test=test_gluon_model_zoo.test_models --exclude-test=test_random.test_shuffle --exclude-test=test_operator.test_broadcast_binary_op --exclude-test=test_operator.test_pick --exclude-test=test_profiler.test_continuous_profile_and_instant_marker --exclude-test=test_metric_perf.test_metric_performance --exclude-test=test_operator.test_order --verbose tests/python/unittest/
diff --git a/src/operator/contrib/ctc_include/LICENSE b/3rdparty/ctc_include/LICENSE
similarity index 100%
rename from src/operator/contrib/ctc_include/LICENSE
rename to 3rdparty/ctc_include/LICENSE
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/LICENSE b/3rdparty/ctc_include/contrib/moderngpu/LICENSE
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/LICENSE
rename to 3rdparty/ctc_include/contrib/moderngpu/LICENSE
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctaloadbalance.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/device/ctaloadbalance.cuh
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctaloadbalance.cuh
rename to 3rdparty/ctc_include/contrib/moderngpu/include/device/ctaloadbalance.cuh
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctamerge.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/device/ctamerge.cuh
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctamerge.cuh
rename to 3rdparty/ctc_include/contrib/moderngpu/include/device/ctamerge.cuh
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctascan.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/device/ctascan.cuh
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctascan.cuh
rename to 3rdparty/ctc_include/contrib/moderngpu/include/device/ctascan.cuh
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctasearch.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/device/ctasearch.cuh
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctasearch.cuh
rename to 3rdparty/ctc_include/contrib/moderngpu/include/device/ctasearch.cuh
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctasegreduce.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/device/ctasegreduce.cuh
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctasegreduce.cuh
rename to 3rdparty/ctc_include/contrib/moderngpu/include/device/ctasegreduce.cuh
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctasegscan.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/device/ctasegscan.cuh
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctasegscan.cuh
rename to 3rdparty/ctc_include/contrib/moderngpu/include/device/ctasegscan.cuh
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctasegsort.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/device/ctasegsort.cuh
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctasegsort.cuh
rename to 3rdparty/ctc_include/contrib/moderngpu/include/device/ctasegsort.cuh
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctasortedsearch.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/device/ctasortedsearch.cuh
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/device/ctasortedsearch.cuh
rename to 3rdparty/ctc_include/contrib/moderngpu/include/device/ctasortedsearch.cuh
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/devicetypes.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/device/devicetypes.cuh
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/device/devicetypes.cuh
rename to 3rdparty/ctc_include/contrib/moderngpu/include/device/devicetypes.cuh
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/deviceutil.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/device/deviceutil.cuh
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/device/deviceutil.cuh
rename to 3rdparty/ctc_include/contrib/moderngpu/include/device/deviceutil.cuh
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/intrinsics.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/device/intrinsics.cuh
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/device/intrinsics.cuh
rename to 3rdparty/ctc_include/contrib/moderngpu/include/device/intrinsics.cuh
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/loadstore.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/device/loadstore.cuh
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/device/loadstore.cuh
rename to 3rdparty/ctc_include/contrib/moderngpu/include/device/loadstore.cuh
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/serialsets.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/device/serialsets.cuh
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/device/serialsets.cuh
rename to 3rdparty/ctc_include/contrib/moderngpu/include/device/serialsets.cuh
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/sortnetwork.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/device/sortnetwork.cuh
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/device/sortnetwork.cuh
rename to 3rdparty/ctc_include/contrib/moderngpu/include/device/sortnetwork.cuh
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/mgpudevice.cuh b/3rdparty/ctc_include/contrib/moderngpu/include/mgpudevice.cuh
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/mgpudevice.cuh
rename to 3rdparty/ctc_include/contrib/moderngpu/include/mgpudevice.cuh
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/mgpuenums.h b/3rdparty/ctc_include/contrib/moderngpu/include/mgpuenums.h
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/mgpuenums.h
rename to 3rdparty/ctc_include/contrib/moderngpu/include/mgpuenums.h
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/util/static.h b/3rdparty/ctc_include/contrib/moderngpu/include/util/static.h
similarity index 100%
rename from src/operator/contrib/ctc_include/contrib/moderngpu/include/util/static.h
rename to 3rdparty/ctc_include/contrib/moderngpu/include/util/static.h
diff --git a/src/operator/contrib/ctc_include/detail/cpu_ctc.h b/3rdparty/ctc_include/detail/cpu_ctc.h
similarity index 100%
rename from src/operator/contrib/ctc_include/detail/cpu_ctc.h
rename to 3rdparty/ctc_include/detail/cpu_ctc.h
diff --git a/src/operator/contrib/ctc_include/detail/ctc_helper.h b/3rdparty/ctc_include/detail/ctc_helper.h
similarity index 100%
rename from src/operator/contrib/ctc_include/detail/ctc_helper.h
rename to 3rdparty/ctc_include/detail/ctc_helper.h
diff --git a/src/operator/contrib/ctc_include/detail/gpu_ctc.h b/3rdparty/ctc_include/detail/gpu_ctc.h
similarity index 100%
rename from src/operator/contrib/ctc_include/detail/gpu_ctc.h
rename to 3rdparty/ctc_include/detail/gpu_ctc.h
diff --git a/src/operator/contrib/ctc_include/detail/gpu_ctc_kernels.h b/3rdparty/ctc_include/detail/gpu_ctc_kernels.h
similarity index 100%
rename from src/operator/contrib/ctc_include/detail/gpu_ctc_kernels.h
rename to 3rdparty/ctc_include/detail/gpu_ctc_kernels.h
diff --git a/src/operator/contrib/ctc_include/detail/hostdevice.h b/3rdparty/ctc_include/detail/hostdevice.h
similarity index 100%
rename from src/operator/contrib/ctc_include/detail/hostdevice.h
rename to 3rdparty/ctc_include/detail/hostdevice.h
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 8a9e337f3c47..696803bd7723 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 8a9e337f3c4794876bd04d5351d967333bcabee3
+Subproject commit 696803bd7723ade8230af878460d96c68a550fbc
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 55416355d8aa..90e548f42e3b 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -180,4 +180,6 @@ List of Contributors
 * [Per Goncalves da Silva](https://github.com/perdasilva)
 * [Zhijingcheng Yu](https://github.com/jasonyu1996)
 * [Cheng-Che Lee](https://github.com/stu1130)
-* [Chaitanya Bapat](https://github.com/ChaiBapchya)
\ No newline at end of file
+* [Chaitanya Bapat](https://github.com/ChaiBapchya)
+* [LuckyPigeon](https://github.com/LuckyPigeon)
+
diff --git a/Jenkinsfile b/Jenkinsfile
index 81a25deca27b..af059c58e830 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -89,6 +89,30 @@ def python3_gpu_ut_nocudnn(docker_container_name) {
   }
 }
 
+def deploy_docs() {
+  parallel 'Docs': {
+    node(NODE_LINUX_CPU) {
+      ws('workspace/docs') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          utils.init_git()
+          utils.docker_run('ubuntu_cpu', 'deploy_docs', false)
+          sh "ci/other/ci_deploy_doc.sh ${env.BRANCH_NAME} ${env.BUILD_NUMBER}"
+        }
+      }
+    }
+  },
+  'Julia docs': {
+    node(NODE_LINUX_CPU) {
+      ws('workspace/julia-docs') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          utils.unpack_and_init('cpu', mx_lib)
+          utils.docker_run('ubuntu_cpu', 'deploy_jl_docs', false)
+        }
+      }
+    }
+  }
+}
+
 node('mxnetlinux-cpu') {
   // Loading the utilities requires a node context unfortunately
   checkout scm
@@ -746,6 +770,16 @@ core_logic: {
         }
       }
     },
+    'Julia 0.6: CPU': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/ut-julia06-cpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.unpack_and_init('cpu', mx_lib)
+            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_julia06', false)
+          }
+        }
+      }
+    },
 
     'Python 2: CPU Win':{
       node(NODE_WINDOWS_CPU) {
@@ -911,15 +945,7 @@ core_logic: {
   }
 
   stage('Deploy') {
-    node(NODE_LINUX_CPU) {
-      ws('workspace/docs') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          utils.init_git()
-          utils.docker_run('ubuntu_cpu', 'deploy_docs', false)
-          sh "ci/other/ci_deploy_doc.sh ${env.BRANCH_NAME} ${env.BUILD_NUMBER}"
-        }
-      }
-    }
+    deploy_docs()
   }
 }
 ,
diff --git a/Makefile b/Makefile
index 1c8d70ecc695..a4b41b8d8371 100644
--- a/Makefile
+++ b/Makefile
@@ -66,8 +66,8 @@ $(warning "USE_MKL2017 is deprecated. We will switch to USE_MKLDNN.")
 endif
 
 ifeq ($(USE_MKLDNN), 1)
-	MKLDNNROOT = $(ROOTDIR)/3rdparty/mkldnn/install
-	MKLROOT = $(ROOTDIR)/3rdparty/mkldnn/install
+	MKLDNNROOT = $(ROOTDIR)/3rdparty/mkldnn/build/install
+	MKLROOT = $(ROOTDIR)/3rdparty/mkldnn/build/install
 	export USE_MKLML = 1
 endif
 
diff --git a/amalgamation/python/mxnet_predict.py b/amalgamation/python/mxnet_predict.py
index 091bfbb1cf14..a91d3849b0d2 100644
--- a/amalgamation/python/mxnet_predict.py
+++ b/amalgamation/python/mxnet_predict.py
@@ -163,7 +163,7 @@ def forward(self, **kwargs):
         for k, v in kwargs.items():
             if not isinstance(v, np.ndarray):
                 raise ValueError("Expect numpy ndarray as input")
-            v = np.ascontiguousarray(v, dtype=np.float32)
+            v = np.asarray(v, dtype=np.float32, order='C')
             _check_call(_LIB.MXPredSetInput(
                 self.handle, c_str(k),
                 v.ctypes.data_as(mx_float_p),
diff --git a/benchmark/python/control_flow/rnn.py b/benchmark/python/control_flow/rnn.py
index 8a44a9cab174..08498724b1b4 100644
--- a/benchmark/python/control_flow/rnn.py
+++ b/benchmark/python/control_flow/rnn.py
@@ -32,6 +32,7 @@
 _parser.add_argument('--benchmark', choices=["foreach", "while_loop"], required=True)
 _parser.add_argument('--warmup_rounds', type=int, default=20)
 _parser.add_argument('--test_rounds', type=int, default=100)
+_parser.add_argument('--gpu', type=bool, default=False)
 args = _parser.parse_args()
 
 
@@ -66,8 +67,7 @@ def _func(*states):
             loop_vars=states,
             max_iterations=self.length,
         )
-        assert len(out) == 1
-        return out[0]
+        return out
 
 
 def _zeros(shape, ctx):
@@ -124,7 +124,9 @@ def main():
     cell_types = [gluon.rnn.RNNCell,
                   gluon.rnn.GRUCell,
                   gluon.rnn.LSTMCell]
-    ctxs = [mx.cpu(0)] + [mx.gpu(i) for i in _get_gpus()]
+    ctxs = [mx.cpu(0)]
+    if args.gpu:
+        ctxs = ctxs + [mx.gpu(i) for i in _get_gpus()]
     seq_lens = [100]
     batch_sizes = [1, 32]
     hidden_dims = [512]
diff --git a/ci/docker/Dockerfile.build.ubuntu_cpu b/ci/docker/Dockerfile.build.ubuntu_cpu
index f45c8da4af87..7c7e2240ee61 100755
--- a/ci/docker/Dockerfile.build.ubuntu_cpu
+++ b/ci/docker/Dockerfile.build.ubuntu_cpu
@@ -45,6 +45,9 @@ RUN /work/ubuntu_r.sh
 COPY install/ubuntu_perl.sh /work/
 RUN /work/ubuntu_perl.sh
 
+COPY install/ubuntu_julia.sh /work/
+RUN /work/ubuntu_julia.sh
+
 COPY install/ubuntu_clang.sh /work/
 RUN /work/ubuntu_clang.sh
 
diff --git a/ci/docker/install/docs_requirements b/ci/docker/install/docs_requirements
index 7407223b3eed..4e3ce3e55e0b 100644
--- a/ci/docker/install/docs_requirements
+++ b/ci/docker/install/docs_requirements
@@ -6,7 +6,7 @@ h5py==2.8.0rc1
 mock==2.0.0
 nose==1.3.7
 nose-timer==0.7.3
-numpy<1.15.0,>=1.8.2
+numpy<=1.15.2,>=1.8.2
 pylint==1.8.3
 pypandoc==1.4
 recommonmark==0.4.0
diff --git a/ci/docker/install/ubuntu_julia.sh b/ci/docker/install/ubuntu_julia.sh
new file mode 100755
index 000000000000..62013e36d8fd
--- /dev/null
+++ b/ci/docker/install/ubuntu_julia.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# build and install are separated so changes to build don't invalidate
+# the whole docker cache for the image
+
+set -ex
+
+export JLBINARY='julia.tar.gz'
+export JULIADIR='/work/julia'
+export JULIA="${JULIADIR}/bin/julia"
+
+mkdir -p $JULIADIR
+# The julia version in Ubuntu repo is too old
+# We download the tarball from the official link:
+#   https://julialang.org/downloads/
+wget -O $JLBINARY https://julialang-s3.julialang.org/bin/linux/x64/0.6/julia-0.6.2-linux-x86_64.tar.gz
+tar xzvf $JLBINARY -C $JULIADIR --strip 1
+rm $JLBINARY
+
+$JULIA -e 'versioninfo()'
diff --git a/ci/docker/install/ubuntu_nightly_tests.sh b/ci/docker/install/ubuntu_nightly_tests.sh
index 68358908bdc9..406985ea3a4a 100755
--- a/ci/docker/install/ubuntu_nightly_tests.sh
+++ b/ci/docker/install/ubuntu_nightly_tests.sh
@@ -32,5 +32,5 @@ apt-get -y install time
 apt-get install -y subversion maven -y #>/dev/null
 
 # Packages needed for the Straight Dope Nightly tests.
-pip2 install pandas scikit-image
-pip3 install pandas scikit-image
+pip2 install pandas scikit-image prompt_toolkit
+pip3 install pandas scikit-image prompt_toolkit
diff --git a/ci/docker/install/ubuntu_python.sh b/ci/docker/install/ubuntu_python.sh
index 0fd91cbf706c..a60516386652 100755
--- a/ci/docker/install/ubuntu_python.sh
+++ b/ci/docker/install/ubuntu_python.sh
@@ -29,5 +29,5 @@ wget -nv https://bootstrap.pypa.io/get-pip.py
 python3 get-pip.py
 python2 get-pip.py
 
-pip2 install nose cpplint==1.3.0 pylint==1.9.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
-pip3 install nose cpplint==1.3.0 pylint==2.1.1 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
+pip2 install nose cpplint==1.3.0 pylint==1.9.3 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
+pip3 install nose cpplint==1.3.0 pylint==2.1.1 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index d1fc2239a442..96b1646eff97 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -816,6 +816,35 @@ unittest_ubuntu_gpu_R() {
     make rpkgtest R_LIBS=/tmp/r-site-library R_GPU_ENABLE=1
 }
 
+unittest_ubuntu_cpu_julia06() {
+    set -ex
+    export PATH="/work/julia/bin:$PATH"
+    export MXNET_HOME='/work/mxnet'
+    export JULIA_PKGDIR='/work/julia-pkg'
+    export DEPDIR=`julia -e 'print(Pkg.dir())'`
+
+    julia -e 'versioninfo()'
+    julia -e 'Pkg.init()'
+
+    # install package
+    ln -sf ${MXNET_HOME}/julia ${DEPDIR}/MXNet
+
+    # install dependencies
+    julia -e 'Pkg.resolve()'
+
+    # FIXME
+    export LD_PRELOAD='/usr/lib/x86_64-linux-gnu/libjemalloc.so'
+
+    # use the prebuilt binary from $MXNET_HOME/lib
+    julia -e 'Pkg.build("MXNet")'
+
+    # run the script `julia/test/runtests.jl`
+    julia -e 'Pkg.test("MXNet")'
+
+    # See https://github.com/dmlc/MXNet.jl/pull/303#issuecomment-341171774
+    julia -e 'using MXNet; mx._sig_checker()'
+}
+
 unittest_centos7_cpu() {
     set -ex
     cd /work/mxnet
@@ -1090,6 +1119,31 @@ deploy_docs() {
     popd
 }
 
+deploy_jl_docs() {
+    set -ex
+    export PATH="/work/julia/bin:$PATH"
+    export MXNET_HOME='/work/mxnet'
+    export JULIA_PKGDIR='/work/julia-pkg'
+    export DEPDIR=`julia -e 'print(Pkg.dir())'`
+
+    julia -e 'versioninfo()'
+    julia -e 'Pkg.init()'
+    ln -sf ${MXNET_HOME}/julia ${DEPDIR}/MXNet
+    julia -e 'Pkg.resolve()'
+
+    # FIXME
+    export LD_PRELOAD='/usr/lib/x86_64-linux-gnu/libjemalloc.so'
+
+    # use the prebuilt binary from $MXNET_HOME/lib
+    julia -e 'Pkg.build("MXNet")'
+    # build docs
+    julia -e 'Pkg.add("Documenter")'
+    julia -e 'cd(Pkg.dir("MXNet")); include(joinpath("docs", "make.jl"))'
+
+    # TODO: make Jenkins worker push to MXNet.jl ph-pages branch if master build
+    # ...
+}
+
 # broken_link_checker
 
 broken_link_checker() {
diff --git a/ci/travis/install.sh b/ci/travis/install.sh
index 16db601211a5..ae959767133a 100644
--- a/ci/travis/install.sh
+++ b/ci/travis/install.sh
@@ -17,14 +17,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# Disable brew auto-update to avoid long running updates while running tests in CI.
+export HOMEBREW_NO_AUTO_UPDATE=1
+
 if [ ${TRAVIS_OS_NAME} == "osx" ]; then
-    brew update
     brew install opencv
-    brew install python3
-    brew install fftw
-    brew install libpng
-    brew install ImageMagick
-    brew install swig
-    python -m pip install --user nose numpy cython scipy requests mock
-    python3 -m pip install --user nose numpy cython scipy requests mock
+    python -m pip install --user nose numpy cython scipy requests mock nose-timer nose-exclude
 fi
diff --git a/contrib/clojure-package/README.md b/contrib/clojure-package/README.md
index 66ba77b76135..8c71224a3a55 100644
--- a/contrib/clojure-package/README.md
+++ b/contrib/clojure-package/README.md
@@ -10,6 +10,8 @@ The motivation for creating a Clojure package is to be able to open the deep lea
 
 For high leverage, the Clojure package has been built on the existing Scala package using interop. This has allowed rapid development and close parity with the Scala functionality. This also leaves the door open to directly developing code against the jni-bindings with Clojure in the future in an incremental fashion, using the test suites as a refactoring guide.
 
+For a **video introduction**, see [Clojure MXNet with Carin Meier - Clojure Virtual Meetup](https://www.crowdcast.io/e/clojure-mxnet-with-carin) (setup instructions from 20:49)
+
 ## Current State and Plans
 
 The Clojure package is nearing the end of its first development milestone which is to achieve a close parity with the Scala package.
diff --git a/cpp-package/README.md b/cpp-package/README.md
index 2b6e0e39f0fd..c4fe63c9ec58 100644
--- a/cpp-package/README.md
+++ b/cpp-package/README.md
@@ -1,21 +1,46 @@
 # MXNet C++ Package
 
-To build the C++ package, please refer to [this guide](<https://mxnet.incubator.apache.org/install/build_from_source#build-the-c-package>).
+The MXNet C++ Package provides C++ API bindings to the users of MXNet.  Currently, these bindings are not available as standalone package.
+The users of these bindings are required to build this package as mentioned below.
 
-A basic tutorial can be found at <https://mxnet.incubator.apache.org/tutorials/c++/basics.html>.
+## Building C++ Package
 
-The example directory contains examples for you to get started.
+The cpp-package directory contains the implementation of C++ API. As mentioned above, users are required to build this directory or package before using it.
+**The cpp-package is built while building the MXNet shared library, *libmxnet.so*.**
+
+###Steps to build the C++ package:
+1.  Building the MXNet C++ package requires building MXNet from source.
+2.  Clone the MXNet GitHub repository **recursively** to ensure the code in submodules is available for building MXNet.
+	```
+	git clone --recursive https://github.com/apache/incubator-mxnet mxnet
+	```
+
+3.  Install the [prerequisites](<https://mxnet.incubator.apache.org/install/build_from_source#prerequisites>), desired [BLAS libraries](<https://mxnet.incubator.apache.org/install/build_from_source#blas-library>) and optional [OpenCV, CUDA, and cuDNN](<https://mxnet.incubator.apache.org/install/build_from_source#optional>) for building MXNet from source.
+4.  There is a configuration file for make, [make/config.mk](<https://github.com/apache/incubator-mxnet/blob/master/make/config.mk>) that contains all the compilation options. You can edit this file and set the appropriate options prior to running the **make** command.
+5.  Please refer to  [platform specific build instructions](<https://mxnet.incubator.apache.org/install/build_from_source#build-instructions-by-operating-system>) and available [build configurations](https://mxnet.incubator.apache.org/install/build_from_source#build-configurations) for more details. 
+5.  For enabling the build of C++ Package, set the **USE\_CPP\_PACKAGE = 1** in [make/config.mk](<https://github.com/apache/incubator-mxnet/blob/master/make/config.mk>). Optionally, the compilation flag can also be specified on **make** command line as follows.
+	```
+	make -j USE_CPP_PACKAGE=1 
+	```
+
+## Usage
 
-## Building C++ examples in examples folder
+In order to consume the C++ API please follow the steps below.
 
-From cpp-package/examples directory
--  Build all examples in release mode: **make all**
--  Build all examples in debug mode : **make debug**
+1. Ensure that the MXNet shared library is built from source with the **USE\_CPP\_PACKAGE = 1**.
+2. Include the [MxNetCpp.h](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/include/mxnet-cpp/MxNetCpp.h>) in the program that is going to consume MXNet C++ API.
+	```
+	#include <mxnet-cpp/MxNetCpp.h>
+	```
+3. While building the program, ensure that the correct paths to the directories containing header files and MXNet shared library.
+4. The program links the MXNet shared library dynamically. Hence the library needs to be accessible to the program during runtime. This can be achieved by including the path to the shared library in the environment variable  **LD\_LIBRARY\_PATH** for Linux, Mac. and Ubuntu OS and **PATH** for Windows OS.
 
-By default, the examples are build to be run on GPU.
-To build examples to run on CPU:
-- Release: **make all MXNET_USE_CPU=1**  
-- Debug: **make debug MXNET_USE_CPU=1**  
 
+## Tutorial
+
+A basic tutorial can be found at <https://mxnet.incubator.apache.org/tutorials/c++/basics.html>.
+
+## Examples
+
+The example directory contains examples for you to get started.
 
-The makefile will also download the necessary data files and store in data folder. (The download will take couple of minutes, but will be done only once on a fresh installation.)
diff --git a/cpp-package/example/README.md b/cpp-package/example/README.md
new file mode 100644
index 000000000000..76f6a0127c0b
--- /dev/null
+++ b/cpp-package/example/README.md
@@ -0,0 +1,170 @@
+# MXNet C++ Package Examples
+
+## Building C++ examples
+
+The examples are built while building the MXNet library and cpp-package from source . However, they can be built manually as follows
+
+From cpp-package/examples directory
+
+-  Build all examples in release mode: **make all**
+-  Build all examples in debug mode: **make debug**
+
+<<<<<<< HEAD
+By default, the examples are built to be run on GPU. To build examples to run on CPU:
+=======
+By default, the examples are build to be run on GPU. To build examples to run on CPU:
+>>>>>>> 39054b349e83ead13127cd1bd6b90e3141bc0451
+
+-  Release: **make all MXNET\_USE\_CPU=1**
+-  Debug: **make debug MXNET\_USE\_CPU=1**
+
+<<<<<<< HEAD
+The examples that are built to be run on GPU may not work on the non-GPU machines.
+The makefile will also download the necessary data files and store in a data folder. (The download will take couple of minutes, but will be done only once on a fresh installation.)
+=======
+The examples that are build to be run on GPU may not work on the non-GPU machines.
+The makefile will also download the necessary data files and store in data folder. (The download will take couple of minutes, but will be done only once on a fresh installation.)
+>>>>>>> 39054b349e83ead13127cd1bd6b90e3141bc0451
+
+
+## Examples
+
+<<<<<<< HEAD
+This directory contains following examples. In order to run the examples, ensure that the path to the MXNet shared library is added to the OS specific environment variable viz. **LD\_LIBRARY\_PATH** for Linux, Mac and Ubuntu OS and **PATH** for Windows OS.
+
+### [alexnet.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/alexnet.cpp>)
+
+The example implements the C++ version of AlexNet. The networks trains on MNIST data. The number of epochs can be specified as a command line argument. For example to train with 10 epochs use the following:
+
+=======
+This directory contains following examples. In order to run the examples, ensure that the path to the MXNet shared library is added to the OS specific environment variable such as _LD\_LIBRARY\_PATH_ .
+
+### [alexnet.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/alexnet.cpp>)
+
+The example implements C++ version of AlexNet. The networks trains the MNIST data. The number of epochs can be specified as command line arguement. For example:
+>>>>>>> 39054b349e83ead13127cd1bd6b90e3141bc0451
+	```
+	./alexnet 10
+	```
+
+<<<<<<< HEAD
+### [googlenet.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/googlenet.cpp>)
+
+The code implements a GoogLeNet/Inception network using the C++ API. The example uses MNIST data to train the network. By default, the example trains the model for 100 epochs. The number of epochs can also be specified in the command line. For example, to train the model for 10 epochs use the following:
+=======
+### [charRNN.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/chaRNN.cpp>)
+
+The code implements C++ version charRNN for mxnet\example\rnn\char-rnn.ipynb with MXNet.cpp API. The generated params file is compatiable with python version. The train() and predict() has been verified with original data samples.
+
+The example expects arguments as follows:
+
+```
+	./charRNN train [BuildIn\ [TImeMajor] {corpus file} { batch size} { max epoch} [{starting epoch}]
+	./charRNN predict [BuildIn\ [TImeMajor] {param file} { batch size} { max epoch} [{starting epoch}]
+```
+ 
+### [googlenet.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/googlenet.cpp>)
+
+The code implements GoogLeNet/Inception network using C++ API. The example uses MNIST data to train the network. The number of epochs can be specified in the command line as follows. If not specified, the model trains for 100 epochs.
+>>>>>>> 39054b349e83ead13127cd1bd6b90e3141bc0451
+
+```
+./googlenet 10
+```
+
+### [mlp.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/mlp.cpp>)
+
+<<<<<<< HEAD
+The code implements a multilayer perceptron from scratch. The example creates its own dummy data to train the model. The example does not require command line parameters. It trains the model for 20,000 epochs.
+To run the example use the following command:
+=======
+The code implements multilayer perceptron from scratch. The example creates its own dummy data to train the model. The example does not require command line parameters. It trains the model for 20000 iterations.
+>>>>>>> 39054b349e83ead13127cd1bd6b90e3141bc0451
+
+```
+./mlp
+```
+
+### [mlp_cpu.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/mlp_cpu.cpp>)
+
+<<<<<<< HEAD
+The code implements a multilayer perceptron to train the MNIST data. The code demonstrates the use of "SimpleBind"  C++ API and MNISTIter. The example is designed to work on CPU. The example does not require command line parameters.
+To run the example use the following command:
+=======
+The code implements multilayer perceptron to train the MNIST data. The code demonstrates the use of "SimpleBind"  C++ API and MNISTIter. The example is designed to work on CPU. The example does not require command line parameters.
+
+>>>>>>> 39054b349e83ead13127cd1bd6b90e3141bc0451
+```
+./mlp_cpu
+```
+
+### [mlp_gpu.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/mlp_gpu.cpp>)
+<<<<<<< HEAD
+The code implements multilayer perceptron to train the MNIST data. The code demonstrates the use of the "SimpleBind"  C++ API and MNISTIter. The example is designed to work on GPU. The example does not require command line arguments. To run the example execute following command:
+=======
+The code implements multilayer perceptron to train the MNIST data. The code demonstrates the use of "SimpleBind"  C++ API and MNISTIter. The example is designed to work on GPU. The example does not require command line paratmeters.
+>>>>>>> 39054b349e83ead13127cd1bd6b90e3141bc0451
+
+```
+./mlp_gpu
+```
+
+### [mlp_csv.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/mlp_csv.cpp>)
+<<<<<<< HEAD
+The code implements a multilayer perceptron to train the MNIST data. The code demonstrates the use of the "SimpleBind"  C++ API and CSVIter. The CSVIter can iterate data that is in CSV format. The example can be run on CPU or GPU. The example usage is as follows:
+=======
+The code implements multilayer perceptron to train the MNIST data. The code demonstrates the use of "SimpleBind"  C++ API and CSVIter. The CSVIter can iterate data that is in CSV format. The example can be run on CPU or GPU. The example usage is as follows:
+>>>>>>> 39054b349e83ead13127cd1bd6b90e3141bc0451
+
+```
+mlp_csv --train mnist_training_set.csv --test mnist_test_set.csv --epochs 10 --batch_size 100 --hidden_units "128,64,64 [--gpu]"
+```
+
+### [resnet.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/resnet.cpp>)
+
+<<<<<<< HEAD
+The code implements a resnet model using the C++ API. The model is used to train MNIST data. The number of epochs for training the model can be specified on the command line. By default, model is trained for 100 epochs. For example, to train with 10 epochs use the following command:
+=======
+The code implements resnet model using C++ API. The model is used to train MNIST data. The number of epochs for training the model can be specified on the command line. By default, model is trained for 100 epochs.
+>>>>>>> 39054b349e83ead13127cd1bd6b90e3141bc0451
+
+```
+./resnet 10
+```
+
+### [lenet.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/lenet.cpp>)
+
+<<<<<<< HEAD
+The code implements a lenet model using the C++ API. It uses MNIST training data in CSV format to train the network. The example does not use built-in CSVIter to read the data from CSV file. The number of epochs can be specified on the command line. By default, the mode is trained for 100,000 epochs. For example, to train with 10 epochs use the following command:
+=======
+The code implements lenet model using C++ API. It uses MNIST training data in CSV format to train the network. The example does not use built-in CSVIter to read the data from CSV file. The number of epochs can be specified on the command line. By default, the mode is trained for 100000 epochs.
+>>>>>>> 39054b349e83ead13127cd1bd6b90e3141bc0451
+
+```
+./lenet 10
+```
+### [lenet\_with\_mxdataiter.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/mlp_cpu.cpp>)
+
+<<<<<<< HEAD
+The code implements a lenet model using the C++ API. It uses MNIST training data to train the network. The example uses built-in MNISTIter to read the data. The number of epochs can be specified on the command line. By default, the mode is trained for 100 epochs. For example, to train with 10 epochs use the following command:
+=======
+The code implements lenet model using C++ API. It uses MNIST training data to train the network. The example uses built-in MNISTIter to read the data. The number of epochs can be specified on the command line. By default, the mode is trained for 100 epochs.
+>>>>>>> 39054b349e83ead13127cd1bd6b90e3141bc0451
+
+```
+./lenet\_with\_mxdataiter 10
+```
+
+In addition, there is `run_lenet_with_mxdataiter.sh` that downloads the mnist data and run `lenet_with_mxdataiter` example.
+
+###[inception_bn.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inception_bn.cpp>)
+
+<<<<<<< HEAD
+The code implements an Inception network using the C++ API with batch normalization. The example uses MNIST data to train the network. The model trains for 100 epochs. The example can be run by executing the following command:
+=======
+The code implements Inception network using C++ API with batch normalization. The example uses MNIST data to train the network. The model trains for 100 epochs.
+>>>>>>> 39054b349e83ead13127cd1bd6b90e3141bc0451
+
+```
+./inception_bn
+```
diff --git a/cpp-package/example/mlp_csv.cpp b/cpp-package/example/mlp_csv.cpp
new file mode 100644
index 000000000000..8aec4b76d917
--- /dev/null
+++ b/cpp-package/example/mlp_csv.cpp
@@ -0,0 +1,272 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Example: mlp_csv
+ * Description:
+ * The following example demonstrates how to use CSVIter. This example creates
+ * mlp (multi-layer perceptron) model and trains the MNIST data which is in
+ * CSV format.
+ */
+#include <chrono>
+#include "utils.h"
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+/*
+ * Implementing the mlp symbol with given hidden units configuration.
+ */
+Symbol mlp(const std::vector<int> &hidden_units) {
+    auto data = Symbol::Variable("data");
+    auto label = Symbol::Variable("label");
+
+    std::vector<Symbol> weights(hidden_units.size());
+    std::vector<Symbol> biases(hidden_units.size());
+    std::vector<Symbol> outputs(hidden_units.size());
+
+    for (size_t i = 0; i < hidden_units.size(); ++i) {
+        weights[i] = Symbol::Variable("w" + std::to_string(i));
+        biases[i] = Symbol::Variable("b" + std::to_string(i));
+        Symbol fc = FullyConnected(
+                                   i == 0? data : outputs[i-1],  // data
+                                   weights[i],
+                                   biases[i],
+                                   hidden_units[i]);
+        outputs[i] = i == hidden_units.size()-1 ? fc : Activation(fc, ActivationActType::kRelu);
+    }
+    return SoftmaxOutput(outputs.back(), label);
+}
+
+/*
+ * Convert the input string of number of hidden units into the vector of integers.
+ */
+std::vector<int> getLayers(const std::string& hidden_units_string) {
+    std::vector<int> hidden_units;
+    char *pNext;
+    int num_unit = strtol(hidden_units_string.c_str(), &pNext, 10);
+    hidden_units.push_back(num_unit);
+    while (*pNext) {
+        num_unit = strtol(pNext, &pNext, 10);
+        hidden_units.push_back(num_unit);
+    }
+    return hidden_units;
+}
+
+void printUsage() {
+    std::cout << "Usage:" << std::endl;
+    std::cout << "mlp_csv --train mnist_training_set.csv --test mnist_test_set.csv --epochs 10 "
+    << "--batch_size 100 --hidden_units \"128 64 64\" [--gpu]" << std::endl;
+    std::cout << "The example uses mnist data in CSV format. The MNIST data in CSV format assumes "
+    << "the column 0 to be label and the rest 784 column to be data." << std::endl;
+    std::cout << "By default, the example uses 'cpu' context. If '--gpu' is specified, "
+    << "program uses 'gpu' context." <<std::endl;
+}
+
+int main(int argc, char** argv) {
+    const int image_size = 28;
+    const int num_mnist_features = image_size * image_size;
+    int batch_size = 100;
+    int max_epoch = 10;
+    const float learning_rate = 0.1;
+    const float weight_decay = 1e-2;
+    bool isGpu = false;
+
+    std::string training_set;
+    std::string test_set;
+    std::string hidden_units_string;
+    int index = 1;
+    while (index < argc) {
+        if (strcmp("--train", argv[index]) == 0) {
+            index++;
+            training_set = argv[index];
+        } else if (strcmp("--test", argv[index]) == 0) {
+            index++;
+            test_set = argv[index];
+        } else if (strcmp("--epochs", argv[index]) == 0) {
+            index++;
+            max_epoch = strtol(argv[index], NULL, 10);
+        } else if (strcmp("--batch_size", argv[index]) == 0) {
+            index++;
+            batch_size = strtol(argv[index], NULL, 10);
+        } else if (strcmp("--hidden_units", argv[index]) == 0) {
+            index++;
+            hidden_units_string = argv[index];
+        } else if (strcmp("--gpu", argv[index]) == 0) {
+            isGpu = true;
+            index++;
+        } else if (strcmp("--help", argv[index]) == 0) {
+            printUsage();
+            return 0;
+        }
+        index++;
+    }
+
+    if (training_set.empty() || test_set.empty() || hidden_units_string.empty()) {
+        std::cout << "ERROR: The mandatory arguments such as path to training and test data or "
+        << "number of hidden units for mlp are not specified." << std::endl << std::endl;
+        printUsage();
+        return 1;
+    }
+
+    std::vector<int> hidden_units = getLayers(hidden_units_string);
+
+    if (hidden_units.empty()) {
+        std::cout << "ERROR: Number of hidden units are not provided in correct format."
+        << "The numbers need to be separated by ' '." << std::endl << std::endl;
+        printUsage();
+        return 1;
+    }
+
+    /*
+     * The MNIST data in CSV format has 785 columns.
+     * The first column is "Label" and rest of the columns contain data.
+     * The mnist_train.csv has 60000 records and mnist_test.csv has
+     * 10000 records.
+     */
+    auto train_iter = MXDataIter("CSVIter")
+    .SetParam("data_csv", training_set)
+    .SetParam("data_shape", Shape(num_mnist_features + 1, 1))
+    .SetParam("batch_size", batch_size)
+    .SetParam("flat", 1)
+    .SetParam("shuffle", 0)
+    .CreateDataIter();
+
+    auto val_iter = MXDataIter("CSVIter")
+    .SetParam("data_csv", test_set)
+    .SetParam("data_shape", Shape(num_mnist_features + 1, 1))
+    .SetParam("batch_size", batch_size)
+    .SetParam("flat", 1)
+    .SetParam("shuffle", 0)
+    .CreateDataIter();
+
+    auto net = mlp(hidden_units);
+
+    Context ctx = Context::cpu();
+    if (isGpu) {
+        ctx = Context::gpu();
+    }
+
+    std::map<std::string, NDArray> args;
+    args["data"] = NDArray(Shape(batch_size, num_mnist_features), ctx);
+    args["label"] = NDArray(Shape(batch_size), ctx);
+    // Let MXNet infer shapes other parameters such as weights
+    net.InferArgsMap(ctx, &args, args);
+
+    // Initialize all parameters with uniform distribution U(-0.01, 0.01)
+    auto initializer = Uniform(0.01);
+    for (auto& arg : args) {
+        // arg.first is parameter name, and arg.second is the value
+        initializer(arg.first, &arg.second);
+    }
+
+    // Create sgd optimiz er
+    Optimizer* opt = OptimizerRegistry::Find("sgd");
+    opt->SetParam("rescale_grad", 1.0/batch_size)
+    ->SetParam("lr", learning_rate)
+    ->SetParam("wd", weight_decay);
+
+    // Create executor by binding parameters to the model
+    auto *exec = net.SimpleBind(ctx, args);
+    auto arg_names = net.ListArguments();
+
+    // Start training
+    for (int iter = 0; iter < max_epoch; ++iter) {
+        int samples = 0;
+        train_iter.Reset();
+
+        auto tic = std::chrono::system_clock::now();
+        while (train_iter.Next()) {
+            samples += batch_size;
+            auto data_batch = train_iter.GetDataBatch();
+
+            /*
+             * The shape of data_batch.data is (batch_size, (num_mnist_features + 1))
+             * Need to reshape this data so that label column can be extracted from this data.
+             */
+            NDArray reshapedData = data_batch.data.Reshape(Shape((num_mnist_features + 1),
+                                                                 batch_size));
+
+            /*
+             * Extract the label data by slicing the first column of the data and
+             * copy it to "label" arg.
+             */
+            reshapedData.Slice(0, 1).Reshape(Shape(batch_size)).CopyTo(&args["label"]);
+
+            /*
+             * Extract the feature data by slicing the columns 1 to 785 of the data and
+             * copy it to "data" arg.
+             */
+            reshapedData.Slice(1, (num_mnist_features + 1)).Reshape(Shape(batch_size,
+                                                                         num_mnist_features))
+                                                           .CopyTo(&args["data"]);
+
+            exec->Forward(true);
+
+            // Compute gradients
+            exec->Backward();
+            // Update parameters
+            for (size_t i = 0; i < arg_names.size(); ++i) {
+                if (arg_names[i] == "data" || arg_names[i] == "label") continue;
+                opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+            }
+        }
+        auto toc = std::chrono::system_clock::now();
+
+        Accuracy acc;
+        val_iter.Reset();
+        while (val_iter.Next()) {
+            auto data_batch = val_iter.GetDataBatch();
+
+            /*
+             * The shape of data_batch.data is (batch_size, (num_mnist_features + 1))
+             * Need to reshape this data so that label column can be extracted from this data.
+             */
+            NDArray reshapedData = data_batch.data.Reshape(Shape((num_mnist_features + 1),
+                                                                 batch_size));
+
+            /*
+             * Extract the label data by slicing the first column of the data and
+             * copy it to "label" arg.
+             */
+            NDArray labelData = reshapedData.Slice(0, 1).Reshape(Shape(batch_size));
+            labelData.CopyTo(&args["label"]);
+
+            /*
+             * Extract the feature data by slicing the columns 1 to 785 of the data and
+             * copy it to "data" arg.
+             */
+            reshapedData.Slice(1, (num_mnist_features + 1)).Reshape(Shape(batch_size,
+                                                                         num_mnist_features))
+                                                                   .CopyTo(&args["data"]);
+
+            // Forward pass is enough as no gradient is needed when evaluating
+            exec->Forward(false);
+            acc.Update(labelData, exec->outputs[0]);
+        }
+        float duration = std::chrono::duration_cast<std::chrono::milliseconds>
+        (toc - tic).count() / 1000.0;
+        LG << "Epoch[" << iter << "]  " << samples/duration << " samples/sec Accuracy: "
+        << acc.Get();
+    }
+
+    delete exec;
+    MXNotifyShutdown();
+    return 0;
+}
diff --git a/cpp-package/example/mnist_to_csv.py b/cpp-package/example/mnist_to_csv.py
new file mode 100644
index 000000000000..dad9ed5f9c72
--- /dev/null
+++ b/cpp-package/example/mnist_to_csv.py
@@ -0,0 +1,59 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Following file converts the mnist data to CSV format.
+# Usage:
+# mnist_to_csv.py train-images-idx3-ubyte train-labels-idx1-ubyte mnist_train.csv 60000
+# mnist_to_csv.py t10k-images-idx3-ubyte t10k-labels-idx1-ubyte mnist_test.csv 10000
+#
+
+import argparse
+
+def convert_to_csv(args):
+    imageFile = open(args.imageFile, "rb")
+    labelFile = open(args.labelFile, "rb")
+    outputFile = open(args.outputFile, "w")
+
+    imageFile.read(16)
+    labelFile.read(8)
+    images = []
+
+    for i in range(args.num_records):
+        image = [ord(labelFile.read(1))]
+        for j in range(28 * 28):
+            image.append(ord(imageFile.read(1)))
+        images.append(image)
+
+    for image in images:
+        outputFile.write(",".join(str(pix) for pix in image) + "\n")
+
+    imageFile.close()
+    outputFile.close()
+    labelFile.close()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("imageFile", type=str, help="image file in mnist format e.g. train-images-idx3-ubyte")
+    parser.add_argument("labelFile", type=str, help="label file in mnist format e.g train-labels-idx1-ubyte")
+    parser.add_argument("outputFile", type=str, help="Output file in CSV format e.g mnist_train_trial.csv")
+    parser.add_argument("num_records", type=int, help="Number of images in the input files.e.g 60000")
+    args = parser.parse_args()
+
+    try:
+        convert_to_csv(args)
+    except Exception as e:
+        print("Error : Exception {}".format(str(e)))
diff --git a/cpp-package/example/unittests/unit_test_mlp_csv.sh b/cpp-package/example/unittests/unit_test_mlp_csv.sh
new file mode 100755
index 000000000000..55ddcdecaafd
--- /dev/null
+++ b/cpp-package/example/unittests/unit_test_mlp_csv.sh
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file is a unit test for mlp_csv.cpp example in 'example' directory.
+# The file
+#    1. Downloads the MNIST data,
+#    2. Converts it into CSV format.
+#    3. Runs the mlp_csv example and ensures that the accuracy is more than expected.
+#
+
+#!/bin/bash
+
+set -e # exit on the first error
+export EXE_NAME=mlp_csv
+
+cd $(dirname $(readlink -f $0))/../
+export LD_LIBRARY_PATH=$(readlink -f ../../lib):$LD_LIBRARY_PATH
+
+if [ ! -f ../../build/cpp-package/example/${EXE_NAME} ];
+then
+echo "FAIL: ${EXE_NAME} does not exist"
+exit
+fi
+
+cp ../../build/cpp-package/example/${EXE_NAME} .
+
+./get_data.sh
+python mnist_to_csv.py ./data/mnist_data/train-images-idx3-ubyte ./data/mnist_data/train-labels-idx1-ubyte ./data/mnist_data/mnist_train.csv 60000
+python mnist_to_csv.py ./data/mnist_data/t10k-images-idx3-ubyte ./data/mnist_data/t10k-labels-idx1-ubyte ./data/mnist_data/mnist_test.csv 10000
+
+./${EXE_NAME} --train ./data/mnist_data/mnist_train.csv --test ./data/mnist_data/mnist_test.csv --epochs 10 --batch_size 100 --hidden_units "128 64 10" 2&> ${EXE_NAME}.log
+
+if [ ! -f ${EXE_NAME}.log ];
+then
+echo "FAIL: Log file ${EXE_NAME}.log does not exist."
+exit
+fi
+
+# Obtain the accuracy achieved by mlp model after training with MNIST data in CSV format.
+export Acc_obtained=`grep -oP '.*\K(?<=Accuracy: ).*$' ${EXE_NAME}.log | tail -1 | tr -d '\n'`
+export Acc_expected=0.98
+
+# If the obtained accuracy does not meet the expected accuracy, report the test as FAIL.
+if [ $(echo "$Acc_obtained $Acc_expected" | awk '{printf($1 >= $2) ? 1 : 0}') -eq 1 ] ;
+then
+echo "PASS: ${EXE_NAME} obtained $Acc_obtained accuracy."
+else
+echo "FAIL: Accuracy = $Acc_obtained is less than expected accuracy $Acc_expected."
+fi
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index d3ca753c615a..fd4c16b4b044 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -167,7 +167,17 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
   - Performance tests are run to pick the convolution algo when value is 1 or 2
   - Value of 1 chooses the best algo in a limited workspace
   - Value of 2 chooses the fastest algo whose memory requirements may be larger than the default workspace threshold
-  
+
+* MXNET_CUDA_ALLOW_TENSOR_CORE
+  - 0(false) or 1(true) ```(default=1)```
+	- If set to '0', disallows Tensor Core use in CUDA ops.
+	- If set to '1', allows Tensor Core use in CUDA ops.
+  - This variable can only be set once in a session.
+
+* MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION
+  - 0(false) or 1(true) ```(default=0)```
+	- If set to '0', disallows implicit type conversions to Float16 to use Tensor Cores
+	- If set to '1', allows CUDA ops like RNN and Convolution to use TensorCores even with Float32 input data by using implicit type casting to Float16. Only has an effect if `MXNET_CUDA_ALLOW_TENSOR_CORE` is `1`.
 
 * MXNET_GLUON_REPO
   - Values: String ```(default='https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/'```
diff --git a/docs/install/requirements.txt b/docs/install/requirements.txt
index 3d8020cc6ecb..dfc3f70c96fb 100644
--- a/docs/install/requirements.txt
+++ b/docs/install/requirements.txt
@@ -2,7 +2,7 @@ cpplint==1.3.0
 h5py==2.8.0rc1
 nose
 nose-timer
-numpy<1.15.0,>=1.8.2
+numpy<=1.15.2,>=1.8.2
 pylint==1.8.3
 requests<2.19.0,>=2.18.4
 scipy==1.0.1
diff --git a/docs/install/validate_mxnet.md b/docs/install/validate_mxnet.md
index a4cf5446f606..dfe8d063f602 100644
--- a/docs/install/validate_mxnet.md
+++ b/docs/install/validate_mxnet.md
@@ -137,8 +137,25 @@ Please contribute an example!
 
 ### Perl
 
-Please contribute an example!
+Start the pdl2 terminal.
+
+```bash
+$ pdl2
+```
 
+Run a short *MXNet* Perl program to create a 2X3 matrix of ones, multiply each element in the matrix by 2 followed by adding 1. We expect the output to be a 2X3 matrix with all elements being 3.
+
+```perl
+pdl> use AI::MXNet qw(mx)
+pdl> $a = mx->nd->ones([2, 3])
+pdl> $b = $a * 2 + 1
+pdl> print $b->aspdl
+
+[
+ [3 3 3]
+ [3 3 3]
+]
+```
 
 ### R
 
diff --git a/docs/install/windows_setup.md b/docs/install/windows_setup.md
index 009679d39e2f..8e612454a7e1 100755
--- a/docs/install/windows_setup.md
+++ b/docs/install/windows_setup.md
@@ -11,7 +11,7 @@ The following describes how to install with pip for computers with CPUs, Intel C
 - [Build from Source](#build-from-source)
 - Install MXNet with a Programming Language API
     - [Python](#install-the-mxnet-package-for-python)
-    - [R](#install-mxnet-package-for-r)
+    - [R](#install-the-mxnet-package-for-r)
     - [Julia](#install-the-mxnet-package-for-julia)
 
 
@@ -137,14 +137,14 @@ To build and install MXNet yourself using [VS2017](https://www.visualstudio.com/
 ```
 1. Download and install [CMake](https://cmake.org/download) if it is not already installed. [CMake v3.12.2](https://cmake.org/files/v3.12/cmake-3.12.2-win64-x64.msi) has been tested with MXNet.
 1. Download and run the  [OpenCV](https://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.4.1/opencv-3.4.1-vc14_vc15.exe/download) package. There are more recent versions of OpenCV, so please create an issue/PR to update this info if you validate one of these later versions.
-1. This will unzip several files. You can place them in another directory if you wish.
-1. Set the environment variable `OpenCV_DIR` to point to the OpenCV build directory that you just unzipped (e.g., `OpenCV_DIR = C:\utils\opencv\build`).
+1. This will unzip several files. You can place them in another directory if you wish. We will use `C:\utils`(```mkdir C:\utils```) as our default path.
+1. Set the environment variable `OpenCV_DIR` to point to the OpenCV build directory that you just unzipped. Start ```cmd``` and type `set OpenCV_DIR=C:\utils\opencv\build`.
 1. If you don’t have the Intel Math Kernel Library (MKL) installed, you can install it and follow the [MKLDNN_README](https://github.com/apache/incubator-mxnet/blob/master/MKLDNN_README.md) from here, or you can use OpenBLAS. These instructions will assume you're using OpenBLAS.
 1. Download the [OpenBlas](https://sourceforge.net/projects/openblas/files/v0.2.19/OpenBLAS-v0.2.19-Win64-int32.zip/download) package. Later versions of OpenBLAS are available, but you would need to build from source. v0.2.19 is the most recent version that ships with binaries. Contributions of more recent binaries would be appreciated.
-1. Unzip the file. You can place the unzipped files and folders in another directory if you wish.
-1. Set the environment variable `OpenBLAS_HOME` to point to the OpenBLAS directory that contains the `include` and `lib` directories (e.g., `OpenBLAS_HOME = C:\utils\OpenBLAS`).
-1. Download and install [CUDA](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exelocal). If you already had CUDA, then installed VS2017, you should reinstall CUDA now so that you get the CUDA toolkit components for VS2017 integration.
-1. Download and install cuDNN. To get access to the download link, register as an NVIDIA community user. Then Follow the [link](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#install-windows) to install the cuDNN.
+1. Unzip the file, rename it to ```OpenBLAS``` and put it under `C:\utils`. You can place the unzipped files and folders in another directory if you wish.
+1. Set the environment variable `OpenBLAS_HOME` to point to the OpenBLAS directory that contains the `include` and `lib` directories and type `set OpenBLAS_HOME=C:\utils\OpenBLAS` on the command prompt(```cmd```).
+1. Download and install [CUDA](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exelocal). If you already had CUDA, then installed VS2017, you should reinstall CUDA now so that you get the CUDA toolkit components for VS2017 integration. Note that the latest CUDA version supported by MXNet is [9.2](https://developer.nvidia.com/cuda-92-download-archive). You might also want to find other CUDA verion on the [Legacy Releases](https://developer.nvidia.com/cuda-toolkit-archive).
+1. Download and install cuDNN. To get access to the download link, register as an NVIDIA community user. Then follow the [link](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#install-windows) to install the cuDNN and put those libraries into ```C:\cuda```.
 1. Download and install [git](https://git-for-windows.github.io/) if you haven't already.
 
 After you have installed all of the required dependencies, build the MXNet source code:
@@ -158,13 +158,14 @@ git clone https://github.com/apache/incubator-mxnet.git --recursive
 3. Verify that the `DCUDNN_INCLUDE` and `DCUDNN_LIBRARY` environment variables are pointing to the `include` folder and `cudnn.lib` file of your CUDA installed location, and `C:\incubator-mxnet` is the location of the source code you just cloned in the previous step.
 4. Create a build dir using the following command and go to the directory, for example:
 ```
-mkdir C:\build
-cd C:\build
+mkdir C:\incubator-mxnet\build
+cd C:\incubator-mxnet\build
 ```
 5. Compile the MXNet source code with `cmake` by using following command:
 ```
 cmake -G "Visual Studio 15 2017 Win64" -T cuda=9.2,host=x64 -DUSE_CUDA=1 -DUSE_CUDNN=1 -DUSE_NVRTC=1 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_LIST=Common -DCUDA_TOOLSET=9.2 -DCUDNN_INCLUDE=C:\cuda\include -DCUDNN_LIBRARY=C:\cuda\lib\x64\cudnn.lib "C:\incubator-mxnet"
 ```
+* Make sure you set the environment variables correctly (OpenBLAS_HOME, OpenCV_DIR) and change the version of the Visual studio 2017 to v14.11 before enter above command.
 6. After the CMake successfully completed, compile the the MXNet source code by using following command:
 ```
 msbuild mxnet.sln /p:Configuration=Release;Platform=x64 /maxcpucount
@@ -206,14 +207,28 @@ We have installed MXNet core library. Next, we will install MXNet interface pack
 - [Julia](#install-the-mxnet-package-for-julia)
 - **Scala** is not yet available for Windows
 
-## Install MXNet for Python
+## Install the MXNet Package for Python
 
 These steps are required after building from source. If you already installed MXNet by using pip, you do not need to do these steps to use MXNet with Python.
 
 1. Install ```Python``` using windows installer available [here](https://www.python.org/downloads/release/python-2712/).
 2. Install ```Numpy``` using windows installer available [here](https://scipy.org/index.html).
-3. Next, we install Python package interface for MXNet. You can find the Python interface package for [MXNet on GitHub](https://github.com/dmlc/mxnet/tree/master/python/mxnet).
-
+3. Start ```cmd``` and create a folder named ```common```(```mkdir C:\common```)
+4. Download the [mingw64_dll.zip](https://sourceforge.net/projects/openblas/files/v0.2.12/mingw64_dll.zip/download), unzip and copy three libraries (.dll files) that openblas.dll depends on to ```C:\common```.
+5. Copy the required .dll file to ```C:\common``` and make sure following libraries (.dll files) in the folder.
+```
+libgcc_s_seh-1.dll (in mingw64_dll)
+libgfortran-3.dll (in mingw64_dll)
+libquadmath-0.dll (in mingw64_dll)
+libopenblas.dll (in OpenBlas folder you download)
+opencv_world341.dll (in OpenCV folder you download)
+```
+6. Add ```C:\common``` to Environment Variables.
+ * Type ```control sysdm.cpl``` on ```cmp```
+ * Select the **Advanced tab** and click **Environment Variables**
+ * Double click the **Path** and click **New**
+ * Add ```C:\common``` and click OK
+7. Use setup.py to install the package.
 ```bash
     # Assuming you are in root mxnet source code folder
     cd python
diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py
index 05e4b487f380..a4118ebcf76b 100644
--- a/example/image-classification/benchmark_score.py
+++ b/example/image-classification/benchmark_score.py
@@ -32,7 +32,7 @@ def get_symbol(network, batch_size, dtype):
     num_layers = 0
     if 'resnet' in network:
         num_layers = int(network.split('-')[1])
-        network = 'resnet'
+        network = network.split('-')[0]
     if 'vgg' in network:
         num_layers = int(network.split('-')[1])
         network = 'vgg'
@@ -69,7 +69,7 @@ def score(network, dev, batch_size, num_batches, dtype):
     return num_batches*batch_size/(time.time() - tic)
 
 if __name__ == '__main__':
-    networks = ['alexnet', 'vgg-16', 'inception-bn', 'inception-v3', 'resnet-50', 'resnet-152']
+    networks = ['alexnet', 'vgg-16', 'inception-bn', 'inception-v3', 'resnetv1-50', 'resnet-50', 'resnet-152']
     devs = [mx.gpu(0)] if len(get_gpus()) > 0 else []
     # Enable USE_MKLDNN for better CPU performance
     devs.append(mx.cpu())
diff --git a/example/image-classification/symbols/resnetv1.py b/example/image-classification/symbols/resnetv1.py
new file mode 100755
index 000000000000..e5752f775447
--- /dev/null
+++ b/example/image-classification/symbols/resnetv1.py
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+'''
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+(Original author Wei Wu) by Antti-Pekka Hynninen
+
+Implementing the original resnet ILSVRC 2015 winning network from:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Deep Residual Learning for Image Recognition"
+'''
+import mxnet as mx
+import numpy as np
+
+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, bn_mom=0.9, workspace=256, memonger=False):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    if bottle_neck:
+        conv1 = mx.sym.Convolution(data=data, num_filter=int(num_filter*0.25), kernel=(1,1), stride=stride, pad=(0,0),
+                                   no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv2 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(3,3), stride=(1,1), pad=(1,1),
+                                   no_bias=True, workspace=workspace, name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        conv3 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
+                                   workspace=workspace, name=name + '_conv3')
+        bn3 = mx.sym.BatchNorm(data=conv3, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return mx.sym.Activation(data=bn3 + shortcut, act_type='relu', name=name + '_relu3')
+    else:
+        conv1 = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv2 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2')
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return mx.sym.Activation(data=bn2 + shortcut, act_type='relu', name=name + '_relu3')
+
+def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, bn_mom=0.9, workspace=256, dtype='float32', memonger=False):
+    """Return ResNet symbol of
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+    num_stages : int
+        Number of stage
+    filter_list : list
+        Channel size of each stage
+    num_classes : int
+        Ouput size of symbol
+    dataset : str
+        Dataset type, only cifar10 and imagenet supports
+    workspace : int
+        Workspace used in convolution operator
+    dtype : str
+        Precision (float32 or float16)
+    """
+    num_unit = len(units)
+    assert(num_unit == num_stages)
+    data = mx.sym.Variable(name='data')
+    if dtype == 'float32':
+        data = mx.sym.identity(data=data, name='id')
+    else:
+        if dtype == 'float16':
+            data = mx.sym.Cast(data=data, dtype=np.float16)
+    (nchannel, height, width) = image_shape
+    if height <= 32:            # such as cifar10
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
+                                  no_bias=True, name="conv0", workspace=workspace)
+        # Is this BatchNorm supposed to be here?
+        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
+    else:                       # often expected to be 224 such as imagenet
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
+                                  no_bias=True, name="conv0", workspace=workspace)
+        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
+        body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
+        body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
+
+    for i in range(num_stages):
+        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
+                             name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, workspace=workspace,
+                             memonger=memonger)
+        for j in range(units[i]-1):
+            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
+                                 bottle_neck=bottle_neck, workspace=workspace, memonger=memonger)
+    # bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
+    # relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
+    # Although kernel is not used here when global_pool=True, we should put one
+    pool1 = mx.sym.Pooling(data=body, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
+    flat = mx.sym.Flatten(data=pool1)
+    fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1')
+    if dtype == 'float16':
+        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
+    return mx.sym.SoftmaxOutput(data=fc1, name='softmax')
+
+def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32', **kwargs):
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+    (Original author Wei Wu) by Antti-Pekka Hynninen
+    Implementing the original resnet ILSVRC 2015 winning network from:
+    Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Deep Residual Learning for Image Recognition"
+    """
+    image_shape = [int(l) for l in image_shape.split(',')]
+    (nchannel, height, width) = image_shape
+    if height <= 28:
+        num_stages = 3
+        if (num_layers-2) % 9 == 0 and num_layers >= 164:
+            per_unit = [(num_layers-2)//9]
+            filter_list = [16, 64, 128, 256]
+            bottle_neck = True
+        elif (num_layers-2) % 6 == 0 and num_layers < 164:
+            per_unit = [(num_layers-2)//6]
+            filter_list = [16, 16, 32, 64]
+            bottle_neck = False
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+        units = per_unit * num_stages
+    else:
+        if num_layers >= 50:
+            filter_list = [64, 256, 512, 1024, 2048]
+            bottle_neck = True
+        else:
+            filter_list = [64, 64, 128, 256, 512]
+            bottle_neck = False
+        num_stages = 4
+        if num_layers == 18:
+            units = [2, 2, 2, 2]
+        elif num_layers == 34:
+            units = [3, 4, 6, 3]
+        elif num_layers == 50:
+            units = [3, 4, 6, 3]
+        elif num_layers == 101:
+            units = [3, 4, 23, 3]
+        elif num_layers == 152:
+            units = [3, 8, 36, 3]
+        elif num_layers == 200:
+            units = [3, 24, 36, 3]
+        elif num_layers == 269:
+            units = [3, 30, 48, 8]
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+
+    return resnet(units       = units,
+                  num_stages  = num_stages,
+                  filter_list = filter_list,
+                  num_classes = num_classes,
+                  image_shape = image_shape,
+                  bottle_neck = bottle_neck,
+                  workspace   = conv_workspace,
+                  dtype       = dtype)
diff --git a/example/quantization/imagenet_gen_qsym.py b/example/quantization/imagenet_gen_qsym.py
index 85474b663fae..8a2818c4bca0 100644
--- a/example/quantization/imagenet_gen_qsym.py
+++ b/example/quantization/imagenet_gen_qsym.py
@@ -92,7 +92,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
                              ' thresholds. This mode is expected to produce the best inference accuracy of all three'
                              ' kinds of quantized models if the calibration dataset is representative enough of the'
                              ' inference dataset.')
-    parser.add_argument('--quantized-dtype', type=str, default='int8', 
+    parser.add_argument('--quantized-dtype', type=str, default='int8',
                         choices=['int8', 'uint8'],
                         help='quantization destination data type for input data')
     args = parser.parse_args()
diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py b/example/quantization/imagenet_gen_qsym_mkldnn.py
new file mode 100644
index 000000000000..e06276115154
--- /dev/null
+++ b/example/quantization/imagenet_gen_qsym_mkldnn.py
@@ -0,0 +1,207 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import os
+import logging
+from common import modelzoo
+import mxnet as mx
+from mxnet.contrib.quantization import *
+from mxnet.base import SymbolHandle, check_call, _LIB, mx_uint, c_str_array
+import ctypes
+
+
+def download_calib_dataset(dataset_url, calib_dataset, logger=None):
+    if logger is not None:
+        logger.info('Downloading calibration dataset from %s to %s' % (dataset_url, calib_dataset))
+    mx.test_utils.download(dataset_url, calib_dataset)
+
+
+def download_model(model_name, logger=None):
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    model_path = os.path.join(dir_path, 'model')
+    if logger is not None:
+        logger.info('Downloading model %s... into path %s' % (model_name, model_path))
+    return modelzoo.download_model(args.model, os.path.join(dir_path, 'model'))
+
+
+def save_symbol(fname, sym, logger=None):
+    if logger is not None:
+        logger.info('Saving symbol into file at %s' % fname)
+    sym.save(fname)
+
+
+def save_params(fname, arg_params, aux_params, logger=None):
+    if logger is not None:
+        logger.info('Saving params into file at %s' % fname)
+    save_dict = {('arg:%s' % k): v.as_in_context(cpu()) for k, v in arg_params.items()}
+    save_dict.update({('aux:%s' % k): v.as_in_context(cpu()) for k, v in aux_params.items()})
+    mx.nd.save(fname, save_dict)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Generate a calibrated quantized model from a FP32 model with MKL-DNN support')
+    parser.add_argument('--model', type=str, choices=['imagenet1k-resnet-152', 'imagenet1k-inception-bn'],
+                        help='currently only supports imagenet1k-resnet-152 or imagenet1k-inception-bn')
+    parser.add_argument('--batch-size', type=int, default=32)
+    parser.add_argument('--label-name', type=str, default='softmax_label')
+    parser.add_argument('--calib-dataset', type=str, default='data/val_256_q90.rec',
+                        help='path of the calibration dataset')
+    parser.add_argument('--image-shape', type=str, default='3,224,224')
+    parser.add_argument('--data-nthreads', type=int, default=60,
+                        help='number of threads for data decoding')
+    parser.add_argument('--num-calib-batches', type=int, default=10,
+                        help='number of batches for calibration')
+    parser.add_argument('--exclude-first-conv', action='store_true', default=True,
+                        help='excluding quantizing the first conv layer since the'
+                             ' input data may have negative value which doesn\'t support at moment' )
+    parser.add_argument('--shuffle-dataset', action='store_true', default=True,
+                        help='shuffle the calibration dataset')
+    parser.add_argument('--shuffle-chunk-seed', type=int, default=3982304,
+                        help='shuffling chunk seed, see'
+                             ' https://mxnet.incubator.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter'
+                             ' for more details')
+    parser.add_argument('--shuffle-seed', type=int, default=48564309,
+                        help='shuffling seed, see'
+                             ' https://mxnet.incubator.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter'
+                             ' for more details')
+    parser.add_argument('--calib-mode', type=str, default='entropy',
+                        help='calibration mode used for generating calibration table for the quantized symbol; supports'
+                             ' 1. none: no calibration will be used. The thresholds for quantization will be calculated'
+                             ' on the fly. This will result in inference speed slowdown and loss of accuracy'
+                             ' in general.'
+                             ' 2. naive: simply take min and max values of layer outputs as thresholds for'
+                             ' quantization. In general, the inference accuracy worsens with more examples used in'
+                             ' calibration. It is recommended to use `entropy` mode as it produces more accurate'
+                             ' inference results.'
+                             ' 3. entropy: calculate KL divergence of the fp32 output and quantized output for optimal'
+                             ' thresholds. This mode is expected to produce the best inference accuracy of all three'
+                             ' kinds of quantized models if the calibration dataset is representative enough of the'
+                             ' inference dataset.')
+    parser.add_argument('--quantized-dtype', type=str, default='uint8',
+                        choices=['int8', 'uint8'],
+                        help='quantization destination data type for input data')
+    parser.add_argument('--enable-calib-quantize', type=bool, default=True,
+                        help='If enabled, the quantize op will '
+                             'be calibrated offline if calibration mode is '
+                             'enabled')
+    args = parser.parse_args()
+    ctx = mx.cpu(0)
+    logging.basicConfig()
+    logger = logging.getLogger('logger')
+    logger.setLevel(logging.INFO)
+
+    logger.info('shuffle_dataset=%s' % args.shuffle_dataset)
+
+    calib_mode = args.calib_mode
+    logger.info('calibration mode set to %s' % calib_mode)
+
+    # download calibration dataset
+    if calib_mode != 'none':
+        download_calib_dataset('http://data.mxnet.io/data/val_256_q90.rec', args.calib_dataset)
+
+    # download model
+    prefix, epoch = download_model(model_name=args.model, logger=logger)
+    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+
+    sym = sym.get_backend_symbol('MKLDNN')
+
+    # get batch size
+    batch_size = args.batch_size
+    logger.info('batch size = %d for calibration' % batch_size)
+
+    # get number of batches for calibration
+    num_calib_batches = args.num_calib_batches
+    if calib_mode == 'none':
+        logger.info('skip calibration step as calib_mode is none')
+    else:
+        logger.info('number of batches = %d for calibration' % num_calib_batches)
+
+    # get number of threads for decoding the dataset
+    data_nthreads = args.data_nthreads
+
+    # get image shape
+    image_shape = args.image_shape
+
+    exclude_first_conv = args.exclude_first_conv
+    excluded_sym_names = []
+    if args.model == 'imagenet1k-resnet-152':
+        rgb_mean = '0,0,0'
+        calib_layer = lambda name: name.endswith('_output')
+        excluded_sym_names += ['flatten0', 'fc1']
+        if exclude_first_conv:
+            excluded_sym_names += ['conv0', 'pooling0']
+    elif args.model == 'imagenet1k-inception-bn':
+        rgb_mean = '123.68,116.779,103.939'
+        calib_layer = lambda name: name.endswith('_output')
+        excluded_sym_names += ['flatten', 'fc1']
+        if exclude_first_conv:
+            excluded_sym_names += ['conv_1']
+    else:
+        raise ValueError('model %s is not supported in this script' % args.model)
+
+    label_name = args.label_name
+    logger.info('label_name = %s' % label_name)
+
+    data_shape = tuple([int(i) for i in image_shape.split(',')])
+    logger.info('Input data shape = %s' % str(data_shape))
+
+    logger.info('rgb_mean = %s' % rgb_mean)
+    rgb_mean = [float(i) for i in rgb_mean.split(',')]
+    mean_args = {'mean_r': rgb_mean[0], 'mean_g': rgb_mean[1], 'mean_b': rgb_mean[2]}
+
+    if calib_mode == 'none':
+        logger.info('Quantizing FP32 model %s' % args.model)
+        qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
+                                                       ctx=ctx, excluded_sym_names=excluded_sym_names,
+                                                       calib_mode=calib_mode, quantized_dtype=args.quantized_dtype,
+                                                       logger=logger)
+        sym_name = '%s-symbol.json' % (prefix + '-quantized')
+    else:
+        logger.info('Creating ImageRecordIter for reading calibration dataset')
+        data = mx.io.ImageRecordIter(path_imgrec=args.calib_dataset,
+                                     label_width=1,
+                                     preprocess_threads=data_nthreads,
+                                     batch_size=batch_size,
+                                     data_shape=data_shape,
+                                     label_name=label_name,
+                                     rand_crop=False,
+                                     rand_mirror=False,
+                                     shuffle=args.shuffle_dataset,
+                                     shuffle_chunk_seed=args.shuffle_chunk_seed,
+                                     seed=args.shuffle_seed,
+                                     **mean_args)
+
+        qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
+                                                        ctx=ctx, excluded_sym_names=excluded_sym_names,
+                                                        calib_mode=calib_mode, calib_data=data,
+                                                        num_calib_examples=num_calib_batches * batch_size,
+                                                        calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
+                                                        label_names=(label_name,), calib_quantize_op = True,
+                                                        logger=logger)
+        if calib_mode == 'entropy':
+            suffix = '-quantized-%dbatches-entropy' % num_calib_batches
+        elif calib_mode == 'naive':
+            suffix = '-quantized-%dbatches-naive' % num_calib_batches
+        else:
+            raise ValueError('unknow calibration mode %s received, only supports `none`, `naive`, and `entropy`'
+                             % calib_mode)
+        sym_name = '%s-symbol.json' % (prefix + suffix)
+    qsym = qsym.get_backend_symbol('MKLDNN_POST_QUANTIZE')
+    save_symbol(sym_name, qsym, logger)
+    param_name = '%s-%04d.params' % (prefix + '-quantized', epoch)
+    save_params(param_name, qarg_params, aux_params, logger)
diff --git a/example/quantization/imagenet_inference.py b/example/quantization/imagenet_inference.py
index 85649530aa0b..286e49ea4401 100644
--- a/example/quantization/imagenet_inference.py
+++ b/example/quantization/imagenet_inference.py
@@ -129,7 +129,7 @@ def score(sym, arg_params, aux_params, data, devs, label_name, max_num_examples,
         ctx = mx.cpu(0)
     else:
         raise ValueError('ctx %s is not supported in this script' % args.ctx)
-    
+
     logging.basicConfig()
     logger = logging.getLogger('logger')
     logger.setLevel(logging.INFO)
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index a01cc6a77940..dc33c95437f9 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1542,18 +1542,17 @@ MXNET_DLL int MXSymbolInferType(SymbolHandle sym,
  * \param sym_handle symbol to be converted
  * \param ret_sym_handle quantized symbol result
  * \param num_excluded_symbols number of layers excluded from being quantized in the input symbol
- * \param excluded_symbols array of symbols to be excluded from being quantized
+ * \param excluded_symbols op names to be excluded from being quantized
  * \param num_offline number of parameters that are quantized offline
  * \param offline_params array of c strings representing the names of params quantized offline
  * \param quantized_dtype the quantized destination type for input data.
+ * \param calib_quantize whether calibrate quantize op with offline calibration data.
  */
-MXNET_DLL int MXQuantizeSymbol(SymbolHandle sym_handle,
-                               SymbolHandle *ret_sym_handle,
+MXNET_DLL int MXQuantizeSymbol(SymbolHandle sym_handle, SymbolHandle *ret_sym_handle,
                                const mx_uint num_excluded_symbols,
-                               const SymbolHandle *excluded_symbols,
-                               const mx_uint num_offline,
-                               const char **offline_params,
-                               const char *quantized_dtype);
+                               const char **excluded_symbols,
+                               const mx_uint num_offline, const char **offline_params,
+                               const char *quantized_dtype, const bool calib_quantize);
 
 /*!
  * \brief Set calibration table to node attributes in the sym
@@ -1571,6 +1570,15 @@ MXNET_DLL int MXSetCalibTableToQuantizedSymbol(SymbolHandle qsym_handle,
                                                const float* high_quantiles,
                                                SymbolHandle* ret_sym_handle);
 
+/*!
+ * \brief Run subgraph pass based on the backend provided
+ * \param sym_handle symbol to be converted
+ * \param backend backend names for subgraph pass
+ * \param ret_sym_handle returned symbol
+ */
+MXNET_DLL int MXGenBackendSubgraph(SymbolHandle sym_handle, const char *backend,
+                                   SymbolHandle *ret_sym_handle);
+
 //--------------------------------------------
 // Part 4: Executor interface
 //--------------------------------------------
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index afae5dcfcffe..e877d35dbb5b 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -667,6 +667,12 @@ class NDArray {
   }
 
 #if MXNET_USE_MKLDNN == 1
+  /*
+   * Create NDArray from mkldnn memory.
+   * mkldnn_mem The mkldnn memory to be managed.
+   * static_data If true, mkldnn memory won't be freed on destruction.
+   */
+  explicit NDArray(const mkldnn::memory *mkldnn_mem, bool static_data = true);
   /*
    * Test if the data is stored in one of special MKLDNN format.
    */
@@ -742,6 +748,11 @@ class NDArray {
    * It's used by FullyConnected right now.
    */
   NDArray MKLDNNDataReshape(const TShape &shape) const;
+
+   /*!
+   * \ Fix mkldnn memory descriptor mismatch from NDArray.
+   */
+  void UpdateMKLDNNMemDesc();
 #endif
 
   /*!
diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index aa5d4e6de784..dd818457f827 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -300,6 +300,14 @@ using FQuantizedOp = std::function<nnvm::NodePtr (const NodeAttrs& attrs)>;
  */
 using FNeedRequantize = std::function<bool (const NodeAttrs& attrs)>;
 
+/*!
+ * \brief Register a function to determine if the input of a quantized operator
+ * needs to be quantized. This is usually used for the quantized operators
+ * which can handle fp32 inputs directly.
+ */
+using FAvoidQuantizeInput = std::function<bool (const NodeAttrs& attrs,
+                                                size_t index)>;
+
 }  // namespace mxnet
 
 #endif  // MXNET_OP_ATTR_TYPES_H_
diff --git a/julia/.gitattributes b/julia/.gitattributes
new file mode 100644
index 000000000000..4b76ca8606cb
--- /dev/null
+++ b/julia/.gitattributes
@@ -0,0 +1 @@
+NEWS.md merge=union
diff --git a/julia/.gitignore b/julia/.gitignore
new file mode 100644
index 000000000000..d6791c8491bf
--- /dev/null
+++ b/julia/.gitignore
@@ -0,0 +1,13 @@
+*.jl.cov
+*.jl.*.cov
+*.jl.mem
+*.pyc
+.ipynb_checkpoints
+data
+deps/src
+deps/usr
+deps/deps.jl
+docs/_build
+docs/build/
+docs/site/
+.vscode
diff --git a/julia/.travis.yml b/julia/.travis.yml
new file mode 100644
index 000000000000..680df7af481e
--- /dev/null
+++ b/julia/.travis.yml
@@ -0,0 +1,59 @@
+# Documentation: http://docs.travis-ci.com/user/languages/julia/
+sudo: false
+
+language: julia
+
+os:
+  - linux
+  - osx
+osx_image: xcode8
+
+julia:
+  - 0.6
+#  - nightly 0.6 supports depends on #170
+
+branches:
+  only:
+    - master
+    - stable
+    - /^v\d+\.\d+(\.\d+)?(-\S*)?$/  # for tagging
+
+cache:
+  directories:
+    - $TRAVIS_BUILD_DIR/deps/src
+
+# dependent apt packages
+addons:
+  apt:
+    sources:
+      - ubuntu-toolchain-r-test
+    packages:
+      - doxygen
+      - wget
+      - git
+      - libcurl4-openssl-dev
+      - unzip
+      - libatlas-base-dev
+      - libatlas-dev
+      - libopencv-dev
+      - gcc-4.8
+      - g++-4.8
+
+before_install:
+  - export TRAVIS_DIR=test/travis
+  - source ${TRAVIS_DIR}/setup_env.sh
+
+notifications:
+  email: false
+
+script:
+  - ${TRAVIS_DIR}/run_test.sh
+
+after_success:
+  # See https://github.com/dmlc/MXNet.jl/pull/303#issuecomment-341171774
+  - julia -e 'using MXNet; mx._sig_checker()'
+
+  - source ${TRAVIS_DIR}/run_coverage.sh
+  - echo $TRAVIS_JULIA_VERSION
+  - julia -e 'Pkg.add("Documenter")'
+  - julia -e 'cd(Pkg.dir("MXNet")); include(joinpath("docs", "make.jl"))'
diff --git a/julia/LICENSE.md b/julia/LICENSE.md
new file mode 100644
index 000000000000..5ecf95ac60bc
--- /dev/null
+++ b/julia/LICENSE.md
@@ -0,0 +1,179 @@
+The MXNet.jl package is licensed under version 2.0 of the Apache License:
+
+> Copyright (c) 2015-2018:
+>  * Chiyuan Zhang
+>
+>                                 Apache License
+>                           Version 2.0, January 2004
+>                        http://www.apache.org/licenses/
+>
+>   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+>
+>   1. Definitions.
+>
+>      "License" shall mean the terms and conditions for use, reproduction,
+>      and distribution as defined by Sections 1 through 9 of this document.
+>
+>      "Licensor" shall mean the copyright owner or entity authorized by
+>      the copyright owner that is granting the License.
+>
+>      "Legal Entity" shall mean the union of the acting entity and all
+>      other entities that control, are controlled by, or are under common
+>      control with that entity. For the purposes of this definition,
+>      "control" means (i) the power, direct or indirect, to cause the
+>      direction or management of such entity, whether by contract or
+>      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+>      outstanding shares, or (iii) beneficial ownership of such entity.
+>
+>      "You" (or "Your") shall mean an individual or Legal Entity
+>      exercising permissions granted by this License.
+>
+>      "Source" form shall mean the preferred form for making modifications,
+>      including but not limited to software source code, documentation
+>      source, and configuration files.
+>
+>      "Object" form shall mean any form resulting from mechanical
+>      transformation or translation of a Source form, including but
+>      not limited to compiled object code, generated documentation,
+>      and conversions to other media types.
+>
+>      "Work" shall mean the work of authorship, whether in Source or
+>      Object form, made available under the License, as indicated by a
+>      copyright notice that is included in or attached to the work
+>      (an example is provided in the Appendix below).
+>
+>      "Derivative Works" shall mean any work, whether in Source or Object
+>      form, that is based on (or derived from) the Work and for which the
+>      editorial revisions, annotations, elaborations, or other modifications
+>      represent, as a whole, an original work of authorship. For the purposes
+>      of this License, Derivative Works shall not include works that remain
+>      separable from, or merely link (or bind by name) to the interfaces of,
+>      the Work and Derivative Works thereof.
+>
+>      "Contribution" shall mean any work of authorship, including
+>      the original version of the Work and any modifications or additions
+>      to that Work or Derivative Works thereof, that is intentionally
+>      submitted to Licensor for inclusion in the Work by the copyright owner
+>      or by an individual or Legal Entity authorized to submit on behalf of
+>      the copyright owner. For the purposes of this definition, "submitted"
+>      means any form of electronic, verbal, or written communication sent
+>      to the Licensor or its representatives, including but not limited to
+>      communication on electronic mailing lists, source code control systems,
+>      and issue tracking systems that are managed by, or on behalf of, the
+>      Licensor for the purpose of discussing and improving the Work, but
+>      excluding communication that is conspicuously marked or otherwise
+>      designated in writing by the copyright owner as "Not a Contribution."
+>
+>      "Contributor" shall mean Licensor and any individual or Legal Entity
+>      on behalf of whom a Contribution has been received by Licensor and
+>      subsequently incorporated within the Work.
+>
+>   2. Grant of Copyright License. Subject to the terms and conditions of
+>      this License, each Contributor hereby grants to You a perpetual,
+>      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+>      copyright license to reproduce, prepare Derivative Works of,
+>      publicly display, publicly perform, sublicense, and distribute the
+>      Work and such Derivative Works in Source or Object form.
+>
+>   3. Grant of Patent License. Subject to the terms and conditions of
+>      this License, each Contributor hereby grants to You a perpetual,
+>      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+>      (except as stated in this section) patent license to make, have made,
+>      use, offer to sell, sell, import, and otherwise transfer the Work,
+>      where such license applies only to those patent claims licensable
+>      by such Contributor that are necessarily infringed by their
+>      Contribution(s) alone or by combination of their Contribution(s)
+>      with the Work to which such Contribution(s) was submitted. If You
+>      institute patent litigation against any entity (including a
+>      cross-claim or counterclaim in a lawsuit) alleging that the Work
+>      or a Contribution incorporated within the Work constitutes direct
+>      or contributory patent infringement, then any patent licenses
+>      granted to You under this License for that Work shall terminate
+>      as of the date such litigation is filed.
+>
+>   4. Redistribution. You may reproduce and distribute copies of the
+>      Work or Derivative Works thereof in any medium, with or without
+>      modifications, and in Source or Object form, provided that You
+>      meet the following conditions:
+>
+>      (a) You must give any other recipients of the Work or
+>          Derivative Works a copy of this License; and
+>
+>      (b) You must cause any modified files to carry prominent notices
+>          stating that You changed the files; and
+>
+>      (c) You must retain, in the Source form of any Derivative Works
+>          that You distribute, all copyright, patent, trademark, and
+>          attribution notices from the Source form of the Work,
+>          excluding those notices that do not pertain to any part of
+>          the Derivative Works; and
+>
+>      (d) If the Work includes a "NOTICE" text file as part of its
+>          distribution, then any Derivative Works that You distribute must
+>          include a readable copy of the attribution notices contained
+>          within such NOTICE file, excluding those notices that do not
+>          pertain to any part of the Derivative Works, in at least one
+>          of the following places: within a NOTICE text file distributed
+>          as part of the Derivative Works; within the Source form or
+>          documentation, if provided along with the Derivative Works; or,
+>          within a display generated by the Derivative Works, if and
+>          wherever such third-party notices normally appear. The contents
+>          of the NOTICE file are for informational purposes only and
+>          do not modify the License. You may add Your own attribution
+>          notices within Derivative Works that You distribute, alongside
+>          or as an addendum to the NOTICE text from the Work, provided
+>          that such additional attribution notices cannot be construed
+>          as modifying the License.
+>
+>      You may add Your own copyright statement to Your modifications and
+>      may provide additional or different license terms and conditions
+>      for use, reproduction, or distribution of Your modifications, or
+>      for any such Derivative Works as a whole, provided Your use,
+>      reproduction, and distribution of the Work otherwise complies with
+>      the conditions stated in this License.
+>
+>   5. Submission of Contributions. Unless You explicitly state otherwise,
+>      any Contribution intentionally submitted for inclusion in the Work
+>      by You to the Licensor shall be under the terms and conditions of
+>      this License, without any additional terms or conditions.
+>      Notwithstanding the above, nothing herein shall supersede or modify
+>      the terms of any separate license agreement you may have executed
+>      with Licensor regarding such Contributions.
+>
+>   6. Trademarks. This License does not grant permission to use the trade
+>      names, trademarks, service marks, or product names of the Licensor,
+>      except as required for reasonable and customary use in describing the
+>      origin of the Work and reproducing the content of the NOTICE file.
+>
+>   7. Disclaimer of Warranty. Unless required by applicable law or
+>      agreed to in writing, Licensor provides the Work (and each
+>      Contributor provides its Contributions) on an "AS IS" BASIS,
+>      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+>      implied, including, without limitation, any warranties or conditions
+>      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+>      PARTICULAR PURPOSE. You are solely responsible for determining the
+>      appropriateness of using or redistributing the Work and assume any
+>      risks associated with Your exercise of permissions under this License.
+>
+>   8. Limitation of Liability. In no event and under no legal theory,
+>      whether in tort (including negligence), contract, or otherwise,
+>      unless required by applicable law (such as deliberate and grossly
+>      negligent acts) or agreed to in writing, shall any Contributor be
+>      liable to You for damages, including any direct, indirect, special,
+>      incidental, or consequential damages of any character arising as a
+>      result of this License or out of the use or inability to use the
+>      Work (including but not limited to damages for loss of goodwill,
+>      work stoppage, computer failure or malfunction, or any and all
+>      other commercial damages or losses), even if such Contributor
+>      has been advised of the possibility of such damages.
+>
+>   9. Accepting Warranty or Additional Liability. While redistributing
+>      the Work or Derivative Works thereof, You may choose to offer,
+>      and charge a fee for, acceptance of support, warranty, indemnity,
+>      or other liability obligations and/or rights consistent with this
+>      License. However, in accepting such obligations, You may act only
+>      on Your own behalf and on Your sole responsibility, not on behalf
+>      of any other Contributor, and only if You agree to indemnify,
+>      defend, and hold each Contributor harmless for any liability
+>      incurred by, or claims asserted against, such Contributor by reason
+>      of your accepting any such warranty or additional liability.
diff --git a/julia/NEWS.md b/julia/NEWS.md
new file mode 100644
index 000000000000..71ee86ff7da4
--- /dev/null
+++ b/julia/NEWS.md
@@ -0,0 +1,692 @@
+# v0.4.0 (#TBD)
+
+* Following material from `mx` module got exported (#TBD):
+    * `NDArray`
+        * `clip()`
+        * `clip!()`
+        * `context()`
+        * `empty()`
+        * `expand_dims()`
+        * `@inplace`
+        * `σ()`
+        * `sigmoid()`
+        * `relu()`
+        * `softmax()`
+        * `log_softmax()`
+        * `broadcast_to()`
+        * `broadcast_axis()`
+        * `broadcast_axes()`
+
+    * `SymbolicNode`
+        * `Variable`
+        * `@var`
+
+    * `Context`
+        * `cpu()`
+        * `gpu()`
+
+    * `AbstractModel`
+        * `FeedForward`
+        * `predict()`
+
+    * `MLP`
+
+    * `Executor`
+        * `bind()`
+        * `simple_bind()`
+        * `forward()`
+        * `backward()`
+
+    * `AbstractEvalMetric`
+        * `ACE`
+        * `Accuracy`
+        * `MSE`
+        * `MultiACE`
+        * `MultiMetric`
+        * `NMSE`
+        * `SeqMetric`
+
+    * `KVStore`
+        * `init!()`
+        * `push!()`
+        * `pull!()`
+        * `barrier()`
+        * `set_updater()`
+        * `set_optimizer()`
+
+    * `AbstractInitializer`
+        * `UniformInitializer`
+        * `NormalInitializer`
+        * `XavierInitializer`
+
+    * `AbstractOptimizer`
+        * `AdaDelta`
+        * `AdaGrad`
+        * `ADAM`
+        * `AdaMax`
+        * `Nadam`
+        * `RMSProp`
+        * `SGD`
+        * `getupdater()`
+        * `normgrad!()`
+        * `update!()`
+
+    * `AbstractDataProvider`
+        * `AbstractDataBatch`
+        * `ArrayDataProvider`
+        * `ArrayDataBatch`
+
+    * `to_graphviz()`
+
+## New APIs
+
+### `SymbolicNode`
+
+* `mx.get_children` for exploring the graph programmatically. (#TBD)
+
+* A handy macro `@mx.var` for creating `mx.Variable`. (#TBD)
+
+  ```julia
+  julia> x = @mx.var x
+  MXNet.mx.SymbolicNode x
+
+  julia> x, y, z = @mx.var x y z
+  (MXNet.mx.SymbolicNode x, MXNet.mx.SymbolicNode y, MXNet.mx.SymbolicNode z)
+  ```
+
+### `NDArray`
+
+* A handy constructor: `NDArray(Type, AbstractArray)` is added. (#TBD)
+
+  E.g.
+  ```julia
+  julia> NDArray([1, 2, 3])
+  3-element mx.NDArray{Int64,1} @ CPU0:
+   1
+   2
+   3
+
+  julia> NDArray(Float32, [1, 2, 3])
+  3-element mx.NDArray{Float32,1} @ CPU0:
+   1.0
+   2.0
+   3.0
+  ```
+
+* A port of Python's `autograd` for `NDArray` (#274)
+
+* `size(x, dims...)` is supported now. (#TBD)
+
+  ```julia
+  julia> x = mx.NDArray([1 2; 3 4; 5 6])
+  3×2 mx.NDArray{Int64,2} @ CPU0:
+   1  2
+   3  4
+   5  6
+
+  julia> size(x, 1, 2, 3, 4)
+  (3, 2, 1, 1)
+
+  ```
+
+* `copy(AbstractArray, context)` is implemented now. (#TBD)
+
+  ```julia
+  julia> copy(1:4, mx.cpu())
+  4 mx.NDArray{Int64,1} @ CPU0:
+   1
+   2
+   3
+   4
+
+  julia> copy(1.:4, mx.cpu())
+  4 mx.NDArray{Float64,1} @ CPU0:
+   1.0
+   2.0
+   3.0
+   4.0
+  ```
+
+* `copy!(NDArray, AbstractArray)` is implemented now. (#TBD)
+
+  ```julia
+  julia> x = mx.zeros(3)
+  3-element mx.NDArray{Float32,1} @ CPU0:
+   0.0
+   0.0
+   0.0
+
+  julia> copy!(x, 3:5)
+  3-element mx.NDArray{Float32,1} @ CPU0:
+   3.0
+   4.0
+   5.0
+  ```
+
+* `Base.ones(x::NDArray)` for creating an one-ed `NDArray`. (#TBD)
+
+* `Base.zeros(x::NDArray)` for creating a zero-ed `NDArray`. (#TBD)
+
+* Modulo operator. (#TBD)
+
+  ```julia
+  x = NDArray(...)
+  y = NDArray(...)
+
+  x .% y
+  x .% 2
+  2 .% x
+  ```
+
+* Inplace modulo operator, `mod_from!` and `rmod_from!`. (#TBD)
+
+  ```julia
+  mod_from!(x, y)
+  mod_from!(x, 2)
+  rmod_from!(2, x)
+  ```
+
+* `cat`, `vcat`, `hcat` is implemented. (#TBD)
+
+  E.g. `hcat`
+  ```julia
+  julia> x
+  4 mx.NDArray{Float64,1} @ CPU0:
+   1.0
+   2.0
+   3.0
+   4.0
+
+  julia> y
+  4 mx.NDArray{Float64,1} @ CPU0:
+   2.0
+   4.0
+   6.0
+   8.0
+
+  julia> [x y]
+  4×2 mx.NDArray{Float64,2} @ CPU0:
+   1.0  2.0
+   2.0  4.0
+   3.0  6.0
+   4.0  8.0
+  ```
+
+* Transposing a column `NDArray` to a row `NDArray` is supported now. (#TBD)
+
+  ```julia
+  julia> x = NDArray(Float32[1, 2, 3, 4])
+  4 mx.NDArray{Float32,1} @ CPU0:
+   1.0
+   2.0
+   3.0
+   4.0
+
+  julia> x'
+  1×4 mx.NDArray{Float32,2} @ CPU0:
+   1.0  2.0  3.0  4.0
+  ```
+
+* Matrix/tensor multiplication is supported now. (#TBD)
+
+  ```julia
+  julia> x
+  2×3 mx.NDArray{Float32,2} @ CPU0:
+   1.0  2.0  3.0
+   4.0  5.0  6.0
+
+  julia> y
+  3 mx.NDArray{Float32,1} @ CPU0:
+   -1.0
+   -2.0
+   -3.0
+
+  julia> x * y
+  2 mx.NDArray{Float32,1} @ CPU0:
+   -14.0
+   -32.0
+  ```
+
+## API Changes
+
+### `NDArray`
+
+* Broadcasting along dimension supported on following operators,
+  and the original `mx.broadcast_*` APIs are deprecated
+  (#401) (#402) (#403):
+
+    * `+`
+    * `-`
+    * `*`
+    * `/`
+    * `%`
+    * `^`
+    * `==`
+    * `!=`
+    * `>`
+    * `>=`
+    * `<`
+    * `<=`
+    * `max`
+    * `min`
+
+    ```julia
+    julia> x = NDArray([1 2 3;
+                        4 5 6])
+    2×3 mx.NDArray{Int64,2} @ CPU0:
+     1  2  3
+     4  5  6
+
+    julia> y = NDArray([1;
+                        10])
+    2-element mx.NDArray{Int64,1} @ CPU0:
+      1
+     10
+
+    julia> x .+ y
+    2×3 mx.NDArray{Int64,2} @ CPU0:
+      2   3   4
+     14  15  16
+    ```
+
+* Please use dot-call on following trigonometric functions.
+  Also, the `arc*` has been renamed to keep consistent with `Base`.
+  (#TBD)
+
+    * `sin.(x)`
+    * `cos.(x)`
+    * `tan.(x)`
+    * `arcsin(x)` -> `asin.(x)`
+    * `arccos(x)` -> `acos.(x)`
+    * `arctan(x)` -> `atan.(x)`
+
+* Please use dot-call on following hyperbolic functions.
+  Also, the `arc*` has been renamed to keep consistent with `Base`.
+  (#TBD)
+
+    * `sinh.(x)`
+    * `cosh.(x)`
+    * `tanh.(x)`
+    * `arcsinh(x)` -> `asinh.(x)`
+    * `arccosh(x)` -> `acosh.(x)`
+    * `arctanh(x)` -> `atanh.(x)`
+
+* Please use dot-call on following activation functions.
+  And the `dim` of `softmax` and `log_softmax` has been fixed
+  as Julia column-based style.
+  (#TBD)
+
+    * `σ.(x)`
+    * `relu.(x)`
+    * `softmax.(x, [dim = ndims(x)])`
+    * `log_softmax.(x, [dim = ndims(x)])`
+
+* `rand`, `rand!`, `randn`, `randn!` is more Base-like now (#TBD).
+
+  ```julia
+  julia> mx.rand(2, 3)
+  2×3 mx.NDArray{Float32,2} @ CPU0:
+   0.631961  0.324175  0.0762663
+   0.285366  0.395292  0.074995
+
+  julia> mx.rand(2, 3; low = 1, high = 10)
+  2×3 mx.NDArray{Float32,2} @ CPU0:
+   7.83884  7.85793  7.64791
+   7.68646  8.56082  8.42189
+  ```
+
+  ```julia
+  julia> mx.randn(2, 3)
+  2×3 mx.NDArray{Float32,2} @ CPU0:
+   0.962853  0.424535  -0.320123
+   0.478113  1.72886    1.72287
+
+  julia> mx.randn(2, 3, μ = 100)
+  2×3 mx.NDArray{Float32,2} @ CPU0:
+   99.5635  100.483   99.888
+   99.9889  100.533  100.072
+  ```
+
+* Signature of `clip` changed, it doesn't require any keyword argument now.
+  (#TBD)
+
+  Before: `clip(x, a_min = -4, a_max = 4)`
+  After: `clip(x, -4, 4)`
+
+### Optimizer
+
+We overhauled the optimizer APIs, introducing breaking changes.
+There are tons of renaming, and we try to increase the flexibility.
+Making it decouples from some high-level, so user can use it without
+understand some detail implementations of `fit!`.
+
+See #396.
+
+* All the keyword argument of optimizers have been renamed.
+  Now we have more elegant keyword arguments than Python's,
+  thanks to well Unicode support on Julia's REPL and editor plugin.
+  *These are breaking changes, no deprecation warning.*
+
+    | old                       | new       | comment                        |
+    |---------------------------|-----------|--------------------------------|
+    | `opts.lr`                 | `η`       | type `\eta<tab>` in REPL       |
+    | `opts.momentum`           | `μ`       | type `\mu<tab>` in REPL        |
+    | `opts.grad_clip`          | `clip`    | type `\nabla<tab>c` in REPL    |
+    | `opts.weight_decay`       | `λ`       | type `\lambda<tab>` in REPL    |
+    | `opts.lr_schedular`       | `η_sched` | type `\eta<tab>_sched` in REPL |
+    | `opts.momentum_schedular` | `μ_sched` | type `\mu<tab>_sched` in REPL  |
+
+  For instance, one accessed the learning via `SGD().opts.lr`,
+  but now, it's `SGD().η`.
+
+* New keyword argument `scale` for gradient rescaling.
+
+  Docstring:
+  ```
+  If != 0, multiply the gradient with `∇r` before updating.
+  Often choose to be `1.0 / batch_size`.
+  If leave it default, high-level API like `fit!` will set it to
+  `1.0 / batch_size`, since `fit!` knows the `batch_size`.
+  ```
+
+* Keyword arguments of `NadamScheduler` has been renamed.
+  *This is a breaking change, no deprecation warning.*
+
+    * Before
+
+      ```julia
+      NadamScheduler(; mu0 = 0.99, delta = 0.004, gamma = 0.5, alpha = 0.96)
+      ```
+
+    * After
+
+      ```julia
+      NadamScheduler(; μ = 0.99, δ = 0.004, γ = 0.5, α = 0.96)
+      ```
+
+* The attribute `optimizer.state` is removed.
+  `OptimizationState` is only used by high-level abstraction, like `fit!`.
+
+* `LearningRate` scheduler API changes:
+
+    * `get_learning_rate` is removed.
+      Please use `Base.get` to get learning rate.
+
+      ```julia
+      julia> sched = mx.LearningRate.Exp(.1)
+      MXNet.mx.LearningRate.Exp(0.1, 0.9, 0)
+
+      julia> get(sched)
+      0.1
+
+      julia> update!(sched);
+
+      julia> get(sched)
+      0.09000000000000001
+      ```
+
+    * `update!` to bump counter of `Scheduler.t`
+      ```julia
+      julia> sched.t
+      1
+
+      julia> update!(sched);
+
+      julia> sched.t
+      2
+
+      julia> update!(sched);
+
+      julia> sched.t
+      3
+      ```
+
+* `Momentum` module API changes:
+
+    * `get_momentum_scheduler` is removed. Please use `Base.get` instead.
+
+      ```julia
+      julia> get(mx.Momentum.Fixed(.9))
+      0.9
+      ```
+
+----
+
+# v0.3.0 (2017.11.16)
+
+* Update `libmxnet` to
+    * On Windows: v0.12.0.
+    (See https://github.com/apache/incubator-mxnet/releases/tag/0.12.0)
+
+    * On Linux/macOS: v0.12.1.
+    (See https://github.com/apache/incubator-mxnet/releases/tag/0.12.1)
+
+* Drop 0.5 support. ([#300][300])
+
+## New API
+
+### `SymbolicNode`
+
+* Debugging print support. ([#276][276])
+
+### `NDArray`
+
+* `deepcopy` for `NDArray` ([#273][273])
+
+* `scalar ./ NDArray` is available now. ([#292][292])
+
+* `fill` and `fill!` for `NDArray`. ([#297][297], [#311][311])
+
+  An API correspond to Python's `mx.nd.full()`
+
+    * `fill(x, dims, ctx=cpu())`
+    * `fill(x, dims...)`
+    * `fill!(arr::NDArray, x)`
+
+* Matrix (2D `NDArray`) multiplication is available now. ([#300][300])
+
+    ```julia
+    julia> x
+    1x2 mx.NDArray{Float64} @ CPU0:
+     1.0  2.0
+
+    julia> x' * x
+    2x2 mx.NDArray{Float64} @ CPU0:
+     1.0  2.0
+     2.0  4.0
+    ```
+
+* `NDArray` `getindex`/`setindex!` linear indexing support and `first` for
+  extracting scalar value. ([#294][294])
+
+  ```julia
+  julia> x = mx.zeros(2, 5)
+
+  julia> x[5] = 42  # do synchronization and set the value
+  ```
+
+  ```julia
+  julia> y = x[5]  # actually, getindex won't do synchronization, but REPL's showing did it for you
+  1 mx.NDArray{Float32} @ CPU0:
+   42.0
+
+  julia> first(y)  # do sync and get the value
+  42.0f0
+
+  julia> y[]  # this is available, also
+  42.0f0
+  ```
+* Elementwise power of `NDArray`. ([#293][293])
+
+    * `x.^2`
+    * `2.^x`
+    * `x.^y`
+    * where `x` and `y` are `NDArray`s.
+
+* Elementwise power of irrational and `NDArray`. ([#310][310])
+
+    * `e.^x`
+    * `x.^e`
+    * `π.^x`
+
+## API Changes
+
+### `SymbolicNode`
+
+* `reshape` of `SymbolicNode` shares the same interface with Base
+  and additional keyword argument. ([#279][279])
+
+    * `reshape(SymbolicNode, dim; reverse=false, name)`
+    * `reshape(SymbolicNode, dim...; reverse=false, name)`
+    * `Reshape` is deprecated.
+
+* `mx.forward(x)` will return `x.outputs` now. ([#312][312])
+
+### `NDArray`
+
+* `reshape` of `NDArray` shares the same interface with Base. ([#272][272])
+
+    * `reshape(NDArray, dim; reverse=false)`
+    * `reshape(NDArray, dim...; reverse=false)`
+    * `Reshape` is deprecated.
+
+* `srand!` deprecated, please use `srand`. ([#282][282])
+
+* `mean` and `sum` of `NDArray` share the same interface with Base
+  and fix the `axis` indexing. ([#303][303])
+
+    * This is a breaking change; no deprecated warning.
+    * Before: `mean(arr, axis=0)`
+    * After: `mean(arr, 1)`
+
+* `max` and `min` of `NDArray` renamed to `maximum` and `minimum` and share the
+  same interface with Base. The `axis` indexing is fixed, also. ([#303][303])
+
+    * This is a breaking change; no deprecated warning.
+    * Before: `mx.max(arr, axis=0)` or `mx.max_axis(arr, axis=0)`
+    * After: `maximum(arr, 1)`
+
+* `mx.transpose` for high dimension `NDArray` has been renamed to `permutedims`
+  and shares the same interface with Base. ([#303][303])
+
+    * This is a breaking changes; no deprecated warning.
+    * Before: `mx.transpose(A, axis=[2, 1, 3])`
+    * After: `permutedims(A, [2, 1, 3])`
+
+* `prod` of `NDArray` shares the same interface with Base and fix the `axis`
+  indexing. ([#303][303])
+
+    * This is a breaking change; no deprecated warning.
+    * Before: `prod(arr, axis=-1)`
+    * After: `prod(arr, 1)`
+
+## Bugfix
+
+* Broadcasting operation on same variable is back. ([#300][300], [#314][314])
+  ```julia
+  x = mx.NDArray(...)
+  x .* x
+  ```
+
+  ```julia
+  y = mx.Variable(:y)
+  y .* y
+  ```
+
+[272]: https://github.com/dmlc/MXNet.jl/pull/272
+[273]: https://github.com/dmlc/MXNet.jl/pull/273
+[276]: https://github.com/dmlc/MXNet.jl/pull/276
+[279]: https://github.com/dmlc/MXNet.jl/pull/279
+[282]: https://github.com/dmlc/MXNet.jl/pull/282
+[292]: https://github.com/dmlc/MXNet.jl/pull/292
+[293]: https://github.com/dmlc/MXNet.jl/pull/293
+[294]: https://github.com/dmlc/MXNet.jl/pull/294
+[297]: https://github.com/dmlc/MXNet.jl/pull/297
+[300]: https://github.com/dmlc/MXNet.jl/pull/300
+[303]: https://github.com/dmlc/MXNet.jl/pull/303
+[310]: https://github.com/dmlc/MXNet.jl/pull/310
+[311]: https://github.com/dmlc/MXNet.jl/pull/311
+[312]: https://github.com/dmlc/MXNet.jl/pull/312
+[314]: https://github.com/dmlc/MXNet.jl/pull/314
+
+# v0.2.2 (2017.05.14)
+* Updated supported version of MXNet to 0.9.4.
+* Improved build-system with support for auto-detecting GPU support.
+* Several updates to Metrics.
+* CI for Windows.
+* Verbosity option for `predict` (@rdeits)
+
+# v0.2.1 (2017.01.29)
+* Bugfix release for Windows
+
+# v0.2.0 (2017.01.26)
+* Drop support for Julia v0.4.
+* Added support for NVVM.
+* Updated supported version of MXNet to 0.9.2
+* New optimizers (@Arkoniak).
+
+# v0.1.0 (2016.09.08)
+
+* Track specific libmxnet version for each release.
+* Migrated documentation system to `Documenter.jl` (@vchuravy)
+* Simplified building by using Julia's OpenBlas (@staticfloat)
+* Freezing parameters (@vchuravy)
+* Support `DType` for `NDArray` (@vchuravy)
+
+# v0.0.8 (2016.02.08)
+
+* Fix compatability with Julia v0.5.
+* Fix seg-faults introduced by upstream API changes.
+
+# v0.0.7 (2015.12.14)
+
+* Fix compatability with Julia v0.4.2 (@BigEpsilon)
+* Metrics in epoch callbacks (@kasiabozek)
+
+# v0.0.6 (2015.12.02)
+
+* Variants of Xaiver initializers (@vchuravy)
+* More arithmetic operators on symbolic nodes
+* Basic interface for symbolic node attributes (@vchuravy)
+
+# v0.0.5 (2015.11.14)
+
+* char-lstm example.
+* Network visualization via GraphViz.
+* NN-factory for common models.
+* Convenient `@nd_as_jl` macro to work with `NDArray` as Julia Arrays.
+* Refactoring: `Symbol` -> `SymbolicNode`.
+* More evaluation metrics (@vchuravy, @Andy-P)
+
+# v0.0.4 (2015.11.09)
+
+* ADAM optimizer (@cbecker)
+* Improved data provider API.
+* More documentation.
+* Fix a bug in array data iterator (@vchuravy)
+
+# v0.0.3 (2015.10.27)
+
+* Model prediction API.
+* Model checkpoint loading and saving.
+* IJulia Notebook example of using pre-trained imagenet model as classifier.
+* Symbol saving and loading.
+* NDArray saving and loading.
+* Optimizer gradient clipping.
+* Model training callback APIs, default checkpoint and speedometer callbacks.
+* Julia Array / NDArray data iterator.
+* Sphinx documentation system and documents for dynamically imported libmxnet APIs.
+
+# v0.0.2 (2015.10.23)
+
+* Fix a bug in build script that causes Julia REPL to exit.
+
+# v0.0.1 (2015.10.23)
+
+Initial release.
+
+* Basic libmxnet API.
+* Basic documentation, overview and MNIST tutorial.
+* Working MNIST and cifar-10 examples, with multi-GPU training.
+* Automatic building of libmxnet with BinDeps.jl.
+
diff --git a/julia/README-DEV.md b/julia/README-DEV.md
new file mode 100644
index 000000000000..a1d6fa9012fc
--- /dev/null
+++ b/julia/README-DEV.md
@@ -0,0 +1,13 @@
+# Workflow for making a release
+
+1. Update `NEWS.md` to list important changes
+2. Check out the `stable` branch, merge with `master`.
+3. Update `libmxnet_curr_ver` in `deps/build.jl` to the latest commit SHA (or any proper reference). Using `master` here is not good because future changes in libmxnet might break existing Julia packages.
+4. Run tests.
+5. Merge master into stable branch.
+6. Tag stable branch: `git tag v1.2.3`
+7. Push tag to remote: `git push origin <tagname>`
+8. Edit the [releases page](https://github.com/dmlc/MXNet.jl/releases)
+   to copy the release notes from `NEWS.md` to the newly created release tag.
+9. Goto https://github.com/JuliaLang/METADATA.jl/pulls
+   and check `attobot` already make a PR for the release.
diff --git a/julia/README.md b/julia/README.md
new file mode 100644
index 000000000000..a4299575f95e
--- /dev/null
+++ b/julia/README.md
@@ -0,0 +1,60 @@
+# MXNet
+
+[![MXNet](http://pkg.julialang.org/badges/MXNet_0.6.svg)](http://pkg.julialang.org/?pkg=MXNet)
+
+
+MXNet.jl is the [dmlc/mxnet](https://github.com/apache/incubator-mxnet) [Julia](http://julialang.org/) package. MXNet.jl brings flexible and efficient GPU computing and state-of-art deep learning to Julia. Some highlight of its features include:
+
+* Efficient tensor/matrix computation across multiple devices, including multiple CPUs, GPUs and distributed server nodes.
+* Flexible symbolic manipulation to composite and construction of state-of-the-art deep learning models.
+
+Here is an example of how training a simple 3-layer MLP on MNIST:
+
+```julia
+using MXNet
+
+mlp = @mx.chain mx.Variable(:data)             =>
+  mx.FullyConnected(name=:fc1, num_hidden=128) =>
+  mx.Activation(name=:relu1, act_type=:relu)   =>
+  mx.FullyConnected(name=:fc2, num_hidden=64)  =>
+  mx.Activation(name=:relu2, act_type=:relu)   =>
+  mx.FullyConnected(name=:fc3, num_hidden=10)  =>
+  mx.SoftmaxOutput(name=:softmax)
+
+# data provider
+batch_size = 100
+include(Pkg.dir("MXNet", "examples", "mnist", "mnist-data.jl"))
+train_provider, eval_provider = get_mnist_providers(batch_size)
+
+# setup model
+model = mx.FeedForward(mlp, context=mx.cpu())
+
+# optimization algorithm
+# where η is learning rate and μ is momentum
+optimizer = mx.SGD(η=0.1, μ=0.9)
+
+# fit parameters
+mx.fit(model, optimizer, train_provider, n_epoch=20, eval_data=eval_provider)
+```
+
+You can also predict using the `model` in the following way:
+
+```julia
+probs = mx.predict(model, eval_provider)
+
+# collect all labels from eval data
+labels = reduce(
+  vcat,
+  copy(mx.get(eval_provider, batch, :softmax_label)) for batch ∈ eval_provider)
+# labels are 0...9
+labels .= labels .+ 1
+
+# Now we use compute the accuracy
+pred = map(i -> indmax(probs[1:10, i]), 1:size(probs, 2))
+correct = sum(pred .== labels)
+accuracy = 100correct/length(labels)
+@printf "Accuracy on eval set: %.2f%%\n" accuracy
+```
+
+For more details, please refer to the
+[documentation](https://dmlc.github.io/MXNet.jl/latest) and [examples](examples).
diff --git a/julia/REQUIRE b/julia/REQUIRE
new file mode 100644
index 000000000000..5a76dc543b25
--- /dev/null
+++ b/julia/REQUIRE
@@ -0,0 +1,7 @@
+julia 0.6
+Formatting
+BinDeps
+JSON
+MacroTools
+TakingBroadcastSeriously
+Reexport
diff --git a/julia/appveyor.yml b/julia/appveyor.yml
new file mode 100644
index 000000000000..50e275cfa8a8
--- /dev/null
+++ b/julia/appveyor.yml
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+environment:
+  matrix:
+  - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"
+
+branches:
+  only:
+    - master
+    - stable
+
+notifications:
+  - provider: Email
+    on_build_success: false
+    on_build_failure: false
+    on_build_status_changed: false
+
+install:
+  - ps: "[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12"
+# If there's a newer build queued for the same PR, cancel this one
+  - ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod `
+        https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | `
+        Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { `
+        throw "There are newer queued builds for this pull request, failing early." }
+
+# Download most recent Julia Windows binary
+  - ps: (new-object net.webclient).DownloadFile(
+        $env:JULIA_URL,
+        "C:\projects\julia-binary.exe")
+# Run installer silently, output to C:\projects\julia
+  - C:\projects\julia-binary.exe /S /D=C:\projects\julia
+
+build_script:
+# Need to convert from shallow to complete for Pkg.clone to work
+  - IF EXIST .git\shallow (git fetch --unshallow)
+  - C:\projects\julia\bin\julia -e "versioninfo();
+      Pkg.clone(pwd(), \"MXNet\"); Pkg.build(\"MXNet\")"
+
+test_script:
+  - C:\projects\julia\bin\julia --check-bounds=yes -e "Pkg.test(\"MXNet\")"
+
diff --git a/julia/deps/build.jl b/julia/deps/build.jl
new file mode 100644
index 000000000000..bdc33be8c79b
--- /dev/null
+++ b/julia/deps/build.jl
@@ -0,0 +1,280 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import JSON
+
+################################################################################
+# First try to detect and load existing libmxnet
+################################################################################
+libmxnet_detected = false
+libmxnet_curr_ver = get(ENV, "MXNET_COMMIT", "master")
+curr_win = "20180211"  # v1.1.0
+
+if haskey(ENV, "MXNET_HOME")
+  MXNET_HOME = ENV["MXNET_HOME"]
+  info("MXNET_HOME environment detected: $MXNET_HOME")
+  info("Trying to load existing libmxnet...")
+  # In case of macOS, if user build libmxnet from source and set the MXNET_HOME,
+  # the output is still named as `libmxnet.so`.
+  lib = Libdl.find_library(["libmxnet.$(Libdl.dlext)", "libmxnet.so"],
+                           [joinpath(MXNET_HOME, "lib"), MXNET_HOME])
+  if !isempty(lib)
+    info("Existing libmxnet detected at $lib, skip building...")
+    libmxnet_detected = true
+  else
+    info("Failed to load existing libmxnet, trying to build from source...")
+  end
+end
+
+# Try to find cuda
+CUDAPATHS = String[]
+if haskey(ENV, "CUDA_HOME")
+  push!(CUDAPATHS, joinpath(ENV["CUDA_HOME"], "lib64"))
+elseif is_linux()
+  append!(CUDAPATHS, ["/opt/cuda/lib64", "/usr/local/cuda/lib64"])
+end
+
+if is_unix()
+  try
+    push!(CUDAPATHS, replace(strip(readstring(`which nvcc`)), "bin/nvcc", "lib64"))
+  end
+end
+
+HAS_CUDA = false
+HAS_CUDNN = false
+let cudalib = Libdl.find_library(["libcuda", "nvcuda.dll"], CUDAPATHS)
+  HAS_CUDA = !isempty(cudalib) && Libdl.dlopen_e(cudalib) != C_NULL
+end
+
+if !HAS_CUDA && is_windows()
+  # TODO: this needs to be improved.
+  try
+    run(`nvcc --version`)
+    HAS_CUDA = true
+  end
+end
+
+if HAS_CUDA  # then check cudnn
+  let cudnnlib = Libdl.find_library("libcudnn", CUDAPATHS)
+    HAS_CUDNN = !isempty(cudnnlib) && Libdl.dlopen_e(cudnnlib) != C_NULL
+    if HAS_CUDNN && !haskey(ENV, "CUDA_HOME")  # inference `CUDA_HOME`
+      ENV["CUDA_HOME"] = dirname(dirname(cudnnlib))
+    end
+  end
+end
+
+if HAS_CUDA
+  info("Found a CUDA installation.")
+  if HAS_CUDNN
+    info("Found a CuDNN installation.")
+  end
+  info("CUDA_HOME -> $(get(ENV, "CUDA_HOME", nothing))")
+else
+  info("Did not find a CUDA installation, using CPU-only version of MXNet.")
+end
+
+# propagate more build flags from ENV
+const CC  = get(ENV, "CC", nothing)
+const CXX = get(ENV, "CXX", nothing)
+const ADD_CFLAGS  = get(ENV, "ADD_CFLAGS", nothing)
+const ADD_LDFLAGS = get(ENV, "ADD_LDFLAGS", nothing)
+const USE_JEMALLOC = get(ENV, "USE_JEMALLOC", nothing)  # "0" or "1"
+
+function get_cpucore()
+    if haskey(ENV, "TRAVIS")  # on travis-ci
+        2
+    else
+        min(Sys.CPU_CORES, 32)
+    end
+end
+
+using BinDeps
+@BinDeps.setup
+if !libmxnet_detected
+  if is_windows()
+    if Sys.ARCH != :x86_64
+      info("Prebuilt windows binaries are only available on 64bit. You will have to built MXNet yourself.")
+      return
+    end
+    info("Downloading pre-built packages for Windows.")
+    base_url = "https://github.com/yajiedesign/mxnet/releases/download/weekly_binary_build_v2/prebuildbase_win10_x64_vc14_v2.7z"
+
+    if libmxnet_curr_ver == "master"
+      # download_cmd uses powershell 2, but we need powershell 3 to do this
+      run(`powershell -NoProfile -Command Invoke-WebRequest -Uri "https://api.github.com/repos/yajiedesign/mxnet/releases/latest" -OutFile "mxnet.json"`)
+      curr_win = JSON.parsefile("mxnet.json")["tag_name"]
+      info("Can't use MXNet master on Windows, using latest binaries from $curr_win.")
+    end
+    # TODO: Get url from JSON.
+    name = "mxnet_x64_vc14_$(HAS_CUDA ? "gpu" : "cpu").7z"
+    package_url = "https://github.com/yajiedesign/mxnet/releases/download/$(curr_win)/$(curr_win)_$(name)"
+
+    exe7z = joinpath(JULIA_HOME, "7z.exe")
+
+    run(download_cmd(package_url, "mxnet.7z"))
+    # this command will create the dir "usr\\lib"
+    run(`$exe7z x mxnet.7z build lib -y -ousr`)
+    run(`cmd /c copy "usr\\build\\*.dll" "usr\\lib"`)
+
+    run(download_cmd(base_url, "mxnet_base.7z"))
+    run(`$exe7z x mxnet_base.7z -y -ousr`)
+    run(`cmd /c copy "usr\\prebuildbase_win10_x64_vc14_v2\\3rdparty\\bin\\*.dll" "usr\\lib"`)
+
+    # testing
+    run(`cmd /c dir "usr\\lib"`)
+    return
+  end
+
+  ################################################################################
+  # If not found, try to build automatically using BinDeps
+  ################################################################################
+
+  blas_path = Libdl.dlpath(Libdl.dlopen(Base.libblas_name))
+
+  blas_vendor = Base.BLAS.vendor()
+
+  ilp64 = ""
+  if blas_vendor == :openblas64
+    ilp64 = "-DINTERFACE64"
+  end
+
+  FORCE_LAPACK = false
+  if blas_vendor == :unknown
+    info("Julia is built with an unkown blas library ($blas_path).")
+    info("Attempting build without reusing the blas library")
+    USE_JULIA_BLAS = false
+  elseif !(blas_vendor in (:openblas, :openblas64))
+    info("Unsure if we can build against $blas_vendor.")
+    info("Attempting build anyway.")
+    USE_JULIA_BLAS = true
+  else
+    USE_JULIA_BLAS = true
+    FORCE_LAPACK = true
+  end
+  info("USE_JULIA_BLAS -> $USE_JULIA_BLAS")
+
+  blas_name = blas_vendor == :openblas64 ? "openblas" : string(blas_vendor)
+  MSHADOW_LDFLAGS = "MSHADOW_LDFLAGS=-lm $blas_path"
+
+  #--------------------------------------------------------------------------------
+  # Build libmxnet
+  mxnet = library_dependency("mxnet", aliases=["mxnet", "libmxnet", "libmxnet.so"])
+
+  _prefix = joinpath(BinDeps.depsdir(mxnet), "usr")
+  _srcdir = joinpath(BinDeps.depsdir(mxnet), "src")
+  _mxdir  = joinpath(_srcdir, "mxnet")
+  _libdir = joinpath(_prefix, "lib")
+  # We have do eagerly delete the installed libmxnet.so
+  # Otherwise we won't rebuild on an update.
+  run(`rm -f $_libdir/libmxnet.$(Libdl.dlext)`)
+  provides(BuildProcess,
+    (@build_steps begin
+      CreateDirectory(_srcdir)
+      CreateDirectory(_libdir)
+      @build_steps begin
+        BinDeps.DirectoryRule(_mxdir, @build_steps begin
+          ChangeDirectory(_srcdir)
+          `git clone https://github.com/apache/incubator-mxnet`
+        end)
+        @build_steps begin
+          ChangeDirectory(_mxdir)
+          `git fetch`
+          if libmxnet_curr_ver != "master"
+            `git checkout $libmxnet_curr_ver`
+          else
+            `git checkout origin/$libmxnet_curr_ver`
+          end
+          `git submodule update --init --recursive`
+          `git -C mshadow checkout -- make/mshadow.mk`
+
+          # copying on changed, make travis caching happy
+          `../../cpcblas.sh`
+
+          `sed -i -s "s/MSHADOW_CFLAGS = \(.*\)/MSHADOW_CFLAGS = \1 $ilp64/" mshadow/make/mshadow.mk`
+
+          # Copy config.mk, always override the file
+          if is_apple()
+            `cp make/osx.mk config.mk`
+          else
+            `cp make/config.mk config.mk`
+          end
+
+          # Configure OpenCV
+          `sed -i -s 's/USE_OPENCV = 1/USE_OPENCV = 0/' config.mk`
+
+          # Configure CUDA
+          if HAS_CUDA
+            @build_steps begin
+              `sed -i -s 's/USE_CUDA = 0/USE_CUDA = 1/' config.mk`
+              # address https://github.com/apache/incubator-mxnet/pull/7856
+              `sed -i -s "s/ADD_LDFLAGS =\(.*\)/ADD_LDFLAGS =\1 -lcublas -lcusolver -lcurand -lcudart/" config.mk`
+              if haskey(ENV, "CUDA_HOME")
+                `sed -i -s "s@USE_CUDA_PATH = NONE@USE_CUDA_PATH = $(ENV["CUDA_HOME"])@" config.mk`
+              end
+              if haskey(ENV, "CUDA_HOME")
+                # address https://github.com/apache/incubator-mxnet/pull/7838
+                flag = "-L$(ENV["CUDA_HOME"])/lib64 -L$(ENV["CUDA_HOME"])/lib"
+                `sed -i -s "s@ADD_LDFLAGS =\(.*\)@ADD_LDFLAGS =\1 $flag@" config.mk`
+              end
+              if HAS_CUDNN
+                `sed -i -s 's/USE_CUDNN = 0/USE_CUDNN = 1/' config.mk`
+              end
+            end
+          end
+
+          # Force enable LAPACK build
+          # Julia's OpenBLAS has LAPACK functionality already
+          if FORCE_LAPACK
+            if is_apple()
+              MSHADOW_LDFLAGS *= " -framework Accelerate"
+            end
+            `sed -i -s 's/ADD_CFLAGS =\(.*\)/ADD_CFLAGS =\1 -DMXNET_USE_LAPACK/' config.mk`
+          end
+
+          # propagate more build flags from ENV
+          if CC != nothing
+            `sed -i -s "s@^export CC =\(.*\)@export CC = $CC@" config.mk`
+          end
+          if CXX != nothing
+            `sed -i -s "s@^export CXX =\(.*\)@export CXX = $CXX@" config.mk`
+          end
+          if ADD_CFLAGS != nothing
+            `sed -i -s "s@ADD_CFLAGS =\(.*\)@ADD_CFLAGS =\1 $ADD_CFLAGS@" config.mk`
+          end
+          if ADD_LDFLAGS != nothing
+            `sed -i -s "s@ADD_LDFLAGS =\(.*\)@ADD_LDFLAGS =\1 $ADD_LDFLAGS@" config.mk`
+          end
+          if USE_JEMALLOC != nothing
+            `sed -i -s "s@USE_JEMALLOC =\(.*\)@USE_JEMALLOC = $USE_JEMALLOC@" config.mk`
+          end
+
+          if USE_JULIA_BLAS
+            `make -j$(get_cpucore()) USE_BLAS=$blas_name $MSHADOW_LDFLAGS`
+          else
+            `make -j$(get_cpucore())`
+          end
+        end
+        FileRule(joinpath(_libdir, "libmxnet.$(Libdl.dlext)"), @build_steps begin
+          # the output file on macos is still in `.so` suffix
+          # so we rename it
+          `cp $_mxdir/lib/libmxnet.so $_libdir/libmxnet.$(Libdl.dlext)`
+        end)
+      end
+    end), mxnet, installed_libpath=_libdir)
+
+  @BinDeps.install Dict(:mxnet => :mxnet)
+end
diff --git a/julia/deps/cblas.h b/julia/deps/cblas.h
new file mode 100644
index 000000000000..d9449dc8e21d
--- /dev/null
+++ b/julia/deps/cblas.h
@@ -0,0 +1,563 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef CBLAS_H
+#define CBLAS_H
+
+/*
+ * This file modified from the OpenBLAS repository.
+ */
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+	/* Assume C declarations for C++ */
+#endif  /* __cplusplus */
+
+
+/*
+ * Since all of GotoBlas was written without const,
+ * we disable it at build time.
+ */
+#ifndef OPENBLAS_CONST
+# define OPENBLAS_CONST const
+#endif
+
+/*
+ * Add definitions for BLASLONG and blasint
+ */
+
+#if defined(OS_WINDOWS) && defined(__64BIT__)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef INTERFACE64
+typedef BLASLONG blasint;
+#else
+typedef int blasint;
+#endif
+
+/* copy from openblas_config_template.h */
+/* C99 supports complex floating numbers natively, which GCC also offers as an
+   extension since version 3.0.  If neither are available, use a compatible
+   structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
+#if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
+      (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT)))
+#ifndef __cplusplus
+  #include <complex.h>
+#endif
+  typedef float _Complex openblas_complex_float;
+  typedef double _Complex openblas_complex_double;
+#else
+  typedef struct { float real, imag; } openblas_complex_float;
+  typedef struct { double real, imag; } openblas_complex_double;
+#endif
+
+#ifdef INTERFACE64
+# define cblas_sdsdot cblas_sdsdot64_
+# define cblas_dsdot cblas_dsdot64_
+# define cblas_sdot cblas_sdot64_
+# define cblas_ddot cblas_ddot64_
+# define cblas_cdotu cblas_cdotu64_
+# define cblas_cdotc cblas_cdotc64_
+# define cblas_zdotu cblas_zdotu64_
+# define cblas_zdotc cblas_zdotc64_
+# define cblas_cdotu_sub cblas_cdotu_sub64_
+# define cblas_cdotc_sub cblas_cdotc_sub64_
+# define cblas_zdotu_sub cblas_zdotu_sub64_
+# define cblas_zdotc_sub cblas_zdotc_sub64_
+# define cblas_sasum cblas_sasum64_
+# define cblas_dasum cblas_dasum64_
+# define cblas_scasum cblas_scasum64_
+# define cblas_dzasum cblas_dzasum64_
+# define cblas_snrm2 cblas_snrm264_
+# define cblas_dnrm2 cblas_dnrm264_
+# define cblas_scnrm2 cblas_scnrm264_
+# define cblas_dznrm2 cblas_dznrm264_
+# define cblas_isamax cblas_isamax64_
+# define cblas_idamax cblas_idamax64_
+# define cblas_icamax cblas_icamax64_
+# define cblas_izamax cblas_izamax64_
+# define cblas_saxpy cblas_saxpy64_
+# define cblas_daxpy cblas_daxpy64_
+# define cblas_caxpy cblas_caxpy64_
+# define cblas_zaxpy cblas_zaxpy64_
+# define cblas_scopy cblas_scopy64_
+# define cblas_dcopy cblas_dcopy64_
+# define cblas_ccopy cblas_ccopy64_
+# define cblas_zcopy cblas_zcopy64_
+# define cblas_sswap cblas_sswap64_
+# define cblas_dswap cblas_dswap64_
+# define cblas_cswap cblas_cswap64_
+# define cblas_zswap cblas_zswap64_
+# define cblas_srot cblas_srot64_
+# define cblas_drot cblas_drot64_
+# define cblas_srotg cblas_srotg64_
+# define cblas_drotg cblas_drotg64_
+# define cblas_srotm cblas_srotm64_
+# define cblas_drotm cblas_drotm64_
+# define cblas_srotmg cblas_srotmg64_
+# define cblas_drotmg cblas_drotmg64_
+# define cblas_sscal cblas_sscal64_
+# define cblas_dscal cblas_dscal64_
+# define cblas_cscal cblas_cscal64_
+# define cblas_zscal cblas_zscal64_
+# define cblas_csscal cblas_csscal64_
+# define cblas_zdscal cblas_zdscal64_
+# define cblas_sgemv cblas_sgemv64_
+# define cblas_dgemv cblas_dgemv64_
+# define cblas_cgemv cblas_cgemv64_
+# define cblas_zgemv cblas_zgemv64_
+# define cblas_sger cblas_sger64_
+# define cblas_dger cblas_dger64_
+# define cblas_cgeru cblas_cgeru64_
+# define cblas_cgerc cblas_cgerc64_
+# define cblas_zgeru cblas_zgeru64_
+# define cblas_zgerc cblas_zgerc64_
+# define cblas_strsv cblas_strsv64_
+# define cblas_dtrsv cblas_dtrsv64_
+# define cblas_ctrsv cblas_ctrsv64_
+# define cblas_ztrsv cblas_ztrsv64_
+# define cblas_strmv cblas_strmv64_
+# define cblas_dtrmv cblas_dtrmv64_
+# define cblas_ctrmv cblas_ctrmv64_
+# define cblas_ztrmv cblas_ztrmv64_
+# define cblas_ssyr cblas_ssyr64_
+# define cblas_dsyr cblas_dsyr64_
+# define cblas_cher cblas_cher64_
+# define cblas_zher cblas_zher64_
+# define cblas_ssyr2 cblas_ssyr264_
+# define cblas_dsyr2 cblas_dsyr264_
+# define cblas_cher2 cblas_cher264_
+# define cblas_zher2 cblas_zher264_
+# define cblas_sgbmv cblas_sgbmv64_
+# define cblas_dgbmv cblas_dgbmv64_
+# define cblas_cgbmv cblas_cgbmv64_
+# define cblas_zgbmv cblas_zgbmv64_
+# define cblas_ssbmv cblas_ssbmv64_
+# define cblas_dsbmv cblas_dsbmv64_
+# define cblas_stbmv cblas_stbmv64_
+# define cblas_dtbmv cblas_dtbmv64_
+# define cblas_ctbmv cblas_ctbmv64_
+# define cblas_ztbmv cblas_ztbmv64_
+# define cblas_stbsv cblas_stbsv64_
+# define cblas_dtbsv cblas_dtbsv64_
+# define cblas_ctbsv cblas_ctbsv64_
+# define cblas_ztbsv cblas_ztbsv64_
+# define cblas_stpmv cblas_stpmv64_
+# define cblas_dtpmv cblas_dtpmv64_
+# define cblas_ctpmv cblas_ctpmv64_
+# define cblas_ztpmv cblas_ztpmv64_
+# define cblas_stpsv cblas_stpsv64_
+# define cblas_dtpsv cblas_dtpsv64_
+# define cblas_ctpsv cblas_ctpsv64_
+# define cblas_ztpsv cblas_ztpsv64_
+# define cblas_ssymv cblas_ssymv64_
+# define cblas_dsymv cblas_dsymv64_
+# define cblas_chemv cblas_chemv64_
+# define cblas_zhemv cblas_zhemv64_
+# define cblas_sspmv cblas_sspmv64_
+# define cblas_dspmv cblas_dspmv64_
+# define cblas_sspr cblas_sspr64_
+# define cblas_dspr cblas_dspr64_
+# define cblas_chpr cblas_chpr64_
+# define cblas_zhpr cblas_zhpr64_
+# define cblas_sspr2 cblas_sspr264_
+# define cblas_dspr2 cblas_dspr264_
+# define cblas_chpr2 cblas_chpr264_
+# define cblas_zhpr2 cblas_zhpr264_
+# define cblas_chbmv cblas_chbmv64_
+# define cblas_zhbmv cblas_zhbmv64_
+# define cblas_chpmv cblas_chpmv64_
+# define cblas_zhpmv cblas_zhpmv64_
+# define cblas_sgemm cblas_sgemm64_
+# define cblas_dgemm cblas_dgemm64_
+# define cblas_cgemm cblas_cgemm64_
+# define cblas_cgemm3m cblas_cgemm3m64_
+# define cblas_zgemm cblas_zgemm64_
+# define cblas_zgemm3m cblas_zgemm3m64_
+# define cblas_ssymm cblas_ssymm64_
+# define cblas_dsymm cblas_dsymm64_
+# define cblas_csymm cblas_csymm64_
+# define cblas_zsymm cblas_zsymm64_
+# define cblas_ssyrk cblas_ssyrk64_
+# define cblas_dsyrk cblas_dsyrk64_
+# define cblas_csyrk cblas_csyrk64_
+# define cblas_zsyrk cblas_zsyrk64_
+# define cblas_ssyr2k cblas_ssyr2k64_
+# define cblas_dsyr2k cblas_dsyr2k64_
+# define cblas_csyr2k cblas_csyr2k64_
+# define cblas_zsyr2k cblas_zsyr2k64_
+# define cblas_strmm cblas_strmm64_
+# define cblas_dtrmm cblas_dtrmm64_
+# define cblas_ctrmm cblas_ctrmm64_
+# define cblas_ztrmm cblas_ztrmm64_
+# define cblas_strsm cblas_strsm64_
+# define cblas_dtrsm cblas_dtrsm64_
+# define cblas_ctrsm cblas_ctrsm64_
+# define cblas_ztrsm cblas_ztrsm64_
+# define cblas_chemm cblas_chemm64_
+# define cblas_zhemm cblas_zhemm64_
+# define cblas_cherk cblas_cherk64_
+# define cblas_zherk cblas_zherk64_
+# define cblas_cher2k cblas_cher2k64_
+# define cblas_zher2k cblas_zher2k64_
+# define cblas_xerbla cblas_xerbla64_
+# define cblas_saxpby cblas_saxpby64_
+# define cblas_daxpby cblas_daxpby64_
+# define cblas_caxpby cblas_caxpby64_
+# define cblas_zaxpby cblas_zaxpby64_
+# define cblas_somatcopy cblas_somatcopy64_
+# define cblas_domatcopy cblas_domatcopy64_
+# define cblas_comatcopy cblas_comatcopy64_
+# define cblas_zomatcopy cblas_zomatcopy64_
+# define cblas_simatcopy cblas_simatcopy64_
+# define cblas_dimatcopy cblas_dimatcopy64_
+# define cblas_cimatcopy cblas_cimatcopy64_
+# define cblas_zimatcopy cblas_zimatcopy64_
+# define cblas_sgeadd cblas_sgeadd64_
+# define cblas_dgeadd cblas_dgeadd64_
+# define cblas_cgeadd cblas_cgeadd64_
+# define cblas_zgeadd cblas_zgeadd64_
+#endif
+
+#define CBLAS_INDEX size_t
+
+
+typedef enum CBLAS_ORDER     {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
+typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
+typedef enum CBLAS_UPLO      {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
+typedef enum CBLAS_DIAG      {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
+typedef enum CBLAS_SIDE      {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
+
+float  cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
+double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
+float  cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float  *y, OPENBLAS_CONST blasint incy);
+double cblas_ddot(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy);
+
+openblas_complex_float  cblas_cdotu(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float  *y, OPENBLAS_CONST blasint incy);
+openblas_complex_float  cblas_cdotc(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float  *y, OPENBLAS_CONST blasint incy);
+openblas_complex_double cblas_zdotu(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy);
+openblas_complex_double cblas_zdotc(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy);
+
+void  cblas_cdotu_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float  *y, OPENBLAS_CONST blasint incy, openblas_complex_float  *ret);
+void  cblas_cdotc_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float  *y, OPENBLAS_CONST blasint incy, openblas_complex_float  *ret);
+void  cblas_zdotu_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy, openblas_complex_double *ret);
+void  cblas_zdotc_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy, openblas_complex_double *ret);
+
+float  cblas_sasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+float  cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+
+float  cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX);
+double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
+float  cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX);
+double cblas_dznrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
+
+CBLAS_INDEX cblas_isamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+
+void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
+void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
+void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
+void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
+
+void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
+void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
+void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
+void cblas_zcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
+
+void cblas_sswap(OPENBLAS_CONST blasint n, float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
+void cblas_dswap(OPENBLAS_CONST blasint n, double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
+void cblas_cswap(OPENBLAS_CONST blasint n, float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
+void cblas_zswap(OPENBLAS_CONST blasint n, double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
+
+void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
+void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double  s);
+
+void cblas_srotg(float *a, float *b, float *c, float *s);
+void cblas_drotg(double *a, double *b, double *c, double *s);
+
+void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P);
+void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P);
+
+void cblas_srotmg(float *d1, float *d2, float *b1, OPENBLAS_CONST float b2, float *P);
+void cblas_drotmg(double *d1, double *d2, double *b1, OPENBLAS_CONST double b2, double *P);
+
+void cblas_sscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, float *X, OPENBLAS_CONST blasint incX);
+void cblas_dscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, double *X, OPENBLAS_CONST blasint incX);
+void cblas_cscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, float *X, OPENBLAS_CONST blasint incX);
+void cblas_zscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, double *X, OPENBLAS_CONST blasint incX);
+void cblas_csscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, float *X, OPENBLAS_CONST blasint incX);
+void cblas_zdscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, double *X, OPENBLAS_CONST blasint incX);
+
+void cblas_sgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
+		 OPENBLAS_CONST float alpha, OPENBLAS_CONST float  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST float beta,  float  *y, OPENBLAS_CONST blasint incy);
+void cblas_dgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
+		 OPENBLAS_CONST double alpha, OPENBLAS_CONST double  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST double  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST double beta,  double  *y, OPENBLAS_CONST blasint incy);
+void cblas_cgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
+		 OPENBLAS_CONST float *alpha, OPENBLAS_CONST float  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST float *beta,  float  *y, OPENBLAS_CONST blasint incy);
+void cblas_zgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
+		 OPENBLAS_CONST double *alpha, OPENBLAS_CONST double  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST double  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST double *beta,  double  *y, OPENBLAS_CONST blasint incy);
+
+void cblas_sger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float   alpha, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float  *Y, OPENBLAS_CONST blasint incY, float  *A, OPENBLAS_CONST blasint lda);
+void cblas_dger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double  alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);
+void cblas_cgeru(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float  *alpha, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float  *Y, OPENBLAS_CONST blasint incY, float  *A, OPENBLAS_CONST blasint lda);
+void cblas_cgerc(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float  *alpha, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float  *Y, OPENBLAS_CONST blasint incY, float  *A, OPENBLAS_CONST blasint lda);
+void cblas_zgeru(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);
+void cblas_zgerc(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);
+
+void cblas_strsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_dtrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
+void cblas_ctrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_ztrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
+
+void cblas_strmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_dtrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
+void cblas_ctrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_ztrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
+
+void cblas_ssyr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *A, OPENBLAS_CONST blasint lda);
+void cblas_dsyr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *A, OPENBLAS_CONST blasint lda);
+void cblas_cher(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *A, OPENBLAS_CONST blasint lda);
+void cblas_zher(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *A, OPENBLAS_CONST blasint lda);
+
+void cblas_ssyr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo,OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X,
+                OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A, OPENBLAS_CONST blasint lda);
+void cblas_dsyr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X,
+                OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);
+void cblas_cher2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX,
+                OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A, OPENBLAS_CONST blasint lda);
+void cblas_zher2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX,
+                OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);
+
+void cblas_sgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_dgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);
+void cblas_cgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_zgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *beta, double *Y, OPENBLAS_CONST blasint incY);
+
+void cblas_ssbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A,
+                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_dsbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A,
+                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);
+
+
+void cblas_stbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_dtbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
+void cblas_ctbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_ztbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
+
+void cblas_stbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_dtbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
+void cblas_ctbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_ztbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
+
+void cblas_stpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX);
+void cblas_dtpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX);
+void cblas_ctpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX);
+void cblas_ztpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX);
+
+void cblas_stpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX);
+void cblas_dtpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX);
+void cblas_ctpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX);
+void cblas_ztpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX);
+
+void cblas_ssymv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A,
+                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_dsymv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A,
+                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);
+void cblas_chemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A,
+                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_zhemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A,
+                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *beta, double *Y, OPENBLAS_CONST blasint incY);
+
+
+void cblas_sspmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *Ap,
+                 OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_dspmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *Ap,
+                 OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);
+
+void cblas_sspr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *Ap);
+void cblas_dspr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *Ap);
+
+void cblas_chpr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *A);
+void cblas_zhpr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X,OPENBLAS_CONST blasint incX, double *A);
+
+void cblas_sspr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A);
+void cblas_dspr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A);
+void cblas_chpr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *Ap);
+void cblas_zhpr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *Ap);
+
+void cblas_chbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_zhbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *beta, double *Y, OPENBLAS_CONST blasint incY);
+
+void cblas_chpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N,
+		 OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *Ap, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_zhpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N,
+		 OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *Ap, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *beta, double *Y, OPENBLAS_CONST blasint incY);
+
+void cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
+void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_cgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);
+void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);
+
+
+void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_dsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
+void cblas_csymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_zsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);
+
+void cblas_ssyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_dsyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
+void cblas_csyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_zsyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);
+
+void cblas_ssyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_dsyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
+void cblas_csyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_zsyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);
+
+void cblas_strmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb);
+void cblas_dtrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb);
+void cblas_ctrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb);
+void cblas_ztrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb);
+
+void cblas_strsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb);
+void cblas_dtrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb);
+void cblas_ctrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb);
+void cblas_ztrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb);
+
+void cblas_chemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_zhemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);
+
+void cblas_cherk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+                 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_zherk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+                 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
+
+void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+                  OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+                  OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
+
+void cblas_xerbla(blasint p, char *rout, char *form, ...);
+
+/*** BLAS extensions ***/
+
+void cblas_saxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
+
+void cblas_daxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double beta, double *y, OPENBLAS_CONST blasint incy);
+
+void cblas_caxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST float *beta, float *y, OPENBLAS_CONST blasint incy);
+
+void cblas_zaxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double *beta, double *y, OPENBLAS_CONST blasint incy);
+
+void cblas_somatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a,
+		     OPENBLAS_CONST blasint clda, float *b, OPENBLAS_CONST blasint cldb);
+void cblas_domatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a,
+		     OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb);
+void cblas_comatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, OPENBLAS_CONST float* a,
+		     OPENBLAS_CONST blasint clda, float*b, OPENBLAS_CONST blasint cldb);
+void cblas_zomatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, OPENBLAS_CONST double* a,
+		     OPENBLAS_CONST blasint clda,  double *b, OPENBLAS_CONST blasint cldb);
+
+void cblas_simatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a,
+		     OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
+void cblas_dimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a,
+		     OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
+void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, float* a,
+		     OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
+void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a,
+		     OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
+
+void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta,
+		  float *c, OPENBLAS_CONST blasint cldc);
+void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta,
+		  double *c, OPENBLAS_CONST blasint cldc);
+void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta,
+		  float *c, OPENBLAS_CONST blasint cldc);
+void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta,
+		  double *c, OPENBLAS_CONST blasint cldc);
+
+
+#ifdef __cplusplus
+}
+#endif  /* __cplusplus */
+
+#endif
diff --git a/julia/deps/cpcblas.sh b/julia/deps/cpcblas.sh
new file mode 100755
index 000000000000..99342897a58c
--- /dev/null
+++ b/julia/deps/cpcblas.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# be invoked from build.jl
+
+set -e
+
+diff ../../cblas.h include/cblas.h || cp -v ../../cblas.h include/cblas.h
diff --git a/julia/docs/Makefile b/julia/docs/Makefile
new file mode 100644
index 000000000000..57c623889a83
--- /dev/null
+++ b/julia/docs/Makefile
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+all:
+	julia --color=yes ./make.jl
+	mkdocs build
diff --git a/julia/docs/make.jl b/julia/docs/make.jl
new file mode 100644
index 000000000000..6e3705a95fdc
--- /dev/null
+++ b/julia/docs/make.jl
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+using Documenter, MXNet
+
+makedocs(
+  modules = MXNet,
+  doctest = false
+)
+
+deploydocs(
+  deps = Deps.pip("pygments", "mkdocs", "mkdocs-material", "python-markdown-math"),
+  repo = "github.com/dmlc/MXNet.jl.git",
+  julia = "0.6",
+)
diff --git a/julia/docs/mkdocs.yml b/julia/docs/mkdocs.yml
new file mode 100644
index 000000000000..24281730885f
--- /dev/null
+++ b/julia/docs/mkdocs.yml
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+site_name: MXNet.jl
+repo_url:  https://github.com/dmlc/MXNet.jl
+
+theme: material
+
+extra:
+  palette:
+    primary: 'indigo'
+    accent: 'blue'
+
+extra_css:
+  - assets/Documenter.css
+
+extra_javascript:
+  - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML
+  - assets/mathjaxhelper.js
+
+markdown_extensions:
+  - extra
+  - tables
+  - fenced_code
+  - mdx_math
+  - admonition
+
+docs_dir: 'build'
+
+pages:
+  - Home: index.md
+  - Tutorial:
+    - Digit Recognition on MNIST: tutorial/mnist.md
+    - Generating Random Sentence with LSTM RNN: tutorial/char-lstm.md
+  - User Guide:
+    - Installation Guide: user-guide/install.md
+    - Overview: user-guide/overview.md
+    - FAQ: user-guide/faq.md
+  - API Documentation:
+    - Context: api/context.md
+    - Models: api/model.md
+    - Initializers: api/initializer.md
+    - Optimizers: api/optimizer.md
+    - Callbacks in training: api/callback.md
+    - Evaluation Metrics: api/metric.md
+    - Data Providers: api/io.md
+    - NDArray API: api/ndarray.md
+    - Symbolic API: api/symbolic-node.md
+    - Neural Networks Factory: api/nn-factory.md
+    - Executor: api/executor.md
+    - Network Visualization: api/visualize.md
diff --git a/julia/docs/src/api.md b/julia/docs/src/api.md
new file mode 100644
index 000000000000..4984129863d0
--- /dev/null
+++ b/julia/docs/src/api.md
@@ -0,0 +1,18 @@
+# API Documentation
+
+```@contents
+Pages = [
+  "api/symbolic-node.md",
+  "api/ndarray.md",
+  "api/context.md",
+  "api/model.md",
+  "api/initializers.md",
+  "api/optimizers.md",
+  "api/callbacks.md",
+  "api/metric.md",
+  "api/io.md",
+  "api/nn-factory.md",
+  "api/executor.md",
+  "api/visualize.md",
+]
+```
diff --git a/julia/docs/src/api/callback.md b/julia/docs/src/api/callback.md
new file mode 100644
index 000000000000..f67811cc41fe
--- /dev/null
+++ b/julia/docs/src/api/callback.md
@@ -0,0 +1,6 @@
+# Callback in training
+
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["callback.jl"]
+```
diff --git a/julia/docs/src/api/context.md b/julia/docs/src/api/context.md
new file mode 100644
index 000000000000..93ccf83e51ba
--- /dev/null
+++ b/julia/docs/src/api/context.md
@@ -0,0 +1,6 @@
+# Context
+
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["context.jl"]
+```
diff --git a/julia/docs/src/api/executor.md b/julia/docs/src/api/executor.md
new file mode 100644
index 000000000000..b560c7a0864d
--- /dev/null
+++ b/julia/docs/src/api/executor.md
@@ -0,0 +1,6 @@
+# Executor
+
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["executor.jl"]
+```
diff --git a/julia/docs/src/api/initializer.md b/julia/docs/src/api/initializer.md
new file mode 100644
index 000000000000..d0aad2def4cd
--- /dev/null
+++ b/julia/docs/src/api/initializer.md
@@ -0,0 +1,6 @@
+# Initializer
+
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["initializer.jl"]
+```
diff --git a/julia/docs/src/api/io.md b/julia/docs/src/api/io.md
new file mode 100644
index 000000000000..7312259dbf3c
--- /dev/null
+++ b/julia/docs/src/api/io.md
@@ -0,0 +1,120 @@
+# Data Providers
+
+Data providers are wrappers that load external data, be it images, text, or general tensors,
+and split it into mini-batches so that the model can consume the data in a uniformed way.
+
+## AbstractDataProvider interface
+
+```@docs
+mx.AbstractDataProvider
+```
+ 
+The difference between *data* and *label* is that during training stage,
+both *data* and *label* will be feeded into the model, while during
+prediction stage, only *data* is loaded. Otherwise, they could be anything, with any names, and
+of any shapes. The provided data and label names here should match the input names in a target
+`SymbolicNode`.
+
+A data provider should also implement the Julia iteration interface, in order to allow iterating
+through the data set. The provider will be called in the following way:
+
+```julia
+for batch in eachbatch(provider)
+    data = get_data(provider, batch)
+end
+```
+
+which will be translated by Julia compiler into
+
+```julia
+state = Base.start(eachbatch(provider))
+while !Base.done(provider, state)
+    (batch, state) = Base.next(provider, state)
+    data = get_data(provider, batch)
+end
+```
+ 
+By default, `eachbatch` simply returns the provider itself, so the iterator interface
+is implemented on the provider type itself. But the extra layer of abstraction allows us to
+implement a data provider easily via a Julia `Task` coroutine. See the
+data provider defined in [the char-lstm example](tutorial/char-lstm) for an example of using coroutine to define data
+providers.
+
+The detailed interface functions for the iterator API is listed below:
+
+    Base.eltype(provider) -> AbstractDataBatch
+
+Returns the specific subtype representing a data batch. See `AbstractDataBatch`.
+* `provider::AbstractDataProvider`: the data provider.
+
+    Base.start(provider) -> AbstractDataProviderState
+
+This function is always called before iterating into the dataset. It should initialize
+the iterator, reset the index, and do data shuffling if needed.
+* `provider::AbstractDataProvider`: the data provider.
+
+    Base.done(provider, state) -> Bool
+
+True if there is no more data to iterate in this dataset.
+* `provider::AbstractDataProvider`: the data provider.
+* `state::AbstractDataProviderState`: the state returned by `Base.start` and `Base.next`.
+
+    Base.next(provider) -> (AbstractDataBatch, AbstractDataProviderState)
+
+Returns the current data batch, and the state for the next iteration.
+* `provider::AbstractDataProvider`: the data provider.
+
+Note sometimes you are wrapping an existing data iterator (e.g. the built-in libmxnet data iterator) that
+is built with a different convention. It might be difficult to adapt to the interfaces stated here. In this
+case, you can safely assume that
+
+* `Base.start` will always be called, and called only once before the iteration starts.
+* `Base.done` will always be called at the beginning of every iteration and always be called once.
+* If `Base.done` return true, the iteration will stop, until the next round, again, starting with
+  a call to `Base.start`.
+* `Base.next` will always be called only once in each iteration. It will always be called after
+  one and only one call to `Base.done`; but if `Base.done` returns true, `Base.next` will
+  not be called.
+
+With those assumptions, it will be relatively easy to adapt any existing iterator. See the implementation
+of the built-in `MXDataProvider` for example.
+
+!!! note
+    Please do not use the one data provider simultaneously in two different places, either in parallel,
+    or in a nested loop. For example, the behavior for the following code is undefined
+
+    ```julia
+    for batch in data
+        # updating the parameters
+
+        # now let's test the performance on the training set
+        for b2 in data
+            # ...
+        end
+    end
+    ```
+
+```@docs
+mx.get_batch_size
+mx.provide_data
+mx.provide_label
+```
+
+## AbstractDataBatch interface
+
+```@docs
+mx.AbstractDataProviderState
+mx.count_samples
+mx.get_data
+mx.get_label
+mx.get
+mx.load_data!
+mx.load_label!
+```
+
+## Implemented providers and other methods
+
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["io.jl"]
+```
diff --git a/julia/docs/src/api/kvstore.md b/julia/docs/src/api/kvstore.md
new file mode 100644
index 000000000000..34a5027f85fb
--- /dev/null
+++ b/julia/docs/src/api/kvstore.md
@@ -0,0 +1,6 @@
+# Key-Value Store
+
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["kvstore.jl"]
+```
diff --git a/julia/docs/src/api/metric.md b/julia/docs/src/api/metric.md
new file mode 100644
index 000000000000..63cca0cc41ba
--- /dev/null
+++ b/julia/docs/src/api/metric.md
@@ -0,0 +1,10 @@
+# Evaluation Metrics
+
+Evaluation metrics provide a way to evaluate the performance of a learned model.
+This is typically used during training to monitor performance on the validation
+set.
+
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["metric.jl"]
+```
diff --git a/julia/docs/src/api/model.md b/julia/docs/src/api/model.md
new file mode 100644
index 000000000000..f793c7c406c7
--- /dev/null
+++ b/julia/docs/src/api/model.md
@@ -0,0 +1,9 @@
+# Model
+
+The model API provides convenient high-level interface to do training and predicting on
+a network described using the symbolic API.
+
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["model.jl"]
+```
diff --git a/julia/docs/src/api/ndarray.md b/julia/docs/src/api/ndarray.md
new file mode 100644
index 000000000000..5877d8257758
--- /dev/null
+++ b/julia/docs/src/api/ndarray.md
@@ -0,0 +1,57 @@
+# NDArray API
+
+## Arithmetic Operations
+
+In the following example `y` can be a `Real` value or another `NDArray`
+
+| API | Example  |                            |
+|-----|----------|----------------------------|
+| `+` | `x .+ y` | Elementwise summation      |
+| `-` | `x .- y` | Elementwise minus          |
+| `*` | `x .* y` | Elementwise multiplication |
+| `/` | `x ./ y` | Elementwise division       |
+| `^` | `x .^ y` | Elementwise power          |
+| `%` | `x .% y` | Elementwise modulo         |
+
+
+## Trigonometric Functions
+
+| API            | Example    |                             |
+|----------------|------------|-----------------------------|
+| [`sin`](@ref)  | `sin.(x)`  | Elementwise sine            |
+| [`cos`](@ref)  | `cos.(x)`  | Elementwise cosine          |
+| [`tan`](@ref)  | `tan.(x)`  | Elementwise tangent         |
+| [`asin`](@ref) | `asin.(x)` | Elementwise inverse sine    |
+| [`acos`](@ref) | `acos.(x)` | Elementwise inverse cosine  |
+| [`atan`](@ref) | `atan.(x)` | Elementwise inverse tangent |
+
+
+## Hyperbolic Functions
+
+| API             | Example     |                                        |
+|-----------------|-------------|----------------------------------------|
+| [`sinh`](@ref)  | `sinh.(x)`  | Elementwise hyperbolic sine            |
+| [`cosh`](@ref)  | `cosh.(x)`  | Elementwise hyperbolic cosine          |
+| [`tanh`](@ref)  | `tanh.(x)`  | Elementwise hyperbolic tangent         |
+| [`asinh`](@ref) | `asinh.(x)` | Elementwise inverse hyperbolic sine    |
+| [`acosh`](@ref) | `acosh.(x)` | Elementwise inverse hyperbolic cosine  |
+| [`atanh`](@ref) | `atanh.(x)` | Elementwise inverse hyperbolic tangent |
+
+
+## Activation Functions
+
+| API                   | Example           |                         |
+|-----------------------|-------------------|-------------------------|
+| [`σ`](@ref)           | `σ.(x)`           | Sigmoid function        |
+| [`sigmoid`](@ref)     | `sigmoid.(x)`     | Sigmoid function        |
+| [`relu`](@ref)        | `relu.(x)`        | ReLU function           |
+| [`softmax`](@ref)     | `softmax.(x)`     | Softmax function        |
+| [`log_softmax`](@ref) | `log_softmax.(x)` | Softmax followed by log |
+
+
+## Reference
+
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["ndarray.jl"]
+```
diff --git a/julia/docs/src/api/nn-factory.md b/julia/docs/src/api/nn-factory.md
new file mode 100644
index 000000000000..833d9a3efd53
--- /dev/null
+++ b/julia/docs/src/api/nn-factory.md
@@ -0,0 +1,9 @@
+# Neural Network Factory
+
+Neural network factory provide convenient helper functions to define
+common neural networks.
+
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["nn-factory.jl"]
+```
diff --git a/julia/docs/src/api/optimizer.md b/julia/docs/src/api/optimizer.md
new file mode 100644
index 000000000000..28d01cc9fd89
--- /dev/null
+++ b/julia/docs/src/api/optimizer.md
@@ -0,0 +1,66 @@
+# Optimizers
+
+Says, you have the parameter `W` inited for your model and
+got its gradient stored as `∇` (perhaps from AutoGrad APIs).
+Here is minimal snippet of getting your parameter `W` baked by `SGD`.
+
+```@repl
+using MXNet
+
+opt = SGD(η = 10)
+decend! = getupdater(opt)
+
+W = NDArray(Float32[1, 2, 3, 4]);
+∇ = NDArray(Float32[.1, .2, .3, .4]);
+
+decend!(1, ∇, W)
+```
+
+```@autodocs
+Modules = [MXNet.mx, MXNet.mx.LearningRate, MXNet.mx.Momentum]
+Pages = ["optimizer.jl"]
+```
+
+## Built-in optimizers
+
+### Stochastic Gradient Descent
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["optimizers/sgd.jl"]
+```
+
+### ADAM
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["optimizers/adam.jl"]
+```
+
+### AdaGrad
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["optimizers/adagrad.jl"]
+```
+
+### AdaDelta
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["optimizers/adadelta.jl"]
+```
+
+### AdaMax
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["optimizers/adamax.jl"]
+```
+
+### RMSProp
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["optimizers/rmsprop.jl"]
+```
+
+### Nadam
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["optimizers/nadam.jl"]
+```
diff --git a/julia/docs/src/api/symbolic-node.md b/julia/docs/src/api/symbolic-node.md
new file mode 100644
index 000000000000..ef731d9f7d00
--- /dev/null
+++ b/julia/docs/src/api/symbolic-node.md
@@ -0,0 +1,6 @@
+# Symbolic API
+
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["symbolic-node.jl"]
+```
diff --git a/julia/docs/src/api/visualize.md b/julia/docs/src/api/visualize.md
new file mode 100644
index 000000000000..429a927012e4
--- /dev/null
+++ b/julia/docs/src/api/visualize.md
@@ -0,0 +1,6 @@
+# Network Visualization
+
+```@autodocs
+Modules = [MXNet.mx]
+Pages = ["visualize.jl"]
+```
diff --git a/julia/docs/src/index.md b/julia/docs/src/index.md
new file mode 100644
index 000000000000..b6a51fc162ad
--- /dev/null
+++ b/julia/docs/src/index.md
@@ -0,0 +1,55 @@
+# MXNet Documentation
+
+[MXNet.jl](https://github.com/dmlc/MXNet.jl) is the
+[Julia](http://julialang.org/) package of
+[dmlc/mxnet](https://github.com/dmlc/mxnet). MXNet.jl brings flexible and efficient GPU
+computing and state-of-art deep learning to Julia. Some highlight of features
+include:
+
+* Efficient tensor/matrix computation across multiple devices,
+  including multiple CPUs, GPUs and distributed server nodes.
+* Flexible symbolic manipulation to composite and construct
+  state-of-the-art deep learning models.
+
+For more details, see documentation below. Please also checkout the
+[examples](https://github.com/dmlc/MXNet.jl/tree/master/examples) directory.
+
+## Tutorials
+
+```@contents
+Pages = [
+  "tutorial/mnist.md",
+  "tutorial/char-lstm.md",
+]
+Depth = 2
+```
+
+## User's Guide
+
+```@contents
+Pages = [
+  "user-guide/install.md",
+  "user-guide/overview.md",
+  "user-guide/faq.md",
+]
+Depth = 2
+```
+
+## API Documentation
+
+```@contents
+Pages = [
+  "api/context.md",
+  "api/ndarray.md",
+  "api/symbolic-node.md",
+  "api/model.md",
+  "api/initializers.md",
+  "api/optimizers.md",
+  "api/callbacks.md",
+  "api/metric.md",
+  "api/io.md",
+  "api/nn-factory.md",
+  "api/executor.md",
+  "api/visualize.md",
+]
+```
diff --git a/julia/docs/src/tutorial/char-lstm.md b/julia/docs/src/tutorial/char-lstm.md
new file mode 100644
index 000000000000..369bcddd53e9
--- /dev/null
+++ b/julia/docs/src/tutorial/char-lstm.md
@@ -0,0 +1,306 @@
+Generating Random Sentence with LSTM RNN
+========================================
+
+This tutorial shows how to train a LSTM (Long short-term memory) RNN
+(recurrent neural network) to perform character-level sequence training
+and prediction. The original model, usually called `char-rnn` is
+described in [Andrej Karpathy's
+blog](http://karpathy.github.io/2015/05/21/rnn-effectiveness/), with a
+reference implementation in Torch available
+[here](https://github.com/karpathy/char-rnn).
+
+Because MXNet.jl does not have a specialized model for recurrent neural
+networks yet, the example shown here is an implementation of LSTM by
+using the default FeedForward model via explicitly unfolding over time.
+We will be using fixed-length input sequence for training. The code is
+adapted from the [char-rnn example for MXNet's Python
+binding](https://github.com/dmlc/mxnet/blob/master/example/rnn/char_lstm.ipynb),
+which demonstrates how to use low-level
+[Symbolic API](@ref) to build customized neural
+network models directly.
+
+The most important code snippets of this example is shown and explained
+here. To see and run the complete code, please refer to the
+[examples/char-lstm](https://github.com/dmlc/MXNet.jl/tree/master/examples/char-lstm)
+directory. You will need to install
+[Iterators.jl](https://github.com/JuliaLang/Iterators.jl) and
+[StatsBase.jl](https://github.com/JuliaStats/StatsBase.jl) to run this
+example.
+
+LSTM Cells
+----------
+
+Christopher Olah has a [great blog post about LSTM](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) with
+beautiful and clear illustrations. So we will not repeat the definition
+and explanation of what an LSTM cell is here. Basically, an LSTM cell
+takes input `x`, as well as previous states (including `c` and `h`), and
+produce the next states. We define a helper type to bundle the two state
+variables together:
+
+Because LSTM weights are shared at every time when we do explicit
+unfolding, so we also define a helper type to hold all the weights (and
+bias) for an LSTM cell for convenience.
+
+Note all the variables are of type SymbolicNode. We will construct the
+LSTM network as a symbolic computation graph, which is then instantiated
+with NDArray for actual computation.
+
+The following figure is stolen (permission requested) from [Christopher
+Olah's blog](http://colah.github.io/posts/2015-08-Understanding-LSTMs/),
+which illustrate exactly what the code snippet above is doing.
+
+![image](images/LSTM3-chain.png)
+
+In particular, instead of defining the four gates independently, we do
+the computation together and then use SliceChannel to split them into
+four outputs. The computation of gates are all done with the symbolic
+API. The return value is a LSTM state containing the output of a LSTM
+cell.
+
+Unfolding LSTM
+--------------
+
+Using the LSTM cell defined above, we are now ready to define a function
+to unfold a LSTM network with L layers and T time steps. The first part
+of the function is just defining all the symbolic variables for the
+shared weights and states.
+
+The `embed_W` is the weights used for character embedding --- i.e.
+mapping the one-hot encoded characters into real vectors. The `pred_W`
+and `pred_b` are weights and bias for the final prediction at each time
+step.
+
+Then we define the weights for each LSTM cell. Note there is one cell
+for each layer, and it will be replicated (unrolled) over time. The
+states are, however, *not* shared over time. Instead, here we define the
+initial states here at the beginning of a sequence, and we will update
+them with the output states at each time step as we explicitly unroll
+the LSTM.
+
+Unrolling over time is a straightforward procedure of stacking the
+embedding layer, and then LSTM cells, on top of which the prediction
+layer. During unrolling, we update the states and collect all the
+outputs. Note each time step takes data and label as inputs. If the LSTM
+is named as `:ptb`, the data and label at step `t` will be named
+`:ptb_data_$t` and `:ptb_label_$t`. Late on when we prepare the data, we
+will define the data provider to match those names.
+
+Note at each time step, the prediction is connected to a SoftmaxOutput
+operator, which could back propagate when corresponding labels are
+provided. The states are then connected to the next time step, which
+allows back propagate through time. However, at the end of the sequence,
+the final states are not connected to anything. This dangling outputs is
+problematic, so we explicitly connect each of them to a BlockGrad
+operator, which simply back propagates 0-gradient and closes the
+computation graph.
+
+In the end, we just group all the prediction outputs at each time step
+as a single SymbolicNode and return. Optionally we will also group the
+final states, this is used when we use the trained LSTM to sample
+sentences.
+
+Data Provider for Text Sequences
+--------------------------------
+
+Now we need to construct a data provider that takes a text file, divide
+the text into mini-batches of fixed-length character-sequences, and
+provide them as one-hot encoded vectors.
+
+Note the is no fancy feature extraction at all. Each character is simply
+encoded as a one-hot vector: a 0-1 vector of the size given by the
+vocabulary. Here we just construct the vocabulary by collecting all the
+unique characters in the training text -- there are not too many of them
+(including punctuations and whitespace) for English text. Each input
+character is then encoded as a vector of 0s on all coordinates, and 1 on
+the coordinate corresponding to that character. The
+character-to-coordinate mapping is giving by the vocabulary.
+
+The text sequence data provider implements the [Data Providers](@ref) api. We define the `CharSeqProvider` as below:
+
+The provided data and labels follow the naming convention of inputs used
+when unrolling the LSTM. Note in the code below, apart from
+`$name_data_$t` and `$name_label_$t`, we also provides the initial `c`
+and `h` states for each layer. This is because we are using the
+high-level FeedForward API, which has no idea about time and states. So
+we will feed the initial states for each sequence from the data
+provider. Since the initial states is always zero, we just need to
+always provide constant zero blobs.
+
+Next we implement the `eachbatch` method from the [`mx.AbstractDataProvider`](@ref) interface for the
+provider. We start by defining the data and label arrays, and the
+`DataBatch` object we will provide in each iteration.
+
+The actual data providing iteration is implemented as a Julia
+**coroutine**. In this way, we can write the data loading logic as a
+simple coherent `for` loop, and do not need to implement the interface
+functions like Base.start, Base.next, etc.
+
+Basically, we partition the text into batches, each batch containing
+several contiguous text sequences. Note at each time step, the LSTM is
+trained to predict the next character, so the label is the same as the
+data, but shifted ahead by one index.
+
+Training the LSTM
+-----------------
+
+Now we have implemented all the supporting infrastructures for our
+char-lstm. To train the model, we just follow the standard high-level
+API. Firstly, we construct a LSTM symbolic architecture:
+
+Note all the parameters are defined in
+[examples/char-lstm/config.jl](https://github.com/dmlc/MXNet.jl/blob/master/examples/char-lstm/config.jl).
+Now we load the text file and define the data provider. The data
+`input.txt` we used in this example is [a tiny Shakespeare
+dataset](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare).
+But you can try with other text files.
+
+The last step is to construct a model, an optimizer and fit the mode to
+the data. We are using the ADAM optimizer \[Adam\]\_ in this example.
+
+Note we are also using a customized `NLL` evaluation metric, which
+calculate the negative log-likelihood during training. Here is an output
+sample at the end of the training process.
+
+```
+...
+INFO: Speed: 357.72 samples/sec
+INFO: == Epoch 020 ==========
+INFO: ## Training summary
+INFO:                NLL = 1.4672
+INFO:         perplexity = 4.3373
+INFO:               time = 87.2631 seconds
+INFO: ## Validation summary
+INFO:                NLL = 1.6374
+INFO:         perplexity = 5.1418
+INFO: Saved checkpoint to 'char-lstm/checkpoints/ptb-0020.params'
+INFO: Speed: 368.74 samples/sec
+INFO: Speed: 361.04 samples/sec
+INFO: Speed: 360.02 samples/sec
+INFO: Speed: 362.34 samples/sec
+INFO: Speed: 360.80 samples/sec
+INFO: Speed: 362.77 samples/sec
+INFO: Speed: 357.18 samples/sec
+INFO: Speed: 355.30 samples/sec
+INFO: Speed: 362.33 samples/sec
+INFO: Speed: 359.23 samples/sec
+INFO: Speed: 358.09 samples/sec
+INFO: Speed: 356.89 samples/sec
+INFO: Speed: 371.91 samples/sec
+INFO: Speed: 372.24 samples/sec
+INFO: Speed: 356.59 samples/sec
+INFO: Speed: 356.64 samples/sec
+INFO: Speed: 360.24 samples/sec
+INFO: Speed: 360.32 samples/sec
+INFO: Speed: 362.38 samples/sec
+INFO: == Epoch 021 ==========
+INFO: ## Training summary
+INFO:                NLL = 1.4655
+INFO:         perplexity = 4.3297
+INFO:               time = 86.9243 seconds
+INFO: ## Validation summary
+INFO:                NLL = 1.6366
+INFO:         perplexity = 5.1378
+INFO: Saved checkpoint to 'examples/char-lstm/checkpoints/ptb-0021.params'
+```
+
+Sampling Random Sentences
+-------------------------
+
+After training the LSTM, we can now sample random sentences from the
+trained model. The sampler works in the following way:
+
+-   Starting from some fixed character, take `a` for example, and feed
+    it as input to the LSTM.
+-   The LSTM will produce an output distribution over the vocabulary and
+    a state in the first time step. We sample a character from the
+    output distribution, fix it as the second character.
+-   In the next time step, we feed the previously sampled character as
+    input and continue running the LSTM by also taking the previous
+    states (instead of the 0 initial states).
+-   Continue running until we sampled enough characters.
+
+Note we are running with mini-batches, so several sentences could be
+sampled simultaneously. Here are some sampled outputs from a network I
+trained for around half an hour on the Shakespeare dataset. Note all the
+line-breaks, punctuations and upper-lower case letters are produced by
+the sampler itself. I did not do any post-processing.
+
+```
+## Sample 1
+all have sir,
+Away will fill'd in His time, I'll keep her, do not madam, if they here? Some more ha?
+
+## Sample 2
+am.
+
+CLAUDIO:
+Hone here, let her, the remedge, and I know not slept a likely, thou some soully free?
+
+## Sample 3
+arrel which noble thing
+The exchnachsureding worns: I ne'er drunken Biancas, fairer, than the lawfu?
+
+## Sample 4
+augh assalu, you'ld tell me corn;
+Farew. First, for me of a loved. Has thereat I knock you presents?
+
+## Sample 5
+ame the first answer.
+
+MARIZARINIO:
+Door of Angelo as her lord, shrield liken Here fellow the fool ?
+
+## Sample 6
+ad well.
+
+CLAUDIO:
+Soon him a fellows here; for her fine edge in a bogms' lord's wife.
+
+LUCENTIO:
+I?
+
+## Sample 7
+adrezilian measure.
+
+LUCENTIO:
+So, help'd you hath nes have a than dream's corn, beautio, I perchas?
+
+## Sample 8
+as eatter me;
+The girlly: and no other conciolation!
+
+BISTRUMIO:
+I have be rest girl. O, that I a h?
+
+## Sample 9
+and is intend you sort:
+What held her all 'clama's for maffice. Some servant.' what I say me the cu?
+
+## Sample 10
+an thoughts will said in our pleasue,
+Not scanin on him that you live; believaries she.
+
+ISABELLLLL?
+```
+
+See [Andrej Karpathy's blog
+post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) on more
+examples and links including Linux source codes, Algebraic Geometry
+Theorems, and even cooking recipes. The code for sampling can be found
+in
+[examples/char-lstm/sampler.jl](https://github.com/dmlc/MXNet.jl/blob/master/examples/char-lstm/sampler.jl).
+
+Visualizing the LSTM
+--------------------
+
+Finally, you could visualize the LSTM by calling to\_graphviz on the
+constructed LSTM symbolic architecture. We only show an example of
+1-layer and 2-time-step LSTM below. The automatic layout produced by
+GraphViz is definitely much less clear than [Christopher Olah's
+illustrations](http://colah.github.io/posts/2015-08-Understanding-LSTMs/),
+but could otherwise be very useful for debugging. As we can see, the
+LSTM unfolded over time is just a (very) deep neural network. The
+complete code for producing this visualization can be found in
+[examples/char-lstm/visualize.jl](https://github.com/dmlc/MXNet.jl/blob/master/examples/char-lstm/visualize.jl).
+
+![image](images/char-lstm-vis.svg)
diff --git a/julia/docs/src/tutorial/images/LSTM3-chain.png b/julia/docs/src/tutorial/images/LSTM3-chain.png
new file mode 100644
index 000000000000..e962a3c72078
Binary files /dev/null and b/julia/docs/src/tutorial/images/LSTM3-chain.png differ
diff --git a/julia/docs/src/tutorial/images/char-lstm-vis.svg b/julia/docs/src/tutorial/images/char-lstm-vis.svg
new file mode 100644
index 000000000000..610abab774b7
--- /dev/null
+++ b/julia/docs/src/tutorial/images/char-lstm-vis.svg
@@ -0,0 +1,435 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.38.0 (20140413.2041)
+ -->
+<!-- Title: Network Visualization Pages: 1 -->
+<svg width="606pt" height="1758pt"
+ viewBox="0.00 0.00 606.00 1758.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1754)">
+<title>Network Visualization</title>
+<polygon fill="white" stroke="none" points="-4,4 -4,-1754 602,-1754 602,4 -4,4"/>
+<!-- ptb_embed_2 -->
+<g id="node1" class="node"><title>ptb_embed_2</title>
+<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M269,-810C269,-810 199,-810 199,-810 193,-810 187,-804 187,-798 187,-798 187,-764 187,-764 187,-758 193,-752 199,-752 199,-752 269,-752 269,-752 275,-752 281,-758 281,-764 281,-764 281,-798 281,-798 281,-804 275,-810 269,-810"/>
+<text text-anchor="middle" x="234" y="-788" font-family="Times,serif" font-size="10.00">ptb_embed_2</text>
+<text text-anchor="middle" x="234" y="-778" font-family="Times,serif" font-size="10.00">FullyConnected</text>
+<text text-anchor="middle" x="234" y="-768" font-family="Times,serif" font-size="10.00">num&#45;hidden=256</text>
+</g>
+<!-- ptb_lstm_2_i2h -->
+<g id="node2" class="node"><title>ptb_lstm_2_i2h</title>
+<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M269,-904C269,-904 199,-904 199,-904 193,-904 187,-898 187,-892 187,-892 187,-858 187,-858 187,-852 193,-846 199,-846 199,-846 269,-846 269,-846 275,-846 281,-852 281,-858 281,-858 281,-892 281,-892 281,-898 275,-904 269,-904"/>
+<text text-anchor="middle" x="234" y="-882" font-family="Times,serif" font-size="10.00">ptb_lstm_2_i2h</text>
+<text text-anchor="middle" x="234" y="-872" font-family="Times,serif" font-size="10.00">FullyConnected</text>
+<text text-anchor="middle" x="234" y="-862" font-family="Times,serif" font-size="10.00">num&#45;hidden=1024</text>
+</g>
+<!-- ptb_lstm_2_i2h&#45;&gt;ptb_embed_2 -->
+<g id="edge1" class="edge"><title>ptb_lstm_2_i2h&#45;&gt;ptb_embed_2</title>
+<path fill="none" stroke="#737373" d="M234,-835.744C234,-827.204 234,-818.298 234,-810.248"/>
+<polygon fill="#737373" stroke="#737373" points="234,-845.897 229.5,-835.897 234,-840.897 234,-835.897 234,-835.897 234,-835.897 234,-840.897 238.5,-835.897 234,-845.897 234,-845.897"/>
+</g>
+<!-- ptb_embed_1 -->
+<g id="node3" class="node"><title>ptb_embed_1</title>
+<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M194,-58C194,-58 124,-58 124,-58 118,-58 112,-52 112,-46 112,-46 112,-12 112,-12 112,-6 118,-0 124,-0 124,-0 194,-0 194,-0 200,-0 206,-6 206,-12 206,-12 206,-46 206,-46 206,-52 200,-58 194,-58"/>
+<text text-anchor="middle" x="159" y="-36" font-family="Times,serif" font-size="10.00">ptb_embed_1</text>
+<text text-anchor="middle" x="159" y="-26" font-family="Times,serif" font-size="10.00">FullyConnected</text>
+<text text-anchor="middle" x="159" y="-16" font-family="Times,serif" font-size="10.00">num&#45;hidden=256</text>
+</g>
+<!-- ptb_lstm_1_i2h -->
+<g id="node4" class="node"><title>ptb_lstm_1_i2h</title>
+<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M194,-152C194,-152 124,-152 124,-152 118,-152 112,-146 112,-140 112,-140 112,-106 112,-106 112,-100 118,-94 124,-94 124,-94 194,-94 194,-94 200,-94 206,-100 206,-106 206,-106 206,-140 206,-140 206,-146 200,-152 194,-152"/>
+<text text-anchor="middle" x="159" y="-130" font-family="Times,serif" font-size="10.00">ptb_lstm_1_i2h</text>
+<text text-anchor="middle" x="159" y="-120" font-family="Times,serif" font-size="10.00">FullyConnected</text>
+<text text-anchor="middle" x="159" y="-110" font-family="Times,serif" font-size="10.00">num&#45;hidden=1024</text>
+</g>
+<!-- ptb_lstm_1_i2h&#45;&gt;ptb_embed_1 -->
+<g id="edge2" class="edge"><title>ptb_lstm_1_i2h&#45;&gt;ptb_embed_1</title>
+<path fill="none" stroke="#737373" d="M159,-83.7443C159,-75.2043 159,-66.2977 159,-58.2479"/>
+<polygon fill="#737373" stroke="#737373" points="159,-93.8971 154.5,-83.897 159,-88.8971 159,-83.8971 159,-83.8971 159,-83.8971 159,-88.8971 163.5,-83.8971 159,-93.8971 159,-93.8971"/>
+</g>
+<!-- ptb_lstm_1_h2h -->
+<g id="node5" class="node"><title>ptb_lstm_1_h2h</title>
+<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M306,-152C306,-152 236,-152 236,-152 230,-152 224,-146 224,-140 224,-140 224,-106 224,-106 224,-100 230,-94 236,-94 236,-94 306,-94 306,-94 312,-94 318,-100 318,-106 318,-106 318,-140 318,-140 318,-146 312,-152 306,-152"/>
+<text text-anchor="middle" x="271" y="-130" font-family="Times,serif" font-size="10.00">ptb_lstm_1_h2h</text>
+<text text-anchor="middle" x="271" y="-120" font-family="Times,serif" font-size="10.00">FullyConnected</text>
+<text text-anchor="middle" x="271" y="-110" font-family="Times,serif" font-size="10.00">num&#45;hidden=1024</text>
+</g>
+<!-- _plus0 -->
+<g id="node6" class="node"><title>_plus0</title>
+<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M250,-246C250,-246 180,-246 180,-246 174,-246 168,-240 168,-234 168,-234 168,-200 168,-200 168,-194 174,-188 180,-188 180,-188 250,-188 250,-188 256,-188 262,-194 262,-200 262,-200 262,-234 262,-234 262,-240 256,-246 250,-246"/>
+<text text-anchor="middle" x="215" y="-219" font-family="Times,serif" font-size="10.00">_plus0</text>
+<text text-anchor="middle" x="215" y="-209" font-family="Times,serif" font-size="10.00">_Plus</text>
+</g>
+<!-- _plus0&#45;&gt;ptb_lstm_1_i2h -->
+<g id="edge3" class="edge"><title>_plus0&#45;&gt;ptb_lstm_1_i2h</title>
+<path fill="none" stroke="#737373" d="M192.569,-179.148C187.113,-170.186 181.363,-160.74 176.194,-152.248"/>
+<polygon fill="#737373" stroke="#737373" points="197.894,-187.897 188.85,-181.695 195.294,-183.626 192.694,-179.355 192.694,-179.355 192.694,-179.355 195.294,-183.626 196.538,-177.015 197.894,-187.897 197.894,-187.897"/>
+</g>
+<!-- _plus0&#45;&gt;ptb_lstm_1_h2h -->
+<g id="edge4" class="edge"><title>_plus0&#45;&gt;ptb_lstm_1_h2h</title>
+<path fill="none" stroke="#737373" d="M237.431,-179.148C242.887,-170.186 248.637,-160.74 253.806,-152.248"/>
+<polygon fill="#737373" stroke="#737373" points="232.106,-187.897 233.462,-177.015 234.706,-183.626 237.306,-179.355 237.306,-179.355 237.306,-179.355 234.706,-183.626 241.15,-181.695 232.106,-187.897 232.106,-187.897"/>
+</g>
+<!-- ptb_lstm_1_gates -->
+<g id="node7" class="node"><title>ptb_lstm_1_gates</title>
+<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M250,-340C250,-340 180,-340 180,-340 174,-340 168,-334 168,-328 168,-328 168,-294 168,-294 168,-288 174,-282 180,-282 180,-282 250,-282 250,-282 256,-282 262,-288 262,-294 262,-294 262,-328 262,-328 262,-334 256,-340 250,-340"/>
+<text text-anchor="middle" x="215" y="-313" font-family="Times,serif" font-size="10.00">ptb_lstm_1_gates</text>
+<text text-anchor="middle" x="215" y="-303" font-family="Times,serif" font-size="10.00">SliceChannel</text>
+</g>
+<!-- ptb_lstm_1_gates&#45;&gt;_plus0 -->
+<g id="edge5" class="edge"><title>ptb_lstm_1_gates&#45;&gt;_plus0</title>
+<path fill="none" stroke="#737373" d="M215,-271.744C215,-263.204 215,-254.298 215,-246.248"/>
+<polygon fill="#737373" stroke="#737373" points="215,-281.897 210.5,-271.897 215,-276.897 215,-271.897 215,-271.897 215,-271.897 215,-276.897 219.5,-271.897 215,-281.897 215,-281.897"/>
+</g>
+<!-- activation3 -->
+<g id="node8" class="node"><title>activation3</title>
+<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M381,-622C381,-622 311,-622 311,-622 305,-622 299,-616 299,-610 299,-610 299,-576 299,-576 299,-570 305,-564 311,-564 311,-564 381,-564 381,-564 387,-564 393,-570 393,-576 393,-576 393,-610 393,-610 393,-616 387,-622 381,-622"/>
+<text text-anchor="middle" x="346" y="-600" font-family="Times,serif" font-size="10.00">activation3</text>
+<text text-anchor="middle" x="346" y="-590" font-family="Times,serif" font-size="10.00">Activation</text>
+<text text-anchor="middle" x="346" y="-580" font-family="Times,serif" font-size="10.00">act&#45;type=sigmoid</text>
+</g>
+<!-- activation3&#45;&gt;ptb_lstm_1_gates -->
+<g id="edge6" class="edge"><title>activation3&#45;&gt;ptb_lstm_1_gates</title>
+<path fill="none" stroke="#737373" d="M352.67,-553.687C358.562,-507.677 361.566,-429.965 327,-376 312.142,-352.804 285.707,-337.124 262.249,-327.092"/>
+<polygon fill="#737373" stroke="#737373" points="351.256,-563.836 348.179,-553.311 351.946,-558.884 352.636,-553.932 352.636,-553.932 352.636,-553.932 351.946,-558.884 357.093,-554.552 351.256,-563.836 351.256,-563.836"/>
+</g>
+<!-- activation2 -->
+<g id="node9" class="node"><title>activation2</title>
+<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M82,-434C82,-434 12,-434 12,-434 6,-434 -7.10543e-15,-428 -7.10543e-15,-422 -7.10543e-15,-422 -7.10543e-15,-388 -7.10543e-15,-388 -7.10543e-15,-382 6,-376 12,-376 12,-376 82,-376 82,-376 88,-376 94,-382 94,-388 94,-388 94,-422 94,-422 94,-428 88,-434 82,-434"/>
+<text text-anchor="middle" x="47" y="-412" font-family="Times,serif" font-size="10.00">activation2</text>
+<text text-anchor="middle" x="47" y="-402" font-family="Times,serif" font-size="10.00">Activation</text>
+<text text-anchor="middle" x="47" y="-392" font-family="Times,serif" font-size="10.00">act&#45;type=sigmoid</text>
+</g>
+<!-- activation2&#45;&gt;ptb_lstm_1_gates -->
+<g id="edge7" class="edge"><title>activation2&#45;&gt;ptb_lstm_1_gates</title>
+<path fill="none" stroke="#737373" d="M103.067,-373.297C124.069,-361.795 147.701,-348.854 167.819,-337.837"/>
+<polygon fill="#737373" stroke="#737373" points="94.2444,-378.128 100.854,-369.378 98.6299,-375.726 103.015,-373.325 103.015,-373.325 103.015,-373.325 98.6299,-375.726 105.177,-377.272 94.2444,-378.128 94.2444,-378.128"/>
+</g>
+<!-- _mul0 -->
+<g id="node10" class="node"><title>_mul0</title>
+<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M82,-528C82,-528 12,-528 12,-528 6,-528 -7.10543e-15,-522 -7.10543e-15,-516 -7.10543e-15,-516 -7.10543e-15,-482 -7.10543e-15,-482 -7.10543e-15,-476 6,-470 12,-470 12,-470 82,-470 82,-470 88,-470 94,-476 94,-482 94,-482 94,-516 94,-516 94,-522 88,-528 82,-528"/>
+<text text-anchor="middle" x="47" y="-501" font-family="Times,serif" font-size="10.00">_mul0</text>
+<text text-anchor="middle" x="47" y="-491" font-family="Times,serif" font-size="10.00">_Mul</text>
+</g>
+<!-- _mul0&#45;&gt;activation2 -->
+<g id="edge8" class="edge"><title>_mul0&#45;&gt;activation2</title>
+<path fill="none" stroke="#737373" d="M47,-459.744C47,-451.204 47,-442.298 47,-434.248"/>
+<polygon fill="#737373" stroke="#737373" points="47,-469.897 42.5001,-459.897 47,-464.897 47.0001,-459.897 47.0001,-459.897 47.0001,-459.897 47,-464.897 51.5001,-459.897 47,-469.897 47,-469.897"/>
+</g>
+<!-- activation0 -->
+<g id="node11" class="node"><title>activation0</title>
+<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M194,-434C194,-434 124,-434 124,-434 118,-434 112,-428 112,-422 112,-422 112,-388 112,-388 112,-382 118,-376 124,-376 124,-376 194,-376 194,-376 200,-376 206,-382 206,-388 206,-388 206,-422 206,-422 206,-428 200,-434 194,-434"/>
+<text text-anchor="middle" x="159" y="-412" font-family="Times,serif" font-size="10.00">activation0</text>
+<text text-anchor="middle" x="159" y="-402" font-family="Times,serif" font-size="10.00">Activation</text>
+<text text-anchor="middle" x="159" y="-392" font-family="Times,serif" font-size="10.00">act&#45;type=sigmoid</text>
+</g>
+<!-- activation0&#45;&gt;ptb_lstm_1_gates -->
+<g id="edge9" class="edge"><title>activation0&#45;&gt;ptb_lstm_1_gates</title>
+<path fill="none" stroke="#737373" d="M181.431,-367.148C186.887,-358.186 192.637,-348.74 197.806,-340.248"/>
+<polygon fill="#737373" stroke="#737373" points="176.106,-375.897 177.462,-365.015 178.706,-371.626 181.306,-367.355 181.306,-367.355 181.306,-367.355 178.706,-371.626 185.15,-369.695 176.106,-375.897 176.106,-375.897"/>
+</g>
+<!-- activation1 -->
+<g id="node12" class="node"><title>activation1</title>
+<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M306,-434C306,-434 236,-434 236,-434 230,-434 224,-428 224,-422 224,-422 224,-388 224,-388 224,-382 230,-376 236,-376 236,-376 306,-376 306,-376 312,-376 318,-382 318,-388 318,-388 318,-422 318,-422 318,-428 312,-434 306,-434"/>
+<text text-anchor="middle" x="271" y="-412" font-family="Times,serif" font-size="10.00">activation1</text>
+<text text-anchor="middle" x="271" y="-402" font-family="Times,serif" font-size="10.00">Activation</text>
+<text text-anchor="middle" x="271" y="-392" font-family="Times,serif" font-size="10.00">act&#45;type=tanh</text>
+</g>
+<!-- activation1&#45;&gt;ptb_lstm_1_gates -->
+<g id="edge10" class="edge"><title>activation1&#45;&gt;ptb_lstm_1_gates</title>
+<path fill="none" stroke="#737373" d="M248.569,-367.148C243.113,-358.186 237.363,-348.74 232.194,-340.248"/>
+<polygon fill="#737373" stroke="#737373" points="253.894,-375.897 244.85,-369.695 251.294,-371.626 248.694,-367.355 248.694,-367.355 248.694,-367.355 251.294,-371.626 252.538,-365.015 253.894,-375.897 253.894,-375.897"/>
+</g>
+<!-- _mul1 -->
+<g id="node13" class="node"><title>_mul1</title>
+<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M194,-528C194,-528 124,-528 124,-528 118,-528 112,-522 112,-516 112,-516 112,-482 112,-482 112,-476 118,-470 124,-470 124,-470 194,-470 194,-470 200,-470 206,-476 206,-482 206,-482 206,-516 206,-516 206,-522 200,-528 194,-528"/>
+<text text-anchor="middle" x="159" y="-501" font-family="Times,serif" font-size="10.00">_mul1</text>
+<text text-anchor="middle" x="159" y="-491" font-family="Times,serif" font-size="10.00">_Mul</text>
+</g>
+<!-- _mul1&#45;&gt;activation0 -->
+<g id="edge11" class="edge"><title>_mul1&#45;&gt;activation0</title>
+<path fill="none" stroke="#737373" d="M159,-459.744C159,-451.204 159,-442.298 159,-434.248"/>
+<polygon fill="#737373" stroke="#737373" points="159,-469.897 154.5,-459.897 159,-464.897 159,-459.897 159,-459.897 159,-459.897 159,-464.897 163.5,-459.897 159,-469.897 159,-469.897"/>
+</g>
+<!-- _mul1&#45;&gt;activation1 -->
+<g id="edge12" class="edge"><title>_mul1&#45;&gt;activation1</title>
+<path fill="none" stroke="#737373" d="M201.148,-463.378C212.849,-453.767 225.411,-443.448 236.611,-434.248"/>
+<polygon fill="#737373" stroke="#737373" points="193.212,-469.897 198.083,-460.072 197.076,-466.723 200.94,-463.55 200.94,-463.55 200.94,-463.55 197.076,-466.723 203.796,-467.027 193.212,-469.897 193.212,-469.897"/>
+</g>
+<!-- _plus1 -->
+<g id="node14" class="node"><title>_plus1</title>
+<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M194,-622C194,-622 124,-622 124,-622 118,-622 112,-616 112,-610 112,-610 112,-576 112,-576 112,-570 118,-564 124,-564 124,-564 194,-564 194,-564 200,-564 206,-570 206,-576 206,-576 206,-610 206,-610 206,-616 200,-622 194,-622"/>
+<text text-anchor="middle" x="159" y="-595" font-family="Times,serif" font-size="10.00">_plus1</text>
+<text text-anchor="middle" x="159" y="-585" font-family="Times,serif" font-size="10.00">_Plus</text>
+</g>
+<!-- _plus1&#45;&gt;_mul0 -->
+<g id="edge13" class="edge"><title>_plus1&#45;&gt;_mul0</title>
+<path fill="none" stroke="#737373" d="M116.852,-557.378C105.151,-547.767 92.5885,-537.448 81.3887,-528.248"/>
+<polygon fill="#737373" stroke="#737373" points="124.788,-563.897 114.204,-561.027 120.924,-560.723 117.06,-557.55 117.06,-557.55 117.06,-557.55 120.924,-560.723 119.917,-554.072 124.788,-563.897 124.788,-563.897"/>
+</g>
+<!-- _plus1&#45;&gt;_mul1 -->
+<g id="edge14" class="edge"><title>_plus1&#45;&gt;_mul1</title>
+<path fill="none" stroke="#737373" d="M159,-553.744C159,-545.204 159,-536.298 159,-528.248"/>
+<polygon fill="#737373" stroke="#737373" points="159,-563.897 154.5,-553.897 159,-558.897 159,-553.897 159,-553.897 159,-553.897 159,-558.897 163.5,-553.897 159,-563.897 159,-563.897"/>
+</g>
+<!-- activation4 -->
+<g id="node15" class="node"><title>activation4</title>
+<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M288,-716C288,-716 218,-716 218,-716 212,-716 206,-710 206,-704 206,-704 206,-670 206,-670 206,-664 212,-658 218,-658 218,-658 288,-658 288,-658 294,-658 300,-664 300,-670 300,-670 300,-704 300,-704 300,-710 294,-716 288,-716"/>
+<text text-anchor="middle" x="253" y="-694" font-family="Times,serif" font-size="10.00">activation4</text>
+<text text-anchor="middle" x="253" y="-684" font-family="Times,serif" font-size="10.00">Activation</text>
+<text text-anchor="middle" x="253" y="-674" font-family="Times,serif" font-size="10.00">act&#45;type=tanh</text>
+</g>
+<!-- activation4&#45;&gt;_plus1 -->
+<g id="edge15" class="edge"><title>activation4&#45;&gt;_plus1</title>
+<path fill="none" stroke="#737373" d="M217.058,-650.823C207.4,-641.37 197.081,-631.271 187.862,-622.248"/>
+<polygon fill="#737373" stroke="#737373" points="224.286,-657.897 213.992,-654.118 220.713,-654.4 217.139,-650.902 217.139,-650.902 217.139,-650.902 220.713,-654.4 220.287,-647.686 224.286,-657.897 224.286,-657.897"/>
+</g>
+<!-- _mul2 -->
+<g id="node16" class="node"><title>_mul2</title>
+<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M381,-810C381,-810 311,-810 311,-810 305,-810 299,-804 299,-798 299,-798 299,-764 299,-764 299,-758 305,-752 311,-752 311,-752 381,-752 381,-752 387,-752 393,-758 393,-764 393,-764 393,-798 393,-798 393,-804 387,-810 381,-810"/>
+<text text-anchor="middle" x="346" y="-783" font-family="Times,serif" font-size="10.00">_mul2</text>
+<text text-anchor="middle" x="346" y="-773" font-family="Times,serif" font-size="10.00">_Mul</text>
+</g>
+<!-- _mul2&#45;&gt;activation3 -->
+<g id="edge16" class="edge"><title>_mul2&#45;&gt;activation3</title>
+<path fill="none" stroke="#737373" d="M346,-741.746C346,-706.206 346,-654.104 346,-622.21"/>
+<polygon fill="#737373" stroke="#737373" points="346,-751.751 341.5,-741.751 346,-746.751 346,-741.751 346,-741.751 346,-741.751 346,-746.751 350.5,-741.751 346,-751.751 346,-751.751"/>
+</g>
+<!-- _mul2&#45;&gt;activation4 -->
+<g id="edge17" class="edge"><title>_mul2&#45;&gt;activation4</title>
+<path fill="none" stroke="#737373" d="M310.441,-744.823C300.885,-735.37 290.676,-725.271 281.555,-716.248"/>
+<polygon fill="#737373" stroke="#737373" points="317.592,-751.897 307.318,-748.063 314.037,-748.381 310.482,-744.864 310.482,-744.864 310.482,-744.864 314.037,-748.381 313.647,-741.665 317.592,-751.897 317.592,-751.897"/>
+</g>
+<!-- ptb_lstm_2_h2h -->
+<g id="node17" class="node"><title>ptb_lstm_2_h2h</title>
+<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M381,-904C381,-904 311,-904 311,-904 305,-904 299,-898 299,-892 299,-892 299,-858 299,-858 299,-852 305,-846 311,-846 311,-846 381,-846 381,-846 387,-846 393,-852 393,-858 393,-858 393,-892 393,-892 393,-898 387,-904 381,-904"/>
+<text text-anchor="middle" x="346" y="-882" font-family="Times,serif" font-size="10.00">ptb_lstm_2_h2h</text>
+<text text-anchor="middle" x="346" y="-872" font-family="Times,serif" font-size="10.00">FullyConnected</text>
+<text text-anchor="middle" x="346" y="-862" font-family="Times,serif" font-size="10.00">num&#45;hidden=1024</text>
+</g>
+<!-- ptb_lstm_2_h2h&#45;&gt;_mul2 -->
+<g id="edge18" class="edge"><title>ptb_lstm_2_h2h&#45;&gt;_mul2</title>
+<path fill="none" stroke="#737373" d="M346,-835.744C346,-827.204 346,-818.298 346,-810.248"/>
+<polygon fill="#737373" stroke="#737373" points="346,-845.897 341.5,-835.897 346,-840.897 346,-835.897 346,-835.897 346,-835.897 346,-840.897 350.5,-835.897 346,-845.897 346,-845.897"/>
+</g>
+<!-- _plus2 -->
+<g id="node18" class="node"><title>_plus2</title>
+<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M381,-998C381,-998 311,-998 311,-998 305,-998 299,-992 299,-986 299,-986 299,-952 299,-952 299,-946 305,-940 311,-940 311,-940 381,-940 381,-940 387,-940 393,-946 393,-952 393,-952 393,-986 393,-986 393,-992 387,-998 381,-998"/>
+<text text-anchor="middle" x="346" y="-971" font-family="Times,serif" font-size="10.00">_plus2</text>
+<text text-anchor="middle" x="346" y="-961" font-family="Times,serif" font-size="10.00">_Plus</text>
+</g>
+<!-- _plus2&#45;&gt;ptb_lstm_2_i2h -->
+<g id="edge19" class="edge"><title>_plus2&#45;&gt;ptb_lstm_2_i2h</title>
+<path fill="none" stroke="#737373" d="M303.852,-933.378C292.151,-923.767 279.589,-913.448 268.389,-904.248"/>
+<polygon fill="#737373" stroke="#737373" points="311.788,-939.897 301.204,-937.027 307.924,-936.723 304.06,-933.55 304.06,-933.55 304.06,-933.55 307.924,-936.723 306.917,-930.072 311.788,-939.897 311.788,-939.897"/>
+</g>
+<!-- _plus2&#45;&gt;ptb_lstm_2_h2h -->
+<g id="edge20" class="edge"><title>_plus2&#45;&gt;ptb_lstm_2_h2h</title>
+<path fill="none" stroke="#737373" d="M346,-929.744C346,-921.204 346,-912.298 346,-904.248"/>
+<polygon fill="#737373" stroke="#737373" points="346,-939.897 341.5,-929.897 346,-934.897 346,-929.897 346,-929.897 346,-929.897 346,-934.897 350.5,-929.897 346,-939.897 346,-939.897"/>
+</g>
+<!-- ptb_lstm_2_gates -->
+<g id="node19" class="node"><title>ptb_lstm_2_gates</title>
+<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M381,-1092C381,-1092 311,-1092 311,-1092 305,-1092 299,-1086 299,-1080 299,-1080 299,-1046 299,-1046 299,-1040 305,-1034 311,-1034 311,-1034 381,-1034 381,-1034 387,-1034 393,-1040 393,-1046 393,-1046 393,-1080 393,-1080 393,-1086 387,-1092 381,-1092"/>
+<text text-anchor="middle" x="346" y="-1065" font-family="Times,serif" font-size="10.00">ptb_lstm_2_gates</text>
+<text text-anchor="middle" x="346" y="-1055" font-family="Times,serif" font-size="10.00">SliceChannel</text>
+</g>
+<!-- ptb_lstm_2_gates&#45;&gt;_plus2 -->
+<g id="edge21" class="edge"><title>ptb_lstm_2_gates&#45;&gt;_plus2</title>
+<path fill="none" stroke="#737373" d="M346,-1023.74C346,-1015.2 346,-1006.3 346,-998.248"/>
+<polygon fill="#737373" stroke="#737373" points="346,-1033.9 341.5,-1023.9 346,-1028.9 346,-1023.9 346,-1023.9 346,-1023.9 346,-1028.9 350.5,-1023.9 346,-1033.9 346,-1033.9"/>
+</g>
+<!-- activation8 -->
+<g id="node20" class="node"><title>activation8</title>
+<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M568,-1374C568,-1374 498,-1374 498,-1374 492,-1374 486,-1368 486,-1362 486,-1362 486,-1328 486,-1328 486,-1322 492,-1316 498,-1316 498,-1316 568,-1316 568,-1316 574,-1316 580,-1322 580,-1328 580,-1328 580,-1362 580,-1362 580,-1368 574,-1374 568,-1374"/>
+<text text-anchor="middle" x="533" y="-1352" font-family="Times,serif" font-size="10.00">activation8</text>
+<text text-anchor="middle" x="533" y="-1342" font-family="Times,serif" font-size="10.00">Activation</text>
+<text text-anchor="middle" x="533" y="-1332" font-family="Times,serif" font-size="10.00">act&#45;type=sigmoid</text>
+</g>
+<!-- activation8&#45;&gt;ptb_lstm_2_gates -->
+<g id="edge22" class="edge"><title>activation8&#45;&gt;ptb_lstm_2_gates</title>
+<path fill="none" stroke="#737373" d="M540.878,-1305.79C548.226,-1258.96 552.937,-1179.51 514,-1128 485.58,-1090.4 432.237,-1074.89 393.26,-1068.49"/>
+<polygon fill="#737373" stroke="#737373" points="539.183,-1315.83 536.411,-1305.22 540.015,-1310.9 540.848,-1305.97 540.848,-1305.97 540.848,-1305.97 540.015,-1310.9 545.285,-1306.72 539.183,-1315.83 539.183,-1315.83"/>
+</g>
+<!-- activation7 -->
+<g id="node21" class="node"><title>activation7</title>
+<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M269,-1186C269,-1186 199,-1186 199,-1186 193,-1186 187,-1180 187,-1174 187,-1174 187,-1140 187,-1140 187,-1134 193,-1128 199,-1128 199,-1128 269,-1128 269,-1128 275,-1128 281,-1134 281,-1140 281,-1140 281,-1174 281,-1174 281,-1180 275,-1186 269,-1186"/>
+<text text-anchor="middle" x="234" y="-1164" font-family="Times,serif" font-size="10.00">activation7</text>
+<text text-anchor="middle" x="234" y="-1154" font-family="Times,serif" font-size="10.00">Activation</text>
+<text text-anchor="middle" x="234" y="-1144" font-family="Times,serif" font-size="10.00">act&#45;type=sigmoid</text>
+</g>
+<!-- activation7&#45;&gt;ptb_lstm_2_gates -->
+<g id="edge23" class="edge"><title>activation7&#45;&gt;ptb_lstm_2_gates</title>
+<path fill="none" stroke="#737373" d="M276.148,-1121.38C287.849,-1111.77 300.411,-1101.45 311.611,-1092.25"/>
+<polygon fill="#737373" stroke="#737373" points="268.212,-1127.9 273.083,-1118.07 272.076,-1124.72 275.94,-1121.55 275.94,-1121.55 275.94,-1121.55 272.076,-1124.72 278.796,-1125.03 268.212,-1127.9 268.212,-1127.9"/>
+</g>
+<!-- _mul3 -->
+<g id="node22" class="node"><title>_mul3</title>
+<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M269,-1280C269,-1280 199,-1280 199,-1280 193,-1280 187,-1274 187,-1268 187,-1268 187,-1234 187,-1234 187,-1228 193,-1222 199,-1222 199,-1222 269,-1222 269,-1222 275,-1222 281,-1228 281,-1234 281,-1234 281,-1268 281,-1268 281,-1274 275,-1280 269,-1280"/>
+<text text-anchor="middle" x="234" y="-1253" font-family="Times,serif" font-size="10.00">_mul3</text>
+<text text-anchor="middle" x="234" y="-1243" font-family="Times,serif" font-size="10.00">_Mul</text>
+</g>
+<!-- _mul3&#45;&gt;_plus1 -->
+<g id="edge25" class="edge"><title>_mul3&#45;&gt;_plus1</title>
+<path fill="none" stroke="#737373" d="M196.635,-1214.32C189.434,-1205.69 182.685,-1196.04 178,-1186 154.792,-1136.27 159,-1118.88 159,-1064 159,-1064 159,-1064 159,-780 159,-723.974 159,-658.455 159,-622.045"/>
+<polygon fill="#737373" stroke="#737373" points="203.329,-1221.94 193.348,-1217.4 200.029,-1218.18 196.729,-1214.43 196.729,-1214.43 196.729,-1214.43 200.029,-1218.18 200.11,-1211.46 203.329,-1221.94 203.329,-1221.94"/>
+</g>
+<!-- _mul3&#45;&gt;activation7 -->
+<g id="edge24" class="edge"><title>_mul3&#45;&gt;activation7</title>
+<path fill="none" stroke="#737373" d="M234,-1211.74C234,-1203.2 234,-1194.3 234,-1186.25"/>
+<polygon fill="#737373" stroke="#737373" points="234,-1221.9 229.5,-1211.9 234,-1216.9 234,-1211.9 234,-1211.9 234,-1211.9 234,-1216.9 238.5,-1211.9 234,-1221.9 234,-1221.9"/>
+</g>
+<!-- activation5 -->
+<g id="node23" class="node"><title>activation5</title>
+<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M381,-1186C381,-1186 311,-1186 311,-1186 305,-1186 299,-1180 299,-1174 299,-1174 299,-1140 299,-1140 299,-1134 305,-1128 311,-1128 311,-1128 381,-1128 381,-1128 387,-1128 393,-1134 393,-1140 393,-1140 393,-1174 393,-1174 393,-1180 387,-1186 381,-1186"/>
+<text text-anchor="middle" x="346" y="-1164" font-family="Times,serif" font-size="10.00">activation5</text>
+<text text-anchor="middle" x="346" y="-1154" font-family="Times,serif" font-size="10.00">Activation</text>
+<text text-anchor="middle" x="346" y="-1144" font-family="Times,serif" font-size="10.00">act&#45;type=sigmoid</text>
+</g>
+<!-- activation5&#45;&gt;ptb_lstm_2_gates -->
+<g id="edge26" class="edge"><title>activation5&#45;&gt;ptb_lstm_2_gates</title>
+<path fill="none" stroke="#737373" d="M346,-1117.74C346,-1109.2 346,-1100.3 346,-1092.25"/>
+<polygon fill="#737373" stroke="#737373" points="346,-1127.9 341.5,-1117.9 346,-1122.9 346,-1117.9 346,-1117.9 346,-1117.9 346,-1122.9 350.5,-1117.9 346,-1127.9 346,-1127.9"/>
+</g>
+<!-- activation6 -->
+<g id="node24" class="node"><title>activation6</title>
+<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M493,-1186C493,-1186 423,-1186 423,-1186 417,-1186 411,-1180 411,-1174 411,-1174 411,-1140 411,-1140 411,-1134 417,-1128 423,-1128 423,-1128 493,-1128 493,-1128 499,-1128 505,-1134 505,-1140 505,-1140 505,-1174 505,-1174 505,-1180 499,-1186 493,-1186"/>
+<text text-anchor="middle" x="458" y="-1164" font-family="Times,serif" font-size="10.00">activation6</text>
+<text text-anchor="middle" x="458" y="-1154" font-family="Times,serif" font-size="10.00">Activation</text>
+<text text-anchor="middle" x="458" y="-1144" font-family="Times,serif" font-size="10.00">act&#45;type=tanh</text>
+</g>
+<!-- activation6&#45;&gt;ptb_lstm_2_gates -->
+<g id="edge27" class="edge"><title>activation6&#45;&gt;ptb_lstm_2_gates</title>
+<path fill="none" stroke="#737373" d="M415.852,-1121.38C404.151,-1111.77 391.589,-1101.45 380.389,-1092.25"/>
+<polygon fill="#737373" stroke="#737373" points="423.788,-1127.9 413.204,-1125.03 419.924,-1124.72 416.06,-1121.55 416.06,-1121.55 416.06,-1121.55 419.924,-1124.72 418.917,-1118.07 423.788,-1127.9 423.788,-1127.9"/>
+</g>
+<!-- _mul4 -->
+<g id="node25" class="node"><title>_mul4</title>
+<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M381,-1280C381,-1280 311,-1280 311,-1280 305,-1280 299,-1274 299,-1268 299,-1268 299,-1234 299,-1234 299,-1228 305,-1222 311,-1222 311,-1222 381,-1222 381,-1222 387,-1222 393,-1228 393,-1234 393,-1234 393,-1268 393,-1268 393,-1274 387,-1280 381,-1280"/>
+<text text-anchor="middle" x="346" y="-1253" font-family="Times,serif" font-size="10.00">_mul4</text>
+<text text-anchor="middle" x="346" y="-1243" font-family="Times,serif" font-size="10.00">_Mul</text>
+</g>
+<!-- _mul4&#45;&gt;activation5 -->
+<g id="edge28" class="edge"><title>_mul4&#45;&gt;activation5</title>
+<path fill="none" stroke="#737373" d="M346,-1211.74C346,-1203.2 346,-1194.3 346,-1186.25"/>
+<polygon fill="#737373" stroke="#737373" points="346,-1221.9 341.5,-1211.9 346,-1216.9 346,-1211.9 346,-1211.9 346,-1211.9 346,-1216.9 350.5,-1211.9 346,-1221.9 346,-1221.9"/>
+</g>
+<!-- _mul4&#45;&gt;activation6 -->
+<g id="edge29" class="edge"><title>_mul4&#45;&gt;activation6</title>
+<path fill="none" stroke="#737373" d="M388.148,-1215.38C399.849,-1205.77 412.411,-1195.45 423.611,-1186.25"/>
+<polygon fill="#737373" stroke="#737373" points="380.212,-1221.9 385.083,-1212.07 384.076,-1218.72 387.94,-1215.55 387.94,-1215.55 387.94,-1215.55 384.076,-1218.72 390.796,-1219.03 380.212,-1221.9 380.212,-1221.9"/>
+</g>
+<!-- _plus3 -->
+<g id="node26" class="node"><title>_plus3</title>
+<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M381,-1374C381,-1374 311,-1374 311,-1374 305,-1374 299,-1368 299,-1362 299,-1362 299,-1328 299,-1328 299,-1322 305,-1316 311,-1316 311,-1316 381,-1316 381,-1316 387,-1316 393,-1322 393,-1328 393,-1328 393,-1362 393,-1362 393,-1368 387,-1374 381,-1374"/>
+<text text-anchor="middle" x="346" y="-1347" font-family="Times,serif" font-size="10.00">_plus3</text>
+<text text-anchor="middle" x="346" y="-1337" font-family="Times,serif" font-size="10.00">_Plus</text>
+</g>
+<!-- _plus3&#45;&gt;_mul3 -->
+<g id="edge30" class="edge"><title>_plus3&#45;&gt;_mul3</title>
+<path fill="none" stroke="#737373" d="M303.852,-1309.38C292.151,-1299.77 279.589,-1289.45 268.389,-1280.25"/>
+<polygon fill="#737373" stroke="#737373" points="311.788,-1315.9 301.204,-1313.03 307.924,-1312.72 304.06,-1309.55 304.06,-1309.55 304.06,-1309.55 307.924,-1312.72 306.917,-1306.07 311.788,-1315.9 311.788,-1315.9"/>
+</g>
+<!-- _plus3&#45;&gt;_mul4 -->
+<g id="edge31" class="edge"><title>_plus3&#45;&gt;_mul4</title>
+<path fill="none" stroke="#737373" d="M346,-1305.74C346,-1297.2 346,-1288.3 346,-1280.25"/>
+<polygon fill="#737373" stroke="#737373" points="346,-1315.9 341.5,-1305.9 346,-1310.9 346,-1305.9 346,-1305.9 346,-1305.9 346,-1310.9 350.5,-1305.9 346,-1315.9 346,-1315.9"/>
+</g>
+<!-- activation9 -->
+<g id="node27" class="node"><title>activation9</title>
+<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M493,-1468C493,-1468 423,-1468 423,-1468 417,-1468 411,-1462 411,-1456 411,-1456 411,-1422 411,-1422 411,-1416 417,-1410 423,-1410 423,-1410 493,-1410 493,-1410 499,-1410 505,-1416 505,-1422 505,-1422 505,-1456 505,-1456 505,-1462 499,-1468 493,-1468"/>
+<text text-anchor="middle" x="458" y="-1446" font-family="Times,serif" font-size="10.00">activation9</text>
+<text text-anchor="middle" x="458" y="-1436" font-family="Times,serif" font-size="10.00">Activation</text>
+<text text-anchor="middle" x="458" y="-1426" font-family="Times,serif" font-size="10.00">act&#45;type=tanh</text>
+</g>
+<!-- activation9&#45;&gt;_plus3 -->
+<g id="edge32" class="edge"><title>activation9&#45;&gt;_plus3</title>
+<path fill="none" stroke="#737373" d="M415.852,-1403.38C404.151,-1393.77 391.589,-1383.45 380.389,-1374.25"/>
+<polygon fill="#737373" stroke="#737373" points="423.788,-1409.9 413.204,-1407.03 419.924,-1406.72 416.06,-1403.55 416.06,-1403.55 416.06,-1403.55 419.924,-1406.72 418.917,-1400.07 423.788,-1409.9 423.788,-1409.9"/>
+</g>
+<!-- _mul5 -->
+<g id="node28" class="node"><title>_mul5</title>
+<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M530,-1562C530,-1562 460,-1562 460,-1562 454,-1562 448,-1556 448,-1550 448,-1550 448,-1516 448,-1516 448,-1510 454,-1504 460,-1504 460,-1504 530,-1504 530,-1504 536,-1504 542,-1510 542,-1516 542,-1516 542,-1550 542,-1550 542,-1556 536,-1562 530,-1562"/>
+<text text-anchor="middle" x="495" y="-1535" font-family="Times,serif" font-size="10.00">_mul5</text>
+<text text-anchor="middle" x="495" y="-1525" font-family="Times,serif" font-size="10.00">_Mul</text>
+</g>
+<!-- _mul5&#45;&gt;activation8 -->
+<g id="edge33" class="edge"><title>_mul5&#45;&gt;activation8</title>
+<path fill="none" stroke="#737373" d="M507.152,-1494.14C509.638,-1485.6 512.078,-1476.53 514,-1468 521.159,-1436.21 526.515,-1399.16 529.685,-1374.31"/>
+<polygon fill="#737373" stroke="#737373" points="504.242,-1503.86 502.799,-1492.99 505.676,-1499.07 507.11,-1494.28 507.11,-1494.28 507.11,-1494.28 505.676,-1499.07 511.421,-1495.57 504.242,-1503.86 504.242,-1503.86"/>
+</g>
+<!-- _mul5&#45;&gt;activation9 -->
+<g id="edge34" class="edge"><title>_mul5&#45;&gt;activation9</title>
+<path fill="none" stroke="#737373" d="M479.954,-1494.59C476.417,-1485.79 472.705,-1476.56 469.361,-1468.25"/>
+<polygon fill="#737373" stroke="#737373" points="483.698,-1503.9 475.791,-1496.3 481.832,-1499.26 479.966,-1494.62 479.966,-1494.62 479.966,-1494.62 481.832,-1499.26 484.141,-1492.94 483.698,-1503.9 483.698,-1503.9"/>
+</g>
+<!-- ptb_l1_last_h -->
+<g id="node29" class="node"><title>ptb_l1_last_h</title>
+<path fill="#b3de69" stroke="#597d1c" stroke-width="2" d="M474,-1656C474,-1656 404,-1656 404,-1656 398,-1656 392,-1650 392,-1644 392,-1644 392,-1610 392,-1610 392,-1604 398,-1598 404,-1598 404,-1598 474,-1598 474,-1598 480,-1598 486,-1604 486,-1610 486,-1610 486,-1644 486,-1644 486,-1650 480,-1656 474,-1656"/>
+<text text-anchor="middle" x="439" y="-1629" font-family="Times,serif" font-size="10.00">ptb_l1_last_h</text>
+<text text-anchor="middle" x="439" y="-1619" font-family="Times,serif" font-size="10.00">BlockGrad</text>
+</g>
+<!-- ptb_l1_last_h&#45;&gt;_mul5 -->
+<g id="edge35" class="edge"><title>ptb_l1_last_h&#45;&gt;_mul5</title>
+<path fill="none" stroke="#737373" d="M461.431,-1589.15C466.887,-1580.19 472.637,-1570.74 477.806,-1562.25"/>
+<polygon fill="#737373" stroke="#737373" points="456.106,-1597.9 457.462,-1587.02 458.706,-1593.63 461.306,-1589.36 461.306,-1589.36 461.306,-1589.36 458.706,-1593.63 465.15,-1591.69 456.106,-1597.9 456.106,-1597.9"/>
+</g>
+<!-- ptb_l1_last_c -->
+<g id="node30" class="node"><title>ptb_l1_last_c</title>
+<path fill="#b3de69" stroke="#597d1c" stroke-width="2" d="M381,-1468C381,-1468 311,-1468 311,-1468 305,-1468 299,-1462 299,-1456 299,-1456 299,-1422 299,-1422 299,-1416 305,-1410 311,-1410 311,-1410 381,-1410 381,-1410 387,-1410 393,-1416 393,-1422 393,-1422 393,-1456 393,-1456 393,-1462 387,-1468 381,-1468"/>
+<text text-anchor="middle" x="346" y="-1441" font-family="Times,serif" font-size="10.00">ptb_l1_last_c</text>
+<text text-anchor="middle" x="346" y="-1431" font-family="Times,serif" font-size="10.00">BlockGrad</text>
+</g>
+<!-- ptb_l1_last_c&#45;&gt;_plus3 -->
+<g id="edge36" class="edge"><title>ptb_l1_last_c&#45;&gt;_plus3</title>
+<path fill="none" stroke="#737373" d="M346,-1399.74C346,-1391.2 346,-1382.3 346,-1374.25"/>
+<polygon fill="#737373" stroke="#737373" points="346,-1409.9 341.5,-1399.9 346,-1404.9 346,-1399.9 346,-1399.9 346,-1399.9 346,-1404.9 350.5,-1399.9 346,-1409.9 346,-1409.9"/>
+</g>
+<!-- ptb_pred_2 -->
+<g id="node31" class="node"><title>ptb_pred_2</title>
+<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M586,-1656C586,-1656 516,-1656 516,-1656 510,-1656 504,-1650 504,-1644 504,-1644 504,-1610 504,-1610 504,-1604 510,-1598 516,-1598 516,-1598 586,-1598 586,-1598 592,-1598 598,-1604 598,-1610 598,-1610 598,-1644 598,-1644 598,-1650 592,-1656 586,-1656"/>
+<text text-anchor="middle" x="551" y="-1634" font-family="Times,serif" font-size="10.00">ptb_pred_2</text>
+<text text-anchor="middle" x="551" y="-1624" font-family="Times,serif" font-size="10.00">FullyConnected</text>
+<text text-anchor="middle" x="551" y="-1614" font-family="Times,serif" font-size="10.00">num&#45;hidden=128</text>
+</g>
+<!-- ptb_pred_2&#45;&gt;_mul5 -->
+<g id="edge37" class="edge"><title>ptb_pred_2&#45;&gt;_mul5</title>
+<path fill="none" stroke="#737373" d="M528.569,-1589.15C523.113,-1580.19 517.363,-1570.74 512.194,-1562.25"/>
+<polygon fill="#737373" stroke="#737373" points="533.894,-1597.9 524.85,-1591.69 531.294,-1593.63 528.694,-1589.36 528.694,-1589.36 528.694,-1589.36 531.294,-1593.63 532.538,-1587.02 533.894,-1597.9 533.894,-1597.9"/>
+</g>
+<!-- ptb_softmax_2 -->
+<g id="node32" class="node"><title>ptb_softmax_2</title>
+<path fill="#b3de69" stroke="#597d1c" stroke-width="2" d="M586,-1750C586,-1750 516,-1750 516,-1750 510,-1750 504,-1744 504,-1738 504,-1738 504,-1704 504,-1704 504,-1698 510,-1692 516,-1692 516,-1692 586,-1692 586,-1692 592,-1692 598,-1698 598,-1704 598,-1704 598,-1738 598,-1738 598,-1744 592,-1750 586,-1750"/>
+<text text-anchor="middle" x="551" y="-1723" font-family="Times,serif" font-size="10.00">ptb_softmax_2</text>
+<text text-anchor="middle" x="551" y="-1713" font-family="Times,serif" font-size="10.00">SoftmaxOutput</text>
+</g>
+<!-- ptb_softmax_2&#45;&gt;ptb_pred_2 -->
+<g id="edge38" class="edge"><title>ptb_softmax_2&#45;&gt;ptb_pred_2</title>
+<path fill="none" stroke="#737373" d="M551,-1681.74C551,-1673.2 551,-1664.3 551,-1656.25"/>
+<polygon fill="#737373" stroke="#737373" points="551,-1691.9 546.5,-1681.9 551,-1686.9 551,-1681.9 551,-1681.9 551,-1681.9 551,-1686.9 555.5,-1681.9 551,-1691.9 551,-1691.9"/>
+</g>
+<!-- ptb_pred_1 -->
+<g id="node33" class="node"><title>ptb_pred_1</title>
+<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M493,-904C493,-904 423,-904 423,-904 417,-904 411,-898 411,-892 411,-892 411,-858 411,-858 411,-852 417,-846 423,-846 423,-846 493,-846 493,-846 499,-846 505,-852 505,-858 505,-858 505,-892 505,-892 505,-898 499,-904 493,-904"/>
+<text text-anchor="middle" x="458" y="-882" font-family="Times,serif" font-size="10.00">ptb_pred_1</text>
+<text text-anchor="middle" x="458" y="-872" font-family="Times,serif" font-size="10.00">FullyConnected</text>
+<text text-anchor="middle" x="458" y="-862" font-family="Times,serif" font-size="10.00">num&#45;hidden=128</text>
+</g>
+<!-- ptb_pred_1&#45;&gt;_mul2 -->
+<g id="edge39" class="edge"><title>ptb_pred_1&#45;&gt;_mul2</title>
+<path fill="none" stroke="#737373" d="M415.852,-839.378C404.151,-829.767 391.589,-819.448 380.389,-810.248"/>
+<polygon fill="#737373" stroke="#737373" points="423.788,-845.897 413.204,-843.027 419.924,-842.723 416.06,-839.55 416.06,-839.55 416.06,-839.55 419.924,-842.723 418.917,-836.072 423.788,-845.897 423.788,-845.897"/>
+</g>
+<!-- ptb_softmax_1 -->
+<g id="node34" class="node"><title>ptb_softmax_1</title>
+<path fill="#b3de69" stroke="#597d1c" stroke-width="2" d="M493,-998C493,-998 423,-998 423,-998 417,-998 411,-992 411,-986 411,-986 411,-952 411,-952 411,-946 417,-940 423,-940 423,-940 493,-940 493,-940 499,-940 505,-946 505,-952 505,-952 505,-986 505,-986 505,-992 499,-998 493,-998"/>
+<text text-anchor="middle" x="458" y="-971" font-family="Times,serif" font-size="10.00">ptb_softmax_1</text>
+<text text-anchor="middle" x="458" y="-961" font-family="Times,serif" font-size="10.00">SoftmaxOutput</text>
+</g>
+<!-- ptb_softmax_1&#45;&gt;ptb_pred_1 -->
+<g id="edge40" class="edge"><title>ptb_softmax_1&#45;&gt;ptb_pred_1</title>
+<path fill="none" stroke="#737373" d="M458,-929.744C458,-921.204 458,-912.298 458,-904.248"/>
+<polygon fill="#737373" stroke="#737373" points="458,-939.897 453.5,-929.897 458,-934.897 458,-929.897 458,-929.897 458,-929.897 458,-934.897 462.5,-929.897 458,-939.897 458,-939.897"/>
+</g>
+</g>
+</svg>
diff --git a/julia/docs/src/tutorial/mnist.md b/julia/docs/src/tutorial/mnist.md
new file mode 100644
index 000000000000..76430fd1b1d0
--- /dev/null
+++ b/julia/docs/src/tutorial/mnist.md
@@ -0,0 +1,264 @@
+Digit Recognition on MNIST
+==========================
+
+In this tutorial, we will work through examples of training a simple
+multi-layer perceptron and then a convolutional neural network (the
+LeNet architecture) on the [MNIST handwritten digit
+dataset](http://yann.lecun.com/exdb/mnist/). The code for this tutorial
+could be found in
+[examples/mnist](https://github.com/dmlc/MXNet.jl/tree/master/examples/mnist).  There are also two Jupyter notebooks that expand a little more on the [MLP](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistMLP.ipynb) and the [LeNet](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistLenet.ipynb), using the more general `ArrayDataProvider`. 
+
+Simple 3-layer MLP
+------------------
+
+This is a tiny 3-layer MLP that could be easily trained on CPU. The
+script starts with
+
+```julia
+using MXNet
+```
+
+to load the `MXNet` module. Then we are ready to define the network
+architecture via the [symbolic API](../user-guide/overview.md). We start
+with a placeholder `data` symbol,
+
+```julia
+data = mx.Variable(:data)
+```
+
+and then cascading fully-connected layers and activation functions:
+
+```julia
+fc1  = mx.FullyConnected(data, name=:fc1, num_hidden=128)
+act1 = mx.Activation(fc1, name=:relu1, act_type=:relu)
+fc2  = mx.FullyConnected(act1, name=:fc2, num_hidden=64)
+act2 = mx.Activation(fc2, name=:relu2, act_type=:relu)
+fc3  = mx.FullyConnected(act2, name=:fc3, num_hidden=10)
+```
+
+Note each composition we take the previous symbol as the first argument,
+forming a feedforward chain. The architecture looks like
+
+```
+Input --> 128 units (ReLU) --> 64 units (ReLU) --> 10 units
+```
+
+where the last 10 units correspond to the 10 output classes (digits
+0,...,9). We then add a final `SoftmaxOutput` operation to turn the
+10-dimensional prediction to proper probability values for the 10
+classes:
+
+```julia
+mlp  = mx.SoftmaxOutput(fc3, name=:softmax)
+```
+
+As we can see, the MLP is just a chain of layers. For this case, we can
+also use the `mx.chain` macro. The same architecture above can be
+defined as
+
+```julia
+mlp = @mx.chain mx.Variable(:data)             =>
+  mx.FullyConnected(name=:fc1, num_hidden=128) =>
+  mx.Activation(name=:relu1, act_type=:relu)   =>
+  mx.FullyConnected(name=:fc2, num_hidden=64)  =>
+  mx.Activation(name=:relu2, act_type=:relu)   =>
+  mx.FullyConnected(name=:fc3, num_hidden=10)  =>
+  mx.SoftmaxOutput(name=:softmax)
+```
+
+After defining the architecture, we are ready to load the MNIST data.
+MXNet.jl provide built-in data providers for the MNIST dataset, which
+could automatically download the dataset into
+`Pkg.dir("MXNet")/data/mnist` if necessary. We wrap the code to
+construct the data provider into `mnist-data.jl` so that it could be
+shared by both the MLP example and the LeNet ConvNets example.
+
+```julia
+batch_size = 100
+include("mnist-data.jl")
+train_provider, eval_provider = get_mnist_providers(batch_size)
+```
+
+If you need to write your own data providers for customized data format,
+please refer to [`mx.AbstractDataProvider`](@ref).
+
+Given the architecture and data, we can instantiate an *model* to do the
+actual training. `mx.FeedForward` is the built-in model that is suitable
+for most feed-forward architectures. When constructing the model, we
+also specify the *context* on which the computation should be carried
+out. Because this is a really tiny MLP, we will just run on a single CPU
+device.
+
+```julia
+model = mx.FeedForward(mlp, context=mx.cpu())
+```
+
+You can use a `mx.gpu()` or if a list of devices (e.g.
+`[mx.gpu(0), mx.gpu(1)]`) is provided, data-parallelization will be used
+automatically. But for this tiny example, using a GPU device might not
+help.
+
+The last thing we need to specify is the optimization algorithm (a.k.a.
+*optimizer*) to use. We use the basic SGD with a fixed learning rate 0.1
+, momentum 0.9 and weight decay 0.00001:
+
+```julia
+optimizer = mx.SGD(η=0.1, μ=0.9, λ=0.00001)
+```
+
+Now we can do the training. Here the `n_epoch` parameter specifies that
+we want to train for 20 epochs. We also supply a `eval_data` to monitor
+validation accuracy on the validation set.
+
+```julia
+mx.fit(model, optimizer, train_provider, n_epoch=20, eval_data=eval_provider)
+```
+
+Here is a sample output
+
+```
+INFO: Start training on [CPU0]
+INFO: Initializing parameters...
+INFO: Creating KVStore...
+INFO: == Epoch 001 ==========
+INFO: ## Training summary
+INFO:       :accuracy = 0.7554
+INFO:            time = 1.3165 seconds
+INFO: ## Validation summary
+INFO:       :accuracy = 0.9502
+...
+INFO: == Epoch 020 ==========
+INFO: ## Training summary
+INFO:       :accuracy = 0.9949
+INFO:            time = 0.9287 seconds
+INFO: ## Validation summary
+INFO:       :accuracy = 0.9775
+```
+
+Convolutional Neural Networks
+-----------------------------
+
+In the second example, we show a slightly more complicated architecture
+that involves convolution and pooling. This architecture for the MNIST
+is usually called the \[LeNet\]\_. The first part of the architecture is
+listed below:
+
+```julia
+# input
+data = mx.Variable(:data)
+
+# first conv
+conv1 = @mx.chain mx.Convolution(data, kernel=(5,5), num_filter=20)  =>
+                  mx.Activation(act_type=:tanh) =>
+                  mx.Pooling(pool_type=:max, kernel=(2,2), stride=(2,2))
+
+# second conv
+conv2 = @mx.chain mx.Convolution(conv1, kernel=(5,5), num_filter=50) =>
+                  mx.Activation(act_type=:tanh) =>
+                  mx.Pooling(pool_type=:max, kernel=(2,2), stride=(2,2))
+```
+
+We basically defined two convolution modules. Each convolution module is
+actually a chain of `Convolution`, `tanh` activation and then max
+`Pooling` operations.
+
+Each sample in the MNIST dataset is a 28x28 single-channel grayscale
+image. In the tensor format used by `NDArray`, a batch of 100 samples is
+a tensor of shape `(28,28,1,100)`. The convolution and pooling operates
+in the spatial axis, so `kernel=(5,5)` indicate a square region of
+5-width and 5-height. The rest of the architecture follows as:
+
+```julia
+# first fully-connected
+fc1   = @mx.chain mx.Flatten(conv2) =>
+                  mx.FullyConnected(num_hidden=500) =>
+                  mx.Activation(act_type=:tanh)
+
+# second fully-connected
+fc2   = mx.FullyConnected(fc1, num_hidden=10)
+
+# softmax loss
+lenet = mx.Softmax(fc2, name=:softmax)
+```
+
+Note a fully-connected operator expects the input to be a matrix.
+However, the results from spatial convolution and pooling are 4D
+tensors. So we explicitly used a `Flatten` operator to flat the tensor,
+before connecting it to the `FullyConnected` operator.
+
+The rest of the network is the same as the previous MLP example. As
+before, we can now load the MNIST dataset:
+
+```julia
+batch_size = 100
+include("mnist-data.jl")
+train_provider, eval_provider = get_mnist_providers(batch_size; flat=false)
+```
+
+Note we specified `flat=false` to tell the data provider to provide 4D
+tensors instead of 2D matrices because the convolution operators needs
+correct spatial shape information. We then construct a feedforward model
+on GPU, and train it.
+
+```julia
+# fit model
+model = mx.FeedForward(lenet, context=mx.gpu())
+
+# optimizer
+optimizer = mx.SGD(η=0.05, μ=0.9, λ=0.00001)
+
+# fit parameters
+mx.fit(model, optimizer, train_provider, n_epoch=20, eval_data=eval_provider)
+```
+
+And here is a sample of running outputs:
+
+```
+INFO: == Epoch 001 ==========
+INFO: ## Training summary
+INFO:       :accuracy = 0.6750
+INFO:            time = 4.9814 seconds
+INFO: ## Validation summary
+INFO:       :accuracy = 0.9712
+...
+INFO: == Epoch 020 ==========
+INFO: ## Training summary
+INFO:       :accuracy = 1.0000
+INFO:            time = 4.0086 seconds
+INFO: ## Validation summary
+INFO:       :accuracy = 0.9915
+```
+
+Predicting with a trained model
+-------------------------------
+
+Predicting with a trained model is very simple. By calling `mx.predict`
+with the model and a data provider, we get the model output as a Julia
+Array:
+
+```julia
+probs = mx.predict(model, eval_provider)
+```
+
+The following code shows a stupid way of getting all the labels from the
+data provider, and compute the prediction accuracy manually:
+
+```julia
+# collect all labels from eval data
+labels = reduce(
+  vcat,
+  copy(mx.get(eval_provider, batch, :softmax_label)) for batch ∈ eval_provider)
+# labels are 0...9
+labels .= labels .+ 1
+
+# Now we use compute the accuracy
+pred = map(i -> indmax(probs[1:10, i]), 1:size(probs, 2))
+correct = sum(pred .== labels)
+@printf "Accuracy on eval set: %.2f%%\n" 100correct/length(labels)
+```
+
+Alternatively, when the dataset is huge, one can provide a callback to
+`mx.predict`, then the callback function will be invoked with the
+outputs of each mini-batch. The callback could, for example, write the
+data to disk for future inspection. In this case, no value is returned
+from `mx.predict`. See also predict.
diff --git a/julia/docs/src/user-guide/faq.md b/julia/docs/src/user-guide/faq.md
new file mode 100644
index 000000000000..8fd8a6b34551
--- /dev/null
+++ b/julia/docs/src/user-guide/faq.md
@@ -0,0 +1,8 @@
+FAQ
+===
+
+Running MXNet on AWS GPU instances
+----------------------------------
+
+See the discussions and notes
+[here](https://github.com/dmlc/MXNet.jl/issues/43).
diff --git a/julia/docs/src/user-guide/install.md b/julia/docs/src/user-guide/install.md
new file mode 100644
index 000000000000..f1d5eeefacfe
--- /dev/null
+++ b/julia/docs/src/user-guide/install.md
@@ -0,0 +1,92 @@
+Installation Guide
+==================
+
+Automatic Installation
+----------------------
+
+To install MXNet.jl, simply type
+
+```julia
+Pkg.add("MXNet")
+```
+
+In the Julia REPL. Or to use the latest git version of MXNet.jl, use the
+following command instead
+
+```julia
+Pkg.checkout("MXNet")
+```
+
+MXNet.jl is built on top of [libmxnet](https://github.com/dmlc/mxnet).
+Upon installation, Julia will try to automatically download and build
+libmxnet.
+
+There are several environment variables that change this behaviour.
+
+- `MXNET_HOME`: If you already have a pre-installed version of mxnet
+  you can use `MXNET_HOME` to point the build-process in the right direction.
+- `CUDA_HOME`: If the automatic cuda detection fails you can also set `CUDA_HOME`
+  to override the process.
+- `MXNET_COMMIT`: To control which version of libmxnet will be compiled,
+  you can use the`MXNET_COMMIT` variable to point to either a version tag
+  (e.g. `v0.10.0`), a branch name (e.g. `master`) or a specific commit hash
+  (e.g. `a0b1c2d3`).
+- `CC`: The path of C compiler.
+- `CXX`: The path of C++ compiler.
+- `ADD_CFLAGS`: Additional C flags. For instance,
+  if you need to point non-standard include directory, please set it as
+  `ENV["ADD_CFLAGS"] = "-I'/path/to/include/dir'"`.
+- `ADD_LDFLAGS`: Additional linker flags.
+- `USE_JEMALLOC`: Default is enabled if jemalloc available.
+  If you ran into segfault cause by jemalloc,
+  Please try to disable it.
+
+  ```julia
+  # first remove whole libmxnet source: Pkg.dir("MXNet", "deps", "src")
+  ENV["USE_JEMALLOC"] = "0"
+  Pkg.build("MXNet")
+  ```
+
+The libmxnet source is downloaded to `Pkg.dir("MXNet", "deps", "src", "mxnet")`.
+The automatic build is using default configurations, with OpenCV disabled.
+If the compilation failed due to unresolved dependency, or if
+you want to customize the build, you can compile and
+install libmxnet manually. Please see below for more details.
+
+Manual Compilation
+------------------
+
+It is possible to compile libmxnet separately and point MXNet.jl to a
+existing library in case automatic compilation fails due to
+unresolved dependencies in an non-standard environment; Or when one want
+to work with a separate, maybe customized libmxnet.
+
+To build libmxnet, please refer to [the installation guide of
+libmxnet](https://mxnet.incubator.apache.org/install/index.html). After
+successfully installing libmxnet, set the `MXNET_HOME` *environment
+variable* to the location of libmxnet. In other words, the compiled
+`libmxnet.so` should be found in `$MXNET_HOME/lib`.
+
+> **note**
+>
+> The constant `MXNET_HOME` is pre-compiled in MXNet.jl package cache.
+> If you updated the environment variable after installing MXNet.jl,
+> make sure to update the pre-compilation cache by
+> `Base.compilecache("MXNet")`.
+
+When the `MXNET_HOME` environment variable is detected and the
+corresponding `libmxnet.so` could be loaded successfully, MXNet.jl will
+skip automatic building during installation and use the specified
+libmxnet instead.
+
+Basically, MXNet.jl will search `libmxnet.so` or `libmxnet.dll` in the
+following paths (and in that order):
+
+-   `$MXNET_HOME/lib`: customized libmxnet builds
+-   `Pkg.dir("MXNet", "deps", "usr", "lib")`: automatic builds
+-   Any system wide library search path
+
+Note that MXNet.jl can not load `libmxnet.so` even if it is on one of
+the paths above in case a library it depends upon is missing from the
+`LD_LIBRARY_PATH`. Thus, if you are going to compile to add CUDA, the
+path to the CUDA libraries will have to be added to `LD_LIBRARY_PATH`.
diff --git a/julia/docs/src/user-guide/overview.md b/julia/docs/src/user-guide/overview.md
new file mode 100644
index 000000000000..a81d7ff30e9e
--- /dev/null
+++ b/julia/docs/src/user-guide/overview.md
@@ -0,0 +1,406 @@
+# Overview
+
+## MXNet.jl Namespace
+
+Most the functions and types in MXNet.jl are organized in a flat
+namespace. Because many some functions are conflicting with existing
+names in the Julia Base module, we wrap them all in a `mx` module. The
+convention of accessing the MXNet.jl interface is the to use the `mx.`
+prefix explicitly:
+
+```julia
+julia> using MXNet
+
+julia> x = mx.zeros(2, 3)             # MXNet NDArray
+2×3 mx.NDArray{Float32} @ CPU0:
+ 0.0  0.0  0.0
+ 0.0  0.0  0.0
+
+julia> y = zeros(eltype(x), size(x))  # Julia Array
+2×3 Array{Float32,2}:
+ 0.0  0.0  0.0
+ 0.0  0.0  0.0
+
+julia> copy!(y, x)                    # Overloaded function in Julia Base
+2×3 Array{Float32,2}:
+ 0.0  0.0  0.0
+ 0.0  0.0  0.0
+
+julia> z = mx.ones(size(x), mx.gpu()) # MXNet NDArray on GPU
+2×3 mx.NDArray{Float32} @ GPU0:
+ 1.0  1.0  1.0
+ 1.0  1.0  1.0
+
+julia> mx.copy!(z, y)                 # Same as copy!(z, y)
+2×3 mx.NDArray{Float32} @ GPU0:
+ 0.0  0.0  0.0
+ 0.0  0.0  0.0
+```
+
+Note functions like `size`, `copy!` that is extensively overloaded for
+various types works out of the box. But functions like `zeros` and
+`ones` will be ambiguous, so we always use the `mx.` prefix. If you
+prefer, the `mx.` prefix can be used explicitly for all MXNet.jl
+functions, including `size` and `copy!` as shown in the last line.
+
+## Low Level Interface
+
+### `NDArray`
+
+`NDArray` is the basic building blocks of the actual computations in
+MXNet. It is like a Julia `Array` object, with some important
+differences listed here:
+
+- The actual data could live on different `Context` (e.g. GPUs). For
+  some contexts, iterating into the elements one by one is very slow,
+  thus indexing into NDArray is not recommanded in general. The easiest
+  way to inspect the contents of an NDArray is to use the `copy`
+  function to copy the contents as a Julia `Array`.
+- Operations on `NDArray` (including basic arithmetics and neural
+  network related operators) are executed in parallel with automatic
+  dependency tracking to ensure correctness.
+- There is no generics in `NDArray`, the `eltype` is always
+  `mx.MX_float`. Because for applications in machine learning, single
+  precision floating point numbers are typical a best choice balancing
+  between precision, speed and portability. Also since libmxnet is
+  designed to support multiple languages as front-ends, it is much
+  simpler to implement with a fixed data type.
+
+While most of the computation is hidden in libmxnet by operators
+corresponding to various neural network layers. Getting familiar with
+the `NDArray` API is useful for implementing `Optimizer` or customized
+operators in Julia directly.
+
+The followings are common ways to create `NDArray` objects:
+
+- `mx.empty(shape[, context])`: create on uninitialized array of a
+  given shape on a specific device. For example,
+  `mx.empty(2, 3)`, `mx.((2, 3), mx.gpu(2))`.
+- `mx.zeros(shape[, context])` and `mx.ones(shape[, context])`:
+  similar to the Julia's built-in `zeros` and `ones`.
+- `mx.copy(jl_arr, context)`: copy the contents of a Julia `Array` to
+  a specific device.
+
+Most of the convenient functions like `size`, `length`, `ndims`,
+`eltype` on array objects should work out-of-the-box. Although indexing
+is not supported, it is possible to take *slices*:
+
+```@repl
+using MXNet
+a = mx.ones(2, 3)
+b = mx.slice(a, 1:2)
+b[:] = 2
+a
+```
+
+A slice is a sub-region sharing the same memory with the original
+`NDArray` object. A slice is always a contiguous piece of memory, so only
+slicing on the *last* dimension is supported. The example above also
+shows a way to set the contents of an `NDArray`.
+
+```@repl
+using MXNet
+mx.srand(42)
+a = mx.empty(2, 3)
+a[:] = 0.5              # set all elements to a scalar
+a[:] = rand(size(a))    # set contents with a Julia Array
+copy!(a, rand(size(a))) # set value by copying a Julia Array
+b = mx.empty(size(a))
+b[:] = a                # copying and assignment between NDArrays
+```
+
+Note due to the intrinsic design of the Julia language, a normal
+assignment
+
+```julia
+a = b
+```
+
+does **not** mean copying the contents of `b` to `a`. Instead, it just
+make the variable `a` pointing to a new object, which is `b`.
+Similarly, inplace arithmetics does not work as expected:
+
+```@repl inplace-macro
+using MXNet
+a = mx.ones(2)
+r = a           # keep a reference to a
+b = mx.ones(2)
+a += b          # translates to a = a + b
+a
+r
+```
+
+As we can see, `a` has expected value, but instead of inplace updating,
+a new `NDArray` is created and `a` is set to point to this new object. If
+we look at `r`, which still reference to the old `a`, its content has
+not changed. There is currently no way in Julia to overload the
+operators like `+=` to get customized behavior.
+
+Instead, you will need to write `a[:] = a + b`, or if you want *real*
+inplace `+=` operation, MXNet.jl provides a simple macro `@mx.inplace`:
+
+```@repl inplace-macro
+@mx.inplace a += b
+macroexpand(:(@mx.inplace a += b))
+```
+
+As we can see, it translate the `+=` operator to an explicit `add_to!`
+function call, which invokes into libmxnet to add the contents of `b`
+into `a` directly. For example, the following is the update rule in the
+`SGD Optimizer` (both gradient `∇` and weight `W` are `NDArray` objects):
+
+```julia
+@inplace W .+= -η .* (∇ + λ .* W)
+```
+
+Note there is no much magic in `mx.inplace`: it only does a shallow
+translation. In the SGD update rule example above, the computation like
+scaling the gradient by `grad_scale` and adding the weight decay all
+create temporary `NDArray` objects. To mitigate this issue, libmxnet has a
+customized memory allocator designed specifically to handle this kind of
+situations. The following snippet does a simple benchmark on allocating
+temp `NDArray` vs. pre-allocating:
+
+```julia
+using Benchmark
+using MXNet
+
+N_REP = 1000
+SHAPE = (128, 64)
+CTX   = mx.cpu()
+LR    = 0.1
+
+function inplace_op()
+  weight = mx.zeros(SHAPE, CTX)
+  grad   = mx.ones(SHAPE, CTX)
+
+  # pre-allocate temp objects
+  grad_lr = mx.empty(SHAPE, CTX)
+
+  for i = 1:N_REP
+    copy!(grad_lr, grad)
+    @mx.inplace grad_lr .*= LR
+    @mx.inplace weight -= grad_lr
+  end
+  return weight
+end
+
+function normal_op()
+  weight = mx.zeros(SHAPE, CTX)
+  grad   = mx.ones(SHAPE, CTX)
+
+  for i = 1:N_REP
+    weight[:] -= LR * grad
+  end
+  return weight
+end
+
+# make sure the results are the same
+@assert(maximum(abs(copy(normal_op() - inplace_op()))) < 1e-6)
+
+println(compare([inplace_op, normal_op], 100))
+```
+
+The comparison on my laptop shows that `normal_op` while allocating a
+lot of temp NDArray in the loop (the performance gets worse when
+increasing `N_REP`), is only about twice slower than the pre-allocated
+one.
+
+| Row    | Function        | Average      | Relative    | Replications    |
+| ------ | --------------- | ------------ | ----------- | --------------- |
+| 1      | "inplace\_op"   | 0.0074854    | 1.0         | 100             |
+| 2      | "normal\_op"    | 0.0174202    | 2.32723     | 100             |
+
+So it will usually not be a big problem unless you are at the bottleneck
+of the computation.
+
+### Distributed Key-value Store
+
+The type `KVStore` and related methods are used for data sharing across
+different devices or machines. It provides a simple and efficient
+integer - NDArray key-value storage system that each device can pull or
+push.
+
+The following example shows how to create a local `KVStore`, initialize
+a value and then pull it back.
+
+```@setup kv
+using MXNet
+```
+
+```@example kv
+kv    = mx.KVStore(:local)
+shape = (2, 3)
+key   = 3
+
+mx.init!(kv, key, mx.ones(shape) * 2)
+a = mx.empty(shape)
+mx.pull!(kv, key, a) # pull value into a
+a
+```
+
+## Intermediate Level Interface
+
+### Symbols and Composition
+
+The way we build deep learning models in MXNet.jl is to use the powerful
+symbolic composition system. It is like
+[Theano](http://deeplearning.net/software/theano/), except that we
+avoided long expression compilation time by providing *larger* neural
+network related building blocks to guarantee computation performance.
+See also [this note](http://mxnet.readthedocs.org/en/latest/program_model.html)
+for the design and trade-off of the MXNet symbolic composition system.
+
+The basic type is `mx.SymbolicNode`. The following is a trivial example of
+composing two symbols with the `+` operation.
+
+```@setup sym1
+using MXNet
+```
+
+```@example sym1
+A = mx.Variable(:A)
+B = mx.Variable(:B)
+C = A + B
+print(C)  # debug printing
+```
+
+We get a new `SymbolicNode` by composing existing `SymbolicNode`s by some
+*operations*. A hierarchical architecture of a deep neural network could
+be realized by recursive composition. For example, the following code
+snippet shows a simple 2-layer MLP construction, using a hidden layer of
+128 units and a `ReLU` activation function.
+
+```@setup fcnet
+using MXNet
+```
+
+```@example fcnet
+net = mx.Variable(:data)
+net = mx.FullyConnected(net, name=:fc1, num_hidden=128)
+net = mx.Activation(net, name=:relu1, act_type=:relu)
+net = mx.FullyConnected(net, name=:fc2, num_hidden=64)
+net = mx.SoftmaxOutput(net, name=:out)
+print(net)  # debug printing
+```
+
+Each time we take the previous symbol, and compose with an operation.
+Unlike the simple `+` example above, the *operations* here are "bigger"
+ones, that correspond to common computation layers in deep neural
+networks.
+
+Each of those operation takes one or more input symbols for composition,
+with optional hyper-parameters (e.g. `num_hidden`, `act_type`) to
+further customize the composition results.
+
+When applying those operations, we can also specify a `name` for the
+result symbol. This is convenient if we want to refer to this symbol
+later on. If not supplied, a name will be automatically generated.
+
+Each symbol takes some arguments. For example, in the `+` case above, to
+compute the value of `C`, we will need to know the values of the two
+inputs `A` and `B`. For neural networks, the arguments are primarily two
+categories: *inputs* and *parameters*. *inputs* are data and labels for
+the networks, while *parameters* are typically trainable *weights*,
+*bias*, *filters*.
+
+When composing symbols, their arguments accumulates.
+We can list all the arguments by
+
+```@example fcnet
+mx.list_arguments(net)
+```
+
+Note the names of the arguments are generated according to the provided
+name for each layer. We can also specify those names explicitly:
+
+```@repl
+using MXNet
+net = mx.Variable(:data)
+w   = mx.Variable(:myweight)
+net = mx.FullyConnected(net, weight=w, name=:fc1, num_hidden=128)
+mx.list_arguments(net)
+```
+
+The simple fact is that a `Variable` is just a placeholder `mx.SymbolicNode`.
+In composition, we can use arbitrary symbols for arguments. For example:
+
+```@repl
+using MXNet
+net  = mx.Variable(:data)
+net  = mx.FullyConnected(net, name=:fc1, num_hidden=128)
+net2 = mx.Variable(:data2)
+net2 = mx.FullyConnected(net2, name=:net2, num_hidden=128)
+mx.list_arguments(net2)
+composed_net = net2(data2=net, name=:composed)
+mx.list_arguments(composed_net)
+```
+
+Note we use a composed symbol, `net` as the argument `data2` for `net2`
+to get a new symbol, which we named `:composed`. It also shows that a
+symbol itself is a call-able object, which can be invoked to fill in
+missing arguments and get more complicated symbol compositions.
+
+### Shape Inference
+
+Given enough information, the shapes of all arguments in a composed
+symbol could be inferred automatically. For example, given the input
+shape, and some hyper-parameters like `num_hidden`, the shapes for the
+weights and bias in a neural network could be inferred.
+
+```@repl infer-shape
+using MXNet
+net = mx.Variable(:data)
+net = mx.FullyConnected(net, name=:fc1, num_hidden=10)
+arg_shapes, out_shapes, aux_shapes = mx.infer_shape(net, data=(10, 64))
+```
+
+The returned shapes corresponds to arguments with the same order as
+returned by `mx.list_arguments`. The `out_shapes` are shapes for
+outputs, and `aux_shapes` can be safely ignored for now.
+
+```@repl infer-shape
+for (n, s) in zip(mx.list_arguments(net), arg_shapes)
+  println("$n\t=> $s")
+end
+```
+```@repl infer-shape
+for (n, s) in zip(mx.list_outputs(net), out_shapes)
+  println("$n\t=> $s")
+end
+```
+
+### Binding and Executing
+
+In order to execute the computation graph specified a composed symbol,
+we will *bind* the free variables to concrete values, specified as
+`mx.NDArray`. This will create an `mx.Executor` on a given `mx.Context`.
+A context describes the computation devices (CPUs, GPUs, etc.) and an
+executor will carry out the computation (forward/backward) specified in
+the corresponding symbolic composition.
+
+```@repl
+using MXNet
+A = mx.Variable(:A)
+B = mx.Variable(:B)
+C = A .* B
+a = mx.ones(3) * 4
+b = mx.ones(3) * 2
+c_exec = mx.bind(C, context=mx.cpu(), args=Dict(:A => a, :B => b));
+
+mx.forward(c_exec)
+c_exec.outputs[1]
+copy(c_exec.outputs[1])  # copy turns NDArray into Julia Array
+```
+
+For neural networks, it is easier to use `simple_bind`. By providing the
+shape for input arguments, it will perform a shape inference for the
+rest of the arguments and create the NDArray automatically. In practice,
+the binding and executing steps are hidden under the `Model` interface.
+
+**TODO** Provide pointers to model tutorial and further details about
+binding and symbolic API.
+
+## High Level Interface
+
+The high level interface include model training and prediction API, etc.
diff --git a/julia/examples/char-lstm/.gitignore b/julia/examples/char-lstm/.gitignore
new file mode 100644
index 000000000000..a393ee67b410
--- /dev/null
+++ b/julia/examples/char-lstm/.gitignore
@@ -0,0 +1,6 @@
+input.txt
+vocab.dat
+checkpoints
+visualize.dot
+visualize.svg
+visualize.png
diff --git a/julia/examples/char-lstm/README.md b/julia/examples/char-lstm/README.md
new file mode 100644
index 000000000000..ff16ee0a3ae9
--- /dev/null
+++ b/julia/examples/char-lstm/README.md
@@ -0,0 +1,121 @@
+# LSTM char-rnn
+
+Because we explicitly unroll the LSTM/RNN over time for a fixed sequence length,
+it is easy to fit this model into the existing FeedForward model and re-use everything.
+To get a more flexible LSTM/RNN implementation that avoids explicit unrolling and
+deals with variable-length sequences, we still need to implement another model
+beside the existing FeedForward.
+
+To run this example, you will need to install two extra Julia packages: `Iterators.jl`
+and `StatsBase.jl`.
+
+## Training
+
+This example is adapted from the
+[example in Python binding](https://github.com/dmlc/mxnet/blob/master/example/rnn/char_lstm.ipynb) of
+MXNet. The data `input.txt` can be downloaded [here](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare).
+
+Modify parameters in [config.jl](config.jl) and then run [train.jl](train.jl). An example output
+of training looks like this:
+```
+...
+INFO: Speed: 357.72 samples/sec
+INFO: == Epoch 020 ==========
+INFO: ## Training summary
+INFO:                NLL = 1.4672
+INFO:         perplexity = 4.3373
+INFO:               time = 87.2631 seconds
+INFO: ## Validation summary
+INFO:                NLL = 1.6374
+INFO:         perplexity = 5.1418
+INFO: Saved checkpoint to 'char-lstm/checkpoints/ptb-0020.params'
+INFO: Speed: 368.74 samples/sec
+INFO: Speed: 361.04 samples/sec
+INFO: Speed: 360.02 samples/sec
+INFO: Speed: 362.34 samples/sec
+INFO: Speed: 360.80 samples/sec
+INFO: Speed: 362.77 samples/sec
+INFO: Speed: 357.18 samples/sec
+INFO: Speed: 355.30 samples/sec
+INFO: Speed: 362.33 samples/sec
+INFO: Speed: 359.23 samples/sec
+INFO: Speed: 358.09 samples/sec
+INFO: Speed: 356.89 samples/sec
+INFO: Speed: 371.91 samples/sec
+INFO: Speed: 372.24 samples/sec
+INFO: Speed: 356.59 samples/sec
+INFO: Speed: 356.64 samples/sec
+INFO: Speed: 360.24 samples/sec
+INFO: Speed: 360.32 samples/sec
+INFO: Speed: 362.38 samples/sec
+INFO: == Epoch 021 ==========
+INFO: ## Training summary
+INFO:                NLL = 1.4655
+INFO:         perplexity = 4.3297
+INFO:               time = 86.9243 seconds
+INFO: ## Validation summary
+INFO:                NLL = 1.6366
+INFO:         perplexity = 5.1378
+INFO: Saved checkpoint to 'examples/char-lstm/checkpoints/ptb-0021.params'
+```
+
+## Sampling
+
+Run [sampler.jl](sampler.jl) to generate sample sentences from the trained model. Some example sentences are
+```
+## Sample 1
+all have sir,
+Away will fill'd in His time, I'll keep her, do not madam, if they here? Some more ha?
+
+## Sample 2
+am.
+
+CLAUDIO:
+Hone here, let her, the remedge, and I know not slept a likely, thou some soully free?
+
+## Sample 3
+arrel which noble thing
+The exchnachsureding worns: I ne'er drunken Biancas, fairer, than the lawfu?
+
+## Sample 4
+augh assalu, you'ld tell me corn;
+Farew. First, for me of a loved. Has thereat I knock you presents?
+
+## Sample 5
+ame the first answer.
+
+MARIZARINIO:
+Door of Angelo as her lord, shrield liken Here fellow the fool ?
+
+## Sample 6
+ad well.
+
+CLAUDIO:
+Soon him a fellows here; for her fine edge in a bogms' lord's wife.
+
+LUCENTIO:
+I?
+
+## Sample 7
+adrezilian measure.
+
+LUCENTIO:
+So, help'd you hath nes have a than dream's corn, beautio, I perchas?
+
+## Sample 8
+as eatter me;
+The girlly: and no other conciolation!
+
+BISTRUMIO:
+I have be rest girl. O, that I a h?
+
+## Sample 9
+and is intend you sort:
+What held her all 'clama's for maffice. Some servant.' what I say me the cu?
+
+## Sample 10
+an thoughts will said in our pleasue,
+Not scanin on him that you live; believaries she.
+
+ISABELLLLL?
+```
diff --git a/julia/examples/char-lstm/config.jl b/julia/examples/char-lstm/config.jl
new file mode 100644
index 000000000000..c6ed0ff63b3c
--- /dev/null
+++ b/julia/examples/char-lstm/config.jl
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+const DROPOUT        = 0
+const BATCH_SIZE     = 32
+const SEQ_LENGTH     = 32
+const DIM_HIDDEN     = 256
+const DIM_EMBED      = 256
+const LSTM_N_LAYER   = 2
+const N_EPOCH        = 21
+const BASE_LR        = 0.01
+const WEIGHT_DECAY   = 0.00001
+const CLIP_GRADIENT  = 1
+const NAME           = :ptb
+const N_GPU          = 1
+const USE_GPU        = true
+const DATA_TR_RATIO  = 0.9
+const CKPOINT_PREFIX = joinpath(@__DIR__, "checkpoints/$NAME")
+
+const BATCH_SIZE_SMP = 10
+const SAMPLE_LENGTH  = 100
+const SAMPLE_START   = 'a'
+
+const UNKNOWN_CHAR   = Char(0)
+const INPUT_FILE     = joinpath(@__DIR__, "input.txt")
+const VOCAB_FILE     = joinpath(@__DIR__, "vocab.dat")
diff --git a/julia/examples/char-lstm/lstm.jl b/julia/examples/char-lstm/lstm.jl
new file mode 100644
index 000000000000..fc4bcc4b6a91
--- /dev/null
+++ b/julia/examples/char-lstm/lstm.jl
@@ -0,0 +1,166 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# An explicitly unrolled LSTM with fixed sequence length.
+using MXNet
+
+#--LSTMState
+struct LSTMState
+  c :: mx.SymbolicNode
+  h :: mx.SymbolicNode
+end
+#--/LSTMState
+
+#--LSTMParam
+struct LSTMParam
+  i2h_W :: mx.SymbolicNode
+  h2h_W :: mx.SymbolicNode
+  i2h_b :: mx.SymbolicNode
+  h2h_b :: mx.SymbolicNode
+end
+#--/LSTMParam
+
+#--lstm_cell
+function lstm_cell(data::mx.SymbolicNode, prev_state::LSTMState, param::LSTMParam;
+                   num_hidden::Int=512, dropout::Real=0, name::Symbol=gensym())
+
+  if dropout > 0
+    data = mx.Dropout(data, p=dropout)
+  end
+
+  i2h = mx.FullyConnected(data, weight=param.i2h_W, bias=param.i2h_b,
+                          num_hidden=4num_hidden, name=Symbol(name, "_i2h"))
+  h2h = mx.FullyConnected(prev_state.h, weight=param.h2h_W, bias=param.h2h_b,
+                          num_hidden=4num_hidden, name=Symbol(name, "_h2h"))
+
+  gates = mx.SliceChannel(i2h + h2h, num_outputs=4, name=Symbol(name, "_gates"))
+
+  in_gate     = mx.Activation(gates[1], act_type=:sigmoid)
+  in_trans    = mx.Activation(gates[2], act_type=:tanh)
+  forget_gate = mx.Activation(gates[3], act_type=:sigmoid)
+  out_gate    = mx.Activation(gates[4], act_type=:sigmoid)
+
+  next_c = (forget_gate .* prev_state.c) + (in_gate .* in_trans)
+  next_h = out_gate .* mx.Activation(next_c, act_type=:tanh)
+
+  return LSTMState(next_c, next_h)
+end
+#--/lstm_cell
+
+#--LSTM-part1
+function LSTM(n_layer::Int, seq_len::Int, dim_hidden::Int, dim_embed::Int, n_class::Int;
+              dropout::Real=0, name::Symbol=gensym(), output_states::Bool=false)
+
+  # placeholder nodes for all parameters
+  embed_W = mx.Variable(Symbol(name, "_embed_weight"))
+  pred_W  = mx.Variable(Symbol(name, "_pred_weight"))
+  pred_b  = mx.Variable(Symbol(name, "_pred_bias"))
+
+  layer_param_states = map(1:n_layer) do i
+    param = LSTMParam(mx.Variable(Symbol(name, "_l$(i)_i2h_weight")),
+                      mx.Variable(Symbol(name, "_l$(i)_h2h_weight")),
+                      mx.Variable(Symbol(name, "_l$(i)_i2h_bias")),
+                      mx.Variable(Symbol(name, "_l$(i)_h2h_bias")))
+    state = LSTMState(mx.Variable(Symbol(name, "_l$(i)_init_c")),
+                      mx.Variable(Symbol(name, "_l$(i)_init_h")))
+    (param, state)
+  end
+  #...
+  #--/LSTM-part1
+
+  #--LSTM-part2
+  # now unroll over time
+  outputs = mx.SymbolicNode[]
+  for t = 1:seq_len
+    data   = mx.Variable(Symbol(name, "_data_$t"))
+    label  = mx.Variable(Symbol(name, "_label_$t"))
+    hidden = mx.FullyConnected(data, weight=embed_W, num_hidden=dim_embed,
+                               no_bias=true, name=Symbol(name, "_embed_$t"))
+
+    # stack LSTM cells
+    for i = 1:n_layer
+      l_param, l_state = layer_param_states[i]
+      dp = i == 1 ? 0 : dropout # don't do dropout for data
+      next_state = lstm_cell(hidden, l_state, l_param, num_hidden=dim_hidden, dropout=dp,
+                             name=Symbol(name, "_lstm_$t"))
+      hidden = next_state.h
+      layer_param_states[i] = (l_param, next_state)
+    end
+
+    # prediction / decoder
+    if dropout > 0
+      hidden = mx.Dropout(hidden, p=dropout)
+    end
+    pred = mx.FullyConnected(hidden, weight=pred_W, bias=pred_b, num_hidden=n_class,
+                             name=Symbol(name, "_pred_$t"))
+    smax = mx.SoftmaxOutput(pred, label, name=Symbol(name, "_softmax_$t"))
+    push!(outputs, smax)
+  end
+  #...
+  #--/LSTM-part2
+
+  #--LSTM-part3
+  # append block-gradient nodes to the final states
+  for i = 1:n_layer
+    l_param, l_state = layer_param_states[i]
+    final_state = LSTMState(mx.BlockGrad(l_state.c, name=Symbol(name, "_l$(i)_last_c")),
+                            mx.BlockGrad(l_state.h, name=Symbol(name, "_l$(i)_last_h")))
+    layer_param_states[i] = (l_param, final_state)
+  end
+
+  # now group all outputs together
+  if output_states
+    outputs = outputs ∪ [x[2].c for x in layer_param_states] ∪
+                        [x[2].h for x in layer_param_states]
+  end
+  return mx.Group(outputs...)
+end
+#--/LSTM-part3
+
+
+# Negative Log-likelihood
+mutable struct NLL <: mx.AbstractEvalMetric
+  nll_sum  :: Float64
+  n_sample :: Int
+
+  NLL() = new(0.0, 0)
+end
+
+function mx.update!(metric::NLL, labels::Vector{<:mx.NDArray}, preds::Vector{<:mx.NDArray})
+  @assert length(labels) == length(preds)
+  nll = 0.0
+  for (label, pred) in zip(labels, preds)
+    @mx.nd_as_jl ro=(label, pred) begin
+      nll -= sum(log.(max.(broadcast_getindex(pred, round.(Int,label+1), 1:length(label)), 1e-20)))
+    end
+  end
+
+  nll = nll / length(labels)
+  metric.nll_sum += nll
+  metric.n_sample += length(labels[1])
+end
+
+function mx.get(metric :: NLL)
+  nll  = metric.nll_sum / metric.n_sample
+  perp = exp(nll)
+  return [(:NLL, nll), (:perplexity, perp)]
+end
+
+function mx.reset!(metric :: NLL)
+  metric.nll_sum  = 0.0
+  metric.n_sample = 0
+end
diff --git a/julia/examples/char-lstm/sampler.jl b/julia/examples/char-lstm/sampler.jl
new file mode 100644
index 000000000000..1a4aada22957
--- /dev/null
+++ b/julia/examples/char-lstm/sampler.jl
@@ -0,0 +1,89 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include(joinpath(@__DIR__, "config.jl"))
+include(joinpath(@__DIR__, "lstm.jl"))
+include(joinpath(@__DIR__, "seq-data.jl"))
+
+using StatsBase
+using MXNet
+
+# load vocabulary
+vocab   = build_vocabulary(INPUT_FILE, VOCAB_FILE)
+n_class = length(vocab)
+
+# prepare data provider
+jl_data = Pair[(Symbol(NAME, "_data_$t") => zeros(mx.MX_float, (length(vocab), BATCH_SIZE_SMP)))
+               for t = 1:1]
+jl_c    = Pair[(Symbol(NAME, "_l$(l)_init_c") => zeros(mx.MX_float, (DIM_HIDDEN, BATCH_SIZE_SMP)))
+               for l = 1:LSTM_N_LAYER]
+jl_h    = Pair[(Symbol(NAME, "_l$(l)_init_h") => zeros(mx.MX_float, (DIM_HIDDEN, BATCH_SIZE_SMP)))
+               for l = 1:LSTM_N_LAYER]
+
+# the first input in the sequence
+jl_data_start = jl_data[1].second
+jl_data_start[char_idx(vocab, SAMPLE_START),:] = 1
+
+# define a LSTM with sequence length 1, also output states so that we could manually copy the states
+# when sampling the next char
+lstm  = LSTM(LSTM_N_LAYER, 1, DIM_HIDDEN, DIM_EMBED, n_class, name=NAME, output_states=true)
+model = mx.FeedForward(lstm, context=mx.cpu())
+
+# load parameters from traind LSTM, though the sequence length is different, since the weights are shared
+# over time, this should be compatible.
+model = mx.load_checkpoint(model, CKPOINT_PREFIX, N_EPOCH, allow_different_arch=true)
+
+# prepare outputs
+Base.zero(::Type{Char}) = Char(0)
+output_samples = zeros(Char, (SAMPLE_LENGTH, BATCH_SIZE_SMP))
+output_samples[1, :] = SAMPLE_START
+
+# build inverse vocabulary for convenience
+inv_vocab = Dict(v => k for (k,v) in vocab)
+
+# do prediction and sampling step by step
+for t = 2:SAMPLE_LENGTH-1
+  data    = mx.ArrayDataProvider(jl_data ∪ jl_c ∪ jl_h)
+  preds   = mx.predict(model, data)
+
+  # the first output is prediction
+  outputs = preds[1]
+
+  # do sampling and init the next inputs
+  jl_data_start[:] = 0
+  for i = 1:BATCH_SIZE_SMP
+    prob = WeightVec(outputs[:, i])
+    k    = sample(prob)
+    output_samples[t, i] = inv_vocab[k]
+    jl_data_start[k, i]  = 1
+  end
+
+  # copy the states over
+  for l = 1:LSTM_N_LAYER
+    copy!(jl_c[l][2], preds[1+l])
+    copy!(jl_h[l][2], preds[1+LSTM_N_LAYER+l])
+  end
+end
+
+output_texts = [join(output_samples[:,i]) for i = 1:BATCH_SIZE_SMP]
+output_texts = [replace(x, UNKNOWN_CHAR, '?') for x in output_texts]
+
+for (i, text) in enumerate(output_texts)
+  println("## Sample $i")
+  println(text)
+  println()
+end
diff --git a/julia/examples/char-lstm/seq-data.jl b/julia/examples/char-lstm/seq-data.jl
new file mode 100644
index 000000000000..3489e5bc3c39
--- /dev/null
+++ b/julia/examples/char-lstm/seq-data.jl
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Simple data provider that load text
+using Iterators
+using MXNet
+
+function build_vocabulary(corpus_fn::AbstractString, vocab_fn::AbstractString; max_vocab=10000)
+  if isfile(vocab_fn)
+    info("Vocabulary already exists, reusing $vocab_fn...")
+    vocab = Dict{Char,Int}(w => i for (i,w) in enumerate(readstring(vocab_fn)))
+  else
+    # count symbol frequency
+    dict = Dict{Char,Int}()
+    open(corpus_fn) do io
+      for line in eachline(io)
+        for c in line
+          dict[c] = get(dict, c, 0) + 1
+        end
+      end
+    end
+
+    vocab = sort(collect(dict), by=x->-x.second)
+    vocab = vocab[1:min(max_vocab,length(vocab))]
+    open(vocab_fn, "w") do io
+      for x in vocab
+        print(io, x.first)
+      end
+    end
+
+    vocab = Dict(x.first => i for (i,x) in enumerate(vocab))
+  end
+  vocab[UNKNOWN_CHAR] = length(vocab)
+  return vocab
+end
+
+#--CharSeqProvider
+mutable struct CharSeqProvider <: mx.AbstractDataProvider
+  text       :: AbstractString
+  batch_size :: Int
+  seq_len    :: Int
+  vocab      :: Dict{Char,Int}
+
+  prefix     :: Symbol
+  n_layer    :: Int
+  dim_hidden :: Int
+end
+#--/CharSeqProvider
+
+function mx.get_batch_size(p :: CharSeqProvider)
+  p.batch_size
+end
+
+#--provide
+function mx.provide_data(p :: CharSeqProvider)
+  [(Symbol(p.prefix, "_data_$t"), (length(p.vocab), p.batch_size)) for t = 1:p.seq_len] ∪
+  [(Symbol(p.prefix, "_l$(l)_init_c"), (p.dim_hidden, p.batch_size)) for l=1:p.n_layer] ∪
+  [(Symbol(p.prefix, "_l$(l)_init_h"), (p.dim_hidden, p.batch_size)) for l=1:p.n_layer]
+end
+function mx.provide_label(p :: CharSeqProvider)
+  [(Symbol(p.prefix, "_label_$t"), (p.batch_size,)) for t = 1:p.seq_len]
+end
+#--/provide
+
+#--eachbatch-part1
+function mx.eachbatch(p::CharSeqProvider)
+  data_all  = [mx.zeros(shape) for (name, shape) in mx.provide_data(p)]
+  label_all = [mx.zeros(shape) for (name, shape) in mx.provide_label(p)]
+
+  data_jl = [copy(x) for x in data_all]
+  label_jl= [copy(x) for x in label_all]
+
+  batch = mx.DataBatch(data_all, label_all, p.batch_size)
+  #...
+  #--/eachbatch-part1
+
+  #--eachbatch-part2
+  #...
+  function _text_iter(c::Channel)
+    text = p.text
+
+    n_batch = floor(Int, length(text) / p.batch_size / p.seq_len)
+    text = text[1:n_batch*p.batch_size*p.seq_len] # discard tailing
+    idx_all = 1:length(text)
+
+    for idx_batch in partition(idx_all, p.batch_size*p.seq_len)
+      for i = 1:p.seq_len
+        data_jl[i][:] = 0
+        label_jl[i][:] = 0
+      end
+
+      for (i, idx_seq) in enumerate(partition(idx_batch, p.seq_len))
+        for (j, idx) in enumerate(idx_seq)
+          c_this = text[idx]
+          c_next = idx == length(text) ? UNKNOWN_CHAR : text[idx+1]
+          data_jl[j][char_idx(vocab,c_this),i] = 1
+          label_jl[j][i] = char_idx(vocab,c_next)-1
+        end
+      end
+
+      for i = 1:p.seq_len
+        copy!(data_all[i], data_jl[i])
+        copy!(label_all[i], label_jl[i])
+      end
+
+      put!(c, batch)
+    end
+  end
+
+  return Channel(_text_iter)
+end
+#--/eachbatch-part2
+
+# helper function to convert a char into index in vocabulary
+function char_idx(vocab :: Dict{Char,Int}, c :: Char)
+  if haskey(vocab, c)
+    vocab[c]
+  else
+    vocab[UNKNOWN_CHAR]
+  end
+end
+
diff --git a/julia/examples/char-lstm/train.jl b/julia/examples/char-lstm/train.jl
new file mode 100644
index 000000000000..57bfeb6b6e11
--- /dev/null
+++ b/julia/examples/char-lstm/train.jl
@@ -0,0 +1,59 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include(joinpath(@__DIR__, "config.jl"))
+include(joinpath(@__DIR__, "lstm.jl"))
+include(joinpath(@__DIR__, "seq-data.jl"))
+
+# build vocabulary
+vocab   = build_vocabulary(INPUT_FILE, VOCAB_FILE)
+n_class = length(vocab)
+
+#--LSTM
+# define LSTM
+lstm = LSTM(LSTM_N_LAYER, SEQ_LENGTH, DIM_HIDDEN, DIM_EMBED,
+            n_class, dropout=DROPOUT, name=NAME)
+#--/LSTM
+
+#--data
+# load data
+text_all  = readstring(INPUT_FILE)
+len_train = round(Int, length(text_all)*DATA_TR_RATIO)
+text_tr   = text_all[1:len_train]
+text_val  = text_all[len_train+1:end]
+
+data_tr   = CharSeqProvider(text_tr, BATCH_SIZE, SEQ_LENGTH, vocab, NAME,
+                            LSTM_N_LAYER, DIM_HIDDEN)
+data_val  = CharSeqProvider(text_val, BATCH_SIZE, SEQ_LENGTH, vocab, NAME,
+                            LSTM_N_LAYER, DIM_HIDDEN)
+#--/data
+
+# set up training
+if USE_GPU
+  context = [mx.gpu(i) for i = 0:N_GPU-1]
+else
+  context = mx.cpu()
+end
+
+#--train
+model = mx.FeedForward(lstm, context=context)
+optimizer = mx.ADAM(η=BASE_LR, λ=WEIGHT_DECAY, clip=CLIP_GRADIENT)
+
+mx.fit(model, optimizer, data_tr, eval_data=data_val, n_epoch=N_EPOCH,
+       initializer=mx.UniformInitializer(0.1),
+       callbacks=[mx.speedometer(), mx.do_checkpoint(CKPOINT_PREFIX)], eval_metric=NLL())
+#--/train
diff --git a/julia/examples/char-lstm/visualize.jl b/julia/examples/char-lstm/visualize.jl
new file mode 100644
index 000000000000..e2a2c87c9c10
--- /dev/null
+++ b/julia/examples/char-lstm/visualize.jl
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include(joinpath(dirname(@__FILE__), "config.jl"))
+include(joinpath(dirname(@__FILE__), "lstm.jl"))
+
+using MXNet
+
+vis_n_layer = 1
+vis_seq_len = 2
+vis_n_class = 128
+
+lstm  = LSTM(vis_n_layer, vis_seq_len, DIM_HIDDEN, DIM_EMBED, vis_n_class, name=NAME, output_states=true)
+
+open("visualize.dot", "w") do io
+  println(io, mx.to_graphviz(lstm))
+end
+run(pipeline(`dot -Tsvg visualize.dot`, stdout="visualize.svg"))
diff --git a/julia/examples/cifar10/cifar10.jl b/julia/examples/cifar10/cifar10.jl
new file mode 100644
index 000000000000..a00664ce3a50
--- /dev/null
+++ b/julia/examples/cifar10/cifar10.jl
@@ -0,0 +1,101 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+using MXNet
+
+#--------------------------------------------------------------------------------
+# Helper functions to construct larger networks
+
+# basic Conv + BN + ReLU factory
+function conv_factory(data, num_filter, kernel; stride=(1,1), pad=(0,0), act_type=:relu)
+  conv = mx.Convolution(data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad)
+  bn   = mx.BatchNorm(conv)
+  act  = mx.Activation(bn, act_type=act_type)
+  return act
+end
+
+# simple downsampling factory
+function downsample_factory(data, ch_3x3)
+  # conv 3x3
+  conv = conv_factory(data, ch_3x3, (3,3), stride=(2,2), pad=(1,1))
+  # pool
+  pool = mx.Pooling(data, kernel=(3,3), stride=(2,2), pool_type=:max)
+  # concat
+  concat = mx.Concat(conv, pool)
+  return concat
+end
+
+# a simple module
+function simple_factory(data, ch_1x1, ch_3x3)
+  # 1x1
+  conv1x1 = conv_factory(data, ch_1x1, (1,1); pad=(0,0))
+  # 3x3
+  conv3x3 = conv_factory(data, ch_3x3, (3,3); pad=(1,1))
+  # concat
+  concat = mx.Concat(conv1x1, conv3x3)
+  return concat
+end
+
+
+#--------------------------------------------------------------------------------
+# Actual architecture
+data    = mx.Variable(:data)
+conv1   = conv_factory(data, 96, (3,3); pad=(1,1), act_type=:relu)
+in3a    = simple_factory(conv1, 32, 32)
+in3b    = simple_factory(in3a, 32, 48)
+in3c    = downsample_factory(in3b, 80)
+in4a    = simple_factory(in3c, 112, 48)
+in4b    = simple_factory(in4a, 96, 64)
+in4c    = simple_factory(in4b, 80, 80)
+in4d    = simple_factory(in4b, 48, 96)
+in4e    = downsample_factory(in4d, 96)
+in5a    = simple_factory(in4e, 176, 160)
+in5b    = simple_factory(in5a, 176, 160)
+pool    = mx.Pooling(in5b, pool_type=:avg, kernel=(7,7), name=:global_pool)
+flatten = mx.Flatten(pool, name=:flatten1)
+fc      = mx.FullyConnected(flatten, num_hidden=10, name=:fc1)
+softmax = mx.SoftmaxOutput(fc, name=:loss)
+
+
+#--------------------------------------------------------------------------------
+# Prepare data
+filenames = mx.get_cifar10()
+batch_size = 128
+num_epoch  = 10
+num_gpus   = 8
+
+train_provider = mx.ImageRecordProvider(label_name=:loss_label,
+        path_imgrec=filenames[:train], mean_img=filenames[:mean],
+        rand_crop=true, rand_mirror=true, data_shape=(28,28,3),
+        batch_size=batch_size, preprocess_threads=1)
+test_provider = mx.ImageRecordProvider(label_name=:loss_label,
+        path_imgrec=filenames[:test], mean_img=filenames[:mean],
+        rand_crop=false, rand_mirror=false, data_shape=(28,28,3),
+        batch_size=batch_size, preprocess_threads=1)
+
+
+#--------------------------------------------------------------------------------
+# Training model
+gpus = [mx.Context(mx.GPU, i) for i = 0:num_gpus-1]
+model = mx.FeedForward(softmax, context=gpus)
+
+# optimizer
+optimizer = mx.SGD(η=0.05, μ=0.9, λ=0.0001)
+
+# fit parameters
+mx.fit(model, optimizer, train_provider, n_epoch=num_epoch, eval_data=test_provider,
+       initializer=mx.UniformInitializer(0.07), callbacks=[mx.speedometer()])
diff --git a/julia/examples/imagenet/ijulia-pretrained-predict/Prediction with Pre-trained Model.ipynb b/julia/examples/imagenet/ijulia-pretrained-predict/Prediction with Pre-trained Model.ipynb
new file mode 100644
index 000000000000..2d5d0ee7dd97
--- /dev/null
+++ b/julia/examples/imagenet/ijulia-pretrained-predict/Prediction with Pre-trained Model.ipynb	
@@ -0,0 +1,249 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using Pretrained Inception-BatchNorm Network\n",
+    "\n",
+    "In this example we will show how to use a pretrained *Inception-BatchNorm* Network. This network is described in the paper\n",
+    "\n",
+    "> Ioffe, Sergey, and Christian Szegedy. \"Batch normalization: Accelerating deep network training by reducing internal covariate shift.\" arXiv preprint arXiv:1502.03167 (2015).\n",
+    "\n",
+    "The pre-trained Inception-BatchNorm network is able to be downloaded from [this link](http://webdocs.cs.ualberta.ca/~bx3/data/Inception.zip). Run the script `get.sh` in `models/Inception/` will download and unpack it automatically."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let us first load and display the demo image (try to use other images you like). You will need to install `Images.jl` and `Colors.jl` to load the image."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQAAAAEACAIAAADTED8xAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAABmJLR0QA/wD/AP+gvaeTAACAAElEQVR42kz9WbMlW5Iehn2fu68VEXvvM+Rw761b062qnrvR6AEQ0E0QFKwhoQEZZaIJRsr0Jj3pp0lmepBkMhpkFCROMooQSKoJoNlo9Dygpjtk5hn23hHLBz3EqTblQ+bLSTvn7Fix3P3zb+DP/U9/qoCqqqokACooWUKvKgAAyBIRsCiCirElAEq6+/UyMiUyPSkcFS5Q640ssFTVtInSzJp1VbVGterWtFHEzEzVWmu9NxZEVESaqFes6+rbtl7Wh/PlPDxHXS7rdh1mUBMRoWYyqiqVkIJE+NbUjktvXU0BoKois0pGcBtjG+eQzCoiREwhDBYsM0UUZSwzQtRTQsSEzVAiFE6aEu4+MlE+mLGum29rVWl4RaUqp25tMumhjdKMEpTUkggSHSVjBJJSQKKqSI1yEds/56oiCSCGswBRjwIYESwgSZKNKMnMQkQEsjpbpQ5eqkA0k9Zm2Ak2yWGal/vj6aDWaK1VAUlElqa7RxYA602WzlYAVBWzMPH84cPT0/Pzh3j66vr41Xl93lpYh0DUEResAjW3QFW2TAeTYelRGd2gNMmV1CKSSTKraKomkFJJEgWSWpFVJaZZqhRtUgabOU2qk8mxlt570zZTuxYUWSJ0VG7p11zP5WtcnmI85vXpcn5cc5QEkUUVEckYIEOAEhdXVRYAj0pInW4WA1nlBBRAIVkKFuv/7/STRGaKElWAqO2PSsysNdk2z0ypQAGwIodfqwpFFXELUsnRe7Qu5qLKTVZrS+9wC7PIjHRvrYmUmSWILIWMRHjBBV6+jYrM9AiqvRwXkACYSBYCqo1KL5R7gEKrQlaG59jgGVVUqAmTopBKzUJVAXRPsFiRKCUr2JUiBZFwAshaK9KTYwt3j+DY6IOZWZWUMDURUBIoVQMplQJmIgMsJ01gxUIlBCgWIKUsJIqkCCK8igUBQQBAugupZhAACEkgCFRCTEQks3JdWzbARoZzVaiMRhXv8PUS00JVRlqjNQune7XWpmWSqQl4mOab+xvtssErY13Xeep9Xpo9qzJrOEdewpGwikqLjtLIzGCKE0RaFSlgwN2LyQQ0IGw2mRC10kKkxIRUFoogWRRPFFUoFAkUSBGBEAAiX+5fQMFKRCSSiKqI9GLKGJHuompmbapg1UhEJgFCzbwSqCREpJiQMhUmgNTeLJlEGcnKzNR6uYtIxU8qw/4vQAAiAiAiqgoQkTQzkh6XcCEE2IAkpQr7J0Qqihs8oqypmbWmqKiqXobKykhjlbQGCJCRsX/nikowI0ZERQRJsCICAPny46kZUFCYNey1CclilYCZhQRHRAYLBEWVRFWgEkWiKjNIhoeaRQ4CfTJVMc3wyCHliYoq+tAIjjVHSIxKR0FEoSyiRAvw/eQyq1gsIimwTMmqqkBp5MtXAKGieLlqsgokARawf+iiKYSqFVAGaSJSESEigGSxhLGtsAgXZjLLM1dHDcg57BznjvMlb+5O05TzzJzCrDdrfdJ+dzicjq2pmc3zPM0tWNfr+ny92LamkErpNbjGFNh0nMf1uqXT0EBWk6BnBKAKAYgsgISSUpZ746CSCohBJi0CGAJVahUJUASAA5UVFQAaDACE2pspW9c2qViSBYEkEembxxY5uF0zNmRWpReTpKq4l1eSEuUFkgQpQGaIkAKyuqn1SZuYwiksFLIg2F8Evpz73E+YiJDc/87c+x9BsSrVZK8SHjNLwJFIpJBa5RTJFAIizABQkMwaiUBpMfHSEAgAJQnJKKiwGJUR9Nzf9hij1nWQFCGSJYVKKaEppbSZZ5ClqgCEBsmIZCJLMopQIFQNQEVFQSn58loXySqIYGwXVbbW21zNUgV0JqMqfdUx0mO459hyeCKRiWIqm5iI1X67qIpw/2OVUimRQtGMyCySBdsvNsCzUCgl9/uFVQQoBWtAZpKt1ARkaGhDlJimKgOxl2dopVcw4AEvcfhWvm5w2DPK8PTueX0zjreHw02b5jje1t3dab5ZptNhXpbj7VEmg1RvWklMkEUvF43a3Le+TYfTIjq5e4zqH7brh3VcAo6XI+GWlSRBQIUCJZRIJRPNxCSSKd0oSqaICclAlQikqhqqIsKLomo2qSUyCTGdu4hIIfeWT9jNLEhLlNPLx/AIVGpGIlnF8qrMzNT9LhFSiUqyur0UFhGA5NRuXt2aFIoAKlAle5kFQSKrivKT1pPcO1SyRAxZ6UFSlShUce7ToA93VAKNVNUCRKWBSWYlPQpSXpE05palgOE6WhNlQ1SWFwVKASNybJGgZ7nnGFtViCigJEypzcTILkCWlKhVlYkWiyQrhVWlGYwIAqIgFcjMYqqY1X4pAyxUzlFn0VwOh76gLdmNFQqiilrmgx61Dd+uW4XtzSsgun9GomoUKQEFRGSJQTS8sgSQqiBE1SpZSBEhMmGJQNV4eWBaSCBbbykFKEhVUWMxAZSUmDJSNKVJAeEukmXVFBWg00dpI1vLNcYYKOSG9z/+6nq9Pq+H+djYT6dX2Q79eHfq82SL6aFRoZSqstWqygfmSdelX7epr1PmWVQ29ZNOvdt6GZeH7fp+SJjEfn2OAtWUlcygSCNpQomQlCZqZKUWQRoJI6GZKC8hBFEAqZOKVGZkJPenqdIELiJAAkhUVonpdLCqovp+OUpZePEntbTJT9p4YaBUFeYpIMVAFrRrO8zt0A1ZQu41JzMDxZeysTcwBJiZJKsIe2k8WFDV2jsMpkgRkfAAWFMmUNm0V1UxySIhJqCJoGTvqDKcaRgjkJuVIaKZqLR0B5CZ7r73Wt26nOR8PqPQu82TWSvtEGMQJeIxRPeXF6IaCCUqMwEzrapAvNSMdMrekxQorEoUS7OGiByPergJO7hMjGBtpFCSvkWW+ojt6j60nBQpUETMhFollawmWqzMl5mWezcPUVX3yEyRBFRIFXF3AAQKUNUqeoQ26b2DfzWAiZimeFUAUJUSqqrDIUlj62jVt/CI0JRsqObcUnQKoXegmm6owMOH81Nsd3KzbP2WiK5hyFYwtKmXFCJV1KIGIjMTJa3sUOZlEnmlDofJbNZP2g9T6Xn7/JIBYYhE0UTSlCihoylLqqqEalDNFJZSKsF9UmKwBAgIzQQmIgqke6ZASmJ4JjUqARGKNFTWy1OVQtJqmtUH070iQVdIsAAECvmCKJAswgQwI9USTdSm1qd56cveEiQAIfbHtbei+5i7P6EMgCgBoyjg3rSyAFQKWa0TDC2RMUcgIpg0aYAnEtgbDKoqkHvDk1HUGptrL0TmFfO8ACaSIVXC/buw0Kz3U0/44TDv3X9rOs2iTZIwulfSKsr5UgNXqBSlPF+6SRCQTHpAoVlIAsXKiChqphTkMs3z8d6mm63aWtCpHaJiOJ08Dx+brmuMrYgGoFCqIgpIiUIFSgIoIlGAkKCQARRRJAyIvxoO8idvCblfelmVfW6ttZLKDCGLqMyqYgmKkGIWeM1GEtoEWlGeBK0MqFGIUMnIitxKxdIk1AFGWsEEvZU2QMrTE0EyIsYYbVoyfd18bO5+TURUrj5GPtOGzqKyhUO2Sgs6zfJG7Dl1/SoqUiiNjeKiIQQQaKpCLQooKFYSBJEZhKhI7mOqVEYVSoVgBVAFgTBr28b5kpOUAkVRKRKQBECkipWISPWmsuioDSEjWFuSEOE+UGXmjjWpGooiYl1Npfc+WWtqpqoREREQeQFVCpm+X2/c625lFVgZSVGoyQ6uECZiSrUesqC1CWp+9XUdvuUYgyosyxBEVRVZKFRSlIBHoRLhUJZwYJV9aiwVUVDLTKT6fq2LiCr3ykCpqOGVhBQuWZVZqAggWWwoug+BkKqVXhpMZUpVVohKpkqmAxBjnyyIYZMeSm6cc8zzHFv1ZKq/T4WgVIYji2osdzFNAnAK1ERMBRDso1MklEVJJgIl2HtMQHWKIojAqEqKogRklItUM6OwKqSEZPDl164qBiuLxkQqnTqVkvRKd2yiKvvRkOQQwjAhJ0oSm8TVMUltPqmc7o+3t7fTYUlIjG1d16YdqZmX7ToycN02v1yv69O6Xq7ndVwu6WezqNDUDBVVYc9RFZereB7ue5O6PkauKQJRsqgAzRzR2Ix7d5Sg7qAAhAVUMUYos5FrFUoqvFStNzQx04IksKZP0rSrqEoxc6AUACI807dQdtLJ/VqJ/czuvboUCCZZEapSG4iiEV3YTCYrk41pYkpSlBFRlSQpEgOlWQFIZTnVyCpElGQgk9baPiiwsk2iXWXuy8H6rIRl9ss1zk8R7sNRXpnIAWzumWXaxaKECRYQiMqoQVICJmqAcNpPr3aS1DIxqjQxCkHyfD6f/XqJcyYFUVmshghlJiqIxRA73h5SESVV4qYVklQaZS1l+e001dw8x2dv3kh/9ywf2mwfvV7Wy/bjd1/d33/y9fXj7z99HzmAiXCwEhWZJsomaiKSRFCtDGBFUbIoyEwWSbAxan8RVEj3FYDKVAhUVhaR3VTMClCNqih2IVgVEYmiGoRV3kiVOdwRPsqlobFDGFNplqqOdC+nqaSVJ2VoiUCLoovgUHKSaloYHsft4mde5zRcZIuxuY7rdn58l3Fxv665eW1mWkSam7YJiMgRwm1jAQ7bQpCjnFoWohUpWdWAMBqJqsHK2qt5iWQ15RbuIkWpqCpDAdiCZNXm6HNvS0MrTDV1MbOpmVDTq2mG1+aMKqx1vcZ6uawbtuuW2/ryfilMELEfZcgOqUUNiVZa64AIjFkQwdRpmdGamM3uuV63MUZWqIjsu6WsYhaz9uNUMiqVYEIo0kobZMrpMNnSlpP2JtOs2mS91vMZ6/O2nmtbz170bbgkXCszUgAhM2vsM8IOgUhZFkAT0oTa2FszMwgFL7M4QGSaSSdTdYvyIIuJDO4IbAMkib0jqkhT3TGoMho1Spzo5DR3Oc1lnHK7e318Oy9XvGr3h+f1x6/v3jytsWJd5CI05IVxBSycBqZRpdSEmkDt+zsgM0NVqRqZIraDTpkECkKhpgelWBIIVr1AIUro3iXs+EdSsyiRVTvGwKqsGAFPV/W9sVBLZDGoBaBaCpru1ZWIzBiQrWSSLZMT5KT9VkMGiSyOsc6m56fL5SJ1jcv1+fky1qeLjwczg7nzSit3jxpoRWVUUZyoaVqKyNwEbWujq/n7zOdEEGKSVJYKCyPhQmFJJsBwoYhVum/BMkgJIqMchWCgZBGIsstyO+lsU1NrJczM9EwhUVQ1X71QHrVtEYMYJaUFlqdk0TR8kACQSArTQ4XpI5OBEmOfe29qFCuMTK3SaWp9mnyM8/m8rQN0SpFKyIjIrFRjCUhP94hJxSbT2aR7m/KwpDU9LG1aWuucpuxLbXM997w8T+vg0CZYsfnYGFkKahPj5JujKE0CGRFiUhmZSIWxAS9gsYhm7pdvRrkzXhqLGplRBc9wbF6DMUynIFBQCpWEYkf9k1QYKGCbzU7WhJNhkGs8nU5f/3e+9+++v/74P/+9Pz28/uh7X/vuH/7p75+vH1xMcBObm0vKgLBR1EAWhaoiYpWoQTYrorDDbJqQl82JCIDKpEKoGVFJIUHsVXefi2rvE6WJKQKidI/MEWuEu5R4VkqlhhilgV60ohVHVgFt6KLMlunMoSIpPTC6NGmtTapNBF4YEbKurAyVed18fVq3p6fz83N4lYS1i5ont1Kn5sZRXWwipMy6qCWvffZClpSTBmArXffVtppWIau8MliKEkCrEpSCRoQSVRKejuoUJiJkZOgsfTIoRFtTnXubu5mp18ZWmwz3fZuSfZ58G3LeJENSIAgkRCg1xvB9TIgKVkFoYOeIRKSSRmwZFpGUFNVPfvpeRFVVBNZknm2aZlEdlV6RmawiKpFEI6hAwfdFTRLt1KZFpy7LwVrXaenLIststujcVRWQqiIoKM3MyH03kTbptMwwtdZUxZlJINPMBFX0UQHA1EQoUo5AFphUT8kgRo2oEbENHyPC4VtdxFK0QC+qqTSxqn3Kp8DAHacstZpvlpySEjdT//TNx29Oh+v43KbpF77xq995/a3t/VfixTDP+el9Pn+5oug1TFSUQFqbhCWEqApVCEpBFNQkKUbRBNRUVKiyw6ncUZ+fbBeFsu89SSGlXgBoLRJMHzlGjLXKS1KYLArjpbTXDj/L3vaqgsUoAUX0ZWRklRRH79Zn08msydQaqaKsqpHx+HR+fDg/f1ifn87b8/XyeF0337anwqiUjJFYSxwMSIau3aY2taHPxW2HOYXmGfTUeNnmKVpVZWaMQgmhO6q4nxgTKBAjt0hUr2xeESG6tPnQptlsnqmwxj6pCkW0qYqhTa2qSBQcTC0WWKkVzMhwz0BlRWaM3IGgApS2l899gyRqbMKmbWrTcZoW06/99BtRUoJCU2lTm+bWZ21mVA0gMhMgtYqVVbI/XEOIY1AxL733bksdlkObrc+tH+c29ePxZjocI2345uFRXok+zfOhtcX6ZNo7lTaJNJQAkEqwpBhVUVlJipRIhAQlqYAUFQlcx7au13V9Oo/rFr7GmjWSwzoFsCprs1IL+55OTBQUF1FFaNHEDjpbE2Vp/uK3Pvupm49ubj96/Orp177+G//wN/5XP/u1z/RJ3n/+8NUXX3799Te//LcPOxjDQpXb3EmqwlQBAUM15IV9pKJGiojY1Ptk3M/A/pPszXCAVQAIIRNIKVCouq8s6RjlGNuoUVraKIRgX2OWJHci0Y7gkmpUq4gkRUukJJNgkalu0vrU0FALrbE1y0JEDa/reX16OD98uLz/6vHp3dN29XTkdqGAqtpaaUKSksIhfVLth75MU9cJIkNFMxAoL5dIA5zuDg3FgHvVjn/vYyiFrNKsLIZFclQQLSpDRgD92I43S+9zaxM1rZFKIUdEInfwJyLBqsjwACcPJAylBkPBtxprlY/M8kigAMhehcNDoCbaRLpYF+2ija1Tv/Yzp4SrSrPWmomKiLRpmqfJmk29m5mIBpBFiDiRrMoiBRUjwlo7HHpb6nBYjgfTg+nUjnZYltNyuhVtLIwR27Y1ldvb4/G0LKe5d6VpmyxZAmpr6ZXhmVkoIl8IIQK2YlcBqZUYI3PEuKzn6/VyXcd1rCO3TIekyr5LVWPbe25Sda8gYEJUm0qqqQlE9ND6ocntcV6or+rm177zG3/nF3/rtR5uXn1889Ev/cLP/dbX7z79nX/2T3Bdg8u6YZIOIDJ0EiHMSlRJVqU2UpSYiqQKyWnq86GrESwKBUIClZXISO7YfxazdF/A/4QXAFTkXmWTqCY7KF0UiWJUwZGVTIUTkKAXSwVUiJUYqIQIBKKpqmbCpmJqaqwiLTy3NS/n7fp4ffzq8vjVkz9XRVFUskyERmslbV/isfdpmu7ubt/eHu7mabJuRGZlZl19VIV648Zy1CaSlZUFsJixM6Oo1AJFLDw8wLJwZGDfeHlqVLZ5nnoTkzb1rEikmoiJSDVT94iIzIoERUnLSEmBi1eMKvesgVw9ghlpkMlaiUApZuxqTfsySVdtqpO1bv3Q9JOfma213idVVVWKQFTEBGI29W5qYq3PU+8z2VAFYhAlTJLhqKo+83BaWpNlEZ05tfkwn9RmlTa3ZpyQUiWtt8PBlsPSp2Zdp6mpCYVIZFZGjHEN30QSQsjO+NoGPGJ1+Fbna11Xv25jva7n4dvmmeXbGCTNlEqqqU0UE1XdX2coRArKKisYtZSmRnDq0+nY527L4Xhox1/57Fd/7mu/sET//I//8PIXf8CxfuMX/tHf+63/TTxczt//N4wVdiNzt9aHX1sryn7LQATsVrAEqYShTzofJptV9sW1kkURyciIRAkqXwpCCgUqiEImMxOFgkRV7oTQQkBLzZFVWZkK4f7xp4AojWK2JrCSRqhQVIzGkiIIVqmayaSlW65ZzBFxqffn9fzhLI9Za25DCGksiqVszbI10IZriPbb0+uP3356e3Naej8cTjfLq9mOkcjKbQMG6G1cKy7ktpcFLyAdmqoGkX0pK9qaSHqGZ9AldzylOIYwxGNrJn1u7dBSUk2lo6tNsxLmkSVIVEKjKob7GJJVXiPcURmFKF93xNlURERKUE10mXqT1rR16820yzS3aZmtm7757L71Zi8rATNrapYFYRKVCCpMm5Im2UTmxt7AQgEqQGF4BKIZtYlMnKw3O7Xp2KyZqnDKgujUDkubmnVbltl6t9Z6pzZ41tgiPce2jm1grCMzMjO3a54v43xdHzZ/utbTsz9u2M7jfNku1/XZaxu1eaT11lqnYOoqpqoiQlOliKgIhWUVIKgsqinMfbATGZ/ev25JL/yNb//GT03f+eov//J3/8W/+PM//u//7H/4gz/7g9/5/l/+N9/57l/79d/8xz/zzW/94Z/+tz98fP/p20+m3j88fdFa3wl5qqCxRBwhmhC1yeZlmqZmvSm5L8hU2v6SR1QGQCECRUIKQVZUFTTTUelBeYG99lUyq3KvDdz3qYXITK/QYId1pVU1UkWYJtLEQCmUlKo2CkV6KQpVoTV43bb12eO81Vp0EfSIjRJijRpqUoKy1DbN0/H1zavXd/c3p9f3t2/nw83hcNt0AjUSsQZ8Y2KLsXllaG2oACAMMTNhAK7K6dR1CZ3mdqCa+tD0LGKNTFDNSgmiH0wnzJOZ0IxspiogMuGxFekjI0eOuoxt3VwCNQpZ6Ts3kdIUxmxkE7VsXczYDjbNRiuZqbPAaLP2Q9PbrzVTmomaalM1pagQKpVSkZVZWQ5m1l4PoUKKqDbwBeEdW2aKmYq1w3w69KO1Wa0rVLSbaVF6M+uAlJiKlXXIJEHZthGxpbtfY13X1YdfuW4edvXYPCN0czurbtJG5QgfHheKgEomTPYL3Vqpqs4KK0tQrUmyoNp3dlTtbWE1MFjq5ajxjdcff/2jj1+Pu9/81r/zoz/6w9/7N//dw/qejuVwfC57/v4Xv//P/uO7Cd/5W//LX/qV3/yd/+z/8OMvf3z38Scfnh9KCW3ampioSOlebITCqdvp5kanpkqRLKKKBc+gj+GelRQEUNiJfxk7w3CfgmonBr60Q9x7YVQRidipQ7KT1UOYAjahlbKhFVrthDQUiVBRbaJNhSSiCpFIZwXGOcfFt4vnOqpk5xgIILYkVhEWA4q+nF7fv31zejXdvDoebk/Hu9vjq6nNUWXaJWpLjNxip+lnRiQieirVVQiCLQGcjoflruuxHQ/9eHvgyfrNJJ1eEYQ1scVsatMJot5677OWBKFlzoK7J0tk3zxlJisKUVIWKVlSUYxkUVSgsI7eWjNr3djEZmuLahfpgqZUQmFdpnnS/pGWkFpiKiKonT8nUSAL5ZWeXhnIqMyI8AytF1o0hEJBFkduAjnO8zTPp5s71gtJgCJmXbUlQJVAUKkqfTr0PvuI9br6GGON63Ud2zo28TG0p3UCbN0OJ5xuTBeqlXZVM20iZqCJqomqgWK01nqqUmy2vl8bO6NVMnbWpzBFcowRsOkwTRlXq/Hzbz/7tW//bTxefvAnv18jTtNNpr66ub+ZGm2Ktf3gX//Xdze33/yFv/vzP/XX/tP/6v/yOCqaVJPWRbrYJGxVpJpRyxqWuS+LNSMkgJ2CK1FIxzZGOlRMwKwEUVUQRYGQRCWquK/kWdh/ft9v/wwKFeBOkIJYgckSlqmRXpqU/XoKSJGKgiiBqkQmPdI907ldwtfL9pR5VUahHFICiVEWZq2KLi3N6rAs96/f3n70tddvbl+/uZ+XuU+tJCGCApC+xnCItVltZktPXy+qEOmKrgghptPST9bmNi3z8dXCLm3ufZkOh2NbejtObZa+aOuTNUqHNTObqAiskcPDRwzPUSg1Q6qkVFR5sXZ9U0XUtjmoUjvcKtZVTMRUm0qT3vWv+oKCtN5aN1HR6UaQgqyqRAnBysr0BCoBB0t2UUlFREXFyLQdb/KxywxSrTISolNvvfVlXvq0IKGgiE66TDaJ7kKIzHRVsWUy1W29nM+P6xbrJS+Xy/V6HRsPi928apE+zXo89uWGy0G0SZtIlLLpzglFy9zValQznWjNWmvNmrZuKipCFYpWie6SqjSqChmsFvj47g3dfvb4jV/77i//8e/+988Pj6fjrZai2eWymWmbl8Px9n2s1z//V/fdvv43/tEvf+/nf/+f/59+bAjNpReasCElCIaQFdOsx+Pcu4mKqVQlIJAeEZer+xiVOzUILwPEjg/KvuOTAD1dwMJOJdv3GcliRILKyqogZf+CfGmKFFIhxWYkKM62LzuZOVTlJ1wuzagcFQO1jXFFbOBOLWOhFNCAW2NvLTJs0tvbu/tXr9+8/ejVq/uPPnp7Op1UhcqsAiojxrZtEVW1tHbTT01nkusY41rpkBo29cPtPJ3YDq0tvTT71I83J2uqDe3A5bZNBxNNbRSjdCsplaACEpt7VCTTY3h4VmWkghEVm2PEtnlElCIDFSVNQIiBjdDSLjaZTa2ZkKRKiYiamKlJ66KHux5bRuzzVUUiYgBZERVZyUqke4RXZlVmVkZF5nB/Wa8iSkNV19yaNRWByLIcunUmVSeTpgpTBgpC97HPKRG4XJ4+PLw7n9dt5LZu4es0692rRTRb5+EopzvtB1gTGlSIAsUICS+UVgpFKM1aNaXqYjZbExNtNvfWd1puEyFSgG5mJnPXtK2b19g+++Szv/3Z3zj/8PMP7398c3yFKpswH7pvo2xqmqp2Oh6fPqzx8Pmbt8dv/uLfP8j1//m7/6SaHeZZVaCVggKc0ZrOy9Sa9t72q4QgRbNkXdftPCKCxUpkReyvL7DjP1UV6aKiprHXLJQIdhkGwAJRYBUrWWQBoFBAsJigWC+mULSxZIhQAO5XVKIclYgRlYVkrDWq9uVDelbsSwy4ZG/CQiJ00pv7+/v716/vXr356KPTze18OBShyoKI2LatT+uzh7NoEGHvtvQ+S+q2pV+uCrSlT4c2H2U+zjK1ZjYtc5ubTW06NZuEUhRYkzZTm4q11qG9RAAmdYfRXmhRmZUeUvBIonx4RBYlc9fGclclShNrKko1tW7WdIfmEkHVHX5WI1l6840pqjKZkZmentsY4Q7ntm0ea6bvgFdEhkc6KmN/B7JGwaFCbfuwHxEBlHBp07EfkBrYaQIjJYss1hZjHRvK13U9Pz8+Pj9sq2/XQfdlstu7uTURwe19P5x0PkInoYTDKbBm+2/rxc1dyBLQVKR3aWaiJqLWtFlr3SbT1pp23eF1agPN+8SmaFbzXG/6/a9++9eeP/+wXc9N2rx068bMyYSgUddtPTSbb99Ua88/+BeHN7c/9bf+ox/83n/xJz/68/bmtTB1l6gyiGiTzlPrRlP5iSgGlRxrbKtvq1dUvUgGWLs4DXxRzxW02+3NzduP3izLvDMUwz0TpOzdEvZdDKSAQlF2sExUFE13asauhaWESok0gWZ6JQGtQBUqmV7D3auaqLVOokaOSoHQKyOpsMnu39wcTof7t28++vqnN3cHnSjK3ptnhucYvl6vl3X19bpD2qqWkeVpIruWyCBSyY5+WrSbmS7HuSaVSZbTbF21N4jSmnTTbtbZFVNvJRJA2dq6mlK43xE/2SzsAjIC2tq0TK3vC0U2FRVRocK69MlaMxGSLztqElSKSEVVBJH6tZ++EbHMcg8fSM9y+jWuV79exzZ8G+4R23AflSGVEuHukRGZQRFtbV/QCuE+2PQ6VkTcnk4JeqbHdR1nr6DI5sPHdlnPvp193S6X8+W8hlesrqz7u5vDcVap080yHbks0idhQ+QVQlVT1czyRMaL3KeYqQIxU+t9r+ZCa9ZMRERFqU3LxFo3bdm6svF26lOzudtf+/rPnsZxOz8dZCqE9oNEVUZvTZAA29TG2DLW17dvS/CDP/5X3/7F3/zZX/qb/9l/8X9+aFs7tEowMOBmOpvNU2tmKFSBFCTdc9tivbhv8XIQE+B+DUntZ7nApje3N6/ub9+8fv36zf3t7W0zZda6bfkizAaq9rEYFN813FUsTtaiZZWrFrRKqAqgZGduF4XKkozKispyTwpPy3SaZjXT1syYmeVAsEjnWG7a/avTzf3N64/evnrzqk+m/QUm2bbr8+Xy9PhwvjzGtkqFaqcZhU2lq0QGtswUH464CgGzNvXDMsvc+qHPh2Wemgj7PLXeYPKT1kVEQGNSSkssiTRTch/TX4QSWbmzT3ufeltYbM0Ox2U5zLd3N8eb+XCYe29m2kx3agF20SbAKoFI7WLu0k+/d9eamJFgRoSXCEXkul4vV79e/fLs2+q+ZXi547qtPnzfxu0diAohWXCS1J3Uko+Xx+V0uLu5iXEZ4zp8eIQIfIx1u66Xp8t1jcz1EtfLlutG+GFph5s2tb4cpM/VZ/TG1ltgZK0glW2/8BKSIQrbr9MkIU6FaiOaKK2Z7tujKpEygZm0KVVi6jpP1lrNE24Ptz/95qdfr6fnz79soM1zmUqVj3WeF0dRtaT7iEoMXt8cPsnzdr385bd+7h+JXP/r//Y/wWES6Tk2UMz0ps9Te1naibRKZiFczpdxPY/wF7k1qvBCUqmdfqvNDofl9eu7m5tDbzZP0+1pvr05zIclkdfrNbN2AGQX6OU+Pe9qVbDAkgBZrBfaxa6ZTtlZAGBVxc7RQcHU2tTuTof7+5vbm9vD6TQvfWqWGU/bRq3jfbNj3bw+fPzJx29ff7QcDtZEm0A1w7dtPD89PT5+OF8e3C+QbNYhCE8pmfvcbWop28gYq7IkSyjTcemHZtO0HA/NtApipl2bSZvVTCBClTLdN3pqIoCYklKoXVWyE0DMzGxuMCmVLBFp3VqzZV5at9774XBYlvlwWI6n4zzNZkpJH7mv0irSrJnaSNdv/PwrsaJ6MLS1w818uJmmQ2uTZXpEbtdtW2NsGQ4fft3W2FiObVwQMJFCkiVKQFlkZbpfI2Se3ry+txojR72gloHy8/n56fxwHWPb3EfmqPChwvl0sK69SW/Z5rSuvZMiWQ54wgGByIixjULt4jSvrF1PXvtpEjWqUJA7zJhCmEhrNc3ZLKVBmfevT0vrb/s3fv7tr8UPr+a+rddpmaVhJ+pN3frp5vH5WXJAatb2fHl+ePb7jz85f/nHxz7/2r/z7//L3/m//fEPf6jz7RbPaq2pHDpNhSgVK7zsttY11mtUFrijPLsEb2cyVElJt+PN6dX97ek498mmrofeWnvBMYp1vlzGdXtxUsELdY4FvpjYSAoBL0rBKKCiCJHSnfikun8vERTKrE3dDofD65vTcry5ubl9/fr+7tXN7eGofaqe02SnV215fXz15vU3Pv309es3fenWSIGq+Iinh8f37z88PDw8Pbx/Hl+hRNiqskqQmsPhUJpy6qyxrRmRlQXhLH0yNopIFkM1idZMBOho1l609ajhnjHKsqJKS4QgVFVopn2xucvcRJXatE299WYqJqIk95ZHKK331ro1m6dpPlhrHbUjB4XdlkKp3/zrbxIxMppNd3enm5vDsszLcjje3t3dLofjrGqbj1gdG2obLzw/L8Su2d5pLxqVIklGZgqlzxM1b2+XeVlSCIEaKJEVz4/P56eHaziifA33KkltOM7LZKYTdKrWlOLSQWvXWD1DTEBW6rgWUhMIKVUVmHtpcO8HACq0oAI1ShOIuhlU2E21iWp2yZL89P7Nr73+u+1hGg+fm85mWlAG4E6T5+u6LLfjmtt2OUqy8nDzpp+mh4cfv3n9cX744enbvzgJ/+v/73/uhyUvQ0QPNs+dBEQ7iCobwAjfhoy4bhwmu/JWSgMIpFAmYC315djfvLq/nZaJ6L2rWWv9cFim3sYY18t62a7uCahJq8idJha7D4OwKcL2iRfWmhjYU3upQdv24megWcVWbdJpnm7e3r89vr69v3l1f7g/HOab4/E43x0P8+3dbFNC8s3b+zcfffzq7cfH2xl9mBhlQvl6Pj88Xj+8f3j37sdfPX5xva4o7rp9yWTJ8IqAX5Hb5p6Pl22M4FrhVdpL4NhUW5WkDm2AElaFylpRycK6XkDHfrZq32TvpEMqtIsKKoqpQk5N59a6mZl1qa6tKcW6WbOqSo9MT1SjtiatSeua4PAXKEc/++XXQE3TdLw5LIfDvMzzMh8O7XQ63dzen46nZZmsMTI2v3puCqsQVL3A1rLTYUJRgDgpJssyLcdZDW3G21dHNUCiNAJji3G+Xp/Ol9gGKit2BaSrZV+MAp1VBEyKkGZb+GW7jMoSRNIHq4zQKkiYVQeBJJIZuY0VCcJEd2nqLuFX1Uk5QRy8sPHmcKOoy+XyvU9/tn9JPD6INqpCikgV6G69sY7T6dW2ObRFstLvbw+Uev/+cZnm+9tX3/zFX/vTP/jd3//T37+fjstk0jBLiU6U9mLwQdkix3YFUkqKSBZUMguAmkKyQr1KlHc3N6fTjU6td+2Ttd5EOE2tTW3d1ut6Hdv64h4ie62T2g1YSgqNL3pvh6T2HSj2QihEbEKxChRtqkvr8+lw8+r17enV/f396e3t/f2rpU1ouajpQee5H47LfHN8++r+sCzsOingmhHX89Pjhw/P7x4+vPvyfHnYtufLeo7VCaoaCz62zPQYVeXn8XS+XC5XdWW284j0DFTvs6qQbNNkZqoUCqlII3Yp2ItcNF+cN5BFT0clBVUCNmVlQNmEikKp5E46UlOh2ovXmJpily/uMINpaw1SntvwqyP1Z379ralO83w4LctxmabpcFgOx8MyL/M8t957a603m/YFMXJkZkGlNaESQlGoiBBR5pXzbDe30/H2YItoG/OiOqm0grr78HB3v1xW9x3bBrIqHQIx2txEo2pX5mdWPl/P63ap0lHIkOEFWhVYomUZOYYTWgEIPDLcASGkdnsJqqqaKukFz8xRNeks5RPn++17848l6zwiCDYzs0ZTLxfSr353+2Y6HVqfzfrw6/n5wzQdIvLHn/+44+H19/72dz799D/+z/6P7Lg5vEamTgcKyZ3CjaT4GIiN2CUsufeygCb2Mwkt8VKPrRtu7++Px5ummBdtbWpNs3zzVZTWNHIA0WezDmmUBhWaEUhIUJTI3WMuUaUlVtxVOwQQIkpQgN77YTkux9Pb+7dv7u4Pr47TfLg5zKqkGU3Z23Q7f/TRq2U2MT1MvauOjDG2D199/vju4cPD48Pzh+en9+fL03V7Xq/rdb2OkQAyR2SFR3r4iu2S1/OzX0Ow+MZ1O+/I03JzEIOKqgIKUY0RmVGsQnrmtrqHR8a6XTKyihGxT8FCI00EL8BOJZU0K+yyu13Bmy963r3jyRebNxGKJhXWDKqJ0p/9H30iqq233qd5XuZ5OR4Oy7Is82GarfdF1URKzUynXY1f+6KxqfzErowgRCIIsM+8Oc7LzXK8XabFwLJmVAdjXdf1uoXH2Mb1WvtucoxttwnZ+THkTkPfvLYRcblexhYozRSkEhpZmVCYigHwUXTxQO+TiPjwLKkEIvfxkghwKw5kEbMzmGVX+ah//Zdv/2Y9fDjHemhdjGPLXd2FciYiiiIj4vHDVzenRU2vm/tWonI8HR4fPv/49u7tz/zmD//wX//RX/7rw+tXrU2UTWli1RopmcDqHrn5vteKZClLC0wioypMKSkZOcbww83pozf3SzNrrfcGQ5Rvvl3WazLVjCZs1iadFrNZtO19XkqHWnM4IFUsprRsk5i1nRJGyZ0bTIrZPPfjab55/cnb00e3N69OzUqboRlEVPyK0Q/Tq8M8ebzR6ZTydLnYFtvj+Xo9X87X9PH09O7dw5cj13U9b9uIdK+CSBYyZFt9u/rYhm8R1xpXzy0yM0tX94jRD1OfJ9Yg4xpO3YWwYydkRNbm2xYjM1C1W++YiLAJlLtzw7542FERM1DDMz0SyErPoIC6m/u8vA+qUClRJFK0dZuqysQUuU+K2q0dj4fD4dhaE5qIZErvXTtVOv2pIkuGXsXdq7irXXajh0oUMrNiE5X5cDgebw/U1OaJ8Ijy2LZtXSs2KQ9hRWKMsW4XADO6avk2YqCwavOq9MDYMkvgzF3MApDarUkyR8ZIRvMRplrJSgo7ioBE1TqCXLOiRLoYSDIbX1+v+b23n3xt/mw8PAevZrZdvC82tz4i3QMFCm/uby7Xy+F0q6394Ec/7N3u7t6w5Hp+UObc7/7s//Nf/vTH3/kP//3/9e/8wf9dLWkL4SJMYbFQpDh1g4q2jFEtLIKBKBQjK1SojiSrwdZ1/Oirzz/+6P70+lVwXP0JAg+HpM3E5jLXQaYJyiwgo3zbrplIN5RFpG59bJWZtKoa7tBm4aDCoITuPiU+Bsg2ifbox7Y002Ybsjo91vO6vlmWG2+v/fC1ab6Pfj2v26y51uV6/ZHjDy/PD9fHL7768YoV7jEq4Cksf9ahpTcSa1a51C55HRJo6iO9gEGPuvD61Q/e397eDo7xnJgm1a1MRYssaBVDm/Tsa60NbSdBKRvRdi+wQkVGVSJ2f7ytqmUiYjg4qRBoU5vnWVjUbFooZkUBO8lWoM3qZpmsBLsIWK2sYZrazc2h9UVEBOruYswqX3M7rCOWLY/Ei6cNALx4hGTV8HH1AnhvNjf0ztaXJlq9FSXW7bz52Lb0VVaPGiODY91iJFWGr2JaYNaguOdWhTFkHZlCr6HVqwopKJaWJHNgbOruJlrFbd32Kx8imW42i2lRipaQyCatgTrLTT/qF+/O3/ruLa8tfWvVxYwZtJpmg3CMHNulqswO8zxPh9O2XWVc/XppNh0Oh8enr8a66aw//K/+r9/9h//47/yt3/p//+t/MX26AHNl7nAzioiLwPWFFAYa3GO3hJMSowSQFTtlCpJfPnzxx9+frXGeoioKkcyi9FkP6FyxjSHSTRQZXiNyBqQC2zrWdXjMvmVErON5kC/+CLpbldm+PvAYw70QTTsSObwtjYqoTdzvq1WcfgGvvqEnEemBoy05FYRrjYue7zY+5oc/ef9HQp6wfFifUIDIGGPEFVLoypGZGJFWbVQObGu6u0paVaZjexzv4qvT6XD36WlgmBcz+jKLABqAC8fUFRG+iSgydv+yoKhKAzQzlZJkeIytqjwqPSM9vPLZQ6HSbJrOh9maoZtGyRjr7kbT2qQmHpuoG/f5A4Bkm6Qv2uc2z936DKaPls9cR7Sp9blP2zTPs3uoKn5ioEuRQqxrdPbWrbUpXYmumLoetFNkZLpvWwUiar3GuIyxbT7ol9XDtWuojAjAsgLpVZFlmUVY5e6kIJmIUeXjmlvTXslxHbudtGdQCUh6SIc13UmwomI6dZtb782OrSZc5dXXXs2nKda5IlEyVp9n86rIrWezss4pxK+r13YOhx7vrs/Ps0X4dX18b8qPX70J1W199+7dD7724fm3fuN/8Tv/5r+xOVnqDpSjZMt1zS0otGRCiREjIaB6VCZIjeEisF5NOffD4Pjw/NUPP7SP7m5VNWvsWgbrdtSDmNgQlE7dVOeqUpl2n9YxxroOH7WuY4z1fK01W5lHXcjazXeBlEpRwHZiDZj9/HR5nJ8oSpePng+vPh+3l6k5Ro0Pj+8vz89cZhYk89Xd3duPP1mO84/j8XdTJ8zXzT9AMpG78STkvF5R7+HctgxHPsu4VgQ8eN1WHSXQgitsffIf/cWX/TTLzKC7SuCiuheBjIGInfIXuycod0btLprWUrHy0VuXfsCE3fobW5Y2um8eVXE+Xx+fx7zY6bjcHA40e5EkvVjXbECamf70r3+klACs2en2Zl6O1vvUWp9Vm9B6Ro2xDs91g/u2bSMzzJQQUZ3miaYQS7Xe5uN8WKbjNB2X5STaRQyqplqhl/NlXZ98eFzL1+15rNdzhiN3xxZkVe4WCFERUWNoJVUMbkyJTZDMLfw8fEspI6oyhIaq/Yil5+7q27vObRZRUW3W5nmel6W3aeo2lW1pn96+fbPZPC7nS9U6dgwuSdsJWfkiCuzNhLoss5nSpB9n65K+Ssntq1chxGZ36V//7nf+5Me/9/n5Q5+P59qCW+XY4tmLHlo5alRsGKO2DdcNI8srKsMoU5fp0GWuZlzmhYrEamoQjUJmQWiqZkqRaZ447fQi09YPh6X3pk2Xm8P98Waej6qihj7bfDgUi4SZggTUKGQkC7Cm/e4wTTcHPcwQlQ/bJ9+vj//t4Pvr5d2HL5+ent69f356QrO6buP58nB5+vDuKxY/++wzbfL5ux89rI9PHA/xkBVQBYU0CEs8gteRdN82Oa/wrXAObppOVGpWgVGSTDvyeNNVubGSIyvSMTbdbcK2bYtwoShVQGEjVESsmZn03vvcp95b671rn1Rk75VKyNZEOrUXZada6XJcluVwmBYjwz29ICRFf+bXP9pda+Z5OR6Os7FJibU20cyYNbYxho8ttnG9btvYtoisgjXtTc20TY1ddm7Wcjgel2We5953GoaaECN8retlu1yetmvUZtvq63Vsl2vGgGgpdyMd7GyPEqKhGMF1q0zWoAfGNWJjOjMyPU1apoyxDY9KRURalNbUmmgTk9abcudCtWU6zXZsdtOmvvm2SPvW4d7er8USEsNH7DY9jowSLxmq2tuhJJd5Pp1OBSSaQJsoIsa22tQvl+fhT29/7hfeHN/+zr/451vPzX3lqC1Xr3S619iK14ix+cC4MDfIbnQlsC5tEjHR1tAJRrcWw8HIqoqsGoXISoq0qbW5mU3adqGJUqFN+zRbb9b7PE8UtnmSppPVpBpAwRdrQBUSyooi9GBtWe70/uZU/e49vva5fP0HUe8e3/kHBDQL1nWeu/SojKrZpvL44bsv4lrf/dovvI/HP3r/p1+Oy7kC6ASqPBBQ7Bp2MhIpQ5U9Pcbz8AsqtEYVLMpB2lztKPNhkh4piEpUZUX4KqIRta5rhJkuTCB37jelT33u8zIfl+lwPE3zZM2s67RM09yW49Q7wQgJtNRJWlfrDYouebpd+qEndhK9Ib2JWaSUwkx7M91HDEXKGtVqjEzJzEwvDMHublW9yeCujRRVlkjXiY6LX4haloOJwdU3TJPWagO1rpfLZbs+Y7uUb9f1um2r/2SAJksySMB3t8XdUCoZGzIKyAzsfPoazID77qrjGZGZO89jOenuNiwCAcIzNNGkChkC9KnfdJ29yXE+eeLp4bGezzm8qrQ4NpFwlS5oLHMfpaWMKF639XBzJ2vVOI9thXvF1ppw226Op+fzh6ff/aOf/pW/+7Pf/af/7A//+XR/g6t6RgrGyPXsNWKsOTZZrxHO8paJXZYpzUJ2znhSV6FkkpSHh+friKnrMps6hgnrrvdmNO0ibDnKUyKzKNpEKWPEiLSpT4f5iPl6fn4+Pw3BWJF1qQxP7wmrMGGzg5jx3Tr/6Pwdb5+09pjn1eqeN0PQD8fhyOvWhEXpE7lu6HVAfPnl5996eN58fPnhR96N1vyyiVaABCUqcfUW1cgobxU+0pydQamAoDJRAsmcelPGdr1qn8BKS2eZKAWZvvtkV8AzWMXcmaGqynme52kyXXq3TJeWu/fMSQ6ZOXy5Xk7vHz48Xy+bb6xcepumGVmbb0X0Q6upx2VNwRjDnGMnECgLFaCWWqav6yoSGVy36zYu7usW15Fr5RYRfDGD2vXmAKr3aQzftgFymmdVZcp2HQJS7Xz1x4fz5Rqxruu6Xq+embudfDpZTKYRpbXtcswMKS0yS30LCWQhRlXsPSWqPP7KbhHQriKiWSPHEFftTK7DFaQ5YrXtuU3HNs2zTtKsj4x3V+1t27b1Ovq8bPWU6zZ11T5BxdpMcmT6GB/ev/vqi68Oh8Plcp6mab1cjocufVJp07LczK9+8Kd/9DPf/u5v/83f/qM//ldn9Ofi5sNHrteoDXHVbWQM8ZE+IkdBVK0X6ekqQLmMfVZgVolGIS7XM1Ib5wgBzLdzblNWp2RKFWy9DqBUEGa0EpGit65m4tlynsBqk52f5HwRH+uGUTDFvMy32jW38bUP/qvt+Dan82V78tVEqWbCy/WqOi2HQ0YsMEFu6ad+aBd73J7Pfp6l9d4m4uGyebqRJppehSyVbbhCUVHCSCKV2ltnbOCg5yBpXbRZkVWVnioB3T2ZTamArGMrAkgWACnJpma9995UlYrpQDIrSmAqEFMz80RDm07WbnCzHq7XLYZPvROa7lHJim4dkowKiqfo9/7GRzvLYmq2TL3PE00LvnPZfYzn89Pz5fl8PT89Pzw9P43LcC+AKoKftKM07uTcEau0dry7FbAIrw0lVXK5rg9PHy7ns4/h7h4Yq1eC+xQVQBCFGFERUkQgI4GWgdgSXrFVDI+xE5R2R0FqMctb47S0QmBkJbPCzChIEC9WI+XhQl36sYs2wRzy6kO0hPt4vmygtkXcfaxXoUMyqyIK1m5Pp+Px6OvapikitYmK+4jVx3pxj6oxTq8OPS+ffOe7lw+X3/3zf/O+1hF5fvb1GnHe3HP3dYvIXedFAoqSrMydqEkUaZSJFI+ozBwDDpPetffWBFAxRUNKelwv23YdgSShJUxCBBAhutoWHo7wYcC0TNIOyLisZy15Nb+N28Mc+Zv5+u+8/bm3vHnWfL8+WdZhPqLoLFXr0ltrxZQIZrTZBOGJ8/Z0/+btyPzdH/wP18rzCC8HKyhMk2QJUhUQZsRgrJZX1lq1BkZg7Efaj7ft+Gqxg7QZras1YVOzJrp7IWAbI4YbOwFVsybWupmpCqS1uauBWm2yeTI1qpJGUWlzp5LK1uywHA7HpXURLaO2ZiroqpN1UfWq6xhmJEQSHFGeY/U1t5pNygo5xubX9fm6ni/X83l9uozn7ZomKjZn7hqFrJLYPTwkrNV5fbxc7qY251jZK4b33guBKt82hO/mmlXcLSKLRdWsQkqSFhiZEE3fXd+QUbFrMr0yMzxRJTvDX9EgS2siO9N9D76p9XKFIMaQLdR7IyrtistTPfbb3TF9ntDSB1jB8fwYco23H71aun7xwx/per053bW2xLZe0O5ev755+/HY4jjG5fnd8c3rD+/eny8P2/X8+vbGx1Yh58vjcrn8w9/6j37n3/y3l8cPn4/YRvnQWadrnN03ZFBEpAEshIAVVDF6ZVK7jnVDWQb3HyrDr7nJ0Ll1CQU0N3hdIVMiLufr6ljH9XRziB7NtJ0mSosUYwmq0hEvROh76jjdLRg3yWlpy8a/PX3v3/v6L63FR+IaMfU+VVuzTGyxHrSuLT1MGmwXVtaIbJbT3GrDx3ef3s+3f7l94caezd0TJSa7kQvJ3TCrdjqK7UA0X1rpsp2p0TqWg2mvYjq004Da29+IyPSIYdLI/pKKArg7lTWGjzYsijJps247jJmoAkIHoTNvs21ZoxDuNZIYE3PkNgQ0a631RJ23s37v197ihTZfu/PULjSKcvdxvl6u1+u2bZfL5Xq5jHUd7jvuny/JMTtHrYAKjwp6ZDGtWdTIyM0vo7aRl/Pl6Xy+RGZERo7IjIjC7h/74huNQkDGyBgIRw7kYA2M1cNrtwzZadw7ERqJZqb7S7z7LEiq7UoB+jbSB1G7qXYiI/18vSimnzp9ejrHNi7n6ybCiM3X7fn5w/2ru88+++kvPv/88vS+CefllCWXy2Xz0N4//dY32nyCTL3PLG+my6zLzd3z2U8HG09Pp0+/e7T5X/7e7//5F19K4O50B/byIMT2IJrdcZ5lKkZTSnJXuhGpY9R6vvrlGmtsZ1yfnyO34+mwLAelya7eGrFdt8saj4/Xzz//4vJ88Rjrul6v5zEiM4RBxVjX2hvMrA2RETcyL/1okH/35jv/4Nu/6tneP39AuFzdZCpaE+t9FuouQgMpQt11VSImlkFkqk2ffP2zf/nF7/3eu78ogSp/ouiskmJjIBGBlEyWW21ZVxln8XPIMCHZMN325Y3ZgWoqJm3exw3drfLc04cDxVJAd2PwXRgnRjB39dVOlntJqlERoyiygNotkjJliEC6qjYBxMT6LK0BCYaKZJR++1c+hrB1qrH2wCJG5HCPMWJdV/cxfGzb5sMjImKU76zsLAap/Ctha8I9UBmxQjwR21gvvm3jcl2fz+t127Z13bZtDc/MwT1VIrl7luyy1+tIH4i1asAdPhBRdGzuAKr2XBZUFRUUqvClA5YsiZ0CSRqF7u7uw3P1UYWxjXU9P6zr4tMvvvpsqRiXyzoqU7rAoyjyxY++WJblZ37upz48fPnlu6/W8IiRcTWz4/F+3Twzrk9PmaHC0HZ+fJCSnNp2ftC6HO3u07/2a//0v/wnf/LDP/2Zr30v3R/OH2bI1CbV/cnskTI7ZT/J3Tci13UNr+265rjG8PQUojgCW5+X29P9ZA3ch4HNx0Dq0+P1+z/40edffu7DY8Tz09P1cs64ekQFxhoRY01fwy2q02LqN1f924fPfuun/ka7+sPjB81CVFsOoIhJmw/KllJzm0REu2WuXWVqlukkALtenp+u548+/tp/98N/9Qdf/PnSZgLarJjgQAWhYuLY4wcRTjiwEZtyQIdFrnrC6aObfiu64Hg4TK0lsqC66+yF2xjh1bWTfY/2MYqw2tSsNdMCAzkJe+QAvU9mrTWbREB2M4U6LKUZxZAQMRVSG0srk8isER5MWqYrdU+Ky+zunumwNkvfi05GZaaZTYclCZRuHAikRwRVLgWTsqAURslKMqnP57M0YcVu4pg1Nl/HtqVvGVEBoe7WMXv4ko8kNaqhrpXwUS8FFYjdQ4q6p57uoiDuNqWsERtKUdWoaJoSSIfIiNi2EVFqynWtKyZz9qk0jwe+bXNwc9Ara4SwQ7aKujse/uJP/3Csz5999gsZf1adp9tb7T0Df/pnf7Be/WuffBzjGVmHPmXl4ebN49OHj28n97xcMR6/P+Mb//N/7x+9e3yg1+Xp3LQFRM3nmkNAz/BMYSQiK2vNsecc5jwZUVFok1FE5TrxsMa2ruu6XrrM4urckOkZKh1Abmtero/5rs6DxpIvp8N8/+r1/f39bDaQK8EcXRqUh6G/8eqz//FHP7++H++fLwADukwzVKUZklkQ42yLJqiSLLgqhVSBJsuwAYjVFTj2ubXlON9dYhu+TkKarOFBoqBWYsyV0sAlePHVtBr34ITldmkHFc2pWTMBEmx78KJYj6gMUWm7lxhQTUTVao/RKGtNMzOHr5FrPNuUJO/kqNYp2hSQCrFSTWSObWBNr4LFuDIZEbGNiowx1hzWF7HGRFLUa6inqiLqrKWVexwNibaL3dE0FUlfnY2eOUY25aZOajF3t3wpZIxMzxwuexgL/8oelCVUqQCqSb3A3SxBhKCssAUiZEQZkpF7eUkja6hqoviThDlBD3NksFpZK0mUU1UqRqzukq7lQ8F1DTbk0DL/9Pbt3Xz4y/MPM9jZV1w8nbGbafJ0c/PFVw/r+P7Xvv7Z51+9GyFYyS5vXr0G5HpZD9Ph6el9TBMDHy6PrPH+/fs3r14DuT48zZ9/+Q/+wX/w+Z/92f/+//WfLPfLEvZue5rUUthFa01QVVhrwDnGGKutY520z0u3VpVjOvH8/L5ZqfU+zOtp+DUihoyKRFZUDtmK6NpLrzm2D+/fG60sn85bbNmlnUWa6pWp4Wvyl772zV9+9cnPnD46+/O784Mnux6mLpBWVTG8t0XRFKSD0oDKHFQpwN1VGxjX7TrPLSk+6vZ4e3e6a9NUq4io45nwxkqVMgorfUiJEpsXZBLJwRj0+WTTqXEa1ffhs0pVuyYwUCoY5yEhqg1AjiFmVRARbdPUtDXzKiZ9PJ59Bafc+rNkl6tJa9PchSXoXajIiiHGZOQILW99XFcAqi1jLVRT2v2rWxGMMTLTaJUyooAU2UoEgJmoakZVayJSuUbEqIx9m5cIRBQQ8ZOAVUBRVcnIjM0TKIFERZFsxsiEZiazarfOTyJLVEdWhQCSvmUiaZWoClVFBKClKS+RksUahYumRBU1UYNBCoKbi0qE6ECq+FQDA2DEtl7aLF1qrE+X54dMFKPgYiqV7l4lqtObNzeX8/rw9Hh7ezpvYzoeT4cjgMvl8vr1zc3drT3c/PgHf/rR6zcKXM/+8OF5sWm+u7tk8Ud/cvvtn/vN3/if/Kf/4v9hx9s/eTgvN8euMdKleDvPRflwXrM8ISyR9LfzAfNMrLc39zYF+EEXq2rWKM9RYUissfq6IVlpEa4NsZ4zr4FCkWUDWXtUxPrYcUjI/Uf37/zLefW/9a1f/3vf/JU71oenx6frtQdFyhpU2h7e1uYutP22UwUZiGjIopAc40rR1tty9wqP71tEbuv7y6VSFFyWggfScg/DxGCWlngaKjKZKTn2uERpB5tfWbtrs8lMBWzLkBaQMFi6+9gyKcUuarBoxkalmMjxuEjvI7wLPa6UG8ppdwAI5/PDtc+LzekwCE1U5AVelybuFKFRmD01cgcggcrUv/nb3zsc53ma90Cln0RBkihVba2ZmYiovrQkWSWiCt29/EQ1gdidUvfxNFF79m9tufOwXsK0SsA9o49VuhcLFHbzmxIKK2IX7lUWMiOKwB48KCBUxUqNQIo4SW2EaFjBSClrQdscgxIsUigUlVZhLCTyvK7bZf17P/Xr37l7/cMvf7ytgoiqQRqrosrEWAQ5LUeUfO3TT9+8eXtdN79u7z68i9zW7fLh+fHrn3z6/v0Xjw8Pb968bq1XxuV6PRxnsz7W58P93e3bjw9/8W8fzk9fWX00HbqZAnd9/vh0s7mf1zi2KcoJReTtcfn06x+vfLKjf/T2xtqYl364ma64lIzGFFqJuK/bBuxiQR/X6/Xp+cFjCKVErPeTHbaxxbZpVveS2N59+YO//51f+ce//Btvw56fzmsOoiTR20RQpSWEZtPhSGpktKZGbmOTyorBPS9R9hQoZfBpe9Su/f71P/3B73zZL8fpuLuGUyVKIgOMykAplOnuI8pbbahV0qM3uXt7Wu6tzdIm4aTs1EkEqk0i3CNUtVszaKPNfTod+rLc3My3piK7x3SEsmRnDmXueprdTI9MUaPuS7QUpRjJiswKiEik7z4dTLAYMfRX//53em/LMs/TDKAQFIClYi+nX/fE1Z8oLSorA1moyErfnQqqaico7CSm/ctYYO0yfwV3bTOrtFIpBmgJC3uy10v1qGJJRrBQnpWJSmRVhkymzShFSVG3LiKWlDTahNYbCWtXsyBMCTVBsaAjEZUGVNaomKX/xrd/6bM3n74/Xx/eP1dQBGNzVfMYrJynlpWFUrUvvvjSIzz8/eMjhV++++LDVz9+fP/V5eHhZ3/+F/74j/7g8enDPE+UOp8fI2KZD7fTaT0/Hr/3c/N6/W/+4F++uX3zVuxoy5vD628f7046Pa5bbPnZR5/EGB/efXj7+v7q5zd3x4/fHs/PP/zOpx+9evVqUmlTbPUEwZ5pT6WvNa61ByWQuI7xdL4wyyhVWdC3x7vz8zU93t7dzdvl6cc//u1f/rv/27/12zeaW67Xy5qbT9OOuquIOYSi1htKq8oUhErknmkKloiCrMiuqoKiPJ7fz/Ny+Pqnv/Pwh/1mmjAPKAUmgsptbJ6VqJCqciYqVUtlIDaycjnY6eZgx7IDZKFNbLtrlDaPTMDUjDq3PvV2XJa743GZp7mfzDpqt+UUZGUxd6Z/JHeP4mooM7HerXUrRMSg7DmeqqoKicRLdkcmArnlGKv+2t//KRU1UTMz1WYaUT6CL9kL2qyp6R6XDWFEIiQLXhm159NFRXhmZsUewL77VzBVbJd/7wED1D25MiEqBdllj7tyi5WeVVovHoysksoKD6Ban9gT6qTQpHeIajXCoi8UHaKiJuSGgpTuFkUsIOYMSmWtmm4i8fVXH/2dv/Y3bzj76hRet2ts21/ZVBGcp+PNq9eHw02fljZPfZ7vXr+iye39q/tXr8Mr1vH+3YeB+ujNqz/8N7+X8MpQ4HhaQsRUZsTyrW/D/MNf/OFc8/0y3fXDT3/8ta9Px4eHc9n0ts8fn47ivD5dv/X1T6+xXnH59scf9V7a66Ob+9N9u8RDbdtuGOpp1w104cWfQGBjjW2M67ZZawTCq8KX+fZ8Xln16rjo5fLXvvO9/90/+A9Pjq8uD18+PabHPM3bFkSHCm1iazd3t0L45ktvSs2Ipo1NkkmUirBETaapi9rz9ujbdmjHulv+sj6Mgmoray/Z8CKZtUUM+ChvKUgLbzlQV4VLMz3MCxpkrn5oYlQRUaQhyO26aXHSNtMOvR/btKiqSNdpTyAsVmZt6S41aoQP960q1/VCBqUm03lS5aImYEbtFr80ZRPNRL6IED08cmRFDV/t5WS/pJLTrDeNMLBSoQqt2jNqAcCoXWbnNej14qVeEIZkxQAEyt3Scv9fUblzHKilBkFBBTSERCMcimqq8EgwHUitPdxgB6bCM7NNxknLoDrUjJpiLCVb7GHuAiOzpIKNkgQjogIqXYqWVS6eAlGTZia9HVtrlBSo0oIVoIK9L6fD8XQ6SZuTHMMPx5siYotpmh4/PNzf37/9+NPcBqS+/2d/8au/8ktf/8a3/uzP/uTV/WlqdjgcDkeuNZ6fzncfvn93+9FvfPrt3/vqq4TNN8dvL6dGRI7p4fHuk0+/+PDO3nx0b8ui7fbtcm2Ph4Hbt9/cND599c0P24/ubm98I+uxcSx62C6H94/venAwAb+Ma2SdJhFtj5crvcpxPV9m45brjeB/9u/99m/86l+f1usXDx8uY3UfYv0aQygAQgDTeZ63MVR4mHvbT1lWcrfnTRFhAshp6ptfK0q9eltsOfXeyya/uTl4+HPFaMAwzqZDdesVAqFnOmOEbywPEG1qaj0koXtMTA7ZGxFf60pRVhnQmyrYgcr0csJVSrrJrqiuct/Kr8PhnmRt2xYRIha5rdcuGGXoRxbKPZtpVZm102lSbdvzJSPCScuMQdL2Rcbe+qvqS8QkSq13bc2mrAgEiu7OLEbsXoCevhtTQjMi3Xfs2iAo391opZLJrH3RUyKSKJqRYi5SiMwR+RK5QVYhd+b67g5LSGtNGlJGM9Om2rw0XNlmSc0CSylawkwhTbJQ4Vr07GNVjDS2Lco55i7Hw6tPP/7MNh1tZIPpxHwQQQ70rtJMTG2e5tNh27bb29vwej4/Pj089G7v3n31xY9/+OknX/vkk4/+7ffPl6f1z/7iz7/7Mz/34fHheFimuW2Z16enm/mQotvzOn3yyUff+Obj+Tl0WW6XWy8x+6k3r2/mw7qO+ea1d70uR53mt+sIvnp7e/uhztX0m+3Te2/WkfO7V+3VV59/MbejtNs/G4Nz8uk8RrTWJzOTMp3rGk8tTI+3fZ4n+8bbr/3dn/3rv/3rf2e7Pj6vH9ZxrcLU+suMR5bV0k/ae2amh/ROQUSIVOsS20Dk1GSs2xZ1XA7rukYOZKmallfVdHNjT6eF1c9rX61rhyJqZT0BQO4eCXuitQK1C75pyk4zy+JwbxNKZctUQY4RuU0yU4pKVnkOgVKQtRKtIiClGqxNsUGgFttYGaGGdb1G+XVb5eHh9nQ5xLL4ga1aU2iBGrTjYVZtrs1kQj17rOvmvR2sqqpQCVBVLZC7FSNJ1fZCm1at8q7dywWQFM2dXY5wj0IBpTsTYVfiQhRVyMqEF7RS9rWvGcvAiFYFxUpElYhVZAiE4VUkMxIl5L6ETNGkDSqlhzbZVKoHE6CpRWm57iFQQWqVxRaTHBy5bePsWyZu7+bT4djl/tNXn/WmUUVp07KHIAhjD9uAr56JU8b9/X3EyKj7+5vV/bQsh+P0F3/+bx/evxezpnOb+5fv330jvv2tz757eX68vT2FX5/G5Xv3t11dCuRxPr46dcU0HW0a41wje+DTu9vrJdnl7FeZDjd3tz9697h6fPub3/r+0188X9dv4TZO09MPfxzLN8X73enG0m/vPn18fLys6ze//tH3f/T+9e2rU7dYt5vT6xb4gw8/umvz129u7k3+g7/z9/7mN392uz6u53dXbNE6/39E/emvbt2al4fd3Rhjzvl0q9v925+eAqooqiibzkAAxzEGCaVBchwl8sf0iSP/HcmXKCJKIsVKHAgmIGOBCZ2hCqgOquqcU6dOf97+3Xvt1T3NnHOMu8mHuQ/5vqS1pDWf+Yxx37/fdUmx6cjuJEWkW212kIqEqVsqBSJMNaG46SKfgbDmYeEOcBpnBGWhYFSdwzxJV6OVvqxEMaBvq6mauXALRGZO6s29LQV8MDdzwmDuAgFAJbETNCA0xKaGQODWmluoSKMmgV1ZgUeEgYM2c0DhjOANmnqtWmdQ18mhmjYzm5s+jIAgJHR/vLmMJ1s8W3Vl6Tu5r6hPZlayQNWSupo0Fn24uUyjNsOu944A1AWg48yJMCigBbBVAAwEWY7x1c0RUKiYQDLTcKvh3ha3eRhRkpLBY0lsYpCaMytGWrqcyBQLsCyciCopELgwzkjqBBaBjGA0cw4jCIoQI5Igcw7O3iFjiFNVbpGYGeNNzKC51YCeqNdwksQi83Rcb8rbz5/uhp2eIuphkknpnGoKVnaojYwcpjlQjnV89emHBLHdbnePHm3PNlJ7rf7y5csPPnj/8Qv/+KNPQDW85pSmafrJx5/8gT/4c//yn/3j81331otnh4fT4bQ/3/Y47uH2+zO03fpsNmytMaVpmodu1a2GdEaqremqXA2qh20uRiWlfHX1VddaSrHo4PznPx/vPjt+/JWrF/uHca/xB65ewNxO1i6u8qPNdtWtjk1Tvz4dxtN0vDrv3x3kj371l96/eFbrwzjvpwZ1DCGweqJlDJJYun6aa/Ioq54qmFYRIVhSJACBzGQVWrMQJ4honooYGDsYykO9e8xJy4YItoRTn+RUclp7PZg0YkYVIknss9UwQg22AGBOIkYQRpQCGVwRKZwNNIysEQGOoeEVciGaRTRRZivIHH6a2iHC5lbnWo82tmjhMzh4DZ1tMWdPGgB5SnNQMEfCXYfZiSE5GExtjgjnoGSZ2aggTGYmt/uxlGaRI7tJEkIWQcRwNlsQKBBuRNKWLJqZ+8L2QARJiRTYY2IAMCPilNLiWoKIpgtT2gAXbC57YDQnAggGAk5QKDU1jEiJ69JYAtc33S5mCczGCVRcCuHCkGFYBFlESBxAysjohJFb03DMlLxinS3cH11dPHv86Ops26VBtsNlN/RMEjOhjq1hSTqNRdLxMNc4WXMCZsbj8TjNp+uXpVlsdtvVavX69etutWrNbJ7M2/XxZpVXt599evf8xR//E3/yB9/5zXVXLtfr7/3kBxmfln6N8Yqbg3Srdbm5uRnB53nadWW7GhAxcAVBGctpzpdDf3NzY3XablbI2cxzN2wuzy7u/KpdPn/05BXf/d7HHz6/uOiG/tvf/d47z9/yUc/PH83ot/vx/fPLZPX5s0dfffSVf+tLf+h4+nz/8IBQZhvDYJzHrusgQrqSV5t4o44GD6BUyJA9St+11kBnsKZoFuARTIwBi0FdXdvi4EMKd8lMq0w2QygIBmoko6CudbOr+aQRPcoEFgScJEsSI3c3FHNXgyCjcFBubuEK5izkwRDGeERImcHFECeYskK1aAa1eWutNWuxSDmrhgMimQY4smPVNjZAOiU5iHTMzGzRQseGpthFzpkyWU9ttpQZg/jya+eBCr6AeNQh3kBEA9zB3RdfERFHuJt7eGvqas3UzCPIws2NEJJIylmYSy591xVJb2zEjEiwqDNj6dFgRhRYAmHEyLi4Z3QZsjqqhaMtmjcUA0FMKMlJjJgdPdiAnAiRnNAZAUFc0YwIuwDWiefZhpxfPH787PLRZrVed8Ou7C4250/KWdKoZsdpGoaVZNY6GS2IyX7o1+uz8361Xu/O12eP3KxONTw+/MlH5+fnb7314vNPPy2pUMCk87Mnlw8PNy+eP7tar159/vFQOiJx1cz88OozbDOmxCl3fX9zc3d3f9t33W63a61N8zz0q8NxbO73D3f399eMMXRlbjWXUnKCOiXCQvz8+Vurspbml5tVO5426+Ht7VmPtF1vV5JWq7UkOWd5a/3s/WdfSeUwz7fjqVJO6mzz3K/XbuYWqaym2lprhUVSYiQkZslLwVyIVNVtnq2Z6TIX+WmWqi3pQrBglO36vD5dfR9ehauD1cmqNwBzUwBjAmYQIW/VakRjcMTgePOsglE0cCRXb3MdqzVtS+MNI5asILg5A5vb3CaD1rTOXitYC9WwgCAia+HmbhC2RLbDHTAQQ8ygqaaUh9UgIgQ/9dgukp3m0ziaNq3aZuPteyuRYBQGMKsRQYEU6GFuXutsphACCBHWWlv+Pl943rE4HqJ5I8Yky8WZc84d58LSpYyZ3qQdwgGA2IkWv+iyXECLQCYmcDSQRe4tS+7N3Q2VMiIT5mBySYSCCwUfDJFiwf754iR0YO4gRCeFyTPBbr15dnV1eb4b1kPXDXkom7KCMaaHaViv15vNeDiUUhyW9DwRJwvQ1jhJWQ0o6fzi7PLq6vziEbp/55vf7Nbd2dVlPYwXjx5vdv1Z3++6/PH1Z7lkPO1fH/ab9RnApLU+3D3knIZhUPWSystXXxz2h812uz07n+b6+uUX4zTeHx6Oh7v94dZN3ZyY1W29WR3H48vrmxa6PtuV3c4tusR97tj5ydU5IgX4OufNet13Q0FJDRPnFy9eYAmtGI0VvY61ELbWwKPvh3Gqc9PSdbkrROhzizpLSrlfmVZXBVwuYQSBADFO0ziOqhWX2TNhAJU0PH721su1fnf/Uc7irTWdEEyEELC22bQRspAQlzAHVdAAo4iEjIbL9pSQQEHfLJWW6zUgEYjwEh9AD3VvEbNOLZrhIoc2BweHcDcPRAlHr2S6bIzCGxDy4jEC8Fy6nLtwXOD+rTVtOh4nndp0PE3jOM/Gq7d7phAQgGhePWCpzpupalOt7oZBbq7WWpsw6E0exdzN3iA+GJgwiRRJqUvLHjuJ5JQkCRCbofvCs8c32hbi8EXevswLHAgzJ1yWYhgIGAGORgLExAVZIIQIcwSQEzpEEBiipwAMVPMAZFzI+NYccbVeP3tyfn429F1XhjWkyD2CwenV/mx7cXZ+Fk33+5PkMj3s728OD3ev5+lwvH99uH01nk4Pd6/ub2769S73w9l2e7bd3N8/vPvB+0aw3aw2m441NuseEg190TZ6tdN4Avbx+DBVr00vz3bjWG/vbh4e9mW1ktJbYG0zUty8vp6ORwBLpRyPU8lDq1USB8Krly+Pp8Plo6vVqtNWH27vLs7POGFOJQsdxiN6nO92q76fJ3398nbaj8+/9P5uu57up3k6eSg0dNM6uWTklDQgSelKl/uCCOxBsVhlDD2SsGRp2mozJonwcRy1qbkyLfkAYABzTGV19c5bP8DrHx0/FqGms4IRLs3WQAb1qja7ejM1DQgUFgJkFAABWtxuFGHLvhacWISIBGzJYC/cH7No6oowoxlZsC8M5sQkjAxmwW5hCosvzMxcw41Cl/OdTdNcqwpl+Kl1sDWbx1rH6m461+k01qny8GxgYWECcAdzgDfdNvPW6ptrQKC5am2wnImWNPTCm0P0BXoPmCUJMzPllBPnnLqUJAyXCPByWofAJdztTurmAUiAEe4GQAxEBEgGYBGLFDpQkAVAINjdEDSzMaACkrtEQzD28ADDCFdk5uQcCE4oiZ9eXT66uBhWq1yyMAXPNMNQ+0fnj4Nhs+r7fsVE06woYjadDveEkZmmw1F1AvUFY3q7v099v05DyfLO73///uXLq2F1bIeRlA4Hr9PmydNoBiGTWZ1H0GamJdHpdPz008+s6fnVo3BaItggsF1vbNLgfH27N+L1sPLQhYL/cP/QDd1bz18cD8f9w/7u9o7CAaD03TxPNy+/uDi7MIim8/F4evX6/tnzZ++9/z6oPuxvPRScx2kOB+47FJrMUhn6PCTghRFGzZGQZS0sHM3U2lwB3C3m+dTq1NoSpBUk0lYzp5TEVLlfbV48++bhJx/Nn2G4qs6zhqqqOZhGMxgJ3aLOba6TWYMwWvbDi0UGKWAx2nmE4pJ0JvDEgkDgIJQRMci5IDIEARIuzAU3xGWW5Bq+oOAiHGEpmRgACJFEgKpZtVYtAgSQwuaq2rTWVue5TtVVp3Ga55mHpz26pCRMjhBhbuaIvtTSalW1CLBwr61GLKxJ9ABdXDBAvuDuIxaBRRIppXSpS1mW+zAvPisE8wBgJkHkZi3cEJwQF5b4cq0OVCCLIA8MWJbeEajIhp7BxJuhYcTSt4jF/UgYiBQO7iFvtBNMzH1HF2fD+flFWXfUEbH4DH1dfeXiPRHqV90babVCpHQ87fcPt8ICKalC7oZUhpQzOIyng6xKbQ3m2ubjxZPzs8dXL7/1ve2T9dgOp+vbh+Phxdvv3e+PYvAwHsPa4/OzcRprnQDi5u7B3YGw1nZ2fmbgEZ4ln06TAc9qpcsCwSSTzo+urvZ397vdWcnD8XT66OMPIQCR1kPPzJ98/Al4lFwOx+OpzgnlyeOn7331AzLY399WNDDSGqc69UPHqXO1Lg/uHOrgCMxIKUsWzlRKLt0bWrjqXKu5Qtg0TfM8qmpOKcAhsCslmIn4bPdo/d47//LT3/7B7Y9QYxqn6XQ0natOVWdzd53CnJgC0Kq1qvPYdAZXAAvX6gAYpBCqjZALsbkJCqEQACGlyAEBpEtJigGEkTkCnSAiwDzMMXRRoxIahfHCpViMLYhIIIziHmYtwpkFI1zNVOs0W2t1rnOdT9OeV89WdbZElgjA1c1C3XRWsKY2z2C+4OTMLQCX7EOYtQWwHsv7H0FNwyOlVEru0iLmoAVyj7IICx0gwsLUXSFCTZ08CGJx8wFF8JKVEw9WV4vqIYgJWBkENQuKqpoCukTz0AWCqEvVlICF0GxOJJkLZoJ02qzy06unwyrNug8UOnJ3TB9cPnGsU53ncbw/HupJp6k+XH+xv7n18JxXKF2/Gdbrs0NrOaXLR5fnm/XTi7NgI/KbDz998ZW3yvnq+sOf9AGBDCJpKFeXj1+//HhuD13ucuoawPE4e7O5WepkfzogR78ehqG/GLbH+/3D8eTM/bpbd6WkfP36evfoqraZEQjTfn+c6vjDH3zv3Xffefzs8TRPr26uD4cDkHTDalJd+jQXVxcNY26tzlMCqbON89yXgUhO44mpMCVCIuSSulwGlgSOyNwlDmLuViywTDgYgrnUVk+nU9dlEZ7mKUkqqXNJjvBk95SePfrH3/pn33/5fQbRycy0tXGuc23VWniry6sYDBFSBM/TFLXGbGFokJEIQtyDlqABBjKxoxsuXXAKZwYidg8BBDQEUgh3IETwCDNXAzVrjibknSuqerizMKIREgYhMEDEm2o4MeE8z21u4NbmWVVrG9VnXj8dzJsFopMrtKamas28YSguSrb4NxcQx+ZBSGbuSxgIYgnomztEpJS7lBdHOy/CpsQiQkxAqOZmoUstUsEUm4ZahBsyICLEkhJij0CK+qb/brQk0z2shQCCkTV0awzohui4aNTeqMAEnC0xdmQdc0p52Gy260StnG6nK7h4b/O21uk43V9/+noe23g83N/tx/lU5+N0PI2nk3u11lqN1dnZl778rmJ8cf3Fbii79apkRrRWjy8//vSdn/3K/rObdn+IIQfQ7RcvH18+vjvi1dBRspcPD+e7iyxye3c7T6N7AOTd2eMlDnJ5dXV7f+NmrcWwXW36fjod7k/71dBrtYuzJ9e3Nyihcx1Ph3fffw9d2qx3d3f7/UMWIcachZG1VrPJ6+zAYH7a76fakAmYZq3i1FASIEkGhEAzQgEWAARwB/QQBGJOXelKNouqNUJFCIjrXAVh1RUiMTImefb0vbbe/LVf/Ruf375c4cqiQqU6m6mqVg8N9AB3h6azWYAzSwKLmMkNjWfBYujuc7/4zPtOgIhIECMMGRbvkKMGOgUgZURzNABABUByjHAl6wv2TOyOaIqYgoCAAAtiWjDrKJRTlySDWasa0eaparUAN1X3MBt5+3xFSNpaaLiHNm/NwsgdTMHdECUiTG2pI4YuyGloqgHhvmSagRwglrUx5ZIXlJmISJLFH9PUF5KJq2lzV9W5aXMPDFx0igsaaVFVIRi15tFC4s0j7hGhgY7hmYKtqdYAF4CGCAKZlk4wCKAwRielH0A6Hta7nJOpyUhP8pP3r95DjZe31w/7/fHhOI5Ha7W22sZpOh1bM20GHsPQOTWw+b33P3j+/NnnX3y4P1wvgzJGfH33eXt1942f/bnf/fEPURuB337xUi0uHj8fD6+7JHPz0nVqfndze3vzKufEmLdnW6KUU6YU97cPzFkkW+i6L4f7QxmGUnLfr0/j/Or27uLi7NPPPn3y9Mnl46txPB4OD6fTOI6TgSNhM5Ocd9vtNJ4YKUPZ7/f7w55FRHIzZRFwWHKOiOABpmCGKecIj+bIISzqbG5CVHIBiqajVsiCVauHlS4RiXAB5H7VX7z9zvXh9q/+1//Z3bynCnWeW53m1g7TwawBmLam1qZa1UwrWENcOL1Opt60BjC6siCRZ5EskoVlObIQYgAgquviiAREoAZhizecGQGwNUtAhbehZBYLA6C5IyESAsgb5zimvuu6oeeFkB8B7mFe52nJ2c91Vqs8PC3gYLV5VTIwtyXPsEwhEWihT8ZPZQy+jH/cm9vSy7KF2LgEms0hIom8OfwTMS5hO1yWaBERjmGual7dZw8Nd3MwRIbAf3MRbOrhDE7oCIFuBO4YAC5hblbdmusCjkRZEEWIQsCSGDMiSgZOuCppu1pxyVXH4XhxIeeXu11muX99Z+oIdLo/PLx+XavqVLXNqi0szBoLvnjriQh+9MlPrh5fvvP+u599+tHdzct5HFPqEvhnH334cLp/74N3Pvnoo47F23yc91ePtvOkOa8gzE0P+/v96eawv+/yjjtmofOzc49otbV5Pp0OZ7tdncculcPxYbvbch5OUzVtEF5Kev367urxE49Ipbu9u71++UXXDXOdkpTEaXGSW0RgHO5ur+9uzs/PAek0nQIhKLF0gSkc5jqaVYAoXde8NWs0JMzJKLggeJz2x+NxT4Q5szCD4zSdkE2kL2VVvSYqVxcX3Vtv/c4Pfu+v/8P/8niadLTpOB9Ox6nVh9PDaTwsyJup1VnnVltTs8qhPp+m6aBmDYjA0bURESTKCQpCKalwEHlCcY8Ac2yLP8G9BSghuy15TXMDwcxRfOk+m6tBBKEhUBIoy7kcgIRTyiWnlJjJCeBNTmfJfUKAVaut8erpKmLpRXmYI4C9iXN6BCKRWbjF4po1WyLPoI4QvEDZFv2Ug/sCfY1gokVYnUSInJHDQc0QwZq7RmtqLbxFm1SrmmlYENKbYJ2DGyyHrlj2ZxYUiB7gYIoeFL5cPgqQpmXbxpTSG3IMEjMxZgLGVe6HQrmAa3m3/5nLvKmnh/OzTavuZoBx9/r1/e2dV0Bg9WmapwhAdAAoXTo/v+w6/vCjH7nBk8fP6nSo8zTOIxi0aJ9+/OOLy4v1sH59fb17dLE/3iXudhePtE3aqlCu437S4zzWWXm9W7HkzXp9Op1MPbyN+0PuS05pOo3VWtd14XCa5r4UaxUNh9VmfbFRU0n93f1pmufEeBoPfbfCpbboYO5zq3d3r/vtOsz3xwMnLikzSdOYTlML9KbhQCCubdKTtpkZCTnA3BqS55Ka1qlWRg60w2mc57lI3m4vAmB/ut+dbzabVT5/9J0Pf/Sf//2/s9+f6mT3x4fjaZ7GqlaneZrmmYiAUV3B3S0gsgAnkPnQxuNsRsBk5iLYD5zYS8IksUjdChdw1JgM9E0kMypGmPtyf8QgJiHjCCBAi8UWSIJIPzVnBIGpMgsRI5EIY2CY24LM8cDlg1XNzLUqr66GsGWKgwsNg5wW52AERqBZvFEBOEAwOIUtuQMCCyYiQASAFmaxXDdEJOX85nFkDGBA0kUSoNGqa9VoTdXGaZqrxfJ5X8oEBKoKgAC5zQ0W0pUGLHpnC/MgFqRgJkmceyy9i7BIkYQkyszIQYSEzJmplK5IlwvMw9fPvv6lx2/X64dpmrCj2pQd7+5v1Zs3qq151JTSPFVhSl3Xr9aYCT0y+mefflTneTucmdvt7UuhRASS5NMvXj55+gw8SteVYYWxSiXf31wjxGoYxnEaD2OiDplW67PVsIYQInLTOjetEwl3pbhZbTXnTtUO48Fq7buuqvabdcpJiE+n8dXLawwbx33uU1e6aZyYGVEgUKe5255z4OHmzt04pyJJmzZr682mDH1YhDkAjodja1pSEfN6PMz7g05K4VrnnJKQMdHpVF/dfFHyqu92SHY67RH53befM0l68uK/+rVf/i/+yf+XqUBFa3Mdl6NFi5gtFAlyt9TNkCllWmVIEizRE9A0zc3MhVms77HrUuEAaoiQMgsKQDhbs8okQMEUgkyJkRzdQAEVwH0hX7FiYBEgAHRyDjA0QCRkIpYkTPzmCrysd5dJv0I4tKZzNQ+QACMIX9icgK4RuFThZfm2kkQR6F6JISUhFgglAgRHRGsCiAu5P9w8KIzcUSNaLM4nB6gL2oUJhSPhcmx3Yl+gKuHsCorhaCQIiGYaPnsYAQIEEYFBOBMgSiAbi6ckktGZUxJmRoSIQEwighKAQaIRMrdp1u08xwpwEGKHYbs+He9rbVmSsne5a8NmgjY9VDfzqpvVGsg55wBobVbzTLgZ0t3NF17nt99+zmKvr19tt9v5cBLpPv7443fefvvlJ5+9/cF7bKVNbWwaAN40LFqzOs3d0GeCeTy1Nvf9KsKO4wGxdr0wpc9vXzrOK+uF16ZxsMOwWW/WAyDPk7Y6Tqe70NNpf0iC2/Vam+ahbC/PTfF4eNiuhob5NJ4IDVgg6HCacumvLs9JVqeTgiShnCiZs9da0pYTgk7Y6jzttU1mXpi3m/5h3t/ta5d3w2pYrYZpOu73+w/eeZclYwxY0ne+9x2bYlivHBp4nxKxo9UZRLnjqZ1o4lKGQBKiwoTBc3gpadWvam23p70zHE/KOYY1AioGUwQRIBgnI/dEbB6oAQkNfFHvIieBwHCRQpEgQFJCSwhu6IQCFG7N3SUVVUUMFtJmvNDhkTCQUMJbBHoLtEBEQUQIxTBEjIW2SeIQcHJITBwYjIiuTgzgAWzC9IY6HeQWyxweFxwRoDZvs4ZGqJsZIgHqUlnInBrq8qWRcxeBfedew5TAwNlDQUdAljD1mGlBexNSCoAIRSRoCMjRlVRKwkzEvowQkJyZkqSUiMWDDAozL683Ok0j2vT6+vqS1scYMcOg6WFqTWtKiaRbbbjZNO5xnKsk3mzW41TRqxm3cTSKgBCOOh+++PzTy0eXZnZ/d7sZuu36/O7u/mXJj5+/ePnJZ1/+8pfnCZyRcnHI01gp0XRsVNP+8FrDk5RS3p6rRljOeZrGLz67Do0QnVs1r2belQIoktPh8JCoI/ZpfBiPD4icc/FGTNKv18ES7tuzzc3167HpbjVsL85Gg6axXm+kX6vGdDzU5oSdpJxL36/PE8LyWxJvFPcahyR51125tanp/f3RHM83T1JyAJtnBZBh3c/Tcf3o2e3dy9/4rV/lDlOHaMA5g1jzxoAUxTTAYppP6sgCm7QCrKpMRMi4IOfJo83VDQ/iZ1sbMgMSIVprLEAJYQ4mADcQRiYgd7XFFkwUhMEIGVYo6iAUGEiCCKHGGIrNNMwI0MyYnYnMLCG1cNOAJX9h1lpDDEbk9eUKdDlPBoQjcEAsnjpCREQ3C38Drgogxrx4+QA5nCPYFokBwJKZQ0QRGUonjAUpfBmnEjiEoTXSiilJJqFAcK3N5moeGG7RWrSkraEDBRMCI4pQRAC5MAcHCkqm3AEll8JJhHjRdHvuqeskF04ZuAALdrIKUUQpBHrirV4+v7joWI7H0TAE8TTOk2mtR21a6wyAm836NO9ZCMIyy2rbtdpqra1NhEGE43j67PPPt5urdbcyMEF66/lbx/FU51a6bn946ZVX26fuBn4yMA9KkllKgD3sbyyilJW7QSB6TON0ON6thiJpYC6Hw/1Yp/X6LCdq2sbjuFmtT8eHH//gu1q9Hzopq6FfoYgjqQUh3l6/aq3tNmsz6/r1VNt6NayGoWokLOvhbLvZ5DJkzlOt4zi3cR5Ph3F/mE+HaTrprKDqaqdxvrm5AYX17gzJofnxYX9/dyslXT2+UIv1ixd/75f/yf/p7/3N7dV6kESZSIzFSRCZkDMSMZuTq4bFRER96sEJjdnYnU5tOtW5TSHgQJZdVlIMLTBEEqEHGQjGAoSSCApOkgQRDRctIEVKuUAmBAhiEEJAISSHxQposWBEwmlBNcDSTas6naZpmuvcWtMIA0Bi4d2jTQR4NMRgWOSS9lMkkYUtsR8KBwISQkQkDMQMKD9F9FCARwQGMBGipJRKkS6J4+Jy+ik7zikMIxzcUhJGaurTPJmGVUZlVwMXACGWtPhCKByJIQECJ0aCtID/E3ICEYQUGETsuYPSldSl1EVOESkCIyVFTNGkk1RgeCRP15QBnMgP93ttRsJNdZqbh4KDqSXh8/PzXCgVqjZnQWEKb63NhJiQs3AhvL57terXV7vL+VQD4tnTZ7fXd+fnj4L0dKhXj97JQ7q9edmXToQlFVXf7s5vb27A0zTp5ePzruvqMQIaYDiqSLq/26cub9brVcnW7P5w64rrfvji849efvbJbrNZb7bIpZQ+d3m2hkg2T6HzejXUcSSksdZqrfQbNUwlMXdplQ3NXU/jMdDcbZwmbZYIicLfOIbgdJwOp72Fr/ozKaJVT8fDeDpAxPZsu9l0qRuGyyd/7e/9rV/56LfW6+3KhQo6mBAQL/2pQAZOYuggjTpw88K547UbszF6zCc/TdXqFMxgjKpSMGWiQMAqiQgJIILMMMwjQAlj2SsJQZYkIT2tEWAZFwUALFoVojDXCA9bPORCpLWFujcws7nZm4PJkrFkSkmYkfur5GHmhsGIGZECGCG5xZISAEQCRiIAoKXMzgAEiIyA/wblgAzkwIwkxIlkeSsQAXAEuQliAoDA6mHqLiTMubk1C21u1cOCGAk9clDBVFgSAQAEMGIQASESMSdOiEtERJAAICIlTplTV6RPlBAEmHJQVJyEJPk6z/lJfvpMHmcsHu5Wx8P+4XCIiDaruhNynU8AHt5KH/3AQ58RKwIg6ul0yyIIETCbW+l7YpjGE5I/e/HiVOt4PLz/3pccdD2cE7vqhMQJUzvNw2aQbnWa6/pyo9o6FEYo2/NIXe71tD+Zq0iZx9PhcNysd6XvgfLd64fjwzUXJE0PN1+MdX9x/oTL2m1erdfdcH6apq5Lp9NECLVVN5h1nqztzs7DYZ7GeR7V23E8WCWRTBJRW5aCjDpPbj7Z6GbWFAPH8TTPp+1qq5BYEDhOpwO497vN7uJsyGl19ThdPfrb/+gffOuz3xtWkMmDgBc7dBJzo4SUAkUxGWVwEnVHp4QpKkZVbdWa17GNqmiURUZ0B8iUgCTQIgiFnJZUNLpbTjkFESAGIiCBMAtA0gAMRyBYDiOAulhf3uQ6iIExyNvC9dQIqE1hqZlZQwpO9CanM1ysIt6keiIs3BAwYCH2IDiGIzMREhOFh9CiB4bFDoAAHosCDoRQmEkwZeHETCJSALDOGgEiCQDdFrotIjIye2CtzeamVWPJvXVAGbmQpJClmokIGBBAuEQmAImYg3jZPVA4IQIQYArJIMmRlJPklBg8dCl35C2tz61nYQfb395PpxGT1Glq86ym1uacQMi6TABN20LfR2aBwNNpX/KqdMUhcl4hCXLenW3DfH9fHz95NtXDeDpAyo92T8Y6mXnVVueZQp3AAHeXVyJpHsdSinolgOePnh4Otzd3dynlJNzaNI51e/FIcrm5ufvJh99fiTl1q7Oz/d3NPM/dZpdltbs8W+8ubm9ORD7XUaRTnXSeqlV3Xm0uVWN/f68613YUpjJs+/68mbkqGt6/vj/e3Y/HY6uTNrPmoXWurdaYWwBhPwy73Vqt1rDS96WU7WqTS9p+5YPr24d//K9+9aW9ZmopCwBFOAtwYuQABhbAhMEBhIbgCGCGHuSsc1VX4giI6hoa4sREro0JKVviFA4OgUzECQk4EbHQ8j8GJkR3ROIIimBQC0cPCEMPaPbGK+wRYYRBYRHL4ugNnwFwEWaHL1+5JMzCAssAPhDCINrynoYwBEBgAEQAbW42BuScxQPDgSwSuHAYLQ8nhIMLARKhMAou4yCNpt6aikBKCYLdUnggNESEEIIqxClR12OzaOHIb9QDgogEHGTmtow/39xKEBCACTzeGKcdQwM5TBsG5pKRgMgIM9MGyE7z4X48fjLHOnd5ldqRvFnKA5JRpvBIDBmkK11t5KaAwzSfaq1EZN4Ipe9Xkmjo16rd0O9SSh7zMGw4/Ob1yx98/1vPn7/lga3ZXE/b7fnDwwOxPNxc77rkzVNHfc7j7JeP353uP3/56raUMo17n2O9Xs/jKXEAihF16xU6fvLj74ZPuftg2FysdltIHUnfr3brfpty/uTTV+FSUktMx/1DbYcsJVPnmKZpMp+YnKmszs4BRYLmaY/IOtfb6/3h4UGI3QLC1ZwDTFXRJPWZJfdDN/A4n47zaXOxW2237XjCMCk7SP0Xx7vuaniyP5/nwV3b/FDnQAbhCAsPfMO3dG7NCN0wKvjME3FCIWIUi5XTWOOkEeY53JFaQPUQtZyzNUuMmYUQkDEAmgZSICCRqLk1dSAKNyNGAEf3cAtHNPXZF2XoGxSnmmIEkRBgYCwKTSRbsnqLXog3jwZEClu2xwwhZhhgsbTbidwNIFgICSE4yH256UKEmQf+lNZPAcBJck4JBZGRYDEBm/niuUFcPoTmFghEga3WVmtEIKFjAC0ouAXOHZTIAcKc/f+vCnyToGYAYI/QJbDqgESYQgrlTliYMFiyAzVDDleNh2nusb8oF0POrY4elomZWHIuOW/Xa0DLWbq+Tyml0okkD2jVUpJuVRCg1pNkYHZAHfeH0nHpu0dXT5CYCS8uLkvaKkJJCKBmgQ77h3ti2G3OSuqCsC+Dt9Onn/zw4skzLuubmxvC0Ho8POx3u8eb7Vli/vTDD/c3Xzx/+90X73+tH1aDoM3T7nLDiVLqbm5vWTKzt/HgdartlHLpVtvpWFXHuR3dqjXry9Yj3T6cEKUvyU2Px0m4O7+4VHewCE6uDUybOqYEoMK4Wm9O88R9d355gQgm2A/p6ury/NnzH3/+8nc++93vfvaj69N9nzkJuDBlTlQiyBwMQjIBO6YIchJxMEB3h4x9JhQKAaIwiGYeFqliYJeRUsqJCyMHvnEFGzMiomsDZoiflmCamwUaRoA1t+bNMAwNfNK51eZvHhGP8DA3DW8eEYQYjhFY64Tkkog5ByKSyxuYYThy4JK/AIjghYqKuNDf3V0oKBZvSsUAaot+b7EEeLAEkjDz8mPuvgx2lgREM3NFL0H85gjkDm4a5ozMLJxD3D3AXCEANQLB0AEAjMEDwxHJAyPMEWwOJQNCd1JVFgwAymQ15tEBRFgg1B1VAzwpehNQr3Ozgx5WfTfP8zSPZ9udpAwRVSdBsRrMCbm2URkT+Nj1KcJLGjYXq5cvP6+jnm2G1WrVut7MoukX95/studDGp48fu7Exxo6XUcEmq/yMOHRImq0zarzadLWJHecUp+LO4gMzKf7w8PxUJ8+zY+vrn7y4Q9fvvp4vTu7uHzx+NHFaDrf71dl5SVU52k+bbdnEHV/+2ocZwAathsgvr+/BZubz25IOAz9RgPG+/vUD7mU2to01aHb7LaXL1/d3N0fXjx/Olp99ZN71Jb7Iffd6fgQzvu7oxTouCBCs7rBIQWUftiL/JMf/KtvvfzedbuGnnXWXkpIqtUNtc3zQq8DBBExUuQAx8wc0FqzSlPGJIICSdR1SM2iRaoNEQwB3cksJCEJL3dKCIJAQgDzJUxvvqR0IjAEoJlqdQ9llAibbAZzhuIU4UYA6LCUDhYaj/tP0WcLxxAAEQ1M3IIYGQPRkYAE0EMbhCIygUcENAVkl0SIaK6oVt3CCXnZfDvyYuRlQQpzwyACdSIKRDKrrVrDMPPF8QQAod5aVbdAWFj/uLQQkBZyKARAAwhy89Y8XJfXPhFEUGsaFICsgGqO2MKkdFRHYFZCpiQW5pC8mUARBp+NJRODq2vVi4tLDiR/U/Q3BZGU+26aTmqt1dENkKLknlmOx6NwfvvFB7c3d/PkKfl2+9i1blary8vd9fX19WEadpsvf+P33e3r3cs2ja9rm9jp6upqijmY1MfVsKktDg+f1dG0YS7w7OriBx9fT83OL6+aN2Ca2oRJnrz1/tnFpav3fRlvD/1qfapjneL88uL13e315z/Y9P3l5aOQ7uZ2b/XAUMf5GBFnm8cspZrOY12tt2dnFxEWmDarNQVf37z67NVn24v+ra+89eEPf4wBXDpKMo2neVJadTnJZpDjw81h8rOrR+cXT1lr3/ff29/+4P7zl/bgObbRj4ynNksQd9AUMQsaCCKiRyiiuLoIgQdjyiTRamSgnIQUXFLomvLczFyFVBJFWKvBTCJAwuHRzDInohTWINDNVR0ihcESl9RqVnVuEU4BHugE6D4vCbQIxGUGCmG2UGuxNWVmSUTMHuHg4FWEOUIDnEhIkHhRWQkw1lnjp6944UQQQMYsZB7NNRBTZrGAxdnMKSgilsWwhZMDI1kQYRcxtVYXYihzEBGo1VqnVtUUAJaNLyL8NAuHC0g9zLVaxAK+ckAOIvfm4EG0mJfAXcMroNY0jWYITqYFmSA8xLA1M4JtObvcPO27TY+GWk3nlAatLVQ1NBFoVUBD8DAnFE54dnbZ5sndu6uz4173D9PubJ2kO46nNh5XZ4O6P758EZA//uTHH3/+o8357q23vzwdjoe7a0A+zuOOUPLOw7p+kzC3dqy1fvm9rz168tbDfP9wukmprFeXfc+PHl8wJ4cY1punT19cXV1U8CI50ai1glMZ+tM83d5+kcrq6dtfrqoff/KqcOo6cRPp+kRpHtvhdAfsuazneX51/QnKar0627cZDOaTXV2cvf3B2/M817H2w6pqvb+/m/YPqUsFi0eM0zxr67ddGEDXb7cXgfN4uNvrfbBnlmqVSIQ8UFtTYGLgwGV00yAgzIUJiSQXbC0FFKIkzBiZVTmkoEQwGaSgXhw90Ah4bkrCEogYEuFYkZKQWNg8V18AaxGgNmuzFlatNkMnBDZwgkALLgmBa60AgILo4GoAi53EF1xVmAeEZJau4/4iEQqQcHJiZRZEZEZAanpyg4ycO8kdclJiJJyFiio0nQkgUV6EqZiR0+K4YeEMwACQkiQsauCmphrNogJA8tqOdTxNbZ5mqDPAGxZKM9CGERwYZhoNY0JvWJtnUHIGz76U+x1CGS3CFAzRAJQhUoAhIgJjBBq4NrDelaY4iXbvdM8f7Z70wypLQotaa5gJAmeGwAg3tbmeVBsh5lI4yWq1Es4R+fzRo64r8/G4GoZHjx5Lv97tztfbHZBfPH4seXV9fXP76vU41/Uqq+vd7V1oPbY5qMsl9R0SpYfb22Ykq9L3NJ/G03EsnAHg5vbV2fl2KcJeXT7ZXl6y8Gq9mY+n+8PnNzc3gqtguLu/Pbs4u3r0VI0+f/nZdp2JoAIkWWnANM/H8b5f567fltK7YimZhc0ijCAs9Hj55JKG4faL14fXD69f37g38FnIcjc4lWgTWnDJYI0M3v7q7xveewuq//qPf+2bd58KSKta3awaajQzs2UBasiOC08GnZCInEvOQfJGfohdHrpEnTfV0R1caT9O1QMSkrASuQRxAgxIRGELoc3DgzD+DZokEjppddPwuUEQLmFJDPcApJREWJ/2jcMAAIAASURBVNwhkRABE4lIOOTcEyGqLZ2TCOeBuZNVPwiFc/FADHzDGCQkgxmFUurm2UEsZWLBnDkQwknYTsdqxljQrSIiQ5ihahRJboAdEdnC4kNyEYpcIsBqm+roVQUwrM7zrG7EkBJ7mFpFpWgKJoAJDdUswiKCgdQYyYEagnMJylybm6K2EmDMbObzNHKS2QlVQ9PMVUIgaVn1gz95zJu3Hz0/79dtPoHaNE2AsV53WWhqNQGBJ2PgJBHOBCklAra2GDHV2ry7vHr61ruvX10/zPHo7CKl3IIxmGD46pevzs+2H/7k029++3uU9Btf+kByvpn2WYY+pHSpKbH41KpI7rp1gM9z63J3c/OaWZ49fU9nRJyfPn1mioykrqpKROG5lF3qu2Mdd7vdlruXd/fYb54/e+vm+rP7h+MwrFvU6XTfWjvbXUmi4/GIMOVcxrERVxGzmHOXNhfbeTrxsWvz+OrVF83bfLI6PgxDl0sPEF6rh/S5G8dmduo2K0hPceekAgBTzK1pKDrj0m30aBAkQguWoQEgkmNtXnvtMAkGUDhHCCiGt5gBXSHe/FsDBQnAQ4CFAKt5gAUxKzkjegNTFpYEEcjaIhq4oxsGZPOGJAThqokppSScMJwCI5CZ3M3d+tIhpimC+9UyFcUU0smw6XJOIpmHYWDx1maIjOiUWSA3nXnKiKhROUnpIGVwCHA2bxbOWMKghaWUwMENTUO19QMTG7IAkFlQmKQkqc85W60Pdw8PD+PokNzNKkSYAAG6a3Vrjj4ahCxAaQ9abvyJGAO9NUrKCYcs0vNx1nnycNeKqkYMesIZ3KraFFaFhkjkrc4G8mTYvn/23uPhCq1N497bydWYubW225wJ5+nhgBhLYyGlVEppTYUzBh11zIxgen/YO+Wrp89zFvAmktt4fPXy0x999+HRbvXWu+984+t/sKwv/u4/+Fuff/bhz37jD5QyIBQu7JHn5lkcEDnharM+Hca3n7/9e9/5prb57OLifPu4ttPx8PD40bOH+5OZrfJQp1kgwPthvQ6rpXTrYTOe2ubiSbde333+yXSc+lLq6eF0OnGJbugDsWkM61WdZrVxnkxyx2Krza7vs7v1XT8djtP+mBAmtXracwDnzWyhh7uSUTmHc171z56+K9IHAuScsK/TeKiza63zOLqiO7TWqiNYFgAHZDQ3QLDw5tG1ikwcQJ5ESIgYzN0CCBmIASAMIAM6uSAxRIQZILovqmQPECNgCg9wIQ9XcI0wMHVGIhJXh8AspevycvSHJf8ZCB4AS25AwKzjCAoA4pxR0EhZQFil9DkLp+KlSGupRcVkbkDIUoAmjGBgzx0xozOAUpgAz17VLTORIzgguoIzYORcmJeCDC4woEE4C0dP1TvFmOv9cX86ztURJAEvn7Jm7ARq40wABqREGAhCFIFuQRERLoybde43BGwgvN508ylO99HmarOFSgNwra1R6DFOKJ1sV0NpleaHpvV0mhgV0FQniFjnM8F888XtelhJovu7o7u36n3fM+eUuqXn2ZXBVes8hR6KlMrQjnb7cH93/er+9au7/c1Umx2O69Xq/Ol7f+xP/zt/+S/+9//q/+3/XNtv/cLP/tKpSffo8ni8S5OtOUfEsCkBmohPh2NgnJ+fHw73wl0SWg+rMC8dE6d5spzBTJC7qR7CmqRhrrp9fLXaXpzuX+ciq8324eZ6qveUZbvadN1wPLRUOqsK4IyRCUWS9CtOyTXmOgmlOkedp+aWWCrN6/XVanv5+u41h20uLoCKElxeXDx+cnaaX/XwPAzv9ncPrx8eXKFWbbW25q1hGGggQ50cSYKawhQLIZNkMk3SennTFFwKvoGdgbJgMEQWd2/g7EGmgAQsyGhN1YFZABEDMUDNIUidKQwI7c14RhEJcTG1YzgTp4AAdHcngYgQplwSYyBBlwQ8jCkPnYNP5mbNkSUX5DLljjjJw2mOSRGDSAOIM7C4G0EgEbB4YFiqhCsR1OZhEOzuoUwdOUYk4QgD1AAA9K7riagkWfUZRGoYsZOF2mgM0RwSS3EOtWaSCDUog1dbdNLhEGbMjCQBFRnLWlbnTKVFxOXZtmlAdV6Tlzwep7mqTnOWrpce0aD60K13Zf3e7uzx7qrLm5PWLqrPjVG6fqCUCamZvXz5eb9aI0Cd55wzc3Ijlpw4ap3MTShT4dbmNh5eT8dXr2/raTzsxx/+4NNPr1+3cKGyW9Hw8ndeXV//+T/37//lv/hX/trf/M83P/rixbtfuTuMHXGW3FpDpPE4XTy7OOr+7v6hX/f1MG93Z6uhS4m32y0RUWYS4Zwwxc1+bgFzm/b7/cVVvrh4vNvtxvmUxBDs7uZVL7EadtL1QIU5dUPKub+5edX1CR0xOQqENa3Zw8BhgtNhf7zb3589ubCppu7sfPf4eJxtPr54+0ULJYaLR4922/L6eN9/8cXwYsT+8jjh3c1rLVxHtUm1za01ImCBcCJMKBI0veGWBGLj6rUWdskA4GABQEQgQZoIghkpEwIKExEEvpGqZGBERgWPAIS2pNAAHNjM3asruBeEJIvcBw1goTEjEJq28JDF6IIBECKSJbXWaGGWFCHBeVYKUHcwln4jJDNlyIVzszoZAjAFsnsTSaEQAIKIEUrojgZcAxyhiwDwoABAJ8wsiGSAoRZAXsrQ91kCS8lp6CTnNeKu73rJxnW/f7h5vQeunBzRyIE0BafeWtTl5bIsIDTIFucTZNo8KWXbKEOfh0Rl/zBLCkZoHOLkEAK0Kt269IbiqIyRUKKl6VgTHI7JEHpQFU4AhIinw7FOo6ve3d0NuSOiUgosNU43Yu67DADaYJqhaTxcHygXyOcffvj627/7o4fR1Fa1Hcd20pvTxUB395/f3/3NP/fv/tk//kt/8re/8723v/S10/4IfX56vr25f1nn9uLRIzs2CMNM2eiorVTvzlM/ZIW4PL86Hvel61EwIvaH+9B2miqhdF0eNutq7XB/PR5ubz7//Gy3ksyJOyKaDKZqpe9as1RyKSlAZju2aSwcrkZECK4aieStt55O1Sc9Pn787n6v43TYna0piSo8urrIOe3ncZsud7IyvaGcuq4zszbadJqjuc82TxNmGUrmLoLAohEJBSvYMspGTtWjerDP5EAoJMkCESPQIdGS5CQBYTYGIARUd00gQKihECCQVJ0ZA4IIAWPJOgASQkJsy+S15MwC5hMSUhAAmjXiRTpkkvLyXhbBkgIxPEVzI4SgEBmIJSW23EOuBAeNYCdgMpbgBNaWeIUDB2BkThiJaFZoIoDI4BjqgYyZjEdKWd0QmDiInFNZDUPOWXJKKTFuSumd/YtXcKwPk8/eiYgImVTVit4rrQgAWgAgIxcDa2YEsTrv85kPO+q6Qg5sMSWWNBskwaCmCct6fd7lklIaKKmg4vHV9Y0ex0fbzcXpZMceL55upBcSDW/HfT2NdZrcVdsphs12vTuepoAGoUNeMngiIiyZMGFZn46n+7v6/Q+//5PPXr3/la89Xw2nh+mwv73evx5nPe2n3/zRFx/e3u3rf/2n/vQfe/Hes0nnZ0+e13a6q+1UbdcPZgZuFFCkONb1sEopqVvpVobgEdvdpYYG8HRqq+Hs5tVnD/vx8eOr3e5yrqZuOXcqw7NnXwqxKSqYoMca0R8e2lzbPK+7PjxYJDGrNq8n7KRZZEwpZ2QKMIr25OnjTz66PRxazrkbZIx2cXl2dX5uPKlKnyA65ry5//x7/+o7vzLV0AZ6MreGChhUhBnY5iZDODmDMCTFBoIBiGKK0CKFMZl5MDAVQBNd4H+JKTE6mqKyFCRjYfBAR3/D4kRYsGmGphMEmaE6EjmhI7lXRYYiaaH+uHtKmYAjAoBNDQBaa00hSyRmRLUGIpKIGWkZM0nKwcxA1Vmo58gQRhzq7lKcBYgTYlhYEvblu8dl2CAnh2ZhAZAIILiSFMkcFIt+rLW5SWbpUkqlF5FUUp9T1+duhrFZPdnd3bEyIZCUjVC1w8FKIx4wrwmR2+hEDMBzM0+Quxg2uFozUZACgQmHFCFOoJ4ASWNzuepKjwQDClNuUEa+GVbd+frJo67PhFODDgO5AbRax3mcIyKiAfrpdOi6rvn4sL9m15kkUfZQZCDMm4tnz9760uXT9d/5u//wk89f/+E/8m+V1cCEf+Iv/amyvfx///W//eu/8s/KZd1uH3/7W/96//Ctt99958k7zx4O+69+ddvR5UcvP2RC9JmoN4hoMwbc3x3WXXd+sctdZ74wCeY0rAFJrZeutYeb24fDo2dPn794gdzfP4zry13fb4G73A+n/QPVY+p6aHZ4uCMCrT50GRFrw1XH/VBQnK1FRoQkkSiX0qVVP5Dwq9fjrMd+GAjYrD46O39yuVYfKWKVEkhEIoD+o08+fXVzOB7382nEyq6QUlqvNv1qve1LUz3CQ0DjkE541NkjIxb345v6ObEpKRAAFKGanJVNTQAJRZMxGYKnn7a+AiEmwyBADg9EDA1oZh7NEyATOaIuAf5FM+puP7XFE0RzCwCMQHebJhNRIQlRETEPZTYLRHY3ceAX31hLMhYTRkZqszMxOhGSB7uDNy0FVucpD5QIEIiIiD314mEcOXDKneQhdQP3PQASWlQ91HkOyB33XZ+HYVNKVzruenKEAEHiU9sjz0whCMxNugUjRJsd785tvc5UiFJgZ8N5161LXyD3VnrGiOoNoYDkcHJsjhMw5q7r+7xer/u+6zerPDCkmPSEYJthWIv0sircJYRwdQ/VYACoOnoj6ihIa1Wbx3HvHtNxOp1Ojhpgm7Ord7/2i+fPvvLZ7enXf/d3L148PTs7+7Vf++2PP7/7T/7T/93jxxd/9f/6n/3uj370+ublk0fnX/rKB7/73e9zxJfeev/86slmuxnWHYTa8RB1XO96c5qPx+PhtutXfeklp4vdpRpzwvV2havd5H3u+3FsR5ve/dLbzx8/a6p3d/eqcbq/2+9vMKfjaVznnkKQCqeOCIauL1lASnXNvahqAEpyt4BmXemkrIf1MGy3JW9yXq82w/pip6ba5rPz7ZOnj1u06g0jUubd+VvD+x+gH//h3/+7//DX/sVeT0PqqCEEdauyLv2qrNerbjNc9mk7xRzUaujJWyAKSABlkIQ4iCRo6wSrLgmlCLQgMx5rnsIkYcrCREGBRGAqJgICYQARSOpg4RCEQUzIQkAYAOhMJEgmQhggLCUlouoRhEqh5CkCLKoI9p1IXkDMTmgRoOamYcFC3EQKCzMhZhyGoY6mAQFL1ACkQy4A4Ik6FJBFrQwWzfuBJgdSpmwpkxBLhEILFDc+HbVNp17GeZxg13LuKYkhcOa+H9I4DsO6RdfkABZhKEJpl2CjOUseOkrOtatTqjrnHsCHAE3dkcUcAJq7VHexYnMdVWrqC7NB0jLwqvQMaIzVpzX3hcGpGSW3INIMKcLbNNc6g3liGrCYYmC7vb9jZqIhE1ea1A5G66uLR8/f+32rxx8cVL797W+71idPn7bTw/Xd59/61g//wl/472r4t37v25n4s0+++PTjj/9H/8P/8Gs/80c+/PSj29uH7eUFJxaUvuR7tzaeCHAoopS227P1emD1tF7PbSylpFXPwznIrrAweqX917705en29ru/+3tffPbhw+GLNtf5YSbibt332+6DL33j6bN3jZwzswpmysOK9vceLQKaW2KmtLbTLQGHOQ3OOWXqQ5tGONBuWHdvp/3ZPmWe6wER+7w2r+6eh8Ilf/G973/3u998erH+yuWXPrnd//D2e0li062TkLBzcObgnHtfjebCyibq5EyZAABaM2ftc0ZK5uGkgO4QQJgydW82uMiSISzcF9Il4ptYP2AEKCASSSyldQZwQAdOCBSIyZfoGzROCEhCrr5YcsGnpq1No+uGJSoFI6CHYgB4qlOb55Mwp5SZyJeZY7dKrS6yVGYIyeYBVIwQMSBR13XNAs2FhCtPBuaWOHeClIVEBBCNBBDq6XCcbtf9er3ajOO+36YwCRCIRgApQ9+vTjVbQM4MligEkSHRaigeVIau6/KRTh2sWfrwpuZJOMnCYOynqgETZ6YETEyFc8mZJbEkJI8ghCzIltCNJQKahpuHIWvVphNaTHNTgs2wmvxU34DzsKTu1GK9Xpc8PH/23qO3PujWjwNgf3vz0SefX148effZs3/+T3/54bBPQ/dP/+k/NaTdbrOf7v/Mn/3z/+P/yX/4q//sX+y2xzre//Czj1+8eAYaKCGAJISp88qpb/2QwHk+nkrZUGROnjqPLpX1Y8au+Xw81reev3N//ek/+gf/1fVnR2vlVHkeeWpGjtu7k8jdpx+9fPvLb/2+b3xte/XWvB0EoR6mnLAxnY6TawvmHF3aXZ7miXNKyB0mV6ttBAug1HXdsOZ+uFD1aZpK7ptGVcurlNYl9vcxPXzta19ZX15+74uXx+MxF4mqhYckRAApc8oYiFmoQRaB2XSCEEJEI1cIsCgWC8AvA1bwpVGl5gpvzLmL5oHdEQkJCHyJkDkAECVekLdgQMpiqABKiwAYgFogUEhCycYIXmPBN89tVG3uboFzqyVhECAGU5jPBOjNrLoQkQghhQcSQc6csrXGjBDhRCHCKVFJQFCJexRAB2AohVmEOZhKphW4IYIRpMiub3Tyx7vT7c2h6/KwxrTqV0OnrTFjrUpsORNLztFnIsSEzoxCKfqUKWPpejcCr+aClDwaumVhITY2MHF2ZBWjlCMhY3ZJlgiQGvOAHpDMWAh5gLyVoda9UuYGYeR1qvOxYFqv18Bw2I8aTWsDgKYzsAzDmnN6/Ohi6C7XF5eb8/PTQdvpYZ7Hx289v351e3d7NHfEcvFoHRGhdnu8+9N//t/7j/7KX3l+9fj/8L//PxrK/WnsV0OHzAHeXPU4pBTQlUGPxz0aIbAn2pxforXT6frsaserjTazSqnvbNLf+OVfvbluH73U0WMCEj5Lq+1pf3j5cNrI5iLi09/7sd492M/Ho3fePt6Pp4e7gOZcUvGUsmo7qfV9GRKrTkBxmg8I3Pc9MnFOscQrASJitRlqrRAwrPrHz9+S3RnUdhhv7vcPv/693/3s+jZttk8v3rq/eV2nfT+cp5Q4UekYAHKl2SEWAgq0wgkJXDXcXZVEhFFQBDGS8+Sg4E2NCAkJPEUgs0YQpohYyljuhhji2R08nAUlIbEjG2ZxNUAkSgs0SNg4jBmBHZuC0zw3NUBOEOgarTGTs+BPtQQjU1AkATDmjJwFMUDZQjLjPAEgGoFTztJ1uRRTmKsCOiF1qQTxLCmI0pA7RjCXlAnAAHCZFaMlqLZ/PW63h/t9KasppQnczMIh1BTJk5BAl0AwMnIGoCScMG0Gwky1zZ3lqgRA8wRETuBmwJAUXDKbUoQxA/cByQiJIyCBC4Qb0CwJgKlDgXBJZIbu3rwxgIh48zrNWCAXoUbBjuE4JAQZhoHAEbHbDdPoz1+s6nTKWTjpfjrsZFO1jcdKMSBQ4CgiEPL/+Zt/4z/5X/zHv/DzP/fo8dPf/u1v/oEvvVtWgxCTYc4duj1+dMEid/cPfbedp5OkfrPbOWEikUjinVFuudnptC79N7/9zdfXpx/8ePz+3YOcd48uH+8uLnR+2Dw/h5Db6/HbP/rx164udvPmd7778VcN18NKcjIgNm+mREQuhiAiXZZx1KZR9bgeVpk4sET4IiZpOrvbdPLA4KHf7Xb9+jI4/eQH3/onv/kvvv3pj8b97aasuv6MwMHn+/kBMZiR0ATRGfrSmfvslTEQ2CoJQ6ItYgVTgoSISFZSLxQOVNDmOrVa1RsgRHWDhsJvdL2OpugGiAQpWBDDS8KcEQkIyV0bmPtiH4UkLGTChN48nDhQEYDcgDHcwYM8FNHB0B0RCSlYghgkyUACxA7BZghvlCsWAB4ZCPsVdQNzhvCsjuhIEcKYBwTAlBm9BYRgYWEOnGutWrGFNqRE4jHu2/39cb09gEffr5pNVScFcPfEIrhCCxQKEMQeAQidiIgZdJFlzOrhgQiCGKgKwYIwYHiAs9GQlutytCKYKYXGTOQUwICApBiRYAXbYjlrSDCAOfFCQPXmCz0+5czdDj1EpI73jx5dcaZC9vTpU6BUGC52q8uLJ7/7/Y/+zB/7E++8+/6PPvr44f6W+FxSHus4DKuPf/id/+n//H/9pUfP9w83t7ev5und3e6cF/QZ17Pd4wgc2yHlDB7H/f7q8VPBzgmFyBBovUFKYtGv+3rdPvnw81/5rd/7/OS/8Of+6Fe+9tXnu6/+tb/5t/+bf/73dDz9O3/0T37j9399/9aT7//Wt/pKP7d5cjpY1xkR2FxtnnJKgZ5SX8DcdZotycq1CRX0PNUmGToqEGhuWTBy8hbCmTLmoZf1xfT69ld+7b/5V9//nbzpX1w+PjWeIJDTdnNp2A7H1yjrNg2nlFBaYu9Sp1NlzAMxB5hXYCeSAiVDIiIiwrBhGKRPrSLl882k++k0zsdpquYqocBOAeFASJ0kU/BIpTCjZonEkFiYourMkOuErQZ5EkGEeeFUMUoRIkpnkU7HmThAWoAiObGaKYKEpYhCzCmLDKttLmNECyf35dgjKfVmcyBSIilUekAKceEgoQirENylFQp4hlbJLRgdSVF91mbq0WhqFROmHr2maR/393sMiggkG0+jgzccI1A4iaBVVYOUilmLKKE9JRMyxNHdwxxSJO4lEQrY5JRSGDLMKJaZcyokbCDQKEyDKhNITi7mFmABjjq3VWICZgkObpqCNSJ0rjZPAHG6r6XHrs/Mvt70HhpAn378WT7bIb2VpHt0dvELP/MH//k//60vPvv8/S+//+lnH55O33l9+9pKMtPLs+0f+P3vlZh/8OGPv/eDH4LqrtuNp9YuYNP7eJiZ8tR0s05CIT6t+r7O2l/ltNno3f328TNYd2bVZ1fnjz7++Nvf/+yHn++/+od/Vuf27/7p/+Dmlf7eDz6/fj1Op8Nf/xt/89+7/2M//0d+8eu/9As//q1vfmXevP/lLw+DHQ5fGIwAgCQEKkDTNAXGaj2YGaCXMhAlCxd3V8u5uJqHI1DOXFIqfZ+HC9isX/74u+O0f36+PYbL+bbdzovnwdLgsNnr3OJQQ1OIYENsWXKWJC1TRJe8BlfdZxx63vYiffKUEgoSwabr5hKY+o3JbPowne7u9uN4mOZDm1WIE4twzpLQJLjkgsKek0JMDDNTCKUZCBKHQiAjKAOHV8qQpGsK4CxdSkAeaggCkxAmwZRwrjMAIpaUEsHAP/PHH4sAcWHKANaq1hqtqptFMHFaDf2wEV7OzhjcIyEhrlNfmAE8EuSFRkdspnUcqzZtzsc9uNF2u0uFgaN0fZLkWtGjjqO2yWM2VJacJQOQKQinrtslIaRgEghq7cFMl2R/Ji+ZMualNemRxjZV9K6kPq9S6ogEI4OBMJVBkCvIFDELpZVsV9Sxh86u4eBxmmszc/V5ns21X2/WqzN3BKSu64mlzZWIGrQPf/Q9bY79xVyn9x9fNo2//y9/+emTi0ePL1f9ENFyF++98+IXf/5n//if+GPC6V//9ne++Tu/++LZkz/7J/7I491mt94kYp9P4zx2JQ9dSeDH+1sWOb86x9U6DatxfyjrHeQNBpZudXx1951vffO/+Ee//qWv/uzji/L3/sE//r/83//WX/9bf/373/8tUxkPh/v761bbarfaXJzn8mSq9etf/7qftIPJ2r26grbQOp+OjNCv+tJ3p9OJBUVkaqPDBOGBDAgsRCgeABDIPjx5Kz3/wEtq93fXh9cPt188POxXq83Qd12ggEuPkV1xBom+y6UEoxF7otTcJ5gZvc8CZB36qgxM1HW07cuqT/1KUsopJymZ85ClpMS5z9thV/JKkoSjayTMXS5Dzqths+6H9VDW6261KixBCZIQAQLM4W4utc6JISUJbIuCS1AEBZ1K6ggTEUPU1FXJRhQLpKd0HZEgJCkdMRm6c8pE/XgkhEbomHmeFECYObFDkKkvhicWypnIQDJjSXUGQQ0P8zr5UZVMOXQRvPg47Vfbp9v1usfMFQx9cm+Is2mHsk1FzZxQUCY7qHcrzkjM1Ajbgv3NqUGAOxExO4IbcdFaEYOwE80pupxWJJQoAEtUT0IMFkJCjNJCp6pzjdSZA2NomqYx5qlWF8acGGkTFA18vV0TRCZh9BC+f319ebV9cn754Y8+jLRaDSUNm//eX/yTiu3Xf/03P/iZL7/1pffXl+dD5rOzs5SH7/3o+jd+7Vu/869+5cXz53/5P/hvX56txtNDqxevr291OpZMQ8fqTSMo5xTYpRVzH+ackkLqV1ubHMZ6eHX3L37zWz/+7OYbv7ht7WY+3f/mN3+jdKuz7W7/+tWf//f/0n/6v/2f/W/+V//L737r++fr59uri4eb16/uH56thttPvjjef9iOY/UxEheW0g1qp9bOLndX0zQdjreATiFAwEmIiBeOd9Nm86orabOGsiHwYRi6Bptu3XVddVfJ945r7maPhnFqanjvXgl6oHuKLOiM7i36Doece0DsMwa4B4uTzEJY+pJFmFJIz569SWuYvG8dStenvl+ttnU/RauALeeUBXOSrmdOE3AwnhEzo47H6l4rGoai8+nYIDR3WOtpWHOfh7A8Q4TDsDrLOTfbtfgi46RxEqlTgAP23XayEBER8UQsJNVyW3XjcQ/E7CpUmDkVZ3F3kQV7whoOBtEJM4BQZzyrhkVVr4ANCQD7VpGQSNo0Hi62m0ePLlWVSUgiGGbXMGBPDOJ4UqsRDEwes8OJIoMXxACeJDFGJgJDAvcIcguLIEpmEQ6ClFLpcy/i3pyoUwRiT50gBkY1AkMNnVqjBwNokTyLzW0+UTALh3lrc8bCaTYPlqQ2a/VQD7fTzcP2rfXTq/6T7/7ryWl19ej3/8wf+o/+0p9963L7G9/65t5cuk4VP/zeh9M0ff75XVL8j/8Hf+EXf/EX+rwZb1+/vH613pSr8y2Bi5CZJc6tnkrJQ9cpONgch7lfbWDoyZUI7z99+cNPX/6z3/idy8vLvksvP39ILo8uzlUKSJrnw9e//sG//W//4p/6M/+t/9f/8//xS3/E3Gyufrs//L63nj5cw9Sm8f56aqe8Xo8OLIdhfXmBZZSHeW7T6WDWdtvLROV0vCOMzfo8563kJI0w9xFuoKh6d3+TxvlqWF3buL95tepwtbs42nQYK7P0Xa4qhOpYE60X8HIS77Mn5JwLUhKcBDszI57cAVIYQVckpQFk06G4Q6u5ap7NEWcAKClrPqK1BdlAgLlLnD1lAWyEgZzRWyq1tmxuWqNVrzUinFKQELiVwhQpTAlWl+dPhGie140KpZP67Wm+A9fWqhTcbLdCBF3qSuqYGWdd9V3botlxnBpCYkYiYxImCq9EQCJN4c2RBwWRchZVrXpEIhJOHVozQOPEoXpxcbY726xWXWvm6jlJhQlBhZCREnJIttBwECGHaqZApKYSlQSSdK4tYE6Srbq7IHL40sQHRGbwzJKldGzLBFM4E1HpyGOGpbvJJMjjNB81BsidSNFAmBrA8TBqm9frLSsTB6ipKwuq+fFwEsbjWG9+cPz6V76y6bNotPn+137zlx9fvfsHv/Hog3f/+PX1w83d/n5/4ovtaiibVX+2G853qy+u9/vDuNusyNNcT3NFBjztD5uhV62qmjkCEUqZpmnoOgOkecbxMJ785ub21c3xZIUI1Pzs/Enq18CvOka2lmT3z3/1tz789AtxeP7isYpNp1ksCQXm03vvfTDf39VjW+H67n7PBNt+fTydjtNn2/v95eW5cIeQ2qzuhwqnruspSN1SKUNaSwDUEK02PdTbL1YX3eEByt4uu3KEada72zodvVHuV8xpylZAYyY9y72n1Oox1tYxUUkdyeRKmQUzeFASaDrR3M2rUlYlSY5ITpxMZA46zWHAkuosmgDNI9B0JAzmYEEWMZ+BlRGbOQMyVkL3AHdyg9ZaQHSFABugdXnd5w3BpkgPWK1Vp57ZNttH6Vhw/xpsnOpnZ9uVABqnIikLJXfFruA2E8zXd7RvjSRSZmJEhEJi6oiaZEXYZUk5ZzVE9NKhIZlra5UTS5dyzpVstVu9897b/TCkIgBgxCyeHDsujM6CyMgLRQIQgABMtVJm89qsJUZc6DBAjBYMYUaQCIFQuiJTI2+VBTAMgJEzODIVpqHL4B7hE0p3+3Cc6mmV1itOyUAE0SkPHWlExGq1kpI92CMBSmBrVUW4dGmapkxop+P3v/v9r3/jZ590fUn8k89ef/LD755uL59cXD3a5HcePRrWq3nGaZqQ/HS8f/X5nqDbFTkerkPH8dBgl6tCmysPa3e/u7ndPX+03p5NwKrzbCpV11nq/c39nR3n2qXy7PLJd774yf3D9VdePH/3vWff+/DH9aB9H4+fPvrtf/3rv/DzP/feW+/87B/8es6AEcfDfLla9086fH3x9lsf5KyH2/u5unBMMyCHYBBBhBGJKcx6IqTcFeKi7uJuzUrmwIiqYMGtXnTlOpehu5znGOP0cHiYYo/pvAMO0o5ljG4fTUpKNPdp2w0XQQenCaOloigOqWujMUZXCrO6TWQcOGDHWViwgzSMrQmcEiByoXHGmBHcGxIosYJRzpkFiBsDxCIEZorGBJJYE8uxeVgguKsDYspifqLkq3IZmtRORQSx1zolWSFMRCCE69VqHPF4fOA/9GeuSteV0iNJRM68JRB/I4jVnHG9zd3AyBaoQSq8hkiJu6GsA2DW2QIEI6Xi4a0FMRKBOULQ87euHj960nVdSh0CizALaAsIyoU4MTBpzOGKgGZeW4tQphLhJMpJAav5ZF4JBYmW1SHiYtZhD4vw1dB1uQRGYDAV4V5o1efImRTV1GmOHF3S1FVqVb0aB5r5rG3WlvuuW23yUKT0QiKckkjTJimzZKtaurxe94fjw3oznMZ56LrHF1tyRSRtOE9VEm4uzp++8+Vutbs/nObjaVVkmg+vXn5+PN09e/Z4Oo03r292211mWQ9J5ykcN9vz3PXj/c00TZv1tmzL/nD46AefBZfd7lyj/pN//C+H9eWXvnrZZzo9+GfXPxmn1mYEP15e5C9//WuPHr24uHz6+vXNz1xe/uGf+/27ty9immAaOQeXohpQx6HfpMIQARDa5jrVVCRllJxK2jBlTstiE0vp6HzD51dtmuonP5yODwYy+dxscggiFuHt5qIUZgoWrKyO1q140/dZzrpy3q97zgThKEokRBIOdW5M6jEmkdx13WqdVlvKQzd0XdeJYGA1aCktcE4gRAxnnhFaRDCuAA3lJMnCW3gQsKtYDVMIlfFkENQPqXQhuZXSS1JOKsIISaS4IwmmRMRMYhb3QEbEGFLrJIvAHZAQc0rMLIyJ4rIftqthnOa7xGOXABCnSTMn5i1oSpyZk7oHAkZCiG7QEGhNzAxKW211t7l4dPGkT4W5IWgpvcdoBuqNEubcKToyoc0RIwN61LneYWwtn8zCjobcOFWPGUAjxJURGIGBDCFaQ22OCAwoTOqNyDMLAxEEUE8RENxay5wzbwi6nvP5buiCpFrL02y1d7em7lTyAIA6qaoBzgt3z8LKegCjafSut/vDw/bskRsU5ot1Tuv07K13ADtOuV+fl25z98PvTYc7gJNZPT5cH/dffO3rX9+U9OMPPz3brnORgDbup2hVU5lrw3qbCKfj4eHuVeqC1KOCQwyP1/+dP/9n7h+u/87f/5Vf3+x+7pe+8Sf/XMI0/uSjTy3sS+//3Fe/9sHlo6vt2fPXL++2E/2hP/Sl9bMBujPfvp752Ev2kjkl2Q0WPh6rSD6NszmcXW6H1RYAAUJSVzJTpgqwLplIAgp4bdc37XAQ7nfrjarSdNwmmfrd0dpYTxPUU8KZvKbIqy4P3SCPSLCUlPNqvd7eldub/bXHjMF9x1B1GmsujRMndcLsmjTxZAZslFOh3tDn46kvwkglcc3QpjSHIjrGrYODH9iByQBHCyJmlJYLT+K5IGHHjMiV5f9H05/1WtNk+X3YmiIih733mZ7pnWquYnWzmzO7mxQpURzUlCEBvrEF2xIMG4Ig+8rfwveCLwwbsD+CYcCAIEG2IYuQLMqSmiJ7rO6uqnd+xjPsvTMzItbgi108n+Cck5kxrPVfvx8QNoLs1lt/PSQmeolERKyWglwSs0v2AomyjENxMe9m4U4pc8rFGxAk2t/mFmPSpnPwe5IViRISBBML5RvmpH4cigCPtdagrrax+DhRa+AeJdF+zDNTyoHUER3ZhOj8eGx+TsEAE5OwZNVN9dSsOrTAtrU1D2dErtVo7cnVzMwU3VRRBJKwhdMFUYeRiBDUrAIYBIE4IQpxZhKZClloHgAPfJfzzlfNyTCgtQWEGHjIg2dXdKAkQkmKqdx/OFmtGHZ9tZtGeXqswMO4O7x+c7/0+OzjT7ZzTUK0tHeff3777OX903Jc/sU8jL13jk2m4fx0Oq/Lzc2Vbuu7dRHsxKbbORD6uopI0346H/t2zjkDuLf13S+/enh/6gvu8tSW96TL/+bf/Td//MnNf/Zf/skf/HP+7Pt3/6N/+9/66vMvQPrt4eXdi89Ssj/6w5/Rh/W3f/KDmx+W3fe+4wD9fB441rotT49DpjTdPa1nbBjEverp/kmGcd5dMY2mS7fOHZNwSQSMnoeUD7GuaX0w1GgIfRm1suQK5ZHPvcVSTYQkibpN2K1gKfupTJI540SUxgRTPgDl9+8+BwCkXkrWbdtaB+J5l9QEjAkEgGtXCYegQZKJoHSAKDmXhGdVbSfADYEY0RUdILEBqqMRk2Rz9ZRpGNIGHNEvSB5kstAI7bqBHxOJZO61A3awttkjcs1DuArgWIT4r/zrLwFwyAWBKDilWXAMENSeedhN41QyQCA2STqO05T3g+w4ZRIUYWZr9qjR3AwxE+eIDZF3824slCXJSJS4dQ6MZtvD0zu1bT8/Ixo4cU7Jvao/WTQA783cI+cxJQL0gEokFhcdeQZAgAu1AZlYRBwhEafhVxk3AR7KVIYroqEIJeEe9by8W9v7qqfWV4vN174uZ4Jo64boOeWcSx4GckssHvrw+GgG0zAnwnVZHh9O5+VeUhCyaSzL1qsPw/T2zbfv3rz98P6b9+/evv72q/PDN8vpA0JoM2t9WVcFXL1/8/b14/E47qacWCLAjHJmyUMetu1MFEDQe2NksvSnP/tZ3R6f3e1yWns9Dow//M5H3301pK5v35zvTw/7/UGQlnNrT/H1V1/OZr/9w89+87d+8oPf+nXsev/mz/2bz5f7dw+nh3o6QUKRISKbewQKyzgOZq1bU62SgUEcwANK+JSn/PKjOBzs3bd2eupNH9ejrU8EUDgFwarYVLtwK1zZlCNlR6JOtJv3Q7qa8iyJCWUer6c8Lkt9enoH4YQc0B1WcBzSCOO+jBNPBRNBNIhwA7fN4mz2BO7hqF3dkIBUwV0DmmoPgIjNLzb3cIyLDTITZo9GqXLaSJSFkALJ3EK1d+vHp+PWHljUfFE7Cg25ZLrEJ1H4L/+rLxF9GGZm2dpS0m4cb3p/AsZduc6lDNPMnHv0SD7nfcofEZechTlHYLcHi+UiekR0ZiaGiC5USi6ptMziNrW+mp9Pp/OyHklwmg6SMgGzUFDVvrgpkJpSeBKRYcyOFQAAulm7uCsBiQiYOKdEyO4O0Iix0MDCnECEmTNBCQcnA1b182l7/9jebnZsG5omRCySEyOA1tXO69K1erdee9XWthoKJOKoqis4dusX91suOZc8DeP5tAzT+OKjT57O27DfYSrD7gAiDtwNlnP16qe6dO/eZOth1qY0XM030zjv9nsFaY58mTgOiLb0cz09HllS91rXh7mwJNKUbvbXu7s7s/XlIf/0Oy/HdZGtQYWbJDPZR/Pw1376g9/4m3/pO7/2PWimH748ff2nx29/8eH169O6eGKAYTfuKM2SACAICgut25mQ9ruZMVNO+2mmIjxO8+0LvNpXdd6eCDqYRIDaIhFAWAGVzACcU2RGdEGkNDkl7SdAfn77/ZxLkAmXeRxL3o95cm21Lu5G1DHUXEvBYd5T2achAziBI6oBgNVYzfqxra0rqbbokWjIPAiTo9a2uB2HQZl7RKDPiCAyMhYkcGgRgQweKtIRkyvZxXIabAGInGRgQQQgKjlzEkJUABRkCITu50TZvD2dv2QYhnyrViNwGgdKlPI1pn5cgWjHRAEJwHNOgLasyhiZseHizhBGrCyKVIFH4mJgtX/YWmXGquvW1pSviM855drcXRIfUrrvfu+GuTBS59TMgYIvbuy1q6qzB1OAsTDknAmlteYuIjKM7FG3tjoGkiEJ+YGhWHSPToRCxJBLnnLkWhuwL82SRdcFHJMMtvXWq3oQY+1L71ZNhfxqvrq+efkcnx0fjvcPp5RW6+urZx9/8cUXp9Ppo48++frrr8c5ffzJK9P2cP+u1/N+Hk6Pp/BtW9fweT/tMu9LnrPsZdx3IO8bY5gFMKPa24djXc5hwAOnotM8t7bdUuyvxvV8f/Xxx9//rb/1xb/479vrb/76X/1EW+K8dwrK6eZ2f3197ULH95+/+/yXT6+/iPOC5kE+5ETlkDh1U/UGADlnGdKHD28tNNBTHnZz6aZb1NtyMx2u8eWnuDvwm2+juYBARrVagSKCGDPSLiWmCttZiAvjqZ+c8YbnJ5S35/a0Pr64/hiVJTkXZh9v5pfz9/cfnt598fXvnc6PiXgoXe2+tw/mz+s6ug1QsnonCMDsAraNXc9JUHhs1N0QIhW+TpxTzKfltfYnlkBKxAxBDp6HMIOdDWVIarxVreeaDqn1SkSE3aAiIUaoasm5Q1OrphlFCIGE+G/8w08DFakzMgBrP4fHbnpmGpJzKUysSAbkEA5wsUQzEQR24tbs0WwDCtNqcVEV/apsWXIWSuGOhK1tW/uA1CGCmeZ5n/OAxA4qnJBa93uHSmRAyJyFMgaZuzGYe9ss1BGJORNxSiKcUkoEgkDCDakG9sDK4swlghyUSAHPtR2tKXuRXkpMU55JPSubhWSG1nXb1NXdwPzx8X453bsqBo1lit6HkknILMKQKYhx62pNv/72Cxb74Q9/sJzO63IsgvMg8zAgAOXMlLvBaV1Titu76+evPrm5+0hZugYjZ5KUZsnlfH7o6xKhTmEEhKXwkAjN9O7lS8oC2vKL5zff/RGYOegG1tvbu9vD9fVsvmzL/Zd//ifvPv+zxzdf6oe3EF1HqRdvltrpdH/ShQnLwIJJtQN6GQphFoiUC4gOeTjs7/KLj9rdC0dOT0eqR6+tbjW0YgQERViADykLAAQ6IkCwQ+KMMmDIuR1Xf9xfXSWZLxrwnImiueYx7+bpFkKRn5iRpYMAiuScOGdHY+oE0NVCrW0BhtE00wiatKPZBQuCwiJUMLJbNF0ByI0CutqGKOE0lIEJhZhwuDB5zD2CESUcIiBcESTCzDfmlNPIlJiZf+fvf4IAjBHechLm1PoGGEO6JmJzD1QiN61NN4Om6oAIKbqdFBaL8wWsTpDCIsAAxQMRW07ZLrNq+VpYAJUpBURAzWkYyh4jIwQyEAdgmJ8CAjEJT0xFrQW5B/au2npKxMI5ZWZCREIWEjcxD6KVUrBkAAM0ohIu4QjQVWt0B0u7fH1TXk3plsKEU6LMgNZ7183aGt6Wuh6fllbrBeVbZOxd12VdHt/X09K2bV3Pj08Pp6fz+3f3VTdiHobx+9/70Yvb/fH4er+fX756DuHvjyfO6Xhq5+rTbpfSsL96fn33aab5cH3IJQNmSmMei7N5Eo6o2xNCSsMdAWyndyUDsu6GabjeP71/z8vy+OaXE/WrNGdC9HpaHj68+7Y+fvj6Fz/fzk9MsNsd8v62ajvdv9mOT8S0KxNCCZKUSkrD0/Hpw+NDV23dMIyt17VRwrv9i/H2JdxeYRapWzy8i3qioN57XY91XbRXAMu5dNettiBA5oiIgO6xMW0SgNKhEZxJAmk80GFkljylNJpp107MZSiAgEnBgj0NeYc5IQIHqyXX8BYUBUzbeqqnBT2D79yJuBHZpbfDgO4twLsqXtDPnpgmZgpQDCNIYQWR1Lw3J8pJBqZMQMzJzIki6HQRjSGM4Yn/xt9/gYBEYXDyqIgU4dtWJRVmsOgskDlaX9e+hPWuF0r75tHUFgPLaXJDQBMuCEyQmQnQAEl4Kkmy7ETGYZhTGtWbxpHES5qYExEjOxMBRLeViAICnYnIyA2CEOTiqifAC2OehCBBhBu7kSQhuTAwECECHELCS3hCtNC2rrW3mhDJsJ43Ao3wAHA3bWvdzsu2tNbWZdnW1lo7HVcPbmbH8wlLzsJrbU+nczDncVyrrkvL43C4fnVz+1wbjLtp3E0a0oxk3Odxv1TT8A/Hs2k6zPPbt1+//urz+9e/QGvjmHMZpAwkDJww7zbzrTbBPmUJsHV9AID94SZxmvbXfHdLeYr7x9d/9scfPny7H6Wv29Pjh0zU60ZIdzd3hHE6Prx++/X94wcmmuYr4eTuqkaoaZhbxZx5dxgCJYnkBAlTGmk37K5ffJKfPwcIWJ/g/Ih9QzPrTohuW90qYFDExdw8YgJCd+PAAgnMK3ZKpXBCYFvPW/0AyaYyj3zgPLgJBNR+6npiwTKVHoHREUI4B2cItubhGKbRzc0Aums9nU5djYgBAdQll4sKt61HdyVkdQf0CMFIiCSJszCRaI2uzcL+JQz90tbJTEKYEZEYkJp5NTeCIQj5t3/3FUmn1ADcIwAt3MO49T4kQXGgDta6rc2qqfVem1YkRXKzHhZMCYMCTGgsMiQsCQtDQZqmcS+YIhyiiBTOEGi1b246lCkPCdE9Asm7V9UzUvNwIgwIdXRDhnB1dEYphFQSljwiJL+QV6gEAFFH1rgQdkMjOGy0ixwcvWu35q6YYDeXGyzZW7PeKULXpS0bGHjr29oCAYFKmVKZgfFw2M/Xh6WrCef5wGUEZM4Fc8oyHG7vPvr04x/+5CeGjGkkmX/++de9w/Xdi+vbjxzjw9Ppi1+8/eKLPwd9f0gW/e27b79+99XXx3dfPb3/8uHta+sdTLtlwFwSjQmHgajINN7kQsSblKtyuHGyQf3h3fuz99b1uKy7+ZBkVIdAOh2ffvlnf7Rtx6vd9d319TjfCedxHIIuplCD8CIJo3VdGbCUJMKMhRMdDtfjzS1e3Zl6Wo7ezuBMAdu2uVcENwtzC9NcpKAQwOodwUtgIAKhAAzN2DgMG56a36s1KTPKCO5mUdvZLDwq8vliL/fwFpeVboxI2BSbhnqY18sksy/a160uTVdmxAggYFnDj9p720AhnLZQJEyAcQkYjGUOJ1NVg9ZVZGAamdM4zEyFIBMJoF/iLh7d3QGB0Pl3/vFLkibJIghjANKuFTx1RQ8thZHATavVrjXCuvVuG3DzaOEaGOAsyMyZQYrMwskMEMckU5biBqq1pCmlpL7Vfg5qxMw4QUCwawc1DWgePbwSJiIhEQwBJ74IkSAhk2AIRUqZqXTrCACREAKjE5qBhYeHAiTwITy5t96amqpuFGVML5lkiGHMEzVHdeEUCOdlpYD9za2kknLK0wgQ0ziot9PxFERZOMDX1k7bZs1ePPvo5Scfvfj4o08//v60m3b73dXN7e3dy+9//4coMB5uFGhtrchwWvub1+9O70+Pr5d5mqYxH+/fPb193c7vc3YEOz18/e7DNzwcDoeDhALC4Xr37O6WhzSl5GTDfJUOd4tDRuGAZV2n3d2rT18hBLg/PH6IaPM87cY74UtOPjVda12QM6eMgK21arWHIaRhGBHImhHC7dX17nAYd3u+vo40sBlsmy2raSfTtp7XbRGmJIyhROhq5F6ICKH1FcATyQjDnHaI4gJPvoQsUZ+6kvXStlOtdWs1oHXd3NeIaq5o/qv914m5IETT1cLDGxCeT0/aFvCmtjV92vQRsQMtCCugujez0K7mJwaKYCZm5lJG4nRBSWMgCSMREw6lTPNMxOEBgAAkicxbwBbh4c6c+e/+41eAxgScCELcKLDH5cXCSClSEnNYtyNAA7TeoHs32OjCNwlMWBInRmESwEzEAa7q6CZEdJkGYkiZPHBrR8lngN47cCoAEKDqm2oldkJB4EASSeDAgAHg6AQoBEIm5ECIlHpvph0tuStGcDIPc/NfmWlsYGSzbt7UzmEqMGfcJ0k5ZTBAN2ZGYDQfE2qCp2WtW3N0xw4ArXVkHOZ5LPtAWratdZ33N/PVx5znT7736a//9Nd249W6rvePj93gtGzN/O7Vx9N8vb+5SWm4nq7TMH7+5TdPb8+/93t/sD0dX7w6mC15GvO8P2347ddPtS6nitP1bV9Oz/bz4bAzqxxGBIwy757T7oZePKMXr4bnn2YRW5at33/55S8fPrxJ0m+udnpS6NG8Gu9cQgCG6Sbvb+arq/nw/HD7Ypxv0jjP13fXNy8DUmvNrKmeOdHNdDPmwa2DCPCA22qn16316BXDh1wi3HVLyA4klGgo+zKo9arnAcM1EMQzBcGKa8fwUKfH1pqbb1tvXVvTbVtrVe09omtvGMRB2swxM47hXXELdVfQqL1aPS3WVoAmKcKt+SlAk4QwIaKjMQL0FmEA4AFEOSKllAMtwJhHgAuSl1PKQxmI0V0DVGSIAPdGrB4GEEyToASTICsCE5tqQBCAImaSpTYkEggB7EQBQIYerq49JLvncMHEUqj2hpCzHLquhFVYVHXZzvO4B3SLWpsF8TiOJLdrnFZ9WDuJ7ZgZjN2NsDogIbNEhIuk5s2thzkzMwv/SgfTzZ66mikpPxBlFwAFQGytA5pIiiCNCOeI0A6Oq8iHpQvDrfdtxikn1taXukRvKedEfJWvxqEIjwiyvx6n3fVcrnuvmyEK7w8HABkPU57G88NpzAFYOsF4c6uPp69f30/T7t3PvpBScs4/+tFP97e30xV/xvkv/7W/+vl0Z325HuHd43k3wLOrK5Kb/+o//73Iw1/4td/44Y9/M+3pruhHn73II58edqftJI7D7av06V9EHrVmV8yYx+c/LF++rcejNtbA9+/7vA+7ugv3rGs9b4/3xyzpxcuDjDsDhHBEKfsD66SqLSjtbmZCqAXhFWXaikw3e0wD9g30bNtaO2y1JrAs7IAeJCIIho3QA9wWcHQ88NgimGyzh165M1I0VsuSTfKC9yeXHPVhe02R2QftweLTNObky/rYnIJFe+3ba6ALaErQ0UJCm8YKURNnDCmjRqwYzaJ2W5kzS4XI4pPVioDmFgBgW86AZGArkJeBseWIztmcuzUwM2QMaBDiEImh5Nwrbe3Ef/N3b4hdmBEpgsxAtbsjkTAzov1q/EohoLtBtyByDxPOANIaYBAxmoIbI0HOORw9OmCvm6r27rXb6rgAakrDfvx0zLfmfVsfzXtc5K4BZo3ZEQGDL6AvCAIAJCeCxMBMiAbYI0DVPaK1djHcBziE974GUHhBGMLIbAmP5keHDaGBMToTEJMQDvvrm93V/urm43F+Pu93u3mc8762bfXz49Pp+PDheHp/2tbeWvemoKfl+M03X7358ltQu9pPuaRxmJKUrgHIz5+9lJS/84Pv3756+fS4fliXtVph/LW/+NPa+5/+wb+4HWger+ra1WN/9+Inf/nv/vbf+bdvnr8cZhn28NOffjZdX+N8fVo7ycHoCspOrl7E9BzlgIBuK+aQnOpywsBWq+R0Pj4e3351fP82DKI/9m3RcPXqrZ4ePjw9Pbb15Fqtt7o8RTu5LghxXk5KbXcYd8OdOWIhdY0eyUkQQGjMiQDUDNzAm5kacIRBmLo7mGrDCDA3V1TN3gydAZlayBIIahxRu3ltZ60NAJGMQJgHD6tb7RaE3b25aqhGN1c323o7WmscQGHhxFxEsgggbkDdvImEqoZTIG3diDMiIrGaRqzmzUIhSJgup2dB8ubWagCadSTnFIA95axmtT4KhJgqgjAzBDIrc1I3wN5auBPiqZCoqoe6g0e/IMdqreEXBXHDmoVmAFrW99mGy44BAMJD9zV0gW7EY8lDyTvBw5Chj1hXX+sWHCIumBBHiMYJwj0CARUJPLpZzSVdNN1ABEbWUa212k2lYwNKnEi9Qwh5sghOatEcKwtlhNN5Q5wOu0KQym7/6bMfFp3O52NE6Hk7Pnz7+P716fT+dGxlGG4/fn5ze+edHh8f/Xh8Oj4s520+XAHLIPPNzVXA06Kn73/8ats2ijTNh8++930Kurp7UXbD9fUL+BH/6S9+Vtfzbn/Vev8Lv/4X3/xrf//3/8n/Iy25n5bMw/nUvvfZ87IbH89/zoV+8NGnGYina7y9vdrfFIXltCENCBBhXAYAWe+f6sOb5eHtn379FfQj1vv3n7/JnKfDtZQhYZSrZ1e3ubmqNkLfT8NpOUePeg6lOEwzKGzrmtNw2O05yX7a764mxcKRrW29nsAszEcZ23Jat9W9QxgHMWWLBoEBoL1dWD3gARDknIUi8ZUEoKtRb4gl7QlO2iIYjBgZ8YJfMW2AkNW9aUtFwSHMzQxBiIv7ol7DEyKHeYBG4LQbmLtjp6LoSDSWSAnByLFTKLPIMO21L70/WWj46OFCyGxmPYwAovceYSTsGOgKkVTZUVlMupIwBw6AIpQCKyGKuGuHCEaptfb24MZA1cJ6uEeYATj0vjEgp8HAPJ0AyKN77SKC6B5+ERZFJEDCGKb8CQaZn5jHnGbmK7WOpAIpECRJgJutSI5I4Qk8MBQ8rCtnIEZiCcBAN3d1RZJLmLV3QzKiMGMhUFsMqrsjcUS8evbZYfzufrzNctXP6e0XD+LvT/UhPCWDhFGmm0bluy/k+e3dw3H7+hdv1uXhm2+/Tnn86W/85l/49e/vr1+WgRnw7bvXMuCPf/zruyH//h/886e3T88/+mT4apzSDJJU2q4c9lc3h+sD5vHdh5O1fn24/d3/yb9DGf/k//tPvNOOrnl4oYA9Hm+flWfPXoRIa/z0xVt4fy7jeDTN034/32EacL6KIATMaTo9rf3+A67Lu/fvrq7m+cWnx/t3uj6FQynz4/Jk3pkxp2nBhBSJRdK42x02V3efpskWfHh8O5Xhez/4i9cvPrOZY0xRaYgSLG1do8yn01OYa9+0nURyyVeIQbYFUjNFiiJJW1iClHMxqM1WtAYdQZOnwW82b5q6oPf+RDgiZmIx8w6W02RhJMXaB6sbM4ODdndfAx5AyYEDL+I6UG3gjXPkIohT+JmTISycc6LAMuQ0bcdoG/kG87Q7bab9aLYyiUVYXy54PFNs0bwHxwwUiZypAGhKGDHzX/67d4jENIoMgiMAh2MEmTojqru6LdtJwbsv1Rd3gMv3HOiGrmQGHtBiUTMMNKuALcLNe4S3rqZKGO42DKWUnXtHbA64rUvXSiiEmYgQMQLUnyK6u6mG28UFDwEhyYio5IGQVdUtulrXTShLskvcBdAIi9BU27b2N/M4vHjx2ctnn+3GGwi/f3dejmqrZ5N92edhTCJZhlYD8vjq4++V3fAnf/onP/vjP3x4/9U0yr/69/6Nv/Lb/9rV7avWjn/4L/7b3//n/93jun36w79wfXvXq37+5VdLXX7tp79OXJpZq7b1Fmhd9enx2Gp/fHza1vX49FSrffv2/uruxdPj47evP7z40Q9/8Jd+4+7jZ4frstsVbJrHbN2++urb1+/evX3zzfnpbCetFbwjJwYG4iBJCVnWZb4ba22tWt5dyXQdMiNnZFYki+CQ6AyY1OJ4PLp5HvI0HUJ7zokEXT0P4/721XC4CUgQyk/v7Xg8nc+hDaxZNEqUy+SewAysNt245CJCSDmlxEIkgWTmrbfACOsIRgAOqVIGJHCv3mvdwAgiEWVgEkRCDoRetfezaQMwIlRf1Z/M1DsFJI+LVhoiolsHB2YSYSDP2VOycJfYcRoL7TEyU7ZuARwk3Ra1k6oChEV1hwDstjXfupMZM6FIMEVgY0bwgf/S33mFUBiLUEJkQkB00w4AW+2m3HytvVY9dm1+0QCGQqi6moU5Vq3qtXcFgG5q0YDNwdRMLVTRlRDdzdS3eb4Bl96bW9feEaTwjoIxzB3MIeBkdlEOoimYIYAjQk7AhImLO/Wm7nAhvAediYzFiC9NMz+e3uci3/3or3zy8ffC28Pju9fvvm7tacxjonHeHcpYuioEhvett2l3c/v8+f37t3/6sz/Lkj/79JPf+Kt/9ePPfno8t5/94e/9f/6T/9vP/vQPvvuj7/723/7buZT7h69/9rPf/+//6R/M+8Pf+Jt/nTmP81yGaZx2eRg++vhjkRIBb9+++fDhAxIvT6eHNx/evnmXUqpray0++t5n560JD0AulEeBp/MSrmXi3aFczTvKQ5qmy58zjyWset0iA4/TKCVI8+46zdM4DuM4XF0/nw+3424/39yVsusa3WpdTolpHAaG4IQIQChAmJjneff8+avd9RVQMAHU1Y5Pdl5tWawu2s5b3ay2koZxvoph7EwEKCAo7OZhFghIDKamGyEUgsTi6ApuQE7chFrT1jYHMq11a0S7LBMAIlAYoZMQWDMCl0uNMoIwmXmYa7RamzuycFe1jm7EQqlwTrnkjJC8s1rGyAgDEopI+AQktZ6bLrU1QGBGM2IqRNQ7qv4qxpOSEF38uMbM/Jt/+7tJJsYEECwM4A6rea/NtTnhEOREol7dFSEiCJmsq5rWrq1uFqpWL34o5AD0rrWrmtvFZox4gXFlj2rRmbm3pfcNwhnSkK4I2b2qh1mobu4OqAHmHhYG6ICepRCSX8zg7q5gHZjS9c11ySOLEjUzbX37+OOPXr78zBq/ef3F4/EL03Z9/XzMzy4LQK2ttXNEgCOhzPublPLnn//Cev3pT37zuz/8sSX8sz/92Tef/+z3//l/czyf/9bf/Xv/2t//t0oZ/8k/+c+/+vyrd+/v3797+Nt/5+/98Aff++WXX7Lww8OjcHo8HcdhXJft6fGx1m05na8O1/f3xzB4/vzatFdt33z55TyOZb9/OLfe7Onxvml/PD6pwXnZ1nUtwzzvn427w3zY3bz4NOfROSCLULa19naMZA14nPe78XBBsmrv63IUSaGGgcLk0bVtHlbyMM27cF/Ws0MAQKjN+3G+PlStsGwYCzWNNONQJHEqmRFdN1Dr27nVI4TN02F/9SwPV2aEVJBEDWozCBBhYFT3Y11PrV1GyCNoQ+rgXrm1trYPVU+9gdABgcM0LIQk5UlkTDJDsBsQXbqlsrXe2qa9uRshEYzWCWEIUAgU3pWcmcO8qrFpCM1uRDhSzBgpydhaILC7uTdGyjkDuTbAEAYeypBKQvQAw7AkyH/xtz8FF8RgJklERIDV3U0RfMAL6Fc40d5jQeyc5VdWU/cIAw8zxbi8+gHYzRe1LQC21WurFs0M3QHRAajpav4gTBFGmBGIQgiyB29Nu2q4BawBG5IjB4BFeIBjKFMGQEd3t8sNJedUhmEcx5LJY2MaP3v1V8N2X3z15x+e/rhkmFLJXLIcgDnCgpxTHuUw0K1QGcacaXj7+v3HH7189el3Wt9++cufvX3zDaCty/qDH/3k7/3r/zCo/Df/9X/1//tv/ot5zEMZj0/tf/6/+Pe/+6NP/p//yX/81/7aX33z7v08lvN67q7LaXl6PFetfavI5OHeNM3D/elxW9rp/uGf/df/tJ1Pzz7+7u75c8SutTKntZpkPp4eT8fltOqx+vFhHdK+pNw1elumDMGA5uKKuSAzBsgwKicNqMvZ+7asjxYA5LvdNJRRHVLOTIEIWrWudatPErAb9/PhbvfyVZqe+TDZplByKiMPgwy5nZ4+fPjQzqfWKlAPb9gU3Hpft75SygZACKWMJU0gbMweUqNphDEjgprXLmSDWTOjrfbT0rd67K2P+brI5I6URFIKz+adqYhMAbhpRQhHuAw/RWDApVqXEUZTAKyqFk4izgShohZmGWxIuFfN7sYhEcy0QyCEIDZEyakEAMKEnjMlSoQphiEBNABnFv61v/USLDML86Xw0pG8tUaYDZI6uy+JxsIj4lmhCQ0p4WXG1DTMwMyQUCQcDNADAJDX3k29bbqt3bxBEDgIFCaPaEBJ8Fp4KHnUQHfW7hC8LEvtJ8CNxQKBiCToV+JuV1cCSuESwY7EUkZhoqVIJ+RnNz9+fv39r17/4qs3f0ggiWb0IScHN0d3FOKx8CjM1hvGtrUNQwjo40/uzOznf/7n7799OByuhjwQjj/46W9+9J3v/P7v/Ys//Gf/NIN++uknzeLc+T/4D/93kvn//H/6P/5P/53/2R//0Z+7Yu99W5fz8emwv1LtLELCvbV337xe6vr6m2+++uXnT+/fvf72XUIkkd5drd/d3pacAWgcBxSap/H67plTmcZ95rF3b72uy0IZSx59aw5KJTOodxcIyQUScsrT7iDgvZ4OeQbTQN4fDlMqGNSsaTfDdSpjTlKmfJiv99M15QnmKe0nyxPwQGj9fKpP5/V4Xk8PBJaE2dC6GgZ6t7ZoPUN4WAVw9dVAgYmTKDR1SzQwpe6dUBKOKKVVPOqHZnrarG/BkBAySw6QkkfhjGS9b6odMcLN1SMaIpgugeHhiJhoAG9qbhYGOg7F1d0aEXRXVW0LmObehTlTcEpjBEXo5T5CDCCLeSAUvnSXgDEkJZTkgOFuAMF/6e986gBJikV1vCdyADbzAAMcTZEuXyNbQHMARGHO4eTu2rtqIIJIkoSEgoERoN3dQFXXpbVqYYxIzBhIhIEiAMA4SgATCu/MIcARvVntdrIwZCMOBEISIA7HgOyOSEAMWnsAD3kWcskGYM9uPy55+PKbPzot70u+CcsBCLAi10AJCIOGthvSDsMZU6vdFD/9+EdjGd69+erbL792TB89f9VqH/b7V59+dHw6/tnPfnZelrvb2/3d4dzIY/h3/71//3xc/6P/w//+P/jf/od/+Ps/W07Lbp4eHz8cj4/X11fzPCzLKXH66suv/tl//3tbbXXTra59q9rq4TA/++jjMh+O51Ndz9ZrTmk/j2br4+MjJy5lGubd/up6dziUaRymqUyDe2cmTgjWxem81uhba74tj7YcW20qfHjx6Xz9nfv3X9fH9x/uX58e7nVZgCPct20jb5nwcHf73e/95u1nP4qbaxCJh3fL/c/LuvpyOp4ezw/HdjxqPwfYthy1NaJILITILEOZh2HPwsKcEiMABlrvdVusbRzAgO4dKcDBICr0Vbe1H5f2dF5PCkQ0sucAbFYROKUSzu54qaWsa22tBVmAdt/gUmGNIKK4nISthA9judpNV+CpdY/w3tQ0tgUiEtOQ0x4wAYBrRJiDdasWG6KJJMLEXNwRGVMGYuu6AToh8l//+z9wR4bU7YzpQSQQGbx5BNJMOKSEEYpI5hUCiNnNPfyidXWDgB7YEDMipVQSiZpHmKr21rUzhBCTMF56FkRsoOZbTkk4hRESRARgCJG7mjdHTZexB0AEAnQA9Eulx8PDqmp4jCVHyKtXnxD1Nx/+mfOS4Ht1jW4bsmdid1dszSuCiw9oqeRkmxW8+8kPf305n37xiz9C8v3+2c3t84e3j9dXd1d3t1998/XpeJqGcZqnPMyPp226evb3/vV/+8vPv/q//l/+o//Vv/+//PKr13/8B3/04x//cFlO7r4s57u7uy+++KLW9vDw4Y//8A9N7e765vmzu/28S5LmcSx5SOP+6u6ZpMwE5+Px8fHBPZACkRWI0tiaqnYZMoBLYhQch9x6G4aZQIjZQB/WLfry4f4dRpC2h/u3X3zxRR7GH//G3+RyhajL04enpyfrJ0aXPCELRJpTZia53o83L4QPGJ3WM9SWw/p5zZzA9fj4YX168tCcUiJyd0lClx4TEiJFgF8kvAGqXXsDDw+tfTmuT9U2Q93Cjl6f2qZ66qDnuoZF4qJGZpCY3TsCmMblbTCLbavrVg2bgxl0pCAUpAhojAUxCc9ZClOax0Muk/BAmLXbtp29W0DyYEC6fBWq5t4QtcfZooowUyl5FhYiMddcOKAiBSETMv/OP/gBgQC6+SLZWIBICU3dMYYiE0RtvXmQRQ1yBEIi91CtANy7mVWIgEgRwEzCbNrMzUIRJQwBmIVyYmZuql2NGCw2i0AQEoYQASQgsEAOIHBvRJqEL7dqRnSDRICBiAAQVauausPHrz5xjPf3Px+H0W3XtVknD3BSpM3sbEAQ7E3Fpix+Oj0VHj/7+LP7d/dff/WL26vDfvosHL76+uu725ci8vnP/6yUoQzFwK5u7tZz3c2H3/iNv/L2/Zv/+D/9v/+b/9Y/VLXf++/++a//2q8/Pj4y49u37z755NMvv/xKNbatdqvzNLx69gzdReT25vbZ82eXrPaQ5epq3u1KyjKOo5q9u/9wPm23z17wMDnykDOAJxaIMGtzHlkSE67rtjs829RSlnmarR7XZXn75sun89Pdfj/p8c/+4L/96svX+7sXu/1QpjLsD9Zb73U3TLvbV7cvXx2ef7K7ep6JsFcL9M1FcpU4L+eZcwA0a2HKmQkcAUUYL1c9QDM1cBZmuTyAS+hKmXEYc0KGgDwUESk8UC7AQ9u02hLATQGDEEU9hSN6Z0lde2tba5uqr8u69eO6HTucUTZiADTAxhKIGmCX13eeZ0Q0tVwA0JiyRW3tCRkDSbu6N0R3g6ZHB+u9YTobLADANAonZAIUCGJhZiZCohJO/Nv/8BMi5ARBhmIkjpepe0hmyd1a62vryAbsDk6CRKzqCAl8qisiAIALZQRABPCLOi8AEJzcIIIIsORhHEcPU6seGoDmm4Uxc6bMiMTWbVW3ALeoEZoSiwBiMBCjuCtjYsoe0Fqcjk8vX90iLa8//HPizZTDBzU0jUBk7mpL1y0AiZLAzOTn0+NeDj/94W+9/ubbZVlvrp4xlG2t796/efX844j48z/7o5LFAVnk5uZ2XSpz+cmPfv30dPwv/+l/8dt/67devfrsP/9//1c//Qs/OZ+3/X7385//+Xe+893Hx6O7MdM8TykXAiKI3TzO+zlAEb3p9vzu+sWL27qekuB+f7i+vhmmUVUvedXpao8IQ87TWERoSGWahmnaOVAuxbSZ9/lwUKPzu68ZAgOsHtvp/ZuvvyXA/W63Pn7z5pufA3QZh2G/313deKRxLM9evry7fnF7+2J68Um6+rirAi02X2kZYn2q9+/v3795fHp3Ph/dsACHtd7qZavWcARMSVLJIokIiQiJJHMuCYlE2AGZk1xkss3UCDCZQyATswZ4UCAJZ+Hc7YxIEWjWa11No/dW+7HWs+GKvEgKZoGwX7WFcGMacs4AmHMK7EyA5ABntdrVERmRwwLMwTnAa5yarj2a4xNxIPIF+otIhIUoESEzAhAiBzr/7d/9fgA4avdqWJGMwJNkRjGn7dRqbwbKCZghIFggvFBMQplx5JBEucihpMyEGIAQzOQeXVut6iaImVByKuM4SObEgBitrRGOhGZRchlyBu7NNzOtfjLTABNB5iyQiZAp/yp+ghUCn+77Jy++l3n89u0v5pkDajgHCMHQtREj4KbePCKnnPgKUNf29VX5yQ+/+6988dX/4K3f7l+15Xh8fGzr8uL2Y639m2++fPHiuSEAwfX1DQaa20effqTW/+TPfvbi5Ytf//Xf+M/+0//XT37yo9779fXhl5///Nmzu3VdT6fTfj+nlAFoW8/b+TRP0zCN3ToTEvjVbuemZZrGaUJVDM9jub07DIUIA8gp4TiU68Nuv58lldu7W0Y8bmcIkDIMQ16Oj0lQhhJhD2++GnZFys1EeTeItp5Ynz1/Nk55e3qigDDdD7tPv/djuXo25XF/feDnL2wqIEnKtaSpguKYOAmeTc716f6N+UbaODSlAQkBADGQMbGMZUglu3lr1VQBCQJUVdXMAQFbq9u6ACiZYQASs1AYb6Bn6xrdobEgJVHdTGuSjEiqql2JAUiJyehIrES/antRAHMggakSUYAxh0iuVYdcAKy2c2uKIgxDkaS9tt4UevPVo1lsDiujmF0y08I4EuzdBUEQhHEgzIDG/8q/+d0AtFC1DbEjOYEziTlBpLbBVjcLJUZmEE5MJLhHGLwjeWYcGG0sOQubmxsQATM3t9Zrb6BNKAbGQSRLQqaUkggXQOoNXRNCMrNcRsloCBZqvrXew5GFEybChAiIGIThFuFPj+tHtz+8mq7fvX19uBkzEYJFuMDOwdW6e/OoSB6gCCmnIdx35cWPvv8X/+SL/245P0y8Qw1dTrbqy5vvgsObt+8++fijnDKQSCnCUrf67PnL1tdt60+nh7/4a3/jX/wPf1gK7Pe3HriujwCGGI+PT/M0sZC7927LeqRwIjfvta1a28vnz0XScnycr+Z5Nxchs21/NQb0KadxHm/urqd52u/maZzHcdrtdimlXIowRQBGlDQlSa5LQst330XGxw8/vztcld0tsA8cEC0Q9oe7/bgHwJubu3DM43y4+6jcPKP9deCcJak3ZKzrVuaZpquMJXuAbg5mmw6JuBj4EOjMxEyAzEjE3Ky5O2IQ/ao3jwhEpBaIhB7uqh7gyJJdZG1+//h035cYM6WMocgISHU51daImClHRK1r1wXZWBTSxskuZq7EiuREDNQDVjMvZQAgJHZTBEuclu2x2VHYRZglzBfDBtRapaqPASe8yNtpQmrESFgYd0IZIhDIDAmTx8Z/6x9/jChNt6YLQECPhCEogNkUwmPdNABRNHESSVkO5JM1ImAwCo+ciMgtFhIgZkCwwAjyAIzilhmHnKZ5HMY5lzIjIhohT4SiHREFIhF5Hkg4HAIsWl89IuHEEAhKUQgpoJnZ02m5uf7senr5xec/m698nke0GcEjQAQtVN2YLtCUysCCo0eHTj/+/u98/fWfrE9fPZu+X/yAsFlP0/Bsf331+u23z+6emeL5tGrT/eFmWU7TNKhhEv766y9evfx4OS/39x8++853Hx/vd4fd+XwiotPpnHNBQvfGTACgauC2GzOCeuhuHHNJqh0sANS83dzc1N6JoOSyGw7z1dXVs9vb27vrw+10uDrsDyUnREzznMs4lHHalc22vDugIPaNest3zx3i6asvOPH4/KWHAqya5gQpX92WUvIwzy8+CUPtLY9X44tP3AnUINy0cxI8L/3+vdBEwGiVAFRbD+2dWl85cxaJQJGcpaBwmaZShpRzGcaUS85Z1ZpqTqOhm65Bdrn/ba0eq64OD72dXSGNgEXSkNPsBuZPta0AgZDDyWPbtgUpSLpFc++5OJERXjD+gWABTsSIHEBdl4i+nM/gm5OZniC6ZIZwJGnW1lZbde0VEHO+jA0i8gYQEAwxhDlxmLkpunnrR/6d3/1OhF1CAbX2S0eMkdXJVHu07hGISTyLCE7JiwMgi+BgtiGypMv8QB+GVPIEaD0aBCGgG7QKETKUcR7H/TBe7w9JcoAz8yAlAFrzEZlgkySAkCGEfbO6ViUcCAnZETJglsTrep7H/fXu2ZdffMGJy1CEx5wHQAzSwAhAMCcJChIcxvRsGK7r0j/75Cfe2sP918/2Lz3YYDPQteLN3ScP7++F87atj6fHWuvN7bX1BsgAqW56Pt/33r/73e98+dUvP/30k/v7D4fDrre6rsu6bkSX2hZGYIQTAWFJDGOWbd2K5K3VaRxFsqtSipRKGcZt3Vqrz57f5rJLpUzzVZ7n3dVhnEZJiCIy5GmcOI1BzCOVLBCYxsk0fH3E1qcX343E/f5b5OnwyQ+9OkIM482w2+2vX+bDQcYx0zCnibaGuvHhQOOVbrEdn/h8wu3Jv/ni/bc/R2J2WO6/9djO50foOQ/s5s26Y3BOJImQE/+qzHKh6XdTNQtEi+h1C+8hQCyhfKzrB12dBVAggEsBcowAHlzM+qraulK31i/FcmyqGyKEb4AOYCJJDT0gABwQYQRMampe+7q12rd6WuojEYoY0EZoKY8KDh7bspl2dwJkpDCvHhtzQnB1RWQMEmSMar333npb+W/+o+8G4NbWblvvzUOHPFxKre7QVd2BGAINach5T5AghiwDErmh0MDiIh4YLIgM6tXBAdjdEZIqhWPJCTGYZZznaTgIFzWxpkQAUdxgyDnCJBWiDVEMUL27ucDEOJk7SEO3kuZ5d/ji9S+B+LB7ISjClEsiAkJqzQhKytMFvpJ4P+WXCa6f3373Zn/31c//bM6oGF0X26rA/PzZp7syH98/6La23pB5d31gSWvdbu7u3r//IJLevf/2xz/+8bfffptSGcex1rrb7d+9e2/eEImIcs6qegElEHFOOUm0ugxlrL3VWg/7Q5hv2zLv52EYVTuAm7V5nsZxRJJpng9XV3WrwpIk55QBIOUxJSE2sMycApXSnnmE4s0r6zBe7zJI0s0tdq9+GJIkglnGYS/THvMeHdLNjONYTwv4RtPA82HIWbeVXUqZ1vtffPPH/4wlV/SH+9dZLkV+6L0zcsoJkdwjAIIiSb4ogGqt27ohoVAiRETOqRDCtlUkSiVzGloQInhYB3O2ZluYETijI2XT3NpS29K8pRQIqTdDbuGAmMMREdwrEREMZtw7uIf10AZaL+HgFl6HYcjMDoDozG5u2mHbqroGBAIzpfBMREk4wNyAKZtW88W8dq0Owb/9D76jEFuvvfW6NVWVLEwIQeEQAF2bRQcAAM5lSFIYRowS7utaT+dzxMbsgebRHHqABZKaqXaIFP4rbMuQR8IRg8ZhLjypX9YREJJwNW855SyBABisXjVWiCxxQzghOTIg4tXV7v7hfe0t5zlRKTymlDgBUQrMtZ0Bfcw3CW+YZiIWzkKHFy8/ev/hW8JgnIrRjscDfDrxxy9uv/vtL7/t9YThAVByHsbpw/3Dq1evzsvSu14frolhnndfffX1d77znaenp3mej8ejuzMLXaZ2EMwMEUXEXYcykAATofCybleHA0SER+udU7q7u1uXc4RFRB7K/nBIw5xyIWYASCkxYCoZIpDlV+V4kEvFInCIEE8tsdhRSQSHgwD1vjmk/c0nQTkAmjYnT3mKklwJ5is+vMLTKZ6+jaTRKVSTLu38lOYpoZw/vNde3fogWQQtkBBzzlkyIDIJCwJGuPuvRk49pZQkJWEipAvkHmkcSkk5IYGjGEb0p7o+1WM3Q0oKttUTJsiyYxx6b23zZTUHA9faFo8Ix0CPMEQKMHcHyNrBDF2TKYaBh1/mgHtriEycicFju1z2XEs947qFXzR4NBEOl/AeUTIDN9PYIFTV3QMZ+Lf+4WcWbh6u0LZWW2WOaR6yJKTGHNpdmwJwGDFhykOigVwC2DzqdoLoRAbY1RqQEUMEtW5mHUEYEwQgyDTvE48SidlzTixea3UPDUPo2htioKzgnGVydPUONmQ+CBdCU++7/bTZg+maeRdKiJGHgsHBJkk80MMQKPEV+TzKPObJml3Pz7EPfTnOAyecB9qP6bZtueRkbXv/+ptMoSGENI7T8bSMw7Sfd1999eXt3W2r6zCWh4eHYRj2+/35fL4gWi7JbQBEhEuHKOUUEaVkJiEWFnYLYpIklzqGCElKRFxbZUkaMY5zSoVSGeZZRNRtKAMx11pTyaYhQh4BHOYunN06AEJTDKQioKsnNpgG5rrdQ4Dsb8lWZFxOJ44Ybl9EpNiOlEBuP4b71/r2TVCHtp7efbm8/UVr/fbVqwTt8cNrFiEEAAqCVLIkuQRmiVC7AkTORYT9X/4ggmo3M4r4FbcSgQ3V0SKSelfrII4QBGv0sz2JsNAotMucBIQhbbWdlpN7D7S1KpAICxG21lvrqu7G4bQtDjCAEyBwIotwZxEhSimRu7pbQISzdupb3tbonQEyogA4ElwGDM3C7eKiSESDuTET//Y/+sHF2gth7r1ry6WMQ8mJiBUpwsHU3YCRhRJiEZxKFklFJAE0tJBkw4BIFG4GvVtsW3UHdAlTc289IGI/7gk4vAZ44rJ533q99MnCcKurwZaEheac9iSFMYhIpDAFp2a4BnZm5LjQhzIxQXSklGQEJAIGLwIjhAxlf9i9ZMzovj0dRx4ERSh55G2JDvX5sxfQiD0VHkNwGGYHBI/rm5s3b96o2f6w//qbr6ZpcvfdbnepfvTea61mhhgifHFslVIAkIhLHiwMiQkTpzyMORCQkCUNQyplsHBAyKUM076MO8mFUt62KjmVUrQrIvbePZyFwa33noekreu2EXbvSCmDNyAiQgrHNECvqcBmRwTEstenczwdt6fF6rncXQMjPL6L5Hb3Ez+/L++/9ixOQwNYv/mz+vBV3h+Wta3HJ2Y0YBYys63WICLhYZiGcUqp+MU3EtFVkQiJWERSiktpiED7VlvfuhkFE5ljJ6qwPG33j/Vk3vbDnvIuuoBpgpy4NO+nZenaHYkQ3ZEwh7N27I3cWNV7t3Bn8pQxMFQ1AACh8BQuzJlJ3ClMVAk8MYg5blWBMMLcnZAI2AzCHSKIEoREXK40mX/nH/3QwGpbPdTRkEiYcsYsxOyXloGZqYY7Co/MI8eUWKQQAjg0M2eOlAECWt88uoFvtZpBRNpq7d17h/PpNOScZXAHRFRTlBRoECGIgFKbq/dhnBPtJe/zkEsxMzdNqSAnW+qxFBEWt2CahIbMlBJ7SDhcVNVojFAy7zxiv79Jab8sZ7NzQJhht1M/v4UtbuePr9LL+/f3Dr2vm1OogjsAU87ldD7nUoAJ6eK/5JyLSEpJHh8fL0tgKQMiMctlKxDhYRhEBDlKGQOglMxMnHgYxqGMYxmkjCx5mveSp/lwLWUCRBGJgLrVknNcLl4AddtYgAi1GwExUj0fySI4eUYBhQpEaM05hznp+UwY9d3byNepJKFtOuyPH97I5mW+9ccv8f1D5CiHcfvwjW0nlwLz8+38+OaXf0xIn33n++fjk7oyYdduqoA4zXPJY0oJidbWa2sByJLKMLKkYZwkZSR2BEI0bapVLRjYzDtG7fbN+d1Xxy/vlwckJiAi5lzASFt3BVOqvW51UWN3FCaiBMAQYhrhbObmFsApE1IDMARwx5xmpkSWTLltkdOOI2lzj4AA62ER3Rwuaw+yO6gBuACgeyOGPAxmv6rk8u/87vc8LpuIeygjY/iYIJcQNoJAQDVRZe3gRomFhYmJGQEdcEMId0/ZIkLNAdFRa93CmaCYkyskEnAM8N1+RyCS2L1qWM4SvRPJPO3DyZwIr+Z5GOeSeJfz6HHuujGKee2xkLSSM/FgkZBERJh3bm6+sQThQDSWtE+yJ0reOaccsJk1COMwqCEIQmXCu1BUWyIgD5fp/h4QwzhdYFCUpDdDoOvra1UdxxEA7u8/+EU0TpTzoGq9t4hARGYaxtK1J0ngMA4jAgpnIkl5LOPIaeRU8jDmYQZMZSgA7BHgUEq5nKyGYfiVxRPZwMI859GdulYB1NohyVoDVEE30A2Y+7ISynpc/Wlzb+v6/vrVp7ZWPZ6vP/kuUqy6Yhm41dAjEAkOx8+/rsuHkQPy2KKtT2/LmGWaTuum2pIUFiZmU7tEfj1MKHHinLNIRqQhlZQHRGitUbhuzdSIWVIBREU4te39+nh2M+CU2QWPfW1VC48Yg3kN16Zt2U5Vt7DLqEokTu5w4eIENYsegSkHoDMjuiNEkSHhQFGsU3heFzUDRo4wjHC18KTu7mgO4EFMgAGggI5AzA7ozJxSBuiIC//tf/xDDEcIEujaKbAwzIWGTIjg7ojSN+pVAqK1ypzSBCklRo5wJHPwiAa4RUAEIrFC61oRMuMAIRQoAEXGrptkvDncUbRAVN8wdJBdREkpjePEUFjKVHb73ZDyIdG41futPqi22lcqR+RtkDHRGMgRjJiFCxIBmUXzAIJB8o54TDKFuelqvvZWQ7ekNiiHixnfzXemjYUZpEOEd+vh7k27RxiAG0DAfrdj4Tdv3rx48aLW+vBwPwyDuxMRgNW6MlNKOSJ2u/n+/gEAkyQz2+12iMychceLAA2ZOQknaVWXdZEshNTadsEYppQuy3/O2ezy5on3DSXlYdrqmSCW86lBL3nfzls/vfdwiXw+Pq3nUxpH1Q6xWa01dvPV1dvPf4kW4+FKMsR+H7tDd/DPf5mR0m54+PrPCsP1Yd5MEeD04UFSBkQiBkYWDPDMOSdhiFIKESIgE4G7sIBHb6u2RgDEMAwTj6ML17WqmXFsW21cSpkR+djOj9tT7X05b4lGxNztXPvJvFVtEJ6yloxCA+Il6FWIimMl6cQUaMKEhokL4+WNymRiuuudI3LrGoGuFhHhKYIiSI0IBEkAPBdhgZQREYg6kqlXlobY3Rv/zj/4EZInEQjUejaPJCUlZCJ1NxB3UvNaPcBDOrBkygMTXfqUCGpbt5qYk4iZI0vigsYBIVkI0MOSDIUHdGf3w04C1aEPAmrGZZiGK4yym6/KsGeSVPI43EoiIN96X+r9cXlE8cxVQ51gGmfCHGDk5AYgCsjeSXsluLxOY+KBAnvrrmfEJWzrrVkwWAxUxjRua+tu1UybtfW8tQUAEQmIkNjQEWmc53fv3+eUxnH88OFDShIRy7LM81xKbk2naSainAUATqfl+fNXiMychmFAjJQYEVQVkYWZkCGw1uruvel+v+9dtXcS1t5TSrVWImqtCaI7n44fgmMYOAI3w9PDBzyfaRwYCUxbU5CShPu2hathn4adLP3161/w/sVut+v3v8iSeBxtKPh0HqzBLp9Oj0kwmb/7cE+w7lO2poFs3QQCgNxUWFLKampu9KuLb5CgQlcDQgEkYMqp5CQsREkCMDwIiVM6914dCiUQOtn2/nj/0I7mFh5P/YSurdq6ndWrRtv8kcVTlpSQizLzIKOIIxFxIWxCVggzFsSCVCKy9qEbmrJpIGBoa7p2JHVBQgVwV0aQNDsAseWcipQxR2Zz1e6r+gOSQVDtlf/mP/iRQhMhBN16d4oIZQ8PdzWwMLO19mYGyACM4CUnZAFuiIqI5q3pKjkN4wCQASOXnGWgIII08RxODJQTlyQlYWYTpgikJGadYtjtnu131/v5bhrnQWaEHAg5UUSufevxoHZeN3PcSmZ2IHBJhWKnG/TmddMwj94cAEncQ60L5ySjasVUUbTbEtDdacAdqIMTS2ERvAz26KJaXTWAUp5609rqkIf9/up4OkpK7t5ay7nUuhHRfr9HJBEhIjM7HPYfPnyYpvnq6hoASimIMAyFiC5WD3edpt3lsNRaG8cxIsxsv9+vy8rEl06Cd3UIZl5OZ2E6nY8s0tV103Eoy/Gp1QqCZRxZiAFVa+ZgZGTazJ/efH17OPjDl8f7D8Pzu+LNLDznHDlG7B8+ZE88j29/+efQz7shn46t5IRg6iuXACKP8EsdFFHkgjlnulRbUiKSIpkJkiTGIA5khAA3Iw9BRiaFcIQOttX6fn16WD6sdtKuusWq9rAct74p9q37povh4mEIncUNUNIwlpk5mIlYHVYmSJQIOfOesWBINLKWtELT7m7m2ltr3QOJ5fLvlkScOTNBzll4KGVADGIADo/VdPWoQbXpUtvKv/27P8oCAC0wulvrHRVAqYeCOwZpj7W1ap0olZwDGogTk4VHmDC7Qbgie0rZ3EhkLDkRWVfBkoUuAWaikJwHEaGeWZCpA3iAbrq/ut3tbwoNu3EqMhho7cdhGIR2VbfaHxPNblJdhaxA6g4YkWDoXeumrZr3retZQznNzKluhsZI7rg4tN6dwXaShthrjykN14cXW4ckLIim1qu2doLAcdpDkJmzlKvdrgz58elJRJKIqhIRsyDSfr8joog4Hs9XV4cIf/fu3SeffDqOk2pLSZh5GMqlO5ZzvpQRe+9mJsSSZLfbmZmIJEki4u7oceHq5ySmflmJc5lSHr22kmkYB6u9ZJGyezzeD0zWq9YTuW+95v01Ht99ePNt2k345ptl6LFlbtvIaA/vdC487tvPP883+1x2H95+ftjN3WNZjtc3NwF4eZRIKJyGofxLgTwjIjMjsuSBIRG7e7PeERAgHAhQiNPlGSMRIarqw+OHt08fvj5++14/bLqa0uasxuh5tbXqsesJXcG7UOBF6sY85KssO0kSEQ4a3glmpnALMEEvZBNEAgPgZuEB5t7Dw4MgQCQYgzmNhRgICKVcZAWMiO5EAizozWqt3Wv3Tbvz7/zDHwomhB5oFmCthyo6CDEReXjvpuZh4U7EJIndgBgRs7VISIIcoECasjADURJKhNANHIzYNBQQSTgUzC1lzzkZhYYBodmZU3p2911GFlLmBMxrPSHpkEZwbluPLomzQpidL52yrhaW3ei89Nas96qxOgULM4t2XdfVvVqv6F4kZQ7T6j3v8t1cdixz75EIwKP1BtBYeOC9RQKAYR5yHnpXIF+3mlOapqn3flm/d7udCPduqpZS2u3mDx/uI+Dly5cASITTtCNCZgEIZimlpCQ5l2VZtmXd7XZJUi55nmdVLbkMwwAAjMSCl12lDDnlZB1KljxOicCtYy67obgjUAlb0BoRL6eHpw/v6uMHaHW6/SRsTR5NH+/fvc7jmERH8Loey+sHubleJI5ffXn13Vd63J4eHsapfHj/zfHxEQHq1olS72HWzP5lmh1Ata/rwpeKh3tvBkwimUmAspSJxhE5QS6Yc/TWW23akkiRGTidI87VHur2FCsQFRyZ2BTMGkYkolL4UlAdhyKIAoiILKn1jgLAwBdPkCJBUWUECSA1VauqrtrMlZCZA7ELQSLJiS4uk8CQBIxIQIgmpAC9d2/NmqmbIyT+V/7B95MUhW7RrUFXYKLAgMAIUutdO4SBsakQSUkjYwkXZMcAbxhUWywQDojECdwhhPCASN22qjXAARFJENCsB6uhKvRASlm4YN3as5tPyzC5BTIT8+l8Oi8PJbFAasumtlJgD1ULM0BAU1iru+JS+7JufnFziHMihCCM8HANCp6HkcApaOBxkBm7gVM4123VttXaFMJtcwuMjJLzkCTLpq03HccdC43jmBK3Vs0853xzc2OmZn45/avqtm3DMJRSUsrTNJYy9K4RTsS73e5ScxiG4Xw8EdE4jtNuHoYh53zZWzgJiwxZEkvKKaUEALv5ICzoiilR2FjkXDvUhom72cAgGF07Bq6PH/rTGzu+jd3LQ4aRqXKG+6d4fr3bX8XDIxSw9uhPj/l7L8+vv22vvxKOx/s3hfmjFx8/PjycjkdCIioi1HoFABFhFne/HNvcI8KAgtOMMhNLAORSUJKDmZp2beeln1fCQCYSHsqwm+a97AfaEyZbbauGEV0BILLMiRMYZM45E3IwIQUAhAcCQgCodYuVeROEUHC/bKjq0c1bV9UOgB5hlzkpoShFABAikMHAgBwpEg+JGQLAMRw6WNVaa4NAROS/9Y9+nAubm4c3dd3CQTSag2l473bpZNUOZjTkMVEpskuUELq7Ne0aW4OtawdElEAXwcmdAzElbq7dNnNFIhGMsO4QAZicnAKRU67rypwOh49aw+6Lu671eDo9ONgg8/l8qv0IYV1rdwdM7MVDLMIsmnptysyXVSkJZE4USYIQkKMkSmNJIxdR5kbW1TZKNDIEhUlOCIGuHoyElAhFgNmDpmGaD1fC0lpV7cMwlDLs9/vLDKS75ZyGYbhcakspwzAdDrvLuXnbNoBIKQ3DUMookoahHJ+ORHQ4HFLOpZTLNxMe5lZKudrvmYgQc8kpJWZhiKGkIHLt4D0AtvMTWQXGAGrLKYEyYHR7On/blnfnp7eA0zQU4sIUHGbTTV/bmFrVDksf2jbtdl6DqKVBHt68H+fh1ac/bBoMxpyATVhyLsxsZoickkQAkaQkKac0jJIGJAAwQFDv0Lr31tZFuwWTmiMhEJgiEoHj2rs5OOWl1WM9btaJ05B3QxmHRJIwgLr1ZhvCGI50GUkn3GpFNDdIiATg0QFR1SAIg+3iQEJzMiZixgAnhjyIR+26mAKgCiliiBAhEVymc8Ad0Ar41BryX/9HP0FAIq6q5obOtVrz6qGMGB5u3rp3pfAiNHAw4jAOQ8K8bbrZ2REutCz1xmyZZvLsph7QPQCgtzWoBbqrAiDSYKHMAkiIHdxVodaYhxui1HXdWm3tuCyvz8s7yUVdj8u3hisJWHRmFJoJxwiESK5MlEsp865IRgC/+LETiwQPPA4pZ0HVc+srsKAhhOzGnRDkPIIjYVhv5kQMIJDSUJsjpaGUlFJvbd2Wq6sDIpo5AGzbduHS7/f73vvpdBqGaZ53t7e30zSWUtxDVUvJ0zSIpGEYc87upr2LyDBcOmPDUIacc5LkZh66n2cmJMKSyziMAFh7C/c0DOBmrQ7D4NZsW4PYu4tuy9Ob9elhvxt5TNZOgvkU49tv/vT27hpwtad7GdnaxsfFiPJUsNn9h8frVwdEgeApyePpzXh1s58O1hYAAqRhGB28toYoklIEAFISGccppWzee9tMGyK6O4aJcAByTsN+x2W4kJ6X9XzS5am2p7r1dq5eGztLskAPoMhSOGci4ZzmnGYh2nqs2xbhBGzOapgywwX/pysLAMHWcFvVLNQUCBDRw4Mh5UA0j96jpoE92rK1CEQyZu29mfWUMhI4WngwjAyTdeyN+C//vc8wiMUjFGEEyqqtLu6uKWF4mPauEZYJi/BENFIumQ6ZJiI496VbTUAXlB0SFRqEBg1em9bmrTW11uzsEZc5N3VijkyFEzNiYmeSeokSDkM4qS1rfbtsXy3t9dP61mnb6octFmIgFOEJQC6QCQYsaWSZCHEYZRhGQgfvRJwgCdLAKQOzIxglZHRzgFF2o5TWtoeHh/P5EaxLylIyoUzTlRoAYxlGbTrvDogUYUjQmhJRTsN+vzPTUtK2rcfj0zCM87zb7/fTNF3OQdu2EXEpebeb53mnavM89962dZvnOeeMhLv9TliGYRAQScJMEI7u67KM08ycUmIA7O6pDENO2hYRyUXujyfziGVd3n8rYu/ffft4/+bV7fem4aZ5uXr+6jDA1tab3f7+7bvFlut5evv1F0o85MFKNqtPb78eMB/ffygDItnT+w/iPO4m5CC41DSj9W4el0mqcLhEvtethXehECkBlJNkZr/8WmVCZHFIGNqaMJGwUUTUp3r6UOv7p6fH06OCEQliF7YswyDXTIVF8vB8Hp4D+nq206OaJUqZmRhTkFftGu7EYalr1KYGARoOUWsAQR7RY4MwZHK/8AgTSAD3i87RFIgAIBCCPEGdVFPrzTX4L/zWzNJKnjBKhJJkpNKb9nYOxAi15hiFhb13hGmcXzLtBDonwJy003JeAS8jyUhBtS/EjJj61qxHbdu2roiVpBFlAGFxJAFsGefEBdxzHrPsEo9TGV09wNft/v3jL1s8LfW12tlxNezMIjRkmVmc2RCTQ5XsKQsFJJFhSiKOCIguhoJEjEOapnxIktU00LFbrxrGy3lJA0+7mfM4lIKUmPm0nQEpp11KZZqGcbc/L+v5fAbwaZpvb2/3h2ldN7MWwWY2TfNud5inPRIg4jhOqn48nqZpOhyucr7EJXi/359O5659t98P43DpYIvIbrcLUCaZpx2S5JK3rTbtwzA+PNxzOJHrVplimMv5vIRaInAg257WD1+dTvdJRI+LLieLk5Kv7bwbJgjfuj2/vf328QOW3WHY1bpp7+F2uL0Rzh8+fIjQ++M9ggy5dOvNXCiNOefE3exSN0Ezd1dXYTJyBxvzMMxXwkOSBJwoJ6YMLKpqtTFYuHZVYZqmcZ8n83iq7dzW07Y9tO20vlv9LFkST0nK9eGm5DnnYTfeTOOrZ4dPrufPslxttbuvKcNcriMAwps264FcyHN4AkaA1Fa80BeGMXEOAwMwuygDMFpvSA3IgC66504AFhie3YsphuPWjf/R//hvrNsjEhAWjSd1QhgI8bSttQYgdGOAYWBKkfpSp+GqlMngTIhCYq0tW9OoSMjMgNyttthQRE2X87FrN1egyikAIqfEJKlIpgyBhAxAETTNhyIThgRVD+xen+ovt/aBmQM6ogP2JFlSYuLMBcFE3APMex6QJTA4cREyht6tkjFgJmYIdtC1PRKBWsfAkoYp71IeUx73N3eHq2eOhBHLsgDzYXc1lHGa5mkcP9zfn5d1GIZpmuf5sNvtlmU5nU7unnO5u7sdhmG/P5Q8HJ9OImma5giKiHEabm5uELHWmnMexxExVFVEcs4550uKTkTGYUJkkTSOQxIuQ2FKiHh8Wk7Hh/P5AUDPx4dWK7ltpw/Wtvnq8PjuK9RTVwv1/VzOy2Mpu3Ger1+8fP/mGwlwgKf7+xeHG9jNhDKTnNbT6eGRAOerQ183dA2wbdsQaBhnYoYIYprG4cITZ8KUxN2RUASFRVIiogDgVESSX7LQNGIghFFYoAZFyilTOS/LYh0I1GmtlyBmXdu5a2XMIgVQUpnH4Woqh914dXd4tp9eHua7Vy8/en73ijiFgVsgCwPWCy8RnVyYMKwhJgIJU0B3i5TpcqYmAuEiPElQdPNuSSS8rbY5kHruiuHglWxTEON/73/97wCmHgtGd7AL9CQa9h7mK4KE58RpSEOhtK1bbTVPsyMDWaARmQauG1hUEWHirqt6M7WutepJu0eEqgEIkucCIgNEHqd9SQXchf3Cqcs5S04UjBgkQ7d1bQ+AwCSAW1xi6BnNNvJEII4r5U1tQYGSBV0ZVchV116bBQGQBIM7sQiKNZ0lH3YfT2U/l4mBx3Eqw6hq6t5byyntrq4R0zCMkuV0Oq/rsr+6FpF5nsZx/v/T9GdNk21Jeh7m0xr2EBHfkJlnqKru6gYagEAMhJoADQNJWNMISqIZdYWfLZmJkCBA3eipTp06JzO/IYY9rMHddRGFv7Atduy13N/3efZ9LWXdtp2Zvv32u2kaRSSGYd+rSDgeT4jk7syUcsw5ulsIUUTcPcZwj1XfS1XMDAC930d4bGZEyIRENIw5hsTsRM7kIuJdy7pA6307f/npx9KtLS/nl98c5zkQASJFUZK6bTHFw+nw429/ezo9aG/t/Tp9+wmI4LbnOW/b/v7yCu7DEIcpsbCgnI6PzpRDZKTWaqnNXN0dARGAmWMIbqCmpsrIQASAzCwxIqLDbl7cOiMCumoHIHRet23TWnqtDbZi13q7tuvuTaGTZyIoe221Hw6neXh6OD0/nj48P/5qnmdVQ+AQgkGv/SIchJJDUHXwFhhIAAkQmAXdgVmI0FpH10ghpDilIVE+TQ+Rs5CQASqqYVd0jgzBGnhHIOAA/L//+z+TkK2DhHsYtYIXcN72brAFziMNgSiwBHSwuq5rsZbT2HVDRIlBKEY5QC9qHdCIm1nT3mqvQPBfCSmInjhAjB7lCJ4BKPFMBA4lRLFu5pjyKEwOFmQ+TB9a68vyZt6RCSG6EwCYWms9hhQitP5u2NxKIkBtap0YALS1VpXcjJEZQpQxSB4oJoj7an1X7wiuHLiUsu3bvYudUkAOiOIEtXXt+vTx0zBmRAghrOtK7Nt2a01//es/yjmr9pyzdmemjx8/qN5Hn0gMx+PxjtWJMfXeQwjMtG0bEd0/Aojo7uM4rstNtSMCmhOjdSVEkWC1lr04eG+l1xvovpy/Wr1aa7UD1EuwXvcWmY055GxkTU0bELqwbMv27Tefat+/vH55PD7pXta6nk6Pw5C77jHIly+fswgRO7irIlJM6f16bqWaOoDXUsyciO5TUXBAQJFIEuBekHHt2gA6gpu5dUVAUwXXrpUDVTNDC0Oubu+3y/tyvu3XvW9mLeXgxstaUsofHr89TqfD6fHh8fHD84eHw3Erl62e97p0uzbd3JRjFhlMDVwphiApYUAAoVBb4aAiId9slwAAgABJREFU6NBDkHGYgnGCIJxSGLIMoNDUDcSdwSn4gJYdPGeex4H/t3//Z3WvtW0d1NEZFaAYlFZu++5TimNOBJxFoKl3VW3X2wZkOQiQiIRDenoYD0JQajEyTm6oxE4A5DFIZoAYAiA5KhiijzEGQXRlQFZ0iSMBtboR0TQ/W8OUwnF8SOFxWc5L/1tkAku9mzsiBqYYYxinEQnMdu0VujKKGzkqMbTetk4OCAwkqfXe+g2d9rWrbjlNQgmIem/v75cgQqR130SYSAB43XcJ/Pj4nPJwvwDGGM18L7fb7fqL7//wdHwqeyEiRIoxPT4+5pzVmrm2th8Ox8Ph0Kq5YWtdhEXI7Pep6TtXOEoo+x5DSCnd89Xgjo5m3rsSM3u9vH9dry9vn3+oyyvUvZe9rzcSnubDcvkSSFKcSy0VHAhGSdP8KDysy3lMfBiPe+vT6Wi9X94vrtVNS6nglgIiYM5pvd2G05GH4LXWvaR55hjbvph3bX1dl33fWmuqnYgkxiAxpimliUV6V3MLUVhGkYxOpVXtnYljzByEZGCPW+ubY3G5aXnfb9fr+rrttdwIacgzgO7lMqZxyk9jfjydxo9Pp+f5cUy5+H7d3i+Xt27VvXRrCIBE2tycIqDwnVxBQZwcJSBy66aDTBln70wKA+bk2bzfbGlQxIZgA2JAvD/4EKPzv/v3/9zcu16uy0V9RW8OW7UOagx5iFMIIgECduhbbXuzvpS+tjaP4zQdiGKkIUlOaUQgVTWjEEYHRPckg4RM9/AU5t7NzFQB/L6uiojkzGDEoGpLqZcwhnl4DhyHHAMnh7Du6215B3R3VO0hhDEfSIQJogRB6t3cOcqEzoCKBB0IQEqDtezmHb0hGDjJkA4h9opCyb1v+zWGTECff/7d8XhsrZTSatMhj8+fPmozJ3cnkZBSPJ8vpezDMDycPoQQAbDsJYQ4z0dmYSZTN3WzHmMa8qTqIqGUql3neW6t9d6naSbCVur9GrCuq7vHGO8dAGFW1W3bRKj3kpigl7effqbetBezuu1nIfr47XfWmhoM0yGP2cDBoe0rK44xA3RhEQi9b4D7h6dvnXG7nuuy7vs2D+FyPmsHYh7GcWv1NB+tNQcszRhD10buzKKqpRQRSSmbubkRSUgDS0QOEmJOWVLGIOik2s3VEB2DGtaqpXZwr+Rvpf10ff+yvb7dzufl/dJuqrs2SDwJUSlv6/6epzDGYwgyxDGFSUL0YMXq+XrbzhfD3nFT3ZmJgb1bQCIiRCCmKFE4BWb33bRCS2M6oTMaUhdwAgJwt0bWGTyypJxyShEIWDr/d//u+94rArn51r42fTfbTTuFPE9TkANjiFHY1fqm3dbSb62ZI1r98PwhpSetLcUxpti81LohkKkxIYpRIARC81ZVVbqi+33chxIyubgTuBMqQmt9WfsZID49fj/ECVEQrRUWeKh63vcLeFCzGCXFkYGZLFBEH/dWCIR9RIjIDbmTiOAwDYfIA5sFSJkOh3RgEF2McZjHUy1rDBzTeL2uf/THf4wS3l7PccgPD8+S8743ZlT31g3ALpeLqqnqw8PD8fgQY7her5fLeRxHAByGQfWOEbBxGkWk1hZjQsTWCqAdj0dVvd1up9MpxrBv+z0zl3O+XC6llHEchXjf93uGdFlu6/KuvaYkOY99L+6KiEKyti2GLJymwyMIc8zWe9tWpEi93a6vQMVBrXchf395u+3bPE1JJARa3l7AdZgP5mZmXe00HPdlp8AoASjspSD4EAdCBIRpmuKd0ZLi6XSMId0P4BwC0P2OQCQECIggEiXEIEJBILCISAwwpA7x1nRtbVtW091oI2xmDszAxoLrfltul5glpqH0HREdDJRa6bf9cr1+vWxvQDWAo8MQslBiFu5CLr17q87MiCZRgHpf0aylGFS1NwDCLGH02Wos4EZIFNlxGEJMjA78T/+nD70tvRfE5t5r20pZhnx8mD/k9CwUAwhjdPDrdt21GLKqcHRsHYEOx2dvkFLOYVDbq3aFTtiYiCi6sVlza4gMgObgxqaGQMQYOQEwoHJwclBrRd8QUpTTaf4oPCDavldtNqS5lrbsV/NOCFN8iJgQu3DsaMva0XKSkYUUNhFjAYnHIY6Z8il+GughuEx0TDocH4/TdDJDVR2nyYB+9etfk9Cf/8XfDMM4H45qaCZ5GM+X166ecl7W6/l8UfXT6fDhw3POo3n94YcfWFCEp3G6X3HuRZnj8QAAX1++HA5TrRXQ3N2sE8v1ckH0lNK6ba3W+yColIKISB6E3ay1GiKHSHXf93V7O19YsNWr6g5OkdL0+GFbVzfLOQJzqe7dWtnQ+vH5cS2lLKu1qlZTCOPhuF7O7rDdbtM4HIa0tQ4A1/0WOByengLHulZ1ZwmRowFoL73U3uq6LXvdzZqDiQQCDByZ2RyIAAiZxQjJENwMgEMiEASgECilINDAi4JhoHFSycu6LOWt6Vr2pauFKDEkJGEO1+VtWa/j+JiGvO83gQYFtaXr8vnn60/n5TNBO4aZOQDCHT9oJqokRGDOzoxOiCzRUFuvte5EQiAJwyA5UEyc7jEk5EghhkBxCDHM/C/+7QO67vXWdHeHtvOYHz59/JOH+QOSMAm4uffd9mq9maq3NNA9uVT6mlMIw9QVjWvtCzMAVNMaMBKRm3lvzIJAxDsYWwcAQwBvHUDTEFgcvCEioG3tZgLEs1g4pBNHvJZl2Xd3yFPet9teXyl0wjjiya0bKJhpY4GILgiBY+YAjBbClCDnNKIF7Y4EA+bT+Nhqvdze67L1bmuvp9Op1/p//H//42Hgj59+OQ4TCwDRtuzIkMekpp8//9x7/fD84dtvfpXTZNbfz1/3vT4+PuU89N7v0MzfK8uJzHzdlhjF/PeRodfXlyEP7r6uSyu1t3b/IEzTpOrmjRzQybVVvS2XV2g2D2Nv6+12JicGL9u7EBLIcJqsW9mWvS+MJJAaELm3voPzMIwhiLauit1szoM5MFKvvdc6HaaUs6TY3ZZtPYikcRw/PNVq2ot5C0RqXlrR5tpt226ILUape1+3fa+3fpehEKQ0IBNhAxDnQMPkxE7ghK5WLpdSFjI3kI56vS1vSzm39/P2ee1vN9+hO7rcUxPiLMnPl/O6bM8ffjHmk9aOoCFCgnHr+8vttWyXcZwPh6eQxRHckXTf2qK/P5gJQFMzZIqRkOy6rg5yGg6RSd2ZQgox5SFwIIopHyNHIY6c+V/82fdE1LS32upev/3wq1//6h8c5g/z8BDjUGtb972Ucl2vwI3EqxdEjdHiMAAPFVqKubfS+tV6c1NANd8DgwC7VoOm955bc4WNuAkmJEM0EpfkAA0cGBwJVGnf/Xh6aqYkPI7PrfXl8uK+h0ApHvay7bUgAGhzs657LRujppjRYu8sHIOEEGIQxWhEvG19aztoe0rTuly/vL8zmSAxx2k+iMhvfvPXTDYfTqp2vrzebhfrlsYx55FCKqVeLudf/eoPv/vu+2EYhjy+vr1cb5echtPpJCLbtuWcEfF4fAghmHlr9Xq9PD09Et07Q1jKjsQp597q+XI21ZTSHYEfc6x7RcdIoZbdrNayvr+9ZLR1Obu3KJHAXXcCzykrkjYjkd7qerlEkTxmJy7abS9tW6Z5QBYHjDF2L4CEhL3X1gsT5pzVIechBIE79ElCDEFrdTAHFkEmWq9Xd88x3E1AvRsjEyOLUAgO3I3uUAIFBAIE876DKaGDdlDvddfenOJe6q3sq9X3/a1CNSitdq3YHIX87gULlETC+fa5m3/7/Ms5PqtayJzzccYHdVrrrdRynD8+nX7xMH1giM2bArj3ZkuHW8jEYuDgiB2qB1i2mmQc8xyYhV1EwpQDp0BDiGNOMTC3WvlP/6dfpTgihd41xekX3//y0/M3x8PjN88fH45Pt7V8eX+53i5lu3RtFgRZiJUEjYOkubbdVQFpaytoBW0GCq7iiKbdugNgTGNKCKywU6RACVFCwhAdoBOhqYEDIrKk1nYReXr8tncNIQZJ1+Va+iqIhOOQTqa83TazRaIiiurVVKE5SyAkIknpFOUpRjLSrWzM0hsc0kO0bB2Px2lI0zQ8pzwSwrKc9/VGRAiwLMvXrz8z8fPztzkNW92WZXl7f//22+8+ffx2GAZE6tre3t5CCNM03bNApZSc8zQd7jm5lNL7+/l3v/vdhw9P4zi6g7vX2og5pdR6bbUSMxOJyL7v45hrbYwYic3vXWplNCjV3az3GEJvFdR7tZyTAW7XGwaZUtZaL7d3sHZ8mEOe1+s5MCDA4XASSV1biDzkIaW47VspxRGfnz611rd1AYeiRkRk0EohATPT1mvdt+Xc6s5IRBhiEAmIICGkNM7zY0gBSZGRRRDZrC/XS1tW2Hcsm9WC6JKCmK/Lcr5er7elaOtMHaw3QPFe277V2pUBHMjczAGR3cvL6+840tPjN2M6lbZSwpQGwhQ4akP0aZ6ejqePx9O3QziSCbi7r6WsXV1CyJmdkjoYgJrV3mNIQ0gJo3DGFByQAIYx5BhNO5rxf/tvf+nOMRzGOT89fDPmecjDPDw+PX1LELzjXvafXn4L0NDFG6mZmhOPjujegLqDE+WqveuGBrXsjM4EHQsw39kVh3kec0ZBswTgLCABhRHvbVkhwogoRN5pNaMPp2+DpF49Cnf16+1q5hwdIY7hWThtZe3Wc5QYhB1aOxu2GB+FnnI85GkOOat69bWWGi1FDtApUYqIMQ6AwQH25Xa7vDn0+XR4fvzW3GLkT58+xTj89NPPr6+vps4xPj1+yDkTkbuVsrtbSsPpdOy9L8siEmNM8zzfuXGqqqqXyzmlcDqdav1919Hdh5yZ8Hq9EmBK6R79t95M+74tIkbkdzaNMCECU3DVXtZWVkJijqU3xK51cWjklnJW93K7bteLpDCN47aXnAYCHIZ85zuQYJQYJOU89tJba9M8CEIej3kYt20H8A79Ll0jwrZvdd8BwFyXdeldgUCCBImECODWOxgysZp2M0aJLHQnH6WEIWKI4EBhDDFurZ5v58t6fd+WatrcOm7kfSt72SopUUiErG5uGimWtn99/Xkej8fjE1EstTJwDIKRhvEBIArLMEz5MGd8GFJqdlO1FKdutbp2UODIzqQoJN1q63WQYZABOhRoft+KgXpwdet153/zf/lHIgNTRu4hhDGcTvM3D6cnc399ebtcl252vX5l4kFyQCbKIpkAQxhjCA5eq7l762vpq4MTKklXbJuuEifhEbQ4Uog53mN7VAIDYVQzUwLnzHHOxxynlCLjYPWaMuf4TWvYekXU5fpStaWcmCn4kIcjS1rX6r7GQMIRiUt19OOQnplzylOICSiykzeFpq7NwQl6q/u+lm1rtWznty+t78Nhfv7w7XJd12WfpgMh//T5589fPo/TPEyD8PD49JxSBsCU4h0pllJura7rmtKAiPM831+AWquqqfZlucUkOWezOzOC/ytRC0IIX798CSEMw5BS0tqh91q3UhcE1961921bQDClWLel1mLa9n1JKYUk1gHQGKy1jv+1fMzo18vrPB85TkV1msfWyuFwbLW10sFRm+aUDoeDqu5lC4mFhjTmaZrdvXXdarFu2o0lMouIsFBMkUW0N3dPISOSAzAlQHLoTgqAIUQOgQJjCJACCDPLfcokwimnIUZAPm9lKU1JK7SuDV0DJnRG9ERM5OBKEvLA5fb+8+evHKdhGAJz7RoDAkDK03E+1VrRgIWGmM2rmtZWGXvIDoTdcGs7A6OiKSJT7armEkJ3Ne/OhIGbtr01Qyh15//5f//nh/HbIGOru1kPgd1xzMf3r2+v18tW/Ho777evOVIIkZhjIIkokYc0TjmkPCLYtl3I3YCa7SGCozbX7qbWCYVczAmQg0TEBqwpRCI2gG7OGBPkWXKUxJRjCK7m5vPwaLo4aWt7qV8JnGkgCOZsyjFFoqK9IVCKh5ifrY+lQB6mGCaRUTipO8FAQA5uJowUEIu6gx+GKTBFGT9++O50fHh/v1Urp8OjVliuy8vrlzTG08Mjc5im08Ppobae8+AO1+uFma/X27ou8zzf8aCn0+k+y9+2rbV2L1IN4+8rkffMDwAQkWqPMZrb5XwZxzHlYOr3zO2yXIYcrVUW6Fq2fQlEKUvZKrhq38w0hAjKLDHGSZ2WfXOrriUwRuFt7w/PH0lAXTEECpIkEEop1VRrLSFIzLFbr00HCZf1hgCn42mcRkNels3Vr+t6vV3N7c44IaAc0r0bGLKMh2MaD44MQIhJQiK5D7IR3a1UaB29qxarBg6ltWVZt65LL2tbO2wQkoGliDFEAhIKAyZCQCFiDpRj5Pfzl69vP0vEIT0KsxbLMTpBHNKYx/W6QtkNzYmL1tKvpb47eAqI1pzE3UHB3RScYyrFFu09EiE5uCIoem2tlaKu/K//1384xuM0jN1g3W7q+9vbT5frzTy9vV0IrdSX6/qZkhsoYUSshCB8Ly/fy5xOHAUzc1JTJDcHRSPW0m/qznAAYKAeJAI4OiJAswZACAFUBFgQcooOaIAOfW9nFhAPvWwOa12uXXsQjnJEk96aeSeuMU7CA9NxyB9jnNd1a17zMFkFJrhDjtU8xCCctTdomvIwDkfq4kqHh8dhmrblpk4fP368nM/vb197v1XbWrU8HJmjxICAxBxCeHt7CZFvy/Wnn36UwCHEEORwOA7DQEREdLlcRGSapvf3t2ka7pm5e3vYzFRbCKGVejeduRohUGRCtN5qK4geyBEd0bHbtty01RhiV0dAVyBkxKYOzDHE7IzaWttLqz2nQWJS05xjktDB3ZzV7+1eNVXTtaxIEDj22jHwmIbL+aKqIcYogk4dtJfSu27rptpDEEJMKY55TNMwHz+MD99KjE6dI+U8hzRgCNZar13NSAQcdK/3Qf+tbi/X959fXn/7+vXrcl1LWVstuIJ4YhDBGCKhuDJSVA+mFZ2Ec0x5vZ3fXl+c0hAfY5zUGzP0fU8SDf318vV92zEaRzOvVZujdS3NKrv03pHcsXdoIbCIlFIMCQCAEaD5vcGy171X/rP/679A9BgjMrzfvp6v75/f/ubLyw+1rZfL19fzb7pdqlZH4wCOYI6BiV2JDSl0a2Yagzize59SRgYi6r0ZFKag5oQODqpujoImKN1s74vVFnEGT2qVArCwCGJbq+2b1q4bge/l1mqNHkrdumoQZOTesPembjHNh8NTiE/EUUIE5Je3171c3FrRhVBUfd13RnKqTffYYZLxTtI7HE+U4vvb6+38Mh9PZd9//vEH082pd/WPH3855vn15adpnubj/Pj08PnzZ2Iw67/73Q/btjCHp+fHeT4cDsfee865tXa7XaZpBoD397cYBRHvZ4xt2wAMEcu2D8PAgOuygGurDRkDsQiVtr++fmZ0cqh7jUICuC8XRw1BwB2cABxFEciaqtYYJaU45wFJWrd7E1+Yg7AAgrp1J4GYYspZ0d8v7633eZxN/bKt4ziHEMuylHXbtg0RCEH3LcdwmOdtX9Z9kSD35loaZ8kjcwAC8GZGxAkZACJ0N21uiu6g2lrpqmpk4AbQOxTXrW/XZVuWssmVgyRMLMDixGTsW3etVdEIXTuBMVO63m4/nX+LDvP0hMi1d0Ta9gUiuOr78tb7mgNEjsN0ZKSyF7PG2glUSCmaU++9poBRDJu5EEdOARGpF8XaDYn/l//7PyVEYu9I1+X69va72ta1nd+vv7vW162/VKs557rZtl7dnYh73zuqAqGptuKAYCBeQyDmFigH4SEiaDWLQTDGLkLYxEHMUMgR3LtrB2tgYF219ptD7b4qdu9dqbR+035rrVvrFChEKXW7j357sd5VUgBHIhnGMXMix0wTOb9evuz1uq/rXhVAxEyrumCWNNpoFGMYUore7P3t/f36hdymYbq+X1sr8zy20o+HD998892yLfPD4zefvmtWl8v5+v7+8dM3v/3px9oqAR6Ox9PpIYR8p1m11s7nc0o5RjHT/yoSh/tB/3K5NG05DG9v7yQyzVMrtbVuDlXXOYe6b0kCdXh7e2H0ul62dYlswojuvW4izMJuXtueh0yECArQ0SCl8TAfXVLTIoTgXlujGJ3QXD0yB2EK4zgDeNlube/ffPMNgf785QuRnOZZRPbe97Jp11abOUzHYxiSILCTd6dAIY0BRXVxU8RkzRA6jgOmDCzh955pJBEAdMDebS+rMU55rqXf6t7qulvtWN0xxwi0FDIDBGtt37ujMWnvpni3jBLS7f39d28/A+PD/ETGdwLVXtZdW/BWt8Wqo3vkIBKBOjESAzgwIZO7kzcBVAIMEilIjiFLQvW2b2AeEPhf/d/+BIHMwE1aXc+3L3u5llp6NcTOvOQgH46/+PbDn6yrf33/21t/681q9bWs67ZWrWrVoSDgPIwhC4CHEEOKIBgYY4AQNDEmyYiBgHvfHbv74Aa1+l6LAmzlupRLtdKhOSEgEJKZV6sGAI6M0dHW7bwt27puwJ0DGXirNTIKRVcH8BAzM19u59fttu8F3VBIHUzllE9jmomiqtay7OutbI1RDtPctN2ub6eHOQ8DUfzuF38IANu25ihvL2/X263s7ePHj137+/u7m6eQjqejdjDze8vx9fV139fHx4c73PN+iWTm//pxKMttE6YQZFlvKSZCXJarBHb1UrdeayAGNK271g7MWnvZbkSACNaVGO89AVe33gE6IQ95NHc1l5DjNBKCIwQeEFFNk4T7IqL3JhLieHj6xfdEvF2u3fo8zinl3uq2rRJDDJJZmE17B2t13yLh8TjP8zGGMUoackQmJ2II9zg0iGDIJBEYMCaI2ZAdPcYMzda+ddRlL7elLrW8X8+17xgJgZ3AxEBAW7fuRTc1rkW1KnTovZupNQGQana93S7XN+SeI63rtbayl91KIQJT751AIhJ1rd53Ao4uaKJGTdWVHLgrFCVAHoYxxSGIeK91rYDetfM/+tefANi0k3PVer6+b3VprVqnyGEapueHb56Pv/7m6e//wS//vsj4tz/+dlvVAdZlXday77X31axMw3Ec8jTNxEQEKc05MeAlZ8pCQpwju6tabWV1R4XQO3XV++xIode2GRiImJv1FmN0tGq3pqqV1F1ha7WW1bd2a31VYwO1jr00h2LWyZ2AMx8UwqVsbW9au5oNwzFBJgjT45BCWLZL2RdrLUpCDNo6hfDx44fj46dS2+HhUTj95of/crl86dUI0d2n+UhB3r6+CBIATIcjswDgPB/MrPcK4ETEcld5o1o3s3EcS9n2feu9A7ianh5O27btpYzThOiXyyXHqNb3dW1lzZmu54t1Y4HTMGzredsWZGbmcRz1vlNhUa1gigwIlPJMJLXVqi3FZN2b1pSzSFjWjQnRXVvXDta7apmGh8PhQ9W97C0Fmae5aFv3rSzrvq1p4CmPwFRa7a062OFwPDw+K4C25oQhT8hBzUkCs3gQdHVUFcHh4NPA4wgUQDhkMYd1Wb+eX7+u7+/rtfTeuxclJeu0mxijeSuAwB5KpX3feuuGUDvUUrV3RYwSSlk/v30uViTIUq4AOhAByzAdJWW1zd1due8GBlpqLe4au9LetKnvrTUl4cBR7q+4qoEzGDRQ/vWf0hBnBAAK7rzt+7betBuqhNC+/fiL7z7+n+fhOaUUZDgNv6jVrssbEYBL175uRc1OUwiDRJlyyuMYHaM2FypIK8uas4TgQi0EUqdW1cG6YTeqvZiBu7EUAPeGaswkKQiQAZkQuKkbF7feFR1rhW2p+1q8gyuo3lrfl/1LEJrjHIm9OxNAw/f3q6MSq1o9zo/CGWkcs3jX9bIRyLbut9t1GKd5OpHkz19+XrdFJLy+fH0/v03T4dM334P77Xo7HI/LvpStpBifHh/VVdXGcRrHqda99/7w8FBrbX0XCczs7veYZ+9tXdfW2t1/CIAhhF61lGpdwQ1MxyG03i5vb2OKh4fjy+uL7gu0LQm13hEDIA3DWGptVYdpkBgMTM2I4zjOIhERet17sxCk9c1dx/k05GlZlyHnGKetbkHU1tK7TqfTMB3NoZdVcjw8PkHX1tpWys8//bjt2zjN4zha17qXptXBHRqGKY0TRsHAEAK4IxGlZB3AgJFUO4cASL0XxCqIiBSJOtq5rud1PW/NfVBmEajNt34LmQNH8OBoiNDatm513dq2bdZabbU1MDNADEFK2XuvMTEH1K4dXCQe5gOg7evqquhe16ZudW+9Wzcs1Xv31sydooQQQwzeWqsKbKTNqjf+o3/eA2WmAZQRqZuu663W4mpqu7D/wTd/+uHhV7313rzVXrflWq6t92mcAXDdW4rjlA9EIi5THCRmQGJU03dOilTVFiYmZAohp5PwtJVWStduQNR7b22V5Mi9Vo0sUUaBZAACjaGIBBVxj73W3tyctnXXBr1Bb9tW3679vWPzXhPEo8wctNeyrXuxpmgpCgWOY/j0/Inq5N1BWSi+X25d23fffneYHxDw7f3ldz//5TQl4cGBHh8/DcNJ1d8vbzkNdd/2bXf0HEckWMrycDox853s0LuJBNWeUrrrM+4IrbvgFhFKKcuyxpjuEFxCcLd9v7l3tBruUo0ol8sNwQk69Gratm0nCikOQx616zCk3nrvjYRCSIQxxdx7uwewc8r7vjv0+TBve12W2zCOKaZaIaQA1pbLNQ9DJHt/+ULEpVe3VkohksM0xTQQSy/1tm/LtnpX79paRQIEE6Dp+SPFaNCJgCgAIQVxEiSu627rldpGrUAtDEruvTfsJgY55xAHMtkVlSKnEMYjYbrtK7MchiMhO3UG7Qqtw+XyUvYdNJIHrb2VbRzGwzxI9qWtrbbH41Mafq9VZqQgEJnXZa2tIIFqVG1Fb62rAbsGcGTBmHhK6X7tdnPr/Q7s4T/5l6EUiyFbd/BurqU2haIGWrnua87y3adfmxKAKbT315/flp9QLBARmrMSMuOYOU4BwMBAwBWhOHrvlVCRqyq5RUTI+WHKn0Smy+1ctLXeiToLtGqIXRBBmSU4GFp3q2hGQTqDoGIPptLNvRqqhWCC0Mw2uxERIVDrYmjurWpR3LR2aHGQnGnfvgaSx/kX1jQPGdEV2nEYEw5drbT1888/ENLj48fn50+IsK63rs3Nh0N2s23dhjGv6y4cQ5RhGmJMAPj49LAum0iotR6PhxBkWa4hRGautd6Rb6odAEppiDCOo7u7ubadUBl1vd28ahYJkZzg9ecvCVCx5mlyQ+EAgAQowq3veYj0e99JAEQEIAbz36sriNkMJMR5PrjD++vLhw/PhvR2/t3heBSZl+2Wj4fhcLhdv/Zal2XrpS/nt679eHjqTbf9dno8AfK6rWoNAYeUCIkpcYgiotq9dQQEYSAGjoDACKAFW7WygzdHIJl4PgiGttzW9epqCXjMydCrgfIAAXov6DDkwzRPgZMDA/XubV1tWwAxMrv1Ns6n54dPQ4oiHhNrbQjy8OH7ORwRhAF6W5mRUbZ9rbq5WbW9WUcSxkCGQBwCDOPIAYTykMcUHIHMXdH5H/+bb2t3BCInJkOCqsW0Oba97L15LQuJMvPb5a3Ubd0vS/spBmRHxc2hkAEhJ84i5IhNm3ZruqsVQgWoCEYcjvNTSiewMcfjfHhgHr++f9nLdchZkGtTAmVkYDdrwkGQiJ0luhMJCHVwri1t247uQ5DTGIaBhzHHELU7Wxhj3rez1lb7vumytaVaG4YcEUj7XmtIKecEoO5gpsv7pVx2ILrdLkzhD3716198/3e+fHn52x/+c+/bYT4+PpyI6Yff/PDdd99fb9eX97df/+Gv3ZEEm+rjw0PvWsqeU+bfu31Kt55TBMDee4zRzEorEkTVtbd79jVIKLdr2a4piYCcz6+380sv+8NxLtva9r3UbduXw3gaxhyitNZ774AmIikl4uh2RxX2+5KhaUHAex4bwGPMUaJ530uNUVB1ud0Ox8PT4enr+bK3/SGP1rT3nkMIQQx9r9vzw2Ngvl6vzBEIUQIhuXcHkDimEChGksDI5g4EiAxhcARk4pARmYTMtLcOABYycQbzy37d9r1rkUAxJnUpW9v6tZNO6TDwIU/HYTgEiW7afKGg7uCdYjx8fH7+xfe/mqfHECLHOA5DTulyeSvbNuYJzFrRXnTbrs1uLHttu+reejcHwoB+r8kbUouBOPGYD4MIsoE7Q5CU+J/+D98BcYpjDDOzhoRuVtoGSKBTLbqX/nb5adlvL+8/vLz+5aZfwHugFsSZUEKX0NHZkXdrxKitLst+P76BrWBmOiJE4eE4/eEQPxCZBBrkgwNt9WJK1hXIVF2C5wEoYpAh84CEmDgFEVQmagplt66NUQ4yTYI5hxCGIRxjmAljcAHv23Ir3XZfSm0sPATOAhLy1vrebnl67N0EhlKs7wVVS605D3/wB384jvMPv/mbv/mr/9/pMH3z8RNRCnH+L3/xn2IYH56e//Jv/urv/b0/QeCvL69DyoeH03yYf/rd747HE5PcSfGt7eYeQzBTdxjH0dxVe601xqC1uzsSDWGAbtu6btuWJRwO09vb15eff5gCzPOpmtfthr0ScoghjNGZDJyQAYA4IiIw3JNFrezMTsSgGoQcWq21V2XBlHLZTdVSFgdflm2YhsBYr4vt4ADaGiIic23NrO3bcpxP5rDf1hhSTgNxADBhCndgyjhCmpyIBAgQnFwiSjcHxwCSlCjkg0gCq32/kSSZj+r+fr2e1+vSq5IYBWttraVIPw6fnvK3jNN8nAK7ILe+o/BxGhjaGPP33//y9Pgw5jFPMUqOPKaIHNvl7efz5WsIXsve2q3r2ut+VzwAORMAmJupAyGaq4RK0lmYxJkpBUFEAEY1/mf/+o8SDZFhCDlyTBHvTxuMUhiwn+q+rWu9refb8mWr54575E4CzQpRmQaexznlANq3fSsdiAfv1HsD2l21OeV8BOO9bCzhOH9jls2s9x4k9MaX24UQBAI6SMA09nkcAFibAyd3YvEooXpTU+2m3QxkEhqhRGIJE2GMfHyYfxEpg1lpftsWA2CnIBhHEhFGZPbrctt1Oc4nbMMQSHtRbYcUjsfnvdXbZb+efx6Oh48fv/vpdz9spV0ut7Vcv/vu43Ldpjx9+vDNf/6Lvzg9zPN8Oj0+ff7pcwrhu2+/e3t7vyfkCPl6W+fDwTq01o+nQ9kqKBLBHZ8/xKHt+zwls91V9+s1IA45RJay3y7Xt0D4cDoZ9o48TKN5F2ACigKE97OOBg58b2QhAcI9jddbL6XmNLvjsp5DkKenZ+3aWnOAIQ+B8fb+RubjNKKAoqOE0ntOgxAul6XUvdUSxzQdDnZfJiCGHCWEQVg40DAgCpGokFojA5fsYbLegJ3jZAbYC2AlTwxgpRACQ9gNd8aX9fq+ree6fqlLB5x4jDGN80MMAwGxUBDpdTO75cnSMOfxFIaQh5xTZnGhDI7b/k5oLNBaKbV13S+3l1L37lbaSggOoOZqWNuuauauWBp1RwCsLBJjgN97nV3V+L/9l98/PXyXU1KrEpkZw8BBInQa8/BweHRtt21pzWtlQ2FAMyMGiTykGITMAQCjBKG4llq6InFX3fbSAMCMkXM61GJ72URClIdt1WZFu+5lVe+AHHk8jBOSjSkQIRNFieYEICwMQF2LQzcNa+l73QIxE1GkNJ6CnHI8jnkaUgoYaqtbWXv1KMLEMQcWUi8o2qGJ6pRPHGP3xkJBAqott01VDSoTjePxr/7mL17fXp6evs1xHHPSrrfr7fvvf/n5y5dhTPPhMEynUvd1Xf/JP/4ntfTL9XY8nlIKtVZTnca5tSYi43Tcty0EykO2rtZ6TNysIbqrai+kfVlfwNV7SYG9t95bSBJl4iDkEqMQ3OvgfifP1b2W2kIId9oKk7gjYUgplVaut/M8jyGEZVnQ8XQ6bevaWyt1jSG4w21dzREluAoRifC6bYiQUt62ZSvrbV176zEOrRXEPuRDHmZhdcrhcEBhJHJkdEZkYIDMIvleE0Oo3hT3zbbFTKG3fnkz7U06kRPm3Vop+Hq5IsGUx240juOUp2s9e3cWQvBtfTe8xgmBwNFSTIRRKxJi13WvL01LSmlMc21Q9lpbXdZraQUwAKhp36u2pq1U1YoEgOTdOlYKioTgbA26OqC0zvwP//vvnp+//+7T32Ecm14BIGAcc0qJU5pP0+NxOvZme2m9Y6mVICEIIuUhzVMSQVWo1RHZQUgA3XpTA29OTokRu3ZAYcl72dZ1Fw61luV2U3W3Rl5CYGLmSGngyEPOIswIEuJo0M2K9tbca1mYFaCVvSKgiKtpiqNQTjEHZuwWxNXrtlT1LkxBJESiyGYVEKbhgTo23VKaycfAgtZKLQRGALXvIcTPn39a1ssf/OpPvvn0i9vl68vXr8zh06dv1q0EkdPpuJV9PAza7O/88d8Bpx9/+p1I+PTp075vZV8R6XA4xZgAnFjADcCH4YBO2ru7phBa61HEVRH99fWn3sttvQzDEFIuvQGQGR7mIUjqvZhqCJGQmcM0zEyh1WZu94AdIpnZtq1Ny2Geay2t6eFwtG7n81sIHGNclgWJ1r0cTw8xpt6qtZKGbKbLtrZeLu/nWkoMSWvby62tW287MaSUnCzlkIYDxyRhhnE07bhXSBkPB/MOgJRnbH5fvxo5CiAjWF+Wde/lfb9ebq/b7QocYxqqw3LrxRViIiPmMAxj29Z934OICHXTps15c145KgD01oW1tovBBXkPkXIWd2UkR2Hmve57NeJQa2v3BmrXbd/dBCAQkAM5AiOjM/Pv/997VSDkf/5v//6cnp8fvxvTg4Pv+2K+Cevp9Bz5lOM0DhlwvK2L4bVDI4eUJAQhInQHYJHEnrypO3dsRCKMHIwkRg8oU++4t6Xaba/7dVlu29l8K+2ybmfTxurMEVkUNAlJYAN2S109SCIMbkZs1rt7BTQUUsPeEcCwt9pbCAkBhsTuvZat176um0LNOaU8kIhECSGZA8cZvKr1OKaY5oGnMSdE2ta6rmcE/Pr1xbT+4hd/+OnjL76+/vib3/7l6fHw8PhhWZfe/HA4AuAwHvI0PB6OxPTl5evlcv7lL39Zyr4sy5AGIpCQRAQAt30nhBjDneQE6K1UYc4puakQ7vt+u1xykt46Y5hPx23rSDEE79qmaSaEUndwiOH3shY3jDkiQu2tqxEiMyF566XsdZ5O7qjdUhqEw+VyuSNvm5qEhMA5JjMz7QomgWttrZQgsi7L7XoLMZt1LesdRt261roS0eHwHIYRRJCR1L1Xgo5Z4PGjtXAHN5kZRSR3VO8kkmYM46a+up7Xfeu0IijHW63v26pOQgLObpCyNK1b2bU7C8hATSt4Z7Km4K051G6r2d61AzozOythRfYYxiC5m7k7gDtQ0Q1V0bxV3HfrZiQIhEwI3khEQmaIfa+q3RT5f/x3fzLnj+hExL336/7F4KrexzgN44TKKU4hjute1vaGoaMZmcYg6NIL9YagHFFcBQ061Y4QKTIRAQkiGFsPvffWb6XXve9bv5a296Zlv2nfkQVAmCRxIAOUEVwAyaB222MM5g2AhymmMDKmJFkwLGXftQFJa4tZVbNmXrWo9a64lc1aHcdDSqecMhEadJOGQUWCG3dHThIo9NL6vgH01vv57S1y/Oab72IYz+f3l5cfP3z8dp4e397Pl9v1OD+cjk8GPabpcHj67W//Wl1fX78Ow3A6Hd7f30/HB0DqratXBnKwbV0ZKYXYeydxb92tpyCmpdWt7HuK8e3r15zoYX7Y1xYjg4GrT2NsTcHZUYXZze/CbQeVFFgYhM3d7iJR7yKUU+rN3UFE9n3btj3nIaWhNYsxWO/buiIAknQFRdlrV/PT8WBVa6kxZiTc2y4xCYuZmyET5DgmyQIUUobH5x6jgyqzoFgzm0+cJ/SOAdrbhdqGblqb3a61lPT0YTycRkUzK5F35ZROzsPaLk13gmjKtS9OhYOVsnYHI1C91fZmoNrYK/Ve3FvvaiaAsavV1gBsHFKMYwiht/sIuCEC3V3Oe4VqURk7ABAKEYJJkQghYIgRzFqtXdt1u/F/8y8Px/nZVYLkrZ1f3n8A2IccDXoek+mIkEkcHLsWAGASVwMjBHJHcyKIosKQgwQIqlZRzQGBoys4ILowSUzJAYgIzM3AuiJUBhVmBwA0CQFMWlcAdVMzV+tESqwGLhiCDCnmlNKYhiABgUBVva7boobd09771lZz32sx6k8fPohEcMsRiLz7RtxjDAh+u65gkHnAJpnjvm/b2jLzmAcwLLXvpUzjLHG4Xc/LuhLSN998L0HWbR2GfD6/E+Pp8eHl5eXh4bQst2+++SaG/PXrSwyB5K64bgg+5XFIedmvDAZmoIbk2lurO5o9TMe3t6+1Ls9Pj6W2urdhnACs9eruDr7v650AVcoWUyZiCoJMKSVCBkRwW9frul2HYch5KqXe1w4Avm07AEiUFOPxeGitvb6+Xm/XPORtXeu6aO+APh0PXXXblpiiEJdamQIRAWMMIhJiDMwQY7Zh4ocHYvFqDTsFoS6/t3UCooG9vXnrHEGw7ev2/vK57WfQ5uSNmHFea40SxyHc1tfee7Nlh7cGi8ECvCpsDq21um+tN7cK662sa6lFe6XWqfVmZq2b0JDCyDIigHkh0tqLWQNDBkdA7GpamHDIiQIobRRsGFKOLIiotK71fF325vztP5Dj/MgcWq/d19fzZ8BLiqDutbFwvp4vtRWiFrgKUBAMLGZdrahWRJ/inChHBE6hWDPviK1qJRQoYdM9pzHnKcdjknGIM/W70J0JiEECMkA1qkjd2N2w1lb21nTvtXmnMR845N4bEYkEQYoMMcTEQQK4S+APKT2Yc9dQum5tVwALUVLMeWR3ogbUzDa3zcndQatqU6aUOYHWUppAADN3kBgdcZwO0zBdrq+Xy1cR+uNf/0mM6Xc/fc7jaRhHM/vjX//6/HZhljwM83z4+Pzh5y+fhSUPg7buvW/LMqQ0jxOg3g+s+7oQuhC69bsOSHttWpil9q613sF3IswxCBOzANjtuozD6GpddZ4Prtp6124pJQIA8CHHVmutNcXxHlMjohhTjKHU7Xo9l1K76jRN7v7+9lL3JQki2bYu19tSyjaOiQCW6xUMcs4IIFGIHTFyEkcDAw5IHAnI1xtrhXsHEioqdmY9LxwDJQR3K404h8dHInp7e3lfrte+N+ghHXTXaleVfbm+gC81vBhfEFvRc+lvAJu1tZTetlBvvK5627BsUDaoBddbqXu/3WqS4zg8WwdCQ1Lmilxbr6VuagCEYiCIAOrYIXQIFaQLGVOLRK6+1X7b1rXYPHzPv/qH2W2PnJf1fanvTfdWX4VTTuOyrdt63bey1ws6uLF7D2EgJiC/b30BMQUZAgPLrovZZqoOJAhaSzV3yDmNYx5D4BRzwJyHgcAQegByt04dmMjM1IkQWLR7a7psl77vgUOQMORhiHkrqtgpKQUwVQREiMfD8yl/CwbdXD0Urc1WkVygI7RBEiiYmXprvqurW1VoyLHZXrUEOblKZMGq4H46PUVJKad8SOtyfv38ZRiffvmLPzgen19e31Hk46dPaUp5mLdlqbUfT08p5I8fPvzudz+q6dPz0205awNh1t6GManWXss4HZftFpgIupZdiIi8tWVdLmYlxel2XtwrgWrbAT3FcDw8tdZDYHQEw3GMe9l/327p3bQb3BcOtq0lp0lirqW5G7GN09i7AWDOsbV2PZ9vt4WIjsfTOEzLuoDjOMxEpHW32pfzdbku67Yq9Noa3AscQCych4OEPMQU8kGiOAJ37V35eDQWW6/etxCOOAxwvXEzEPYhGmeSKR7mIaf3y+uyb7XV635tDEi92OvW3jd9W/yl+KJta+Vca9cOvWBbWyvtei2v7/u6t7JrV+3VrJo5l821Y4qJKQAW85tDq2Up/QJAhKwtkylZh8pAySOgdOgFDa1Lb1y7VrOOHOXh+0//gP/eP3u2vpPUYm2vizASBwWTmJj9dlmX29p0V23oPaYpygCMTuTIHEBSFBkEySV2atve72tjdOulmpJCTDHMQ84hMkYiijikMNa+VVtAqmkMcZAwmLlaQ+jAaF5bJYacZDSlO5UJWLTvrV+VOiMIEIVT5mN3dLhvKBeFKwVF1kgVqQGjw77Xt+q1mjqZ0252A9mL73UvI41RMphHl9M8T6cnRGKi3vjLzz9KkO++/T7E4XK5NW3PH5/GaQIIAFa2NY7TfHwY5/z25Yt1/e4X3++Xa9nqaZ5f317n+ZBTeD+/IPE8H1W3yIjm1/ev6C2GQZj2968GcJwftuWmvZBQszpNQytWyz5N47osDw8P3fpt3XLKhE5AXbuqau+MnFOuta63JUq82+TMVEjyMPTet213N0DU3rZtJYJhzDlkdHxbLm2vcxpqq4AeQ9jLzsTu2mtliQElpcxEIeec5xwDjTOKuDDyiDSRsEMkpq4dchDtsCyASIFZxExpLzKOx8OpNV2aOfcXfd/bbd9fipaX7fPtcvZWtlLeL8u2mLUDaNbuvWvr/bzv+2rgdMfoI0VmCpFNu7uhooMirF1r7Zvb1YmrUwKraqVJ54AhNrKlGvRWCUwGg+DKASamUXgcwyP/o3/2S1BF0jyF1joAjPnR3XZbQ3zozdfl0rqhEzGFcGQ+9g4kSATGRBLHNImFZiWkrJj2ukEAdDTFrtEpklCWiADE0QyYAvhoTqW/qoLIgBBTeEhpROQGbq6MrooOnOOYRKKm1rx39gbey152N4gclAJYZxJERsSOV6ebmgLuFB3dyOJSz7f91a113dR7CAy0OhTgTpa4jwIYIRzG45w/dUO32ns7XxYWzynelr13jTE8Pj4lmW636zxFwdBaH3JOY9jXFR0eP33Q1i/v7w/zuNd1XW5Pzw9IsO/b08PjNB+W9V1bBevL5SzsKWU0ur7/gJHTkLfblVRDGs287DtGKWUNgZmp1sLErVdwDxLNDIlSSuiwrkspZZrGYRy6OjOKBHBat33br8OQGMnUsiQm9N72/ebWp8MMTExyWdaltUjUe/PAmaMbdmvunmPOQ+Ygd4JLSiOHiDkjshlRDk6dJKII5InC5O83Lxe/o+LKBl2hN3Iry9LAG0BptbEv29vn9x8dKnNsFdet3q79cm2vr+deOcZHkeSq6KhQb/XCKAIsgvdpz12pACDm5ozdSrd9rbdWqzkWBzMD1w66gzdGpBagB+E4DnNOmVJiEUlugTEMkZmd/7t/9QcpsHAOSSTGIGMMLAHNmqsC+V57bUg0CiUEiVFEotVG1CUEcxBEQjSlbp5zcrR1PROouzok5AAu2q3U8506D+4GYLD2rog0jiPjg3ZiCkRJeGAiVe/WSn8H0ofpFGBC462Vva5dda+91R3Q3IEZXMNWXtby1g0Ox+nx8SQS9l7ExQ1L8+aO1AC7mhk4UycyIBBO3obMHx6nD1OetPfz+afr8rovN7cK0LdtT3l+OD0dT4co+Xo5jzk+Hj/u21tvdZxnA87MIlK1bpcruqaceitBJMbYe0FvHz98WJfry5efj4exlW1d3g6HKJK3tWzrl9u+Mcm2LndVLiC9n18P0+xutZRpmlSViYc8xJTMlJiPx4Pq3WYXr9fr9XqNMcYofv8FAIiIKexlzylLDHdJ831wva+7de1FyeHx4WBt124I4Xa+dW+UWDrGnB2USUKMd8FrGDLliYXNzFEgToys1EwYgTAFar3XG0RgCQaAyCjsqtf19ecvvzuv7xi4mP98ffv59W8YcEgH11g2//zj8vq2dQ05PQ/8kGIQwsip9bJuZ1UNAZGIBWJmEWGkwBBIAqNBbeWirZhj62KIjlBKJUTANoXwcc7fPz1+/3T67mF+nE5TjoKMxGrc0RS2Us78p//TL4fhUSg4IIkhakwSQyDvpd1UoTZuWoko8EAwppiQqPfVbRdhhlRb67gjRnQ2pZRGQa+lGGg1IwfhwVyX7e22LSwBlEq5qu4ORkTT8HgYPuidt23CADmNiGOtvteLt2bmDkyARFjB1ma91e5r1RVaQ4zbuuzbAgRjPk3D06en7x5PnzKe3LC2JUgmj9frrbYeOLh3AiAMgCZBMx8Tfcox1tLO71/3/cwcCLC2DZw+ffr+8fmDGrr55XI+noaPHz9+/fn98+ff5GGI82HMg+3b2/ur9pZDiJEAgADd++12US3H+XC5vC/LNYV4PKTPP/8EVgA8pbnUzawTMBqZK5CLCCIMQxJARFI1+L28kkiYmCTGfV1rrSGEbd8khOPpVGp9+fp1bYUBg2CrxRyGYRSJiCQcQhREUHdTHMa57j2FgL1vt/fjYYrj3BW96bZdt+Ua3BGN3Ls2U43pXrKPPE4s7BAo3gGdAFPi8ZNxaOsbIYiM6IDCLpl5RNaqbbutL5e3L7evm5bOoTuetxu6xBAIad3aT68v13IpBofpwykdIzIJITE4tFqaqUsgTshCjEw4hDykaYiHO6SXQFvve28da3dV7dHTINMvnk//6Ntf/oOP3/zhN0/ff5yP0yASgFid99bX0q7r7XJ525ad/9m/+nUOcRgZCNUdMTBDiIzs+9721bQFV+u9pRyGdDQdCQjctSsRO4ZSaq2ViGMY3RiMcjowTbVp8xokOZM5tNrWpSEAUyi11taIPVASpjGNkU97W4yaQDYLzIfj/CFL7rXXfqXYHaF521rprboV7V2VGFLZilltzdxknh+n9Aw6aOspTqfphEraNvC2lq3WIk5qzhQYg/kegqV4UEUGrL20CsfjKcRxW/fA6cPHb2IaX1/fe28pj6fj8cPHD+/vL3/11/9Je/j2V3+c53x9/XJ5/ZqHeJynGNhUJVIQup7PrdXDfAgs18s1Rnl6fti329evX4JEV09xaLUGRlNjCSFKYCbB6/X8eHogROIghMys2vdtM7daNwBjhNvtaqaBAyDGPB5PD0B8ud5u13eEfphPUdK27+7uiExs6mYQ01BaX7bt+ePTMEYl37dy2RcCR1cOGJkRqLWtls21IiGjhJxTTAOlOAZDonjQmEDcGIwD0YgYuFZkUgciNEQKSdEMMUqMks+3y4+XL1fXDoKQx/TL4/AQjFCDg3x5e72Va1VHpG9PzwjQrKurUem9IqbIc+SQIkX2QCHRmDAPlEgoyRzDwQDW/ebehHn09BAffjE+/tHp+MvHh4eHw3QISdy7X+u2btuylrfl9na77Ldr39bSnP/7/+E7JiBKDaw7pxSQgFkcuDUvtZs5ALoCow05BQpjyJFj2ataATNyol57vUaRlAbiCKgxpphGVKxWJAAAaTdTQpd5PCny5fZlb7dpOsYQGTnKw9rflvqaQ4hyDJKYaBpP8/xk3BVa861502bbWm7rufWdMTpBK7s238t1vW1g6TA9gQsDpzCcxoenw1MMtK5ft/1MAOjQq+yLIpaUmGmIYQAiayh8mPMhYOjdjvP04fFD6/b15XOK8fH0+PD4cDg+qPoPP/zlXtY/+ZN/FObxtz/+ELQfxowBh5x7rdYqEZa9rNtyt2Yw8Xa9TkOYhvibv/7L7bY+PnwwMwcve2l9WcoSQ5imad82Znp7f+vWRcK2rUG4tcKEiLCvV0bw1nIIzMHccx4ohLVVdZumaRzmVvfr9eKAKecUUtd+3ZbuykympmrDMH79+vV2vRALx5jG47bst5cvpVzdIQ/DMI4h5ZCjs1CMUx6DRAnCQjGzoSF0SCPPHyHPXr2vV/IGIk4s7N6679X3K0fEOJt2IXo+HNMwdyA3vu1UK378MM95aJsW7V/f367bTpK0tUABQZdyM7BitWgVDkNAIQtSY9RAjK7ujZljHEKYI08xBWGWhgeWE8ePFB9zGLMQE1O3fb+8rS+X6+u1/PT6+npdbktZ161vJSLMY+I//TefiCOAGmJt4NCFA6ALC7mXUtZ9CeIpZXDLIYQYokmIVvray4bYEYl1CdS6vXMKIkMKkRNNeR5Ydl3VvWt3MMZEPqR0EMnrfn6//MQSDuM3BCFGZ8FSbm59yg9RsjsBcBrmcRrVWutn1JaBvcCyXl19ig8SUozkTqq91r6sm5kJp3mcT8NzrRvQFgR7uem27qtfa0ULrbnEPg3DEOc05CAHwhT9lOh++ovotrzfLpfbw8PDhw8fmIQZUh7/5m/+i1n7g1/9XWH+6etPD6eHbz588+Xrz4BAjgRQtW3bdjufp2F0MHez3pbL++PDVMt2fj0z8zDk3krMUnv78vKzg8/j3A3cvZey7ZshWq3q1sp9Xa/DOA7DqGoI1NViTikOyBxidKJa9/V2S4KfPnzK41S1uXmOGQkVrPd6T/C32pblNk9TL21d1nZbiCCgr/vSzUHd3ADVMAzDOD88xukgMQ05pyGnw0TxxJysLiSDD0dglCAAytet9xWtowTPAxGAVlgMvZnV25evDeDDd7+Sji/vZ+BxrxvYLUkA9b3vr+eX99uCxO593zezvuzvVVvrXlthVpZdqRmQIxtvHWtzNuJ5OpEkr0joYxoPIU9QHxJGh+7QGmz7dlvO19v6u9fyV1/Ov/n68rvl/bwt+7qw+8MQn2Z6PgT+J//jJwdClK6goNbdYZUooJWMBSeySMDCwcmISThrgaVeFd16V9uMLAgFEqXSnSXkEJhjIBw/HB6m0+m22m29qnXhw5QPMQ5I2Frdii3rMk+TQGLyIBQT1r7XfjO3yA8s3H0PRGOeUFO/XEO9Jtgj0hieh/RxGKaUc+u91BUAW8HLednrxbQAgekSya2X8/nrttu2Y9nNPQCASApBUsZxPHA8MQZUhh60921f12Uh89PTcRgm7a6qh8N8vV7B++Fw3Ld+uVxyzH/yd//4z//yPyHR88NT2/fb5brUcr68JAnTmK7n92FI1/N1nkYRvG2rdX94eFRYupaUhpTp/L4ETixB1efDQa3X1pLEthdmRsQQooRoiCEOKY85DxBE3U3VEQGAmbRVUrte37Z9m44P43QspfRazL1U9WL7urVWgQCJVDUwai+9ln29FC9I0vcKZCEKUuCcBZGR5vlRjqccx2E8pvmIRGbG89Hi7CmZdacsMrgMTgZ1pbpRmPvxieYjNW9vXy/vb+fLzz+cX3+6XN5vb6+X7X3Hbd1TDDlMt9v72tdLebuuF0MFMMLe2n7bt3W9oGnX5l6dSu/dzen3mR5256baYZ+HmTg4igCK7UwlEwLO696v1/Xrevntefndef3hdX9Z9lttG7SOHbwPSY6Z52TTgfmf/MtPtdXmoL17Y6POdBGyphEZj+lw4Ec23Pve4WJQGFIDqR1dVSL3uAO2CQnRAb17RVJwDzCN6XmYnj89fBckfz1/WcuWcBYaOVII0ZRrvS37167hOE91WxiRCQDOdX8rZdEOxE6o2inw/Dh9nyLfzj+XyxJgmA5P4/x8mB7TkJf+ftk3LZ2qtKplf7vcLqMEDqnp2nQv/bq1i7aoxVx3EgpIg5ziOKC0QCzUte1dQ9MGrhmG4+mIUfatuLcPH79Bx7fXV1N4ef/aepnG09/9+/+n3/zmN9u2/vrXf+TqX19+eru8EMjjw/M4ZLUaorhbDPzwcKx76a3GmHJO21YIhBDNTIKcTh+3bTEvzCBpWPcGyNNhIufulqaBhFNOpVZiliBxHIlnBUhZtGsvpd6rwyED07rfwH0axt765fze1jOpMiAilb0exymm2LyrOQfspr1pihGZmwLFFIIMEkOKTVutS2bO8zGMQ8wTMNZ9VxZ5+pYxkFfsHWjUFCUJhUmJ8PaOAJ4e4ZCY8/L1d//vz3/ZHP7ib/7qz3/716/L21/9+Nd/+/oXKBpBz7ev13q7XM7n7SsFZzG30ro5MBMBsbm0rm7sqODGEAyidhT2kPauG6cgMZSylnYTKOKbk+3At2LvdXtZy9vuG7KRABBFHwKTdnAg53nCeYQhMv83//xbU8VO4qh+T5VVC5VRxzQO6TjmmMf5srxv9Yyubd96NQPaew2RUuhgHchqp2JguBlQs1ChPZ2eRzliG0Kc1/6+rD+jUsBBREKUIAOAubVtvRCLQyzt5rSXfgYCbf769kNv13F6DDy5o3h4HJ9zPl5v+9uXzxLz/OHT4XAMKRbdlvXSqt4uTZSQgyORYZPrz9e/ei9/C7wBiZsBAAUA3gNhTk8cCHExqG5dMWN3dkuSx/xgqutyCxIAadn31/P7vm2tlzxO4zR9/PBcavvy5cuvfvWH6Nha+fL5s5l9+PBpmkcOzATuEEI4no5RQqu1lDqO47YviO6gx9OhVAtxSjnt+x5DMndDckcJnEJUI4qxW2fh1nt3W5faDaBDtyaB0KE37aW0XhARYzpMxxwHNwXiFAe7m+7au2uFXgnU3CXGGAckZqAYsoQQQkg5xTh00xAI0FPOOQ+glVynYYhpRLJumKZjVyBkGh68me8XFAIJpAqBKH3bsMnrjwRNc+DhMH76fnl9+e3rb0Mefvrp5998+e1r/W3d32p7b/t2Wc5rub29f73c3kGcTFk6kg1hGNLASOQsgRMnwCgysiRVtI4SO+Y1hB2gmLdl20urGN3YKvFt86XjTfuuCiQxhZCkgxo0EXcopntKMk7iYh4D/+N/8YduLapnlpBBWTfdPTaC18D5MHxzPDyNw4OBXK8vte297rWqORoZoBLsUdyid2/doHl1ogZY2sLuh/y8lV513cq72i1KJ0IHtk7mIIRCZrCu+6KGe7uZNrW27XvrtdVyuX4ptqfxFPmBANFTnj/k6fl6296W6/j0MByGIGHZlsv5a9l9X0y7pjTEey9fnERab7V0EmExSZApm1fmFIeHICMjGtduzU3ERChFnrXq7fI+5gHA13VrvYWYkOneN398fF62xQxExMzmabhe3n766beHw+njxw8AkFLc9v308OiAEiIgvLy9DimYOQLV1pB8mo+qqN3NKhEJyzgdendViyG0XiUPMWWkYAbmmOKQ8mBG75dbt45ErcG2NjBnohDEQ9rXmiTkgbvVUpsgxxjM97JuwjyOsyMUbcgMDqrIRHdLOXOkwIjgShQCUhjzlPMhpIkopRCpd2tKw4mPs3mDOFJ6Aqb99iUQY5x63x1ziJ/awLSf+esZasEkz9NweV0/v5x/u/z44+vfWq/TIIrtvN7Ot+22l+uyNzAkNPQ4DnkcU47ESCDCFFIQdCYchjFLtAZW3LlBXKkbgbnXrqKGCtUZO4S941ZrV+u9a1NzMMDWratJQCYlghAFAyhhA+R/9Wd/ShAzg9sC4pJduRgoWjez58dvDsM3gfOQvlnWy/n8BTEChiBjYEEw1cVRMTCG2m1X6BZAQoCqddubwqVd9/altjfvhg7V1ByhgzsgV6DCqFX3ZSutNu3mTtqg9WaOzeH1+vNW9uPx05BH5KE3VROZnov15ls+TUnCvtzOb1+u5259aBWixxyHGMZEhzEdcpQGa/MVEDmwhd7J8vwhpw+P8u0kc6VVcQdXLwI4gXJdyjQMwrSui1l/eHqUkNyRkB8eTj/++OMvf/HrZb28vL18++03puU//+f/lzB/8+n7cZ5TSrdlzcOU0nC53ESkq7pDENr2nUVUjTmqqgjv+8KBwDHEGGMsZY8x1FIcIMfMHIRjkJTSIHlAiSjRTEhoXfcYBgNs6o6kbs7NzPZtN1dC1tprK+otxjANxxiGNJ5kGIjE1AJ609JbBfDr9WZdY0zo7AoKFoUIgTiGPKaUgohpR2SQSMQcR8uT0cwyRrF++9kghvTB/IpmXSaQYD9/1vef/fZWCRzSbVk/v/30+v7iyCLoDNV8r9CaEM/Hw8d5+HCcnw/D83F+fjh+OA0fxvBhGr57PH47TkdBygCBwLu1rntrtbv15tar7yA4pDGERDyaQ6t1WW51a702VayGACgcRJjIJXJKEdEBPOUA7vxn/+u/BpcxZiQ1aiiMiATaHQDKmI7Pxz9AEubBoJxvn1VVJJFHMC9l630lQgQ0JCBrvjJjDpM32kp9W5Z1vSy3rwhNOPXGW3GSNA4TsQPt3crp9OHh+KFs2qqiG7MQRG1UezcXM9n2t729UeQUDqyx7Nvelig9Mk2nRw6xb+16ud7ed2tkzcraneV0OOQwubUOm4bWSQEcoVa7AfJ4+MUUjyOJxNCtVH9nbGyzawyOSRJz2tfltlzm+SAxb9seghzm429++NvH54fT8eE//H/+H3/06z9097/567/odfvu2+8fTk9BpPW2rus4jr1rCOHe0wgxqakBkAiQxJi1N/Bquud0J/R3IEdwInZ3lsQckPnuj+AYQhqaQjdCkWmY91vpvc+ncdlWjpGIXZvW5mbMpK25KREoWuSU8swxdwdidgdC6GZm3u8fx17W5X25vqEX9B1doWtgAm9gNRCxREoZ5lmGERF7iMIjEwMycjbX/fYlRCbeHQsbUQgwpde3L9fX143hdbue9+u6rUutnAJjJgzIMcpxiM/T9GHKz6f0YYyn0/R0mk5zfs78IYXnYXg+jN8c5uPTeJiD9FpbK61ZbdaaBXK1FehGHHIYxziJZyt+u71ty3Zb6tasEdbeA8OYJWbi6DFyIARTJhBG08b/7n/7X2qtpTQnvHfHHIDdOnaEbs2n+SHG3PqquC7lrfRLDAEsOsSm1bomyQDRkAGwt5o4z/kJOXbH3g1rDSIx5t4bIBgwIE3jYFa6bQ8Pj9N4nIf5+fh8u12ulxUhCAZG3mtpamM+Bkmt19t20d9vZxjqDfp+fHj+7le/TjTtCp9//nJ5vUYUM1vavu015XCYM7F2K7VqL0qoDfba1LErxtPwgQmr72jv0G4iMdIj+uCmAw+t9rusSUI6n7cYJUZ6+fruSH/3T/74P/wf/89ffvcNOPzmb/82CI3DeBgPLBERz5fznYw7zwciRMR+J8BRRMJhnNxs2/dxHAE7mOZ46F1bb2YuEoSjcEQiY1SkPM4KwJSEY+/uZsM4aOu9tt/+8Jv5OAKTto7ovVHf9hgIAUWktaKq83RC5NJ7SMmg7/uaYxAJtfW611Jul8trWXZTc2/7dt3XHXtHU2u1tgaAzOTmLBLnkzNhGPowIrGX1fsNDTHPkgSXi2OAEBgNsIE8yTB8ffmq3JYK/+Ev/vw//vY/llKZghEG5ma9NxAaY44MoNpJGCgys6nV7iGmkGitq+01YbCua7nWWra9llaQOwfmcIVUKYxEo3qr1cruy/V827frdueSODowQxwoJk8DBSEhZAADM7faKv/P/+7fgNvb7XzdLmAOrECLNiPMCK6wAGgacu+11OW6vtS+BmSwXCu6iTVUo8Cju7eKASN7FpqIU2BmIFITlphj78W9IgMFT7kiNbV6Ok3CAR2mMTmW1hS8l9IMST1oxzyknKcpJfR+u7wsdXFA6rvr/ukXv/746VeoVNz+/G///Pz2HjmpWzcEAO1VuAEUkW7atmVXr+aB4JDioLoS7sBYvZZ+q17Qx3E4ARAiCzF06K0Ipe6OBimIaVvW7dtvv7+cz2o1x/DbH368S2uYxIEQgUVaa/X/z9N/NcmyLfmdmKslQmRmib2PuqovWgEgAcOMkWM0Gh/4dfkZSL6QD4MZ4wxtBmADaHSjrzxnq9pVKUIs4e58yIPJx6gKs6rIFS7//vNah2EMISBiirGrhhBVLeXBzZZ1a11Pjw/uRkwSsgPc1/HGmENMZmCuABjzEOLgjkCCLObOQqWU3vuyrR8+fko5PR6Prvb2dkaiGNjV8jCXbvM4mjkzA6AjgrsQufVSas4DBX55+Xo9nwmBELZ1DyGO42hu3hW9mSuQEIeUUkoBEAhA2E1FwhPEaH2x6wu0jZOhBEzB3cwadqXe0Ip0Lm3788ePX67Ll+XL6/VLX9tNb0CAoKWv2hEM1apa7163ui9LtW5mdds3U5O7qTZLgPt+vW0vZjs4mKvRHqPGoMzGgsRoFVrFrdRay76VVorV7l0JAZgo0pCRBIksEPdWt720Dq2p7O0LYGm2XJZXS8xONIi3CsUoBJ76ZTvT199lft62q0OlAAi76+ZgqMlNWte9VisF4K6U8ltfxvzo7gxgWRCRwMGwNW28hAgGs5MiwuX653fPf82aartOE/72L78D4x9//PK2XA22XWtq+TilyBxlvur5x0//+Bq/vE+nSDHHzE6IzN0UFnn01gqhj4oYOae6rF9ZjhakmTl5b5BocnLmYT7E3m9f93+sq0Pj4TAxx618CThyeOjaGFHB3WoHIceyeu/Gwuvttm3bcUqfP30NKYlQ0/10+tbMzH3ZVgR8fHza993df/3LX+37zsjjOJWyM1Hv7kAxDU27AhJHJ08pEGJXpxj3vRqoCCNEkaRmIQQiaVXByd161XXf19o8xG3Xst/nhFKxPk9TXZcOziLb3oc8GDiaEYAw916RRUv5859+nB+Oh8Ph+vZxuV6fju+/eX+63M4O9Pj0jhyYmZmRhBEFwBGW1ritwWYI1L98xIdJjrG30Za3tpZw+MZowLBT2dUWMKBL+fTx4w31dPiLP37+d2+3Px/HyCbrbVtvW4Wi2NR6t50jIcS917013ywnyCNbI0DJOU1xysJNovaVfLvPD8bIZqi9m0bs0G0zfQueTMHQETlwJAdXN3YAB9VtW7aYlSQFdC291ta0NiUC/m/+L39V+/p6+XS5fnHdGR3doPVuCqT3CoF7b1ZKW1vba9vQQRC1YzclZiQwNgjqrZddA2ZzUlwJSc3ZmSw5cScofV/K6628OanCXsvtdquqalrcmgjMYX5/+GGYD01bM20G27aehnEaDpE5Dw/tVi4vH8DteHo/P30T0qjd/vz573/8+PdDxCEGGTAdKYgngOnwNBzfCUTt2lvH4o5J8sA0Hg6nnFLZLut2a020JUJVKgoVnVkzKEUkU9a+m6mpbfs6jad12cYxrdsFUY7HAxGDQ8qBKI7joezrOI6t6fV6eXx8PJyeXq8XIwsxGUBzdUJ1U7UUM7ijuzA1tdo1jSNz6L2auyMDCoXAIiEEpFCbOqCZXW9bLdvl7ZXcD/Pw9fUrkyADuqtpyEkBBbFrA8IUo5qBA9jdOylLePny8qff/ZfjFN+/+3UpVfV2mA+Pp8dpGFhSiMM4nPI4gTUh55SJAqeU0yNzUGiotb58Vtfw+GyOdtkwDCxHcCB28M7Ebdk+fvrxzy/nOurab3/48R+Wdr71Zd1u2q97QzPRWrsjS7JOWnfQprq1vtW6KxTHfdvflu0z1cVq6bCikDkYFBftgL3uqj1IBuhWjeHAksGFi3TtpRYGYxIjDMFJCFzvTIF9r2bcetm2XTXz//H/+ou9rKXrsq3sVRgQUDtW64DoAFGYkEyRmR0VTK11MGIeqkEIgwO7K7MxEyKYmbk6NAQgk2aKHoCgQUOmWvdedVu6VQ3h+brVHz/8eD2fmW2ehtPxcZqeTo/TmMfbckHiHCK6TvMh8kQojn1dLoCcp1NOxzSkrd/+9NN/vFz+OAwShhAnnw8QogcKD88/pHACQ0Gvt80N1XCeH5McUVPinMJslrZVS6kGzVAUHQXJA3vMFLG7Qnf3ZV2tQx6HYUyt7722aRpSTL21eTog8sPDw7IuxIwGl+uVWb797nsD3vYqzMzSmiESAJha03Zv9MY4kHBtxiHGNJoTsRAwOGMIIUTiwBxr0VZbCFG73a631tbr9XycZxb+/OWTSBynQy3bvt1ECBF7qUzuYEwUmAF823eRkFJ091bbx59+v12vT0+H+XTM+aSA97GE6XjENFIYKETOWeaZh0MaT+N0DPkIFGur3ZTBcN+xUxgnHtgZKEyAJ0PWbrDeJGZS/8cPf/inD3+3rl8NfIe+7C/eb0X3ujN6JokxzMLxPvXvQA7gPbROhJFIhDkPs2BGRIrshE27gzmxmZS6tlaYo1AinEKYiHMrWPfS2n5f6A3kOVNOMiQMjEMSVevN3F04lE1bBf5X/9370pZlWwAMwcHJiN0JkAFiVzfXnLJIdCNwHBLnFHuvwpJxQCVArFhBlYMIMwV/eDiMOfTWrO+lminFHJjBTQXhGJ+TPwT4PsdvUzgu1/3l7atjiSEdpl88vnuach7j2LXe9mXKp3kYc44hDtq99ELEHFByOh0em21b7XtttVTzPQ0cggvaMMbp+Hw6vs8hiYSmutUlxsgQmkKgjC2SC2pCe6zN962V7oDMbMEC9SRGbGLKW7nVWgBgng7jNCzrpfeWYh6GEZEBsLV+mI85xU+fP6gquIeQpnmWPIHwsm2EzBR6U1NwhxgjuKcYGDmE6MAikUgAkFh6N0cMIZKMBsySzLGrxyBEdFuuvev58uZuMabL7bys23x4lJCul+u2rYd5imHc1+KgZoYokXiYkqnXWg38elmYaBpD36uDUghxOEqMgO7khIRxnKaJI8eQTg/v59NTylOe5lZ7J045CxO7Bqu9XVyET+86m99eSZRoQs7UFlwaY/yyv/z4+ufb7RKFSVOUFOiqDgjZ3SVADDmEgICEbIa9ce9qXogIAUk8yjjw1LopUSXroErQ3VtVbebN2Ikgs0xOESC2pn07m7o1tQ4ing4+3PWDkQABOZJk5sSUiARQ+Zd/c/zZ77gxsyHeZ70QIkZsjtdtK/2G96VraNBbzBwimyLDEPiUxnEaI3FXWw9Ten5+nqdTJtTeStnRopOwUBKJkVMIQdNJ3j/mH6yFIRwzTdvyirAfxveZv314OE1DRHI1XMvabT/MxyGOLNys17obdUo+jeN0fED3uhlTZErrsre25oAxUu815Hc5jswQmaC2QJ5CJMzrtte932FsZlwbEAlQdxdwDsAZ00DjIAeCVFsDa4iQcxLkr6+f9307nZ4kxBgPOY/bvseQHk7PX758LHUfhjHEmNIwHo5IYoa32zLkFCRs2waA9ySbGFMMIVDX5gqE1FsPMZqjWidmCQOHyCwxJzNwdxLatqWUbW99ud0QoGvftg2cjseH1ouVAqY5DUOIrezgigrCIcZMRN26ez+/vvbW19vtMM/TPKm6xJBSTPdNjyRMZLWD7TkSEzBLFHZvagVDNANteyD2OCA7mToSDiO40P6C66vvX2kcME+wvb28vpy3y8vlx8vyqspsU+YJxZU2B0QMMUZ3k+jEzMSM5IqGG1JnQQkQY8pxYpbqvncvfe1WCSITWWvQhRDdgWCI4eRO3dys9nLxamBmrYREcYYQHQCYOcQoMsY4xCDjNAZJCMCPv46qtSmCCxIaGCEHnIHJoBp4TKNrLFsXj0mGmI4h5CD53fHd99/95W/+6l/89re//fb5YZySegtMd2EwQe/a+l6GMDQFIOSAzh5iEI1imKdDCKdIYwysYbPQT9P7d48/HA7f5cCl7K2V5fZy25c4TXOanFh9RzAkI/YQYghhzCc32a43V9j3dbm9uYNgDoDdhQmLm2knWw7TMA/PINxBrtd2Ob+ZmzuZU1djDvOYhMVVgsQ5jwEnBEKvrqCmbS+363lbbymPeZwN5OHxm9tyu62377777vPnl/Plkod8fHja9n2c5mk8mqEwt7Yf54MIq2pKqbW27Qsi1NZEJOVhXTcHyMOATNqBY2AJZsSCEhiQ9rKZamvttiyAWGvdS933FdH3dQ9hSIEul5/YLIYYSKxXgs7o3q31SiJ5mK7nV+0bKvTSJFBrfZjGYRiFJQpFxhACM7t2dq3bWtYNkDhIN3O1xCGO74JM2+Wr1xKPz55HUrReEFeKo02/JCjt80+Xjx8YUR0/fnn7009/+unjf/56u1zbYJASh87keFa7hZiZMqBLcOHkQFEkJCA0gw5gIiGGQ5Sho6l7LVV7dTPmgIjmql0xMImQZ/Lobu5dba+1aa9CSliHSfIcmRsxk4CbpTAc5inHOMQcJLZWBJVr6QrK4m7J3buppRYDa7Xv3n/3L/7637x79z4EqWW73W44+mFMDKXW/f27X/zw/W+J5fVyTn+Ia9k//fST1zOhF/F93yNQtHaiuSqt6+oRj2nMIZnupb8O03Rbb80vEZHDMck0Ds+EiSBo36zdWJdedd2305AQIiFM03POx6YXIK/1soUZceQg3ro6Bhx4D3ULkkY+tM5Xpcg9RfQU45AGG/BmnnZu2r9etg4cY5EYCEJvIAE5ZATWJhgCoSEhiDhoqY0AUgpmVvZ2fDxcb18/f/n47ft3f/rpD7fbOk2HkKblVkhijLm1DkAgpK2rKiOEJABmXt31dtuZmSgisTIjirMgiXojl1o6gBNAGJOqWXdVba2llHq3vb6YIUJcbrdPnz799td/WbZ9vW3z4xhzaqa6b4g6hNhrK620to9DqmW7vn0+TQ/WaxAeD5MpSACRaIbdjRz0zngKkRVVATCoIZrxOEl+Z+qUwvj8i+v5T3I9h8ODzpl3s7JTUopi0z9P3z++/v1//2//n//3Hg5A/vHt03W5tVZ6WC+35vnQWcEmDj+F0L033JlpRLTARm0mIh9cfa91ZYiMpNCss5mZNTVXYqyd0IkTR+RAzEw+3GeITa2bIUMaiB3HaaAUKAlxZElNvffOkUlkHh7m4WHI02F+keP0VO0CYGtZEYLQUHvrbRXiX3x/+Ntf/cu//u5f/vVf/ebp/enr2+2yfDlvX8qycLAdKgfBXoTYLbhJL33Zl13dHVvfcNOHdMiJa9NipfSNKBZooAKt93Y1/HGttwbr3i+Izgyq2ra1pkwojOk4PfHLn6+XVyadczqM7wI9u1q3eW0fHPatf2E+eALtjYMchocBxau33ZrXohuLi9PII1ZUWkOS+ciEEsSu50vdKmANcRJMBt4qxJECsXZ0NgY2TPfOTAihO9W2JQIEu769XrfL+3ffqPrnzy/DlMd5UINm7RCn3sy0pzxeble498LMJMRtW8zAzPa9nh6etPvb2xsJS2aW3Hs389Y6IjCwo6jCvtfWmllXVQBalqXsTbtvtb18fVUHJ78uNzdiFlU9v70chtjrXiSQ9WZNvH7+6Y8SqDc1MxGqtQ7TGIOoNkQlEiQmJAdFhGbKOQ4hIQgzzePUEL7czkF86Ft694tx+B5ezv52g+cA04H6qOXCaWxhpum7b377L//x91/+X//T/6f6h3Cwi112W/bbjj66r0BwWZdKPXOLCYLPYIpQhccg2I0Ng7Tcm4MHBzKzZqCtlb7XboxDQEE0JI55QFREZCJ33Vst2ok4RwyYyWGcBooZhAVJna0WgKbAEqbnd7/85vFXbjHLr/h/93/6rVAijAZa29oVxKN3KH2PzL/87tffPX///PAcUljKpftm2m/bdr6dL2+3shUFP1/Xz1/Ov//Dnz59+NO2r021FOwFk0sW9g6X5bLouoOSBEK2ihmz1Wut54a19BXVy20TjEN6QGtGO6EGGgFlWc7bdnVmRiQ+HoZvJA5EVtprw53EWrXeW6s3wS4sAhIkQCAgBCbEtKuvoRujew1sIWdiDyw5ivWGVJHWQMgiLAO4qvYUco6BKTAMrr3bZgBNCQFiDGp2vZxDCI+n5/N5W9dyOM4IzJRSHE07ALau13U1tDwOrmCmKY2uVmu73dZxmMdhPJ/PgAZOKaaccm/dDJgpxkgsiG7m+77v+84siNSqXi6LOS7L65eXn1rZH06nspdWd/ciStq29fxlCIIAt7evkbzutykPe63jNPVWEf3h9IjMe6kxkjpqaYjauwNgIGEUpChhYggcMaQcwoySwbs2hd48eIoTqyNURPYYKDAaY1BgAHTI4ymnH18+/Kcf/+5Wfty1XPbb29t5021p58vb2+evn/JwOhzfCWMQUlNTDvQtw4TUHVqv7IDIxiIG2Hvdlr3ue1XrKgjuAJtWQCUJIUQkKl23UreyASoxEEx5GlI+SXgSHgWDmWlfHXpKASU9HH6Z80NZt9Z2IU4cg+k1YGpQzBWJSaSbffxy+ft/+I9zmgrUp/NxL8vL+Vq0mrfrUj7+9J8a7qc/PpVGn18uL18/132dpuM0JXSA2gUVwGqxq/ZqRZFhJUqstQfsR/IuobiTSQcni+W6vX791Oq8w/jt89MxZJweTqeny7Z4qxvneSLJCVEMNgDobXMTMGi9dt2ARXutWIQSUiEmjtlZetOy99Wrj0Jg5CqRY8YdOHDG7ga+9xLYmcCsEgY1a9CFzHtXqK17b91bZ7RtW2NUdhhienu77Hudc9yvW8wHJ3bXum1A+Ho5D3n6xXff96q3cp3ykMeh976umzscpnHbNyFiFwDovbfWeu+IaGZEpKq1dhG5N9SYWVWX9YrU3OptXbuWmES73/bLwzxj1yZopaJZ1RpjNuuAgmC9dxbet348PZf1dd8Wotzb9XarTBnUwDtiBQvGgk4yxgGHwzhwpobWe5/inMdwWzdEg69vtioeEw6sW/NlxXnCw+zubisTQtfb56/ocDq+++nlp2q+bLbXGlrYy3q+vD4dv3uX/2bEAOED8623M3FDe+3wjmBCck49wWDeDLU31L1XrZuqmWG7XgrGGImoNHxAwZaMVdvebsW6labYYziy09ScWy1mRthB16Y39Wtpodr6+58c8JJ42KxJq2vkSILaDXggD4FjFBwptN7/y58+fPjp/0GRZeJhTPP0MI0Pw0ja63lZ9vJ64XNf5fxyu8E6DAOym/dpyHGIvW4AptiDhtYhCTvVZltbAWAfn9LANnFcfLx5hylpbZevf+z9WwmhjyaPNMT0zcP3y+v5S/nJdtbWTYvD0vXWjbRLb0uAiCa12ra9MgeJ0ryrggBEASecKILkbl5qU28xojqadcWKyWIe1bqXpr11uLAQAKx2pabAOKAzAUMzUEJQdUVbl9s8jGiw7mcJaVmWmAciqm1db3WW4eXDl8u6zL+cVXW9rtfbWd6/3/e97PvtdpumqamGEFS1ahckkajqZqDaQwjueL0uAODu95V7ANBaQ8Te7e187r0jsaqt17dxSDlna3qa5uu+tq4AkFKoWWKMTLG1HnMCc+ueUzq/ve3FUhRgwaE26+bCSL32lFIIod+2xXoMLcoBOCCV2q7WMyMYBLcd6icrDxRmEdGMphUkG2ap1f3Wv5x//+MfPy9b2fq+9G3v2w7FYKN9L2uk+Z/94m9P+VtpN5qOu50prBSI+mvfCXAKIYQQGLoC3cVIVVpb1tYWNxQUc7zUQhJjlWTYArlab/u+l9rWzlpsDCkyqbckIiB2azeD5rAArtsGLNtWP7f109PxVyTIf/Gvx24NndBib8YMOY8xpSgp5QFBXl7f/vzpx58+fPjpx4/7vk9TCALs0UOs1i1wjNmgMzIjDIGTSDdDIslJxZ3ZjIlxSEk4iWQkxACSaIwxpcmInME6gJqCgRjnNMR0GMYcR4CwbOvr9TOgOYFDC9IR9+W26B5QgTxYVUYOHMfx8XD8Po+PRhjJp5QyDENIknJ30F73tS5laXst2o1ciBnQCRmwe0MCQFdTd0fiyGMUxtaxaW+FWjEtzQw7sNxXJUCphYVEwjBOW9lbqYb48vpVRIact20rpVxv1xijEJ/P51JKSomIWqtmpqop5Xmee++lFHdPKd1ut1LK/dyv66qq9xfAzC6Xy3K7OXjZt1b2Xus3794nEW3bYZq39cIMKcVxSAhdQgickTRFZiYws07MsZbL9e1T228OlQDBqJZ1vb4hqDBarwiKCAGFQUIe9q5r6YKAgRAK9CpOXs3QZIyeknUhGchBX77+l3/4L//h4+9frp/run19e7leqzUWjoGEKfzmF7/95vExBUkhNV2u+wtgu8MrwEnVAJBZEAMidu2m1mut267NTMG698Ztd92c3E3NVaE3773utTYl9hAJ0TgAcnXeSr9s/exYiAuQAUJre11uaJlgZHZp/VV8Aj9YFwYEa2qNaQAA7TpNU0ppvh2+fPpyeTu/fHz5i1+/j5ZSHErDtrtapRin4zS2m1cI0CPmGnCzXWpCHgkgjpzBAIxIFLyLkTURsDjvDMQtOmkGIAegSl7aeq7LsRzfUZopvz89ff46frq+gHDKPsR34B5FZQoRvu29m7QgOeZjHg+OcW83YbTF3CggkchSN+172ffr+a3WSkRhGA+neQ5kqkS0NaulS+yRYpI5cOjFNq0hcTQ1J1CutatXZGYetHnrZ+IY88zubn25ncven56ezuu56C6G5HZ9e+1uIYQ7mbmUcj/05+vlXnZUtxDCPc4xsxjjvu/X65XvxGyAZVnu2lJE3Pf9drsxM3fsvatqCJJSIjMWL9slZjbN1p0gMEVzBDLiuG5N+tcYY05HRIpp2MrXUs+wVGs9ShOhTnpb3kpdTsOU87Mb194kR+RRCLa2vJ3fhilPw0jxBGoghUh0VxoCy+xl8XVvt7p0fXvd/vxPf6x9zcORtxYBh5yQ+PHx4XA4Irrj/nndXi8/9ngbDjQyOK0cxvtmEOLpDhYkBtTie3dFRmGCqmrduSMrOGpxiwicU+/dDUNKMaFwwegqtruSVuPCyTLFxGNvbNiikPk0pCnnsZfIf/mvBnQWDEzRrdZatSMzBM78c/U6jsNwOJ5CjG4Fvc5DTimJkDYtS2PJMaHzlmJMIGWzYowgZBE7AQBQIg4MkFNmCtbB1dypgu+6NwcFqYaNnSKNeUIkZz+MYUDqve1929fXRc8Gt2kK8zBlOcX4IDLn9JAPeZpOMc6nx+PDw3PKI0c1retlJYBACtSLbuC27+v5cml7BTD3Fmh7HnhmNtJtK+t56WULOBzy42E8THEUj9hItQMYA4EHQDdswmLNu1pOkZB6t4fHx8v1bRyjq5XlWmt9mI/M8k9/+sM8H6dxctNlWXrvIrHWptZPp9PtdhvH0R2u12sI4Z4M1NZqa2ZWSqm1Xi4XZgYARFyW5Xw+p5wvt9dat259yPnh+Ljvt1auoO308ACILCGG2NSHcZqnATrcrlcRiUEAK7EQRkEjFABmQhYPIaY8ioQoghwxxDQO8/Exz+8wz3HKrtrKJhBCyHFISEhBaJwgTxAHpRMznX//n//zn37fIL18vf3hT//04e0PQJpiIFB3f/fu/bvnR0F0wmW7/enj717f3gyAOQbIBs1wY45mDI7u3ru22vrem3ZtDUsTQAosDsGQAqUQhyiSRFiQqIMiOUYEVgidWJmgWyeWwJRQhMKQxkAJXVxDiOEwH8fhIGChVVXcohAxEqC2vi17lKTNJAR3jxJSHOf5eLs+1uvn27XkbGMc359+Gfj6slxIte6xmB7CBJKFRBgSexyVGa26gSlWtYVIpsR7rGbayBlRUUQimllDJwXzTrYsy9vL13h0iHlnD6fj94dv1nJLaQAsFC4RD9TJfeUg6N0MKIKydOPr7fXl5XNrW3RtHJNQCtb6BrAeT0MpMAwcmHKwGJWhUtsDrmTaam6IMFnynPMskdrOvRNzRVQRWdYSNAQJ/WeAOJiX+fDQekWHGPLb21stxQCZ+cuXL8IxpeEeu3/68vnx9DAM07ZtT+8eiWgaD+Mwv7299d5zzvfl8qp6j39qrf/blWVZcs7btt1joVoKIjKziLSytbJo7xBDCMmNydRMY0zgLHEMIITeG4QwupdSWo4pnt6Ped+bK1QkM0DvDmgSUxyOMQ1pnJCFSGJKe9+Z+TCfWi+9d4boQREZiCEKYCBgwEjj/D//x//4hw9fK602LENhtaJue22Hw+Hx4SgiTfttXT99+LHdSpaD3qwC32YNg3FEZkcyt90hIN49a6GukdgluDsiKamLhxhiHmKkEEHB71tRHQiQIDA56x42VqIh+cDaOgFHoMFyHnM72txbt+v2IQ0qXnVfAUhKqN4dEYYcAwYAEOJ1WXqOmDAlJvfjfJT5aF61uRIIhYfpueywX7+uuzL5PJzycGj95n42WTBo5DGDd8o785flz9BLDocU3HEHJ0YROaIbDpRW2LZ6A+NOqvgTejONaehYwzSF8E3us0QFbnu/an8Bp5xOpQSSvdZ9+xophtr87evtdi2hsBJYAks1UXHuDvFSJExI1qKkFLERI3NCTNHzQOUCr9fbOIXH+TuiUYIDB66IHYkVWht8IDxYdwlqXnvteRwcatnOh+m9K/TWmsN4nGtp5PR4enp9ff3++2/33i6XyzRNAIboUZJwlDFer1cRuRd/SinDMLy9vRFRCGHbthgjIt5/h5m3bXOAfVsCcTeMLGB+PZ/HxIGmNBxUtZbNahnHYRjGbdtSD6otTkNEVrMUDmRVArJnppAPUk1dO+GdCY0o4fnx+eH5u3iYOMZhethK37eVu8kwJozUwZpSHjwwULjvYzH7s6+ax+cffv23/+9/93/7+PIf8sBzHkrT8+W8l/aLX3xPqIjcQb98+bRe1tFjCGnvrV6gaR9NQm84XlIChWiFVNGNiEi835GZ5s7uXdzQUbpTG/Ipk69t66ZQeyePmCMyK5ZCTjgPAmuj0CWn3ny9lR41x3HM2UBUfd+/ipKXxq1uOWPk0QBVeh5TlDiOMwh++fJplevhdBzHPHBQDt6JiFRw3/ecc/J43ohhikEBoxM7Wmub0jmYduyWhHsXhDHGDV66vOaUQDACiAg5OjzV5lLWAGfD2hVWCqWe2+scwxSnYQgYKA5xwLBwJFD0vX99+ezX1yGLWFyW6163qr11IDsQ5gFOQmmIXKww2zxFigEvdV2jOvUulXEzQGakEGQeh9pZl03fyjqXK8TBWYRJRCJBKQXQp/HU3fbr0ryBtXCHKLinFKPgst5UNVFIILfz5f3T+5dlkRjMbN0XEem911r1XtED2EtxRzNtrc3zfK9+llJijMxsZszcar3dbvM8l1L2fV+2G96JhyQG3k0D8zgNZe+BwrbcyrYys8QgwoSwnL8C+G7w8PRAgGg+RgYgRKLhJMEyQC2diVLOACCM6XDI0zHPDxQHawTWRATcW6/TfOLozdRReBhhTA6ETanU1z99/Mcvrwb+m1/+5u32Yb2da1u99uulvP/uYTwmMyXAclu3ax/4MGdW1QOn12277u4exlmJV2YI9GgCvWxGa0jdVEi1qFVwcDfELuBkjwGp96YdmpICQx2Zg/Ts0IGDD9OQpLuvpcZSm6EoAOiglm/bqixRhFAS/+ZfpL05Anpr7IxMqjrPw5gyh3w4HvZ9vazXqktptQMnSSHAdb8WNe3bupZKe23Ltn4e5nw4vsvxVNv+dv3p7fK2FW3qqUeoobTirgCVhUVSVDAXaiSYG0jRmnsXahx7TWZerWylrkVbrYbmbsCMIgn7OMrD0/zPTse/Wc7X8/mLtqV3ut28rL1rMG0z+qOMhzx2bU7daAWuQUb20Tz11kgxcZQ0dEM1AGZni1OfDxwJtlqAo6AouiOCI/TOYMjUenFvSZIwA5B1zykPIe/rNbAHQEbYt5u6YoSt7t9++12rdXt7Q+jTdIgxv75+zWM+Hh/Kvocg9wmveZ5vt1vvVkqNEs10XRYiMoVtX4kIAF5eXnpr7t5bcXcEU22Px8ccE1JTr2XfokiOaZymUouD3i4XLTWxu26mpdbFFYUiR+YoFHIMMQzJmFhwPBym+TAcHymMBqyqX96+nq+vhIWFGZmQRBJLJBGbB+UIEN0CO5L2/+8//N3/+vd/dyuXyLz2cllvr+e3h/HxN7/8PkUbx3kt+6fX11ZpzIccpKtXtx3rVnYUTTEwIkmn7A5KgA669mqldbdGuDeoqxm7IiYMJ54CoO4NDBxUsKURQ6QRB5QxTVPwHDqx2r7DukJdzEpjjKrYCry+3s7nbb81/vW/nAwpZOJI6oag05TGFJ6fT4giPJ8eTqq3razbWq7Xs+kqQntp5+vr+fr1ulzn+fj09ODIh+n5N7/858f5edvq+frl8+cf3160nKWuu1XAgrXurrTa3KpoCWGfoJA2vNayrRuaEhNqBFTTpg5GaAy1ay+we61VE85DPjwcfzPz979895u/+c3/WeLpp09/PN8+ag3rdXBSImGaZuaBRoFktDauu4IgRg85ppBHBTcCcCxVeycDoOBjwiHGKQ1qrs6RM6FnZwFLFCRILbWVOqSRmdbb0vcaGQPhul5QG6hb63EQtTqMMUYZc3p+et6WZduvOeeH4zdq/nb++v79t+7Ye7uXfQhRe9+2DZFEpGu9H3Qi2rb17jR677XuvTd0VW1uFd3nKQsbARzm+Xp5O83HyCzChFxLJXAEZfTWtjFO7lD2EkJMw0ghAHMKEmKIwpJC7Q1qG2IeDqeUBzUwMFdbbq/bdstpOh2fkRADe4wuAZghJMaM6/ny8Y+fL+efXs7/7p/+w+cvPzboBrq3PkT5zS9/GAeOEaz6x59ebtddIHIglLj22qAgkymACwdndqQi4kJEQO6szWvbew9gDdpa94qFZ8pjiNlJgNi9W0dWCIiRMSVFoRCZkBWomhW11vbbm7GGYUKMvdt1K7dLvXzW5VL5b/6bB46CzDFGQHMo4yDoJWaaDw+9yzxNMaRtrdq32i7nr0utrfX+tn66XF6HcRzTu0N+/NUv/uZp/uZhfhznqZT1erlt+9Us7a96W+t2a7pVJBKA5nG7wX6DbCNhhi5FpXavzYNDig9Cw3pr1o3QGI0x9qbEGEh8t23rKT384rt/9u7hu2+f/+rXv/pX33/3l+fz8vL1z83WVgmcXSGgJslMsVFbdd97r73HkBMPKU4kBIDadN9r3YE1ZM5zPAYIFAanESwSgHRiwhRTDqmXbq3N4wgA58vXwPL+/VOKYdsX8J4Dr/stRZnnA6EKyzROzw/PkXhZryw9ShjT9Pr6EgeZp/l8uUiWbl1rN7N1XYloyAnclmXpqkPORPTx40cAa6323hChlMJoqs1NpzwGZtV6mGZB1ro/HKavL1+CUK0NkY7HubcdrRJi4HDXV4sIilCIQQTRmYUQEJyRwExSng5Hd9trd8AYk1pbtxtamB8eghAE9uMJY/LakRhJQG//4e/+3f/4H/7X3//0D79/+d1lvSyt7htMQ/jF99M4YJ6Cu376/PrTT5/3wkBiYMakAMV2dUNiROxeACsxETFzA2imgE6tQa8U1cndPcTOI2DMxHQvL3kHpQgUGZg8CMDASAkg1I7dQFUSh5HyHCimvVettl19eXPdQ7fO/+y/mYaIOQszInXkHjMcH0Pprzk9BD61Wphx38uyvmnzbl62bV22favrtZPnb9//IoXpcfr21z/89uHwnbus2/Xzl991veY0nd/2L1/q7dJquY5pOIWjeOxKdV8UyPKA4ZHlhBD6ZhmzoQR5FJr31hxgkFGQwHczH8IxxXnf2svLpzSN73/4zfvHwzzwfHicx++bruflT3U371ZrEcgpUMxBwlyrbWXbizVjIWEeDdD6rr1pE+8hGEQeA43esVTTFgYeJIQkU8iDste9Yu2DyLLezue3eZ4eH08xBWbu1oXF0cdxODwcXQWJAofT4RDQe9vRivVbIkbv27aMYyZ07erurnZnB93NfM55WRY32+uGQLXWWvd1XXrvzAQAXSu6mzlLYEQ3n/KUA3rbrJXz25dal3EY13Wd58NhHrZ1gdanIQuHbsaBgsTeLQgPKd4XOgNgTsOUBgzBWJiEQyC6V3jd1GMQAYlBZIxGiTDhmDEObS2EhWj++unt3/2nf//3H/7LbVvrtmMrx8Pw3btpnjTnZAhfXt5++nR+eV3Khg7IdCdpmoIDGrMKazcFHQgzMwQmZlftraq1jrtp68Q05CkxCWGKFAG9dlBDNhZkRAoBA5GN4i692V4ZTEYOM8QxItPaam29LE3fWAtt2nff+Lf/6uDYHQ2wi0BKItE4V8S+7fsQnlvpZb+6oxuCoUHrtda1aZvA4vVyzVN8fv4m0/Du+YfD6fF6O396+fHL6z8OWQ9j7sDLddfdhkSH6IcxTDIWh7fm67Y5QMiPw/AsMup2rXpWTHdaAnLI+fQwv5/GUcgRRMHdIthwPr99Pv+5tnbID2k4nJfzui9qrrYVPdcduyFTmIcg5CKRYlr2basFlczdgZraVjatvVVEwBAiU1DtrXpvoLUzB44phFCs9d572am3upd93+Z5fnp4ioRkbmoI6OAi4TCfmEVY0D3FfJgm7bvbjr5TXwlo229ECEZdtZROIKVqraX1sqwLsxDitm29964NgVprtZa9rKqdGB2stdZbJ0IAQEYRHwOhbvt6aXXTvhEYIY3ThOQOmofBqgKZROlaVXtO2bq79SDBiViCSCIKgHg4PHAUBCSOHIe97OuymSGSD4linpkTxICM6ghpkCR+eVs+nl/229v5djmXdr7t69d5pPfPeUguEhzH27q8nc+v594q9U1DiDEkkdytAAlTRtpTlBAj+oAYlJuDuaO772VppXnFdd3UDMkQOYQYQ2DvaGbeHRtId9AOqA6gdO8LU+AwICcgNkLcdN1KvV379lb3t46sOK35QKKFnONmXSIl0hQ8RGjaAgTV68vbP4ofazXA9PD4zGLLy+pGtVvduwgh4T/+4z/keBr+8nC+vRJRLev59rlBeRznhPjd+6C38gY1SmJHx33k+mD0xfC2d7by8IAhjhHI4u3z+VMKSqreVVUPcjiFBwoQQIhue4d938Ecgr+9nv+n//nffvrpw9/85d88nJ5LvXlv7x++F7bf1a/XdZWgte4u015rmE7fPf2VfvkHYp1iru3qHrQBURon7FXdoRHU3vtewA2JVBd3Q20EgToMhM4ecnoeRiIiTCn45fxKBJKkao0pD9MMTrVfsTcENF3cm4AJA3LaSxVWBTi/fZlODxzGVmoFu13PTds0TeM43m6X221NKfXWGOV2u5SyM7Nq6//1Q0wA2HVjiyiAsEYJmEZwvZyXVjYEPh6PpW5MMD9+Iw+8bueuysy91LItEsfe6/nyMuoDEQVJzdQQvWoQnh8eMA23W/1ZhxdlzANaIWCXkRwAK+dRW0XzVuE///i7//zH313OX4JvcYD30+M4BUJFK97C6/r6tr3tvXEWKs5oMVFIkQSxu9be3DEgSiBSTurMZQ+qdcyZcRQcR5EywBGRXFiU3PrWrJtaiikkxu7VSYkDSOzOkkYOIYQwjZiEhjgEyWZwqvVyW9dY7GjwqxAixEkxR5EWsTb0UHondnYnJndxcjS+3T5EvIX4DpBRrZlFFgvzjkW9ecPDMcVIv//j33PEOMj1ujTdr8ur2q6aXOaI9XGa8EGsKOLcal/wMiT5DvEfil8g/Da+e//0PQG+rMtyPZVlq9BrU0ocaNi5zsc58FHMToN3xLdbk/iznOZ/+Y//9t//03//7em7d+8P0zzEYOPh6fiNty+YbEHvADTEoamlcPru3V8v28cYXDgxhUjezAOGQvte1s16jEQOWJuzK7fmFaEHzRNHUAJmQvHmBIRuy7KrWwjxfHnjEE6n0zzPX7582Zfz8/Nzq5v16xAzUSSSFI9Dr8XK63Wp9cY7JfcCvTnst5t6F8Z9z8t16b0DHEopAHBvBpt3d7/LRVU1xaFqzTGeDsdMOiUnhMv5Ukphjs6uvV8vr2Y9IGmpFNNIB+01EhJRaU2iEZG2fru+AXhASsMQODt0wqSq7J7Gwd2vt8telhSGMQ6uigwmrChRTcpmRT/99PKPv//Dv/1f/ofP2z+lsOVjGokVi1k/v/p2ey2Ehd2UI5FmceHjw3ycD1FQm/TmEsN4muKQVFtkUfC9ljHy0+MxholgCj4hUgoZNfa2l/3r+fVr60WNDodpGIZe27pvZkYSRSLFMAzjcZpTZEYMaWSJbmxmZblps9qLEROiIFAGeUiZXHCtlwUrhbqVuFkAKKBJAqEt24soGYyIuO/A4DlGmwOHtRUlsGmYFervfvx7MD0dnoHref1z25eVVxXGygn5IQ89ejVfFlezCT1wT+I4zO+PP/zyu79FhH67Xb67BNGOAABgS0lEQVT+E5S3rrrfFt0ihmAqb5dlPE3T1MwuFDgmWrYtppBGlnD48PLh7z/9j19exl/86vt3D78iHh7mR1LfzgvIXvorQwoZ976H+G4OgPpVJLqVkGIv1ZuGzBYI+9KtUYgBsYt1hPVWtl6ZUk/TAQfBjOzIyMzEuC17EHn7+uoIv/n1Lxzs86efzudzoppj2Jer5HGIJyIKg6iTN9tLMYp7aWa6b0uxrTb1os2txHI+vy7XdRjG8/l8l76ZWe9drREBEbmhu3fdmfH58fn9w4FV0evXt4/Lvnzz7j3a075e2ragtxSE3fq+ME8cYwixt8rR1ay0mkLUqs127AZ1f//DD301fB6Oz0/rbb2+vSkkraX3KkFEMmEGVqhXPnyPcdCy4Nevnz98+fH169flRr1PdEjDQNRduzeoLaxr1RLikMbp4Na1sM9TCvx0fDodpiiBgYVzGsd5nnIeOiBoL3VHoCnlwzSnlERiCCHHlHIwZ93s9fJ6vr0ty5mFpnkOIUKTdV2v25UF53EaUg6Rh3wQlG4G7CyGQOjEHUopi66ttQhDkDlPJt9MgwCOXPpbX9TcfOttJzXAQkuSrI3W8sUwtY6MlAMNh5lG5cV8qxyAsz2MJ07Hpa5ff3x1XBgqIBbdE788y+PeFTxkpN6vYGI03GqP0Z8OPL5/fvfdt+/mZ63bS5pQiUwD4PvDg/LgRjkcO9TWNA5DqRq1HEiLkXdTKTnQdw/HV6uvX88GfwSgIR+j8DE3WwV7NNzL+mdM33SUbnXOT3urHV7zwN6n1vZqb0FO38xDh1xu5fV6u6KySqTQwPetTzGo8HR8OnLSuo1pDCFeL69VOzFRgF/88BdJ5r//x/8foDPC8fQEhjGmlBIykcQYo7sbQRpiHHKE9PL69WVZ9+prre2mwCoD91styx4Q9roBA0ojYEBV7+jBWm+9IQNzejo9nA6nyJG4rLdL3X2eZ3eY56NDM28EnEnQDRHZLXkCBjN1RyJqffPSjsc5Vu/1usr19bM/Pv2q92pmKQ29mxjV+7pOMyAOORBnCGhxcHqmjA0//Pj69eN5fRjnf/m3//q6vqoWB+UwEIay6/68EZEwe6u9F3WQu6B+HlOIpVQJ6XA4HKZ5HsYghOZIgCDIIUS+d8cBLOccY8w5M3Mf+zD4YeRtG0XC8XgMIQDQXUvbVVNKQ45EVGvt3QAYAO6Tk4zi7rwL71y5EtEwUB4mYdRo29NsG4TrpwYUI4mrAql7uW6FenJQ82qKTqQwCvmQJFDkzEbogb/77i9++5t/4dB/96c//u4P/2ldtxhDNAaXLXYiGQ8HYdbFtO+3a8GIA1uY0zCBYNWit5u+fN3Pb1sm7AQt1dPTI0B0tMN4XPFq2sWjEx6OMsxBDW7tuvaLKU/TgQHbevn65eXdczTpgnEYpnLukQlDX8pHJEUPFh6m/Mttxb28DQMGTaDJveR8jHnWrOpc3m5OHDlGh5CBcMghO0JrLVJQ8Ov5lbodj0dv9eH9Y2D53e//YVuXb755yjHGJGaW05ziJEGQIefcWnfuTpKAjkdsap1Yz1vd+62sQF7WpqoEuO1LqTWNgyspOHESv58GJZQgPAxpnufW2qfzJ2EQkTiMgfB2/hJFhLPEptYVgRkUStRAwZFloAl52vRCiEDdwJ+evvny+imiR0Ltt4ynvl0gTGPApexEg5GA7m3bPA8+jBjFVBk2LM2UptPDQ6U15DFPQxrVyjRNz0/v5/lUS7+9nUutKNxaO1/fVDWIJBFg8q7koGDay15Q2y4k8zCmlJxcBImg96aqd73TOI6qervd7uKo4/E4TVNrnTnEmHvvRJRzFqG7oPDecTeDGGMIQTgQAziaWQhBRGJriJhzFhFZvaOo0BLjYAZ7aRpMQhpg5BBXrWsBM1Q3RATG4ma2JJQgQuwsJU/Hv/5n//u//ct/FbCf5ue3t7c/vt4cJEYBZ115eJzmMMaGOhy/fLXrstlowC7BHNt6e/3Mf/50Pf/uw3/+8PEP748Pcci1L7uc56e89gtVdOxuu1uVEJ6evpnyadvrT68/gbeIumxvOWQ1KPt5ucLxeDDGMSU+ZuwWmLReq73mPPV9Ps3fH8f55fqf9rKinrz7OLVxGMZpfis3chYKBhghHY9Ty9bWHjxoR0cjx2XdUhoPT0evKxK7448//f58/TLPT6fT8yGl18tHbTqNj4gcY0QmVYsxqoI6MUNKaZ7nDl671VpD5GVdL5dLCCml0Lq5IxqDUxoGQ5CUEBHU0IHAx3EMQnW5fn39mOJwOJyQYdmKo3R0IgQnpNAc2AG0dt23AllG4QAA0+FYezTr2K2a/vCL31yWC8RxOjzOp0cP47Z5NVi6RCGJOTEyuDFwTB4HqpvjDTt0nHKGp0O367IUzyFN09Pj4+Pz0/vj8aGU8iZJRCSGrdmyba4K3oBcVdfbsm1ba9VAl8t5W3Z0T8N4PB7HcRjGdG+BxxiPx+PxeMx53LblfvpTyncBee+uqvu+E1GMUuvP8xWltFK2WquIIHrOERG3dSe+k1GEiObDiMAAQETyejU7DdD34tc04vULeh8zKnAMPjymmcq21gKI7uimxWo3NCfLu0h1s8d5+v6HX75/+t7r7XFY3p2++RD+0NumOnYmBgxIQH7z2tp+W7dtb9Rh1TaRt3l5ffv4urRP1w9fyx9popuW3CTkcS8Ay2pha60hU9+uSPXx9BxtOMQppeHz9UsrVai9e39Sq73Svu2O19LYhBM/5DF4927eq75evmD48MN339ayhCSPj7/6+voTEaIeXRc3qRug876vt+XrMAyCMXKWKCOKhHGUOXZAwHenIQjclusYcm310+cPEuDd8w8ppWkarKs7xjwwM5OEEIEAwBGBQAAdAptZSimUPY8p7iWX1LRqr0QEnlVVFcBxGMZxOnAMAGCtIyIhklvKUVtzLYwEhtu25YSBcHz/Q2Qo61licGsssW51CuzQLucr0gOPMxgDYErZDafH0bxwmn7x/rvI8fj0DPOInUOwvSNA6/sS5mG6q9NhcENHIAHfVDWQjBhacUcrwpDilPOAiLflspe1lNKaniKnFMZDOulkqswYQmhNt21b13Xf91r3Wuu6ruu63geAkNy9qRIzs2CMMcbIjHdb3nsXkfu5N3NV7b0y876buztC711bR8SUooi4+77vjrspBOS7yjCEEFjcvaszs9wWrQBpyBVbOrSpxHJV0hASCyUhHgYtVgDIHcH/K4a+165LHt0tHudv37375jgdb9y+nD9HgtMhvl02V2zcQyQAi0OqGZa+rXItgZMPrfN29Tj11/GCUpfzn9r2JUxIpE3XEI4yTo5A1i7bTzmMu+v58qKdjuHJ2mrsDj2MIAYi5NBT0DQdum7qO9C06J7RDWEvbS9Yy3j9ch3C6+P0bLVDGKbh/Xa7kotrsjXdoPpOiZ8fx+fjcDwlGSQhJo8poGTgQCnlyfvt9etPU0611s9fX8djen76vt5qTE2tlKqARJiG8XBXVWntEmAcQu1uvSO5eUMEIspxGEc7v93cFZEQubbSWhMOIpKGfDgccs6muq4rEsUQ0FStq/rlckkhhzhNhwGhN9t779Bwyo9L+RwE2QkhOZEpmOr17TWyME8sgcgBvHkfYnQmkfj0/j2Ms4KyKbFybVya1rKca35+DodMZtCuRNUkosZyWy/ruW67t2rqMaZ5npnDtu8vLy+qSkQxxhAZC5M2dxeReTid5gMw7Pu+rnvv3V3dvZT29vZWyobIKQmS39P9UvaXl5f7VvC7LBzAW7tjMvT+YebW2p0ZY44ANI4zoiOiuwFg750phEgphRDCkDIiqmqt1R0xRln3dtvq8YHDNCeuT4/96q1ssK0hHrMHlvGIu6Ov4GpOKcRuaoplj13b4ci/+e4vvp1/IMC3df3zy+92+/L49PB6vuzrxnbAKQNGQp5Sqo9Do9MBMhTGtmMaFh3i3hHeynomVhBVZu+q1JEBkYMoWb3t2+60d/nzh5dtKU/HMTHVSGkYkwTCHjEYdCICMFUnOPZ2RA/S+cAyH4fHoW/jhUrWhSJN3EycYp8VwLlOMqkTpDq+V/OYOQkrMlrFpTVoGzHREPdW2u2KGLa1XW5fhykdT/Oy7gQY49wUz9cPrV5Px29yzq3t7r6XdQ6jK7l37R3uwQzYGNnMB4nTPF4ut3UrLFm11bpDGty91rqWHRGtNkHq7rU1b52lb/suMT49ProTOJStEMD29mU+jHl4RBvDwMioe40cJBBCqmXbtx5yS0EYwdFSICJ6eDrGcbqtZRwShaAVFNC9B5SUufRrr2vkI46hobGjrjVYULB1b3UzZg55CCGM4+zut9ttX4sIIVLvpWyblrJvpZky8+n02L75fpyiu/VeWi8p3aetnEXKvm5bMTNwJMLea9nb5fzp69eXcZzGccw59d6XZbmLw+8OAQBU7wBtAqB7iM/MITARmUHv/WfYNbqIIHKt+72+TCSIKF+/3oSCNXrWOCY2Rj7Ap7psm5+O0zAcRPKS6PVtA1AkRG8hCEqsTcr+9d379HB48N7Odfnppw8fv/6hlJ8CTyK4raubOMq3D9+ElMOI3+UQJS2tRAi66w5COXatqIUjxzAjJet98/1r/wKND/iw1627Vey4DSd61NDdGfQphDDEFPMI4G47AHCQGIVIoCFUMg7mOEwTWyqtb/uZx6M10V1jGpMHAcfsi5emIWicYoz5cZiyiFjxqrVp3Wwt66LdMYy9eauVw7htpaw3hkIctAc3F2GgBNDPt9fHwywpGOg9c+29u4K7r+ta655SQgcCS0wFnall4efnR/vytiwLQicC09Z6bVVvt9tyuRLiHetXSkkscwxMNs7zbbnkmNzRwXLObTk77Ov+GiklnFwAIjCEGIJ7GIZRrbr3XrfpeBQeHLQq7Ht5/u5YGpBHcClWbudbMw/dgTRnQmvUAR4fMYiVgpev17eXt1b22rWDmiE6gLVWAMCsE/1MV1e3fdkcbN0LAKSUzueve13ncULErvV+kXABQOKwbdv5fN73GkMexgRgRHwP692993a70X00dBzHlFJK6R7kpJRSynfY8J2e9HPu+1/15/+VrIH3AeumvWz7PXUuZZNitSnYBvKCj3PqcdutHA68ewHCObxrBBd5W88b9BYTVg4se6LsCoDmhJ/PL398/bM4f/7zn14+fLR4zqGE1LddO657sY+XkMfv3g0PyLGM7e364bYv7EgMU0SOWrcaDNiFQtjbHhV2XdiQIQHLCZ9OCBJikDmO05yOGUYQcEJqTd0cn2JiJoFijJR5CgkULXFMcVIs27btAQAscm7WTEHYAyGiZHAsSuaRMOXAbtw78x2ZCGbGRBQYyAlCHvh2fdOqxMw+MIzo6NaYcxzkxw//BEDj+E47uHsIYd1uYF7rbp3dW+urBEBMRARMwCqCrbV1W1Lm2nbr5g73OTBGIRJwAzAR6W7zNEXCvrcQ6e31SyQ9HcdWVVtxQyC3xkvbeGZ1jS4pDa21pj1OSYTdpJtr11J7OEpOg1Iuzd4+/enwza+MB2pbasvNO7ZqvTrAEA8UUf0C9Ez4nmIvsP50+dOHjx9fthWMmCnGxMzbtgLAVnYgdIR1K04oAZ2AhpDTdEiDlbbdlvW2AICIDMNAKL0XVSMiMyNCRHdoSMwUUjrknFury7LcbjdVnedZREopd2BMjOneLXE35gCE3Xrdt1K2EALiz6Okvddta73fl5F77/1+VytVRMQJuxk2PveuCnFUHphStL65Ve27NUagteyo3Sx1LhLIpLHZu6fHh+F4uXz4w4e/Iw1/+vwPvfu69oW+hpBMem/FXT+9WMJEFnmA7ky7UPGYD2M6HNMhi/i8gZfMiTntqfsBAY0kCk/iEpHFAzpwkGGYpvFwHzNf9qW4JZN5nmOM1+t12+pSa0klpMiMNCLaGqPkHO/qYhahBvu+d0VAAatmvauLoCIty3JvwcYoImIGAPchhJGZCbn2HkKgw6ydJ+YU4r5up4fj48PjH/74T58/vx4OP/9t6ECIl8tlGjIjllKjUJMMLuYWQrhpAegiMkyRXrVthZGqdgEUYkJ3oceH5yhUywZgZjbEvK9ba6vberu9ffvu27Lr7fJlSnlZrsQRSYWE0Ep5pXhC4Bxjc2s/V/3moEaZeu/7esvj+4EBYk5xZhKI7P0AAz2IlkUBViCMh6MksYho4E5AE53ep+my+uvr5dN+23POz09PIUBrjZlFQkrZECR54GkOiQUL9DTkKEFjhUp3KME4jilERARAZr+jfg6HQ7c71Bp7bSmlaZpqrSnle7h/r4qqain1HiUCwJ2dEXMKIdRaS6m9NiLKMczzPM6Hw+GbdSvL7WJmzAiEZnovQy3bIqiCFPdWFN0WeCAZJ+/UgowIvGuJMv3w9P7wr/4PRBQpFEchUOgk/PR4eHyaD1PibXu77K7jafx1KyTc3h0ffzmnunuKMVBIQQaZchi7tcdv3iFADvk4n4Y8hhBcobUmSIgMdyEUtKq9FDCDLCwYuxZVRYfeCrirNtSeOYyHMedcay2t1t7U7e1yzjkPQ8op3QNBRBaRbduwNUJnvqPCupqVuoOjMGuvvffbenP31Ia7CalNa3O1moeo1s01plFZmkUnPS/L82nOafiHf/wP19vrPL4fhyhC4BEwnS+v23ZlalEoyKmZMgckQXPt7go/I/BjSimdL0vrQAwhxTTkcTiOKQ5xyINIgHVdg2QD6r1fb1/39TKmlPP49nY5jBOAu7aH00kClK3GNJay9VLTNKkbmbdW91qt1jjkYZy5sbr1fQshxjTEYYY42vGEzrTl1oUHF6kckMIMMUMgw0TIDhZippSPD08AcM1vEjmn8Y60IAzMEmMUEWBi8UFCongAMiETwsBD41RbjDHH4b+eYGIGEUkSOAYiaq25OwDcI/gQMIQUQsg5hxBKKdfrFZC113sefMe+030fGyIRioi3XgmN8HA6DMfjsXvbH9d1ZUF3u5u5Xpu2LthBe0tZGA3Jd9+t2Gl4GvLzw+H5af7+ND4+jPNdph0kCRGA7V3VIceQssjA3drTCN8//eLt+Yd1+6s4xEA8SFZA7hgCd9+GYTpMRwBwkMQkZswYkoSQwKW1VrWqNWu91o4oCrZib60JA3l3ImRw6LVrbatZJ6IhD5Jk39fr9brsi4FL4GOe71ie+5DhnS1lZtu2qSqLCLOqruu6lwJM8zC6u2BwRgoCiBikuXnvjo6C6t5MhWOIA5i3ps5h3RdC22v9+PFT12U+HrZrTZ3ccZ6GUspPP/0UEwNxM22mgBxiAAD17uYA4L2Bee8WYxymqV5vYOCueZgen57nw9S1Lsvdd0uMqdzWZb+2pozycDxZNxGmIFrL48NTCrKs5yhDzLOq1m6PMSCA1m6miKja1msFtXE+EJAieCbS4q3dSZKI2YZjDqNbdUdnAQ4GiBAQmnvHsq3ns2kZs6Tnd6dpvn9rrXUJnHJAYDC3rnXdqrdbpBxi37ojz09THoYYs1EgBySy3kspROTA61Y0erCfIQD3cdA7MM/cY4wp5+PpFEO4g5VUVb3fc+JlWa7X6+vrq5mJSM7pdDrFwzSOY8ihu/VtJ6IgcphHdSvlDqHpIclDeJQA1r2Cg0xBskOAx9P3vzj++vn03cPpaR4Pc5pQJISQUoo5pRA5iXdkRSIzMHcovfVeetifhNze033LBnmzXTzkHCVTpBg4Eom6oVvXqqqGwDGIiChll/u+SoXNHYRkgBBYVNW7c5REd1ha27ata53nWUKqtVprvTUzA8IQwmGcmPl++q/XK4uklO5tlNvttpRd9d6J9JBiDtkJjdBZ1BURkURCIhJwZca7LhcACAWAat1ZvLXarYfg1/Wm3nIet7U0KzEeA8fa9rLchiGP4xgiC4dtr+M45pz3fW+tEjGi36PSXmopW4hy/17TECXFZn69rUTdEUtpzGwK3ru7O8BhnpmCmQHA2/n8fDoG5tY6o+SYt3Wv3VIKpdQcE6cIvTNhCPFuI+8JIrKgDFmiJ4Ixa2ltPYd8cI4YBoBEAO5AaHD/7wHfPn/58NNPy7rWVureHEqpuzsw0ziO0zTdQ3kzU2tYHCFuxX789JPX/iv7dX6XICqadTN3b6YYBMxbrcyMRIC4bdvlclnX1czuBdDD08P8eBrm6R4mEVFkcWJKQ2ttGIZ5nu+H8x4O5ZyHYRjmnxPlZVlur5ckAYVFpFu/J+uSIgUREZmOKWNQ6pJxHPPzwzffH3/77eGHbx6exmkCBmA01OZuW1NvKTwBgHkDckysBVophB3dU8g5Juu9bLt7U7UkEdEl4pxHcjDrjJ6TlGZOZO6AqGYB75s9PaWBg4zT5Nb3fe8CNIKqtqp7q621Uja15mZMwRVulwXIg8g0Tc5UWh3HERHvdkJCMP/ZOZq7A/Ter9erqo7zdO+oC7G67bWodyLKw0AkQcYYE9HPG5/umdO9zQ4AMcZWOcbo1m/LBzQ3TWZ2PD5Mh2PvvSzLYRopZ1UFcyJCIO3VId59kaohBgrRtjWItNa2UoU4yDgfH2Mar7e1NJ/nGchfPn8BoKenhynlXoxFa+vb5ofD8e3tNScZUj6/vrHE58dj2+tW95CAQ+putdZ5Pq6+qdr9yTDz3ZBpb0ce4mH2p3cQjpJG62C6kXfz7AgCho6A4OCICB5EwvXt/KfPH1NKgeTTx4/X88t4mE+nRzNY1/U+zp9SGlMmAgamEH713S9L03E8IgdVdfuZCRCIJEittbu7+zAM4zjGGO8v9rZt7t7dcs7H45GDvF7Oddu1d3fPOQeUu1Fj5ufn5+fn5/sbYgpqLZEAeFv3223Zeo0sd/Cjuw0pE+Kch/vt8m/+zX+r1Q3AuZjVh/z+2+dfzfNRhEGsE5h1uROh3K3obUG+WW0NkuQ2E4i5jSlljobm2upevBftlljUAAG09X2vDI4ELIRukQMRCBkAiBCimxmim2kKmZnd3QwILcYIALXtcIVtW15fX2svp9NpmqbWWt1biAwACt5aE0YH3Xp1VVYhZhEhortjJaJxmn49TYoaQkhxYOZSynK91d7yMB2Px+PhMOQJkRH/t7sQwBHpji8nYFNww969rtu2bTlEZBaRw/w05MnKyszDMLRSHbS1EqOkYXSFXhURQ4jbWhS7iDAFRooxXtaCAEMeYphYJgBwVEBZlsvL61dtZmb4+DAOoTZar+fjMC23cwj27uGBwHtvvXfBB0rptp3ruqXIyLFzaNqnabpc3mptIQgi1n0jgCApJMHHRxueencIzA/vvRbVS/ANBJQFCBkQAADQEYb56Ye/+Kvp6VHb/vblM6AZyOvr6735lVLKOQNASjHGSErDGPM4sIwpSKnn5fNLN5zykFLqqkjOyvu+q2rwVMumvZZSgsgP33/PzK33Uso4juiwLevXr1/X642ZY4x4nxl2v8vgAODeLAMAB7Xed29EUGu33sQBQdEdeq+lLOe3+2G4R1ny3/6L/447N7OPbx+uy9scxyEkJlPRogUNzawphhQBbd2X3is6uHu0IVjJA4ZJpukYYyxlaVWZEpDXxgQIbmDo7tfbjRBzjhhC3TVQAAZ3c/dScNvKfR0ii4BTV1XVvZR1Xd1dAsUY0jDOalupsG15msfDoe0lBQOwtZU77VOAtr2KCJLsrW61iEhKiZmBKY2DpEjws8wY0Hr32+12N11Dmsd8GIfT3VL2Wksp1vvdaqq2u4KFCFGb1t6qNtMhnxJLTtM4DsfjwbT1Zt379Xp9OJ62zRU6IhKRSCi13p/7MAx763vbkHycwrzN161qh8Ph4MRvb5emVUQGd5Ewj8fb7WbmvVeLUNYyxNHMtu0yDTEKg+vp9PDy8qVqJ2Qk76VYbZ6k6M47Pzw8mM2IyCz3KALACVr3HqcnGL4FxW6KvHBg6qRvZwpXPj05j3f8BDgDAk6nH/76+Av0fn75u3//b0/lgDO//NS/fPrUWnv37pkhbNtW3oq7PuTjkz9d1mXdF2E2s+v1HGP87a/+ZjxNt22t2kII5JgkTNE+fFjWdQOAw+EgQiGMhI5g67L01pCIAccp3wEZ276UHe8J3r0NDOZl2xFBYiSAvW69NQBAogiUQmIiALDeLm/ndV2BcLhLrb779ld979u+bLW06szQbLdi0pE5oAMiOmHblNDd3UvpgGY29ZoQLYsb1H0Db9AbIqaUYoyqQ+8dFDqqK8CyaHfhjIgkAQkcoRbV3pm51Oru83gIIRTd7xCEfa91b9u2EcPp8RiDDCke58M9XGTUNA3etda+9K1pFaIkwQmaVjC/t59qrXcByc8sTtV930opYM4h7LXvtdy9thCnkMZhzinXthu4mQPgvR2zLLfWKjOtt8u+XRmcg0RPSeR0mMHA3dZlQcSAdl9PrWq1tiFnAGkVMFopJQ1ZtasZgAOauzoUDjROMxhO07g2//Tl815u29pzzv/8b/7mm2++6c3WdRFqtfgcaZrGWqsp5JBr0ZzCOA61rNu6S0wp52kQ2xUDVO37tpQckcDU8jiSMAvHIaeQRszaKuAN+BhwgL7orV8/v9a2BPa0run5EYcHAEYABEMkQHUQOb3/9d/+67fbdj7/0xiGX3zzAxENQ4ox9vFwu11LKRZpb8Waln3fEUnYHZdl+fz15T3DttzWdb2b851546Bgd4XPuq632+1eDOUgYC4id1UIupdtiyKltX1dt1IAYJ5n93BPbdUa7QKEZV/P1ze3nyWftMA4zCklBRum8S4vvdeOpNfFwAA0h/h8enTvCtW8t0altLtzERGzn8vJpFq1Q1cbhiBpiAmaNm7WooQUUrrbGLNYS1c3cf2ZeW+WUwIndzT1VjZXCyljFHJQVQjsCOu69t6JwKFxhGM6MLMIrcv1/iMWZ7e27h323vuupq0zOJIreCtdWxckF3I1EWmt3TWD9+b53lavHR37tjUHbSZCgOZoElIIgRiggTuo9t47EPTavKtIUFcQdiD3XmsjkGGcat17be5OIOOUxpxrXZv127qAYZIMjKXeBKNpabuKRFUls8zBUHdkBAXUYZolstdWSllK7cUur58fjsd3796XVvdtQa3hMfEUwKjXfu/5I2JX9K3mPNaiijsTjeORkrvDGGZHa92YGd1UPR9y7725f/P4vj8+0eGgDWz9kmICnlHb7376fevLLw9P0BwZhjgZBUQAJ8eOIAgGQKd3v/nV9x/Wl8vKhQgAIIowM4AdhlFVb3vZthszz8dHpiABWquX69tWVtWHh8OJgVqv5r3stUpPIYqEbdve3t4ul9sdFnY6HXrv1+sC9JOZ7fvmrofDQUR6tZTS4XDovX/9+nXbF3eVGIQCorempTYiarVuy66tbPM6TYck4W4KSbW11kxl33cAQIKHhwPzg0Lvvar6uq73ROSuGr2Tye6tBEbqvevtdpoPQjwMAwcJLCFFjgER75psBwVwQAREEQmIDHjdbuuy30ni8/Ewp9CWVWsbhiGEIEj35B0RYQUEzTkjOpK3qq1305+vrGUvpZRSkCmEkEOs5kAoziIYIhMgRkQS894cDKHWvpW1tyJIROjIQUjI3B2JDofD4TilHO5p7n9dWqEhRTPr7oTYuxLwmIey3DrYlGTbL4SIADmkw5DmKZ+vX5dl++bxW9Btq5cHOfbW2rZiTvM4bqUAYcq5lMqmElKyLlIINnc3RfCAIGwcx9HdP335omoiIY8DoFp3SUl7k0SIGNMhp3y5vqUAgN1Ybbc85KZ9HIdeO3FgIqLQexOhprVu9Xg8RuHqLR6/NfkmQGr9y/WnjzKtnYwwbKuepXhOoWhfdzlkd0REdHQEdAIEAHv67hfHP/w+W9v3vfUiSQLhVpsDDOOIMRFzTiHGaI7ujmVNZQjjUFrV+0JZpBjiNIYY872Mc7m8tab3+k8IIcYYY17X/evXr9freV3XEKRs9R5lHA5zzkmtff7y8cOHD7W14/H4q1/+8jiNSdJxmmOMwLSuq3tS1XW97SRDTPN8TGlIw2RmAoSMFKOkNIjI/W1uTS+3a2vtTvS+TxJs2wLgIeQc0050O18+f30Zx1FiCPdMSYM3A7x3p52ZBVHNwF0BWmvXfS+lqGmt+7qtHEhB18s1SQCwrW5jTDEld9y2bV33OyCWCLrWvfb7Q7u/Xa0VZMIooEoAMeeErKrMGAOHIOiOQFU7gKRg5kgk45jrtiNiHGOQVEq5XC6llBDSfBhzTogownciuZnfZ5FcLWe9j+T23sk9xijuWlZJwIBR0rdP76LQut6W29VNBGNt5+PpAITLcu5tSfTe3T3GZd+ZQmDpiijONSYJOcq27Yh13z3HFCWoA5AT/py+EyFAH/LEBq31eZ6IiFNetttWbnk49G1FxBxD2ZacH10dmfa+jGFkzgSo2ogQTLWXFDNxUKioBZDD83eKQ1le+rq9ff26llWnSZDSEJnxLihwAARGb45813fn6WmtbVtec857LV+Xi4gQ8TQdnt99gw6X61trRVVrb0wy5qmVXrZtcRjyRCQphofTUwwB4Wfx8/F4/BkdiQgA7ioip9Pp/fv3y7K0dhex0TzP83FGRED0jjHkPEzX2+evX98O8xxjfH54PJ1OMcZSyuFwqHV/fX39+vWNiFpSdTydHqYp0/+/qPfosS3L0sO2d8deFz6ey8yq7OousruglghoJHDCgQBJ1IASQOj36B9IP0AjzTXhWKCaZFNsVBcrsyrtM/FeRFx3/Nl+a3Bele4oJhGBe+8+a629PocQkVwso9ifsDS0nOWiKDDGWuvz+bz8V8ZYCEFgKhn3MSSGD81ZG/Pw9FgWhRBiWWNBCFNKEH6OslkeBuf9OI4xxkwpIQSE0DnXtW1smuWeOg5DPwycsbKqQghN08zzjDG2VsfkjdFG+xQjBtAyCuHy3iFCKEWfIAAxLobCAENMMYCRMoExifMcgk8pCq5W5QrC5GIAAECYnI8dhDGCokLr9brIKwBACCHGoLW21lBKsyyjmEwpEYIjSIRgnZLWmiDgrAcQSkZRSHVdAhxn72KCIQTOVQAGYCBYNU/eOo0AcNYSSgEA1lqYfKbyBUANPhKES6WsGeZxmExMKMQUrTUQBO/jOA7e+xj9zS6nFHutpRAUU0Rw1zcpWoRQDHicjLPm/ua2t3YeR5SQBw4Br6MD0NX5BhDqjKvKkjA8+bhmBWA0JWemTy7SbH3LVgSeOgD++PaHP4oY6rIANEGOQUIAAggAgCAaHSnAMAcR8EzdvHn59psxJDhbs98/x+RX9UZlRUqJM17XZYxRa30+t9776Lw3emw7AuusYllZcC7/PGAvdS3LMiklQkhr/XlIiREhtFpV63Xtve/7LiVwdXW1Xm+dc8PQTdOEEMaUMsabpvn46Xl/OGy326urq6qqCCFCKC7yskyMKmPMcj07n8/jOFJK8b/+V//94kBvFx+m5TJlDEJoGUWW5BIA4pI2zCkDAPgYGOeMUARhjHGZKbz/rHVYLAy01sYYp00EKcWojXHOBe+Xi2mI0Wnjg0eMAgSddYs2YhxHa60PFiKAMPQueBcggowSzgXjXAixcLtxgpQQhLEQgmDCKKGMEkpiAnoy1vsYgfXWOw8AJBB775wzgolM8hSTNtpZFyCoytWq3mKMUgII4a7r+raBAFDKlJLWOmud90EbgxBOi4sbgJRyxomitFY5hXCc+hQdih4QWJUb7yZKUgLwsD8SmBhBEOCEIEQwBO+c41ImTCZtXLTWOO9cirAd527s+qlv+2GeBmuNMa7t2qHvnJk3q7ws5KpSKucgBTsMBIOpHwqZgZiGoacI5plCmIzzzCVnjIMACYDB+9navMyLLDM2VNW63t2n4gIqDiCnUEJrm6f3uj9zjBGCv/vH32o7397fCs4RYkioBCBIKYKUzBz0SIRMEAMIpKD68xFESkmQ4DTNep6MnvTcB+czmZd16b1rT80wdJggkMA4DIxSyYUSAqTkgvPRp+i1nkPwjNGU4pKgAyFeBAB5XmaZWti1hGApBeeSMUYIFUKqLM+yvK5X9XoDEZ7N3PbtOE0+OgAJCEgJtV5vVqs1QjSlSCldun0Iniz2G84ZY2eEMSHMubCIFRBIxpjo3cKlXhoFZSylxBEAMXGZYwAhhD5FCKFzRmuNEKnrEgAwDAMAgFJMvKGUOmv7vvfeE0qllIiSoiiMd3Nwbd9lTCxS16HvYwhS8sU3HEK8dENCPrO9Q4ohBAQAAgBj/HnkipFyRilNIRrjCKIARAghSMRFyBhDGHZD571fvi09zwBCyEiMEGMKInAuUIqMMfM4zXpkVCjFlncUQpjnuR96jLHg/OriEqYgc5kLti4EieDcdGK1A95BrzfC29lGhI3W1p4JcSkRRktI2Ww0YwxDGKwzxmAuGSPaBkJRmoCLwTnjnIsBhBBCdCBC5wIAERNYZCpTilKKuYwgTeOppHwaGuCjUmQYBgyByCShlGBgrbFuZoyxbEtRyAQ30bfDuaxfKlHNk6vqgKpNhAVIIDBF2VaM3x+eP3hp19vNF1/86qd3f+z7rl5tXDxVQiJWBRAwRIFnqO0Ca5DcAICKYgMIPTXnGKNgclVfxmStnZ3Xk/HjrGcTykrN8wQJXK1WXLBPT8/N0COCGKPeuxjj7LV1DqS0gPfe+xgDpawsSymzheIWY5xno7VZkOxhGPb7w8KfK+uqKLPtdpsQtNa+fvHy6fh4PO5DCKt1tanWCBAAACFswYmNmRfU4vO5WrhHnwVpICGEMI4xguB87/ulKlPOlhGIMSaUJIRMw2jHGWOcrDfW6uAgTH9S7rhlXTOOI4SQc7lwxJc7K8Z02U5yggEAGECBKZWiyPLFPUEu9oAJAACkzMqitk637ZlQmWUZpTikCGKilCwxchhjPc7DrINPjEFMiPdRCJHnihCMEJ5GO00TxGAtuDW+77pp6ud5RAhRzjBXLvhJzwCkLMuMsc65vu+D7y4u0J96cWi6xlsnpSQIbDZbySmiKJdKCkoIu/oyz/McpPT0+HD68I9Qu2EyhKuyFGM/AIi5Wtk4W+ud8UqpRcfkAeKEZlnhwwCI9WFGGDDGMMVlXiTgjPMgLg6KblMVnFNBGUxgnmyp6jD3wdl6te66bhhHgpNNWAOKAeSyphilyIz1BiVIU6HKCFXbT7fFRVELHwwODpICAW/sCUOMUQSYdEMrlPz1P/2nNsxD11jvCMJuHjkrEEQAIISpA3D+8FjcSShzgJiQpQ0+ej32A8b05vaiqgoAUtM00zTs9/sYw2az4VyEGGKMSvDtdsOlJIwaa51zhJOMYms9xphgaowex3HZvixMh77vFyp/jB5jzBhfLLUXXnRRFF3TGrdfHhVvraTkxc2t1ppzzikLIabk53lwzo/jiBDCeGFfY845CclzIWHiCwS9gFwAROssBJ8JqM65xZ2CMSa5cMHP8zzPE8OEIYwpkZxCABYGzqJzwxhLKUKIKSUIyZL0lmXZAhZGkKLzISRCIMeEc55xASAM0ReFkpIjhCjleZ7nWbk/PC2DoBBCMg4JhBAstTykSBCgglWMTv3UnFrCsHPOGTvPI+cMY5Ii7ftRm6GuS4IFJgRSkvEyuiV9Ohozp5AwxZRSrWdrbdu2erZZVhKCtZ76qTN2EowLhhBIKbiyqCHA3qTOaZv6KngpRF1s1BdFWa3Gw6d6miGN54ePMEGm0qF5gpRiH0GCzoaUIMEYADgHDyFOCANCI0II48269AESnAghs7M+GK9nDHiRiyLLizz3fYcISgEAiPJqM8wBRuAjJQQyVAi+M9ErIQlmjCMXhtibfhggIOt1LZk0g1VlLqq1RxDAHgORUpjak8Rse3H1dHqe53nq+5SCC+n923eXt3crSgCIEJCYAALQcDB8f8zKHRISQCxFmWUZhpRTOY7zNE1VVUmVH/b9h4e3KTmCVV1vYozn81kJUciMYjSPfcMYQUhrjQPOCsU5R5Awxsuy2m63wzCE4Pu+59xaazCGUqpFHtD3g/c+y4qyrJVSzrn949Onx4+L3ohJ8frF/eXughDWdc25a/thoAwzwmOMIKE/cSKA9x4hQhhVkvOUgvWfn4Foo/cxhARhIIxmRb6E9kCMEgTzOHVD3zYnTigBmDOSZyXGeLAaI4AxijFRylGKzltrdQSIQgwSghgs9COECMTAuxi8TSlhABNMs5mjDzF6JqSgnHDGGAMgnJtna3We51IIjCHAn+Et5xylFMe4EMcjx72e+n2TUmKCW2vPbTOOo/GOcRlCmuex6stVWSUEF2abyFVKKYTkrIYgIrSUf2+9I5DgpOexsTHpoQ0hRZBoLhDjCCWfYtO2DOOIQF6uFM2jB31vm/5DVpBcZfz21bWARs+M58350B6fu7EjYEAY5Kqw3iEErDeIgeSB8wl4nHwKKSKSxYRDHCFRyHlvLMCYqZwCmGWScdzr1rgZTgHlWYSIICZLWZWbFFE7t0RmF3cvmFKYC0xIsAYO2heTKGtGeZg6KDFVNLgYCCYAhkgiBBxJkMXxcM6Lzf2ONe2h1Wejpx7Bh3cfs1wS9jUAJKWEIEzRM5eK7TrigCEGIMY4vbi8TCnEiKLbj6YbBpCp4s3L62CHh4d3CIZJD1V9t15jAEJiJKVAIArOYMaUUiEBijKU4mhn5xyndV5WWVZO0xSjX4ovpVQphRBqmvM4NcPQFfl2t9tBiIZ+djERxiKMhDPGyPPxQIV8/fq1KpR2tu/GeXATdARhznnf90opKRUhLKZEuKDL7JVl2RJFuHQWLpkzflmBSS5CCM77GEHwQU9znpVVXgTvg7MhecH5rtpxRhYQYJomq2eEkJSZFCqTOQCAS+GTX5g5PoYUgLPBB01QCiHp2RCGEOILuG2D7/seY7SQecqyRIiM43g6naSUIYRp+qwNNcbYo11o4mnJ33WYEEIICcGbacYJMykiZVrrnpA8zxeu0cJsSynFmKbJBp+WXJalLQxjk4Azzs7Or/OSIHQ87evV5mpdM0y0mZHMynKd5aWUUnLBMGv6pu/GwTdDd4zBrKriZntxud7OL984Hd+++yGMDQgaUaIEG/vGzTEADBFJMAYIMaLGaG2GepUrIYO3JRcRAs45CongaJ2PAKSEZJ5BKZOP9eXN5eU9hKQuNofzk1Tq+sV9lmWE0JjSPM8+11QqoThNEBvTjGcMgKQcJJEwQkjDlIdAXDNG67778Q9SleucJ+8Aoce+l5IqIaINkEQAEADg9O57f37cvbiFZQESiDAIBnfrDSFomCdAU2VLijGlgRD+9ddfF2X57R++Oe6PgmerumSEhwQiCZ9rrrWcQ208xnS7rjCj4ziOgw4JMUYgTBFEQUWe55QS59zzfn86nfp+SAkKoYqiWk4shGm9XnNBjTGn8/l5f7QurFarsswXWMm5EYRAM75QW2OMxhkELIieSCmtNiklyujC5rPWQggJw4xQ6x1G2Fs39UOMsdxuaYYzJRYe8mjtZLQNLqJwXUglhY8pJdi2rffuxd1dCKEu66pcjePIBMdsgVGT0ZMxRkoMYQZi8N4SglKCBFPrDGNMW3c+n7NMFUURY7TWLQ+G1to5xxhrmmbh8AxmHoYB+kgQopwb5+IwUM5D8CCmqiiyLIeIMMbGeWr7DmO8BM4tuqSFQ+ucC6HDBGLEnXP9OB/2zfHQSikjjQNAECaqKko5kyoTTKoqLzdFWVPKMcacEmt9Xa+bLrXtyWvTNof+ePQXw/b6sq533mFRqfPzwc4NJhADRvlm6JquPTkAPUwhImujtZYxjggzekIIWOcgCpxCxiWngFBKCMmqVbWqI4ibzebuxQtM+DiOCcLd9jrPFSY8RSRFZlOIEVAhFWcIQlyUmMv80BIYADIxGezmhDIAIURumob19W0P4tvffQN/8cJaP8+zHqfrV6+FUtaMQq0ARHbuTvsPcTJk0JuNSgkgQK8vrz45bXSPcZCcI1VknGlnnDHVqsiKryCkP7376fn5wzSpqtzGGDlnlFJGxGKIYq03xoDosqJgjJl5nsyUYAApWW8Ywev1dolI6/t+no01oapWQogYfQgBwqSUEpJyTodhmrWmlA/D8G/+zb+5urp68+YN5/xwOCIEsXUAAUqQNkPQkRDkrSOMEQCi1QtfGEEICSHaGmstI1RK6X3UZtJmyqQCwQcQMMYgBQhiWeZK8Wkemra180Qpp4JPo+667urquq5WwXshOJes7Zt235SrWuVZroqyKE7753HqIMQgYcJkSUiMIKXEOPUx4hA4Z8sqSWs9zzMhtK5rKWXbtm3bLpeKEIJPkTCqMo4hmo023iUfdPBOm2kYAABt2zIhLi8vMclOR6e1Xha1RVEsm/iUgDFG6ymExKjo+3EYBm1d33ZFUWGByIqXZV7k6vryoqzy5IKSlcoyxkhV1SklM09lmWvr8qoWJOiGEwTN2DndHh4aWbQQ5RZa5yLiO6FICEYgElGcw9g186RHY0zw0Vr9dDzkQ353dQ0A6vtGZcxbmyhDiMtMMiZysUKMKsWLqpYyN9oF79uhizGOs7hTL0peaK0hBiC4GANkggmRqIxIqIs8wWT1AScHmIRQQoB9ikxlU99hABOkv//D908PH+ahjy7G6G0MThuhW4CJOZ9TQIkJbb01mjKZdI9BMtZ++PSMExRZhjPqYnDBBxCyskAJvf4SUkHfvv1x6CfBNQDQe5dCKLJlGRDGcTTOd117fXvz6tUrEFPTHpwLKYF5tu1op2lKCRpj2rY1xuVZWZbF8fj87t1PKYGqqsqyBCAdDoflYEgp5yl9+PDhdDoN42itnWdTlmUKIQWPGRBCUMIpFpBR/L/863+ppIQIhRBccHrW8zxbp1MAGGIIkTWaQEQpwZ99wXmKcRx6yuh2u6mqinPBKEsAzdYem1PXnRgT69Xau2CNxpSootjudkKKEMI4jSkFQYkx9nj6pM1IuRBCIIxDjLPWC1ywVHfO+TzPznnn/CIzt9Za+/+HhFJK86IoiyKTijOeKAYQKqHKqipUhhDS1kAEi6IoioIzJjiLCWitF2mItdYY0w/D+XwOLnnn+7E/ng6H4/5wPBzOx3GenYsRQCnV7e2L+/s7LgjChBAOUYAIIAgQRCoTRV1KKUOMnFIpKUiRoECAGZpzfzozwoObIAxC5C4CSDBm1KdovdeTG3Q/DLO3YRrHj09PWZZdXVxaF5wznDMEIYSIC1nXmwQRADHLFCVEShlCHLsphoAwVrmKKYUUs6KYjNbOcalUmVORQ8khoTCBhAgAjGAFSQlQAQGKMRKaM0zmw6dPD49YZc/Pp6eHjykmjKhUuVR5JgSK4bR/ev/u7aCnBJwe+v2nt8gZOx7a06lpe0xEkRceeAAjRfj5ee+dl1IlGPM8V0qmhLwPC9YrpciUUpmkhC07cSElItjFcHV5+eb1myyTMYZl+0mISCl1Xde27TzPUqrVqp7m/t27nx8+PhBM7u5ebjZbrfXxeDTGUooIoQjBPC9ijE9PT9OkCSEpWmuMcwYkEBwIHoKIhciIcw7E9GcobqEAAZAoISiBrmkhSquyAlF67+syz/PSWlsUxZJjLoSs69X1xU2WZe04/PT+p8PTE+fi3A+fDs+//OrV9uJi8Wa5ubn58PDu8enDYf9JMMKoiIFYq7mcrUNa6+DTNM1/NglbEDsA4LKyTSn2fbeMZ4tIYHlBHwkhIQbG2CaTWhpOKKNCaw0QhBgJIZTKjDHRByHEss5a1C0LSk0QVSoXjKeUjLNGT0137oZ+ms00u3meJ62VyhhXEOJp6FJKhGEwoXkc+7bbbi9UduGtbbuhbdvdqsScRR9wtM3z4cP7B1Vs797Us+0ZJVxZChTEufeeIM+xpXQAMYEUjNEAo+vra6XUfr/vxy7GKPguq9ZCiLKsAcQEI8Y55xIlwJjAhPlkvHeyVJLxxJKUKsToQ0gJ8iznRQYQTwAAyCGAACQIXAIeAgwiThAiBEACow8PHx9//vGHD49P524euzNnKC+K4/nkv/12GjpZ5NM8DF2XUrrY7DCX89R+evc9BPr9/pSx7P7+PsFweu5Lurm/e0UYP52Op/MBgMgYQ5C9eHHHGDmdTkIIITjFOMuyoesZ5S/uX5aran88/PY//+6Hn38WTGg9D4MGAYSQ5kkPYzcM3fKnQghd1y1XOCl5URSLCcp2u0MInc/nae6aplk2/atVVRSF4IpzEWNYOEiIMp+AEFLk2W63IxiieZycX8wPE0iBIYgxwxhFH6K30zTRAHYX2+riwlr7/PzMOb+6unLOfffDDyHE6+vrdVWDFFfbzXq9bs7HWduP+6fJDC4GSilFaR6Hfmg+fHjX971kPIQQoKeUTtP09u3bLMu22y1CRAjRNGcI4WazWQBjpTIAgLVmIVkopRbV3LJRnabJGTPHCCGMUiqa51J575+enqZp8sEyxrIsQwgv3NoEgTc2QvBnF9UYAaW0ltU4T4zQPM+llNvtxTjMbdsunzVjAsLYNvtpWlkzMyaChy4GYycAJiFUWZZLjHtKybkACaUyqzab/ePP3/7w7r/6r39J1dpnqqCZjTNhHFEVjI0uG8bZRoAQWYYxAFAmMgKR1noa9eJdSQnHiCPIMCQ31zdc0Oj8MEyH/Wmz20aQICYMC4QRhJBgSCjaZpvkA0fETxoKAAjDgACAQEoJxJgSRHGJBgMxARS5Kh7a4f/5+79r2pZnJSHeTSBAP5lmGovn4ycs0c16t15v87Ku1yuCyHq9HvT822/+w/fff/83X//aeu29X5WbqlwLwW7urs7N4fnpOM0DhGG7vdhtbzabDUIQQtQ0TXAOAVgU5V/86ldVVXVjL7n48suvunPz7bffci6VEkIxRFmmEmMcIzqMXdd1WmulVJ4rjCklHEI0DIP3HsJkrZ1nbXQQPBdCABhTSlJkkgkps7yuACbTNMx6RAmUqxolgCEi0YdlFzlpHWNMPgTnQUzWWgRgJpXkgmMiuSSYdlPnvZdSLj4tUkqZScYoZaTpuqDnEALFAHJ0s92ussL18/7pYbPapgS7rplnI6jgXGRZBiIEThOCpMwgxNbElEyIRmViMVRbnDBCtEIIqbg1flGdhhCKoqzr2jm7pOp6a12IxjnftIhgbb33nnEiMSuyPMaorcWMYkTbvpu6cbGnhRCDEF0Mfd8DACAGCCQAWJlnWSZtaa6v1yklo522JqWgGHXGRgABpNCDiCAieJ5niNBC/aiLcpoGhBCmSHJpkQJQff1X/+z+q18BICSXNOMZu7DWhmAtBIEQyGmIyIeEEDROa22VzIO3WltKUJ6XhKBZd4uIpaqLlKDTjhKe4ry8ECU4ApqSc44Scj6cCeX8IoMhDl3jncmqGmVV4h4kDCGOACEoFkZnAAgDlADAhPzmn/03bdv942//Lsuyal0DF4ZhcH4WimsbKlJsdtdXV1cARuc8ZFBxsaLldnPLaH55fesxBhiXmRqn5pvvTpQJClmeF2VZ5Xnuvev7Tqns4uK6aU6CMV4UZtb39/e77fY///73H58eb6+uV2VJATTGYExijMMwYYzKKtvttl3Xf/z4mCIS3FJKjbZKZXd3d3VdL35YjBGlcoJFrli5qpepaRmbIYRCKEIpSCkSDCnP87wuau/jME1k7NuFvhadn61JPiAICUSUcs75xcV2tVotMYaHw8EEt95uOaXDMFBK7+7uLi4uy7Lsum7xXNDa7Pf7GGNd10WhJgzGXlv9adE4C0ypoNbaaRh9TN5bgFHOynPbaPMMAVi+aWvtMrcxtnhZu2nUznlCKBO8KEop/TzPhBDKOYaIUkacSwlqraMNpcrU7iKAlIIzRkefIMGLNxuI6eJyjTEGCUkmQVVYa5e6IqVcfi7L0ns/DIMQQggJEmKMIQyM/0z0CMhqBDml3gbBuFTch4ASADFRygmFEAFOM4fM+u6FWF0ASOUqTyilCDnLKJHjPPmhsdZDEFVWhqenczM23RBjGlNIPvgUEVoU4jjGkGVlnuecSQAixMQnD2C0szZmppALJYQkMUGACCowSClaF7T1zkgoQGIYwpRcAsmHESaEIAfQQYAwpAvBOQK4vtj9d//dv/ri7vYffvf3AKRqt95stiklQjDnYru9kJJzroyZj/vnLMtjHiklF1V9f3WjlOr7PqXkXdTatm07DF1d7f4szl6IN2VZSs45Tp9iOJyONzc3EeGHjx8JhZvNGpA0Dh0AcGnsXddhjDEWXTsac/IxyEzc37/UWhNCCEF5nl9eXmZZZoxBiCyeA4t7SgihbduFzbmstlNK09gv3zLjMkRgY6CMlDQnbpwJIYxzmuUQQp10URRZllHMpOQXFxd5WTw9PQ3TGBBQVSmE6NsuL8v7+3vnXNs2CMHDYX8+n8uyjDFcXOyUyvq+b5oWgDSamVJaFMWyWULWLFhBCAEhgDHt+iYGxzlHCDAmKKXjMENAl9h0re0CXwuZjePgrSGY2uCfD/tMKkEZYSQl0Pc9QkipZbM2EIb7cbZ64pwhhMau74Zps9msNzUntGkaCCEXFGAEES1AlhcKQdL3/eKx8edrRtd1C1l12SQYY2I0GGNKozEGgrRoOFb1pm2atm13uw0AwLuAIKYsv7x5/f79e8wFJDjhBCIctEEI+GRjskulJ1j4APbHZpoMACC6ycWAMKYEQUistc/PU13thFB/5tgSQo7H4/F83lxeEIiCi9o4kQmM8cV6l3Ext/1kp+3mAjGZOIuMgkAQhAgg70IIPZIUIQEBWGjOCMB5nN/9+EfE4XZ3qc00aa1kfnV1JTgFABAsGMPJhyovIATWOhjT2PcAIZRA9A7EMI39OI4JQQgho58F+E6bZ/1U13WWZfM4mnmepllSThI6Pe+bppG5LJWEKFirhRAxgqWrr9frEMIy3xtjQghKZYSwGCNjTEq+HPQ/Cb7TYjW32EEvLotLrM7CiCaECCGWi+s0TVpPQ99+tpvGFMcQz+ezKkoAAMVECVnmxUKDOR6Px+PxcDhEkCjG29UagGStzbKMMTYMw7fffnt1cbHdbkdCjvu9c+bu5Yv1emWtmSacUkKURAhGPUcIRKa890tvwgDKPEMI6ZOllKYID6fjdnNxsbvibFr0KMbOAEQpF/W62+/3Dx+PlPCyLAXjbdv6IpcoaW36vu+6brvdMsaarvUxLJEK3kVCSNudtZ4JAjCuNQZL1MKpbSjheaGWDwVBsnziS41RSi1SoaUbhBDW63U/DnleLncJlGACQGsdfcAIEUI+Pj2mlC4vd8sH7RJgUnGVVav1oTnvT8c3L18RAubZxhBghNbacdLtNNjgrbUhuBDSZ/cyGJ0HCKHDYb/42TPGlFIppXGesiyz3jHBy7KECYx9n8u86wZrbV3k5OZKj/bt739fVznLKaQCAAExBcE7PVIKAZUBALCo3VNKIECInR4ePvzQj8eccs7KmcwIkU8fP1xdXb1+/ToluES4ntqOMQJAGPU8z3Py4XH6IKXM8xJCHEKYhznLMsmptTbLVFkV57btui7G2LatHue6rqUQf/O3/6Wex3cf3s1Ge21mrcd+8t6v12vOxSL1WsaYruuW944xNmamlPfD0A/DxeW2KkrnXFnWIYTFkM9ajRBBCF1eXsboj8fj6XRaBAZFVSKElntalmUuuH7srbWkWK3mcWrHAVsbosMQYQC9sZRSTiiBqB36eZ6VyqP3epqttd55a93hcJznWTB2Op2WmFtrbTt09scfT6fT0E/ehxACwmlxcuScLxTrBdOGiGRSpZR2O5RSenx8BAlTyoyx1jqAoPe+awdMYF2vhZSKO+9iAkgpVWQFRfjT81PTtX3fa62ttYvk13uPIRr7iQkqpQw0EUKyXC4T84eHd7Nxi1TCWi9VbqwfxxEAQHFaBNeU0hjjQmVdjMpCCFkuIYTWaee4MTNjBQawn+cY/fF4vr6+vb65OR6Pz8/PlxfbGHyCQCp2PreEkCzLQorfffPHY5ZfX19+Tv4KIcaofTiPbT8MDGGGSaAAALA4qXCWOWecN3/5l39RVYVzBgDVNM3Hx0+//OoXUsqiKKZp8tqsViuZFb/9d78Tkn399dcgoWBdwDDGCGOKkGAAPAAYxuTnRDkgGEWSQPq8a4PQOffbf/h3BPvt5kI3TXD+anfVT+PT06eiKBDBBDPnDESICWq1maapWpWU0/3THlFifXQhAAA3u8vFjBZhgDxYJIRXV1dKKWut1iaE6GGqNuu7F68+PTzM9jtjTCnL3famKuYPHz4cDserq6u6ruu6JoTkefb09BxCmGc9zxpjbF2ggmNKECSztvM0lWUJIey6bskfoBQv/E5r9QLsLsJGZ6wxJsXonJsBBADkQpYXl2S1WimRSZkNw6BtTCGeu0Y6yblACE3Hw+Xl5cXFxUI51lqH4I/Htu+7hfuZFcWfHiwghKgJhBAOw6C1ZYynlAAIf2YUhxAWkjNjzHn/8OFdnpVKZRDCu7u7xeFwmiZjDKXUWjcMY0ppnt6WZVnlWSYUWWFIsJRy6WspREzJbrcz2nHOg7dd1ymVY4zLMld5tjyZKaWljR4Oh1PTLVbaXKplEbTciZdiv0Dry2th4BGElVII4nEctbbTqBdWnwtOa+29r6owz3MV49XV1dT24zgAEDAlAPClBfdtU69Wr1++6trWrNfOeD311plJj1rrtmn2+z1CiDHhU0QEwoQowjLPtAbXNxdlWU+TjjGemuPT/pMQijFGMRFSppRkljHGxnF0Znzx+lZKCZx1uttdbWVZA5gDTGKaEOQAJuCSxwExiCBGAIKYAMIJAEqRkuK7n34fAUgBXF9fMykqSv7iV3/FGGvO3cJALvKcUjZHQxlDAEKEV+ttkechBOdM27bTNG42G+/DAuOEEGMMwzAopRDC1to8z0OKXIpvf/uPv/v2G54zKVVW1S9v71fb6p/8k79++/at1lqpvKpWizvi7Y0cx/HcHJcaKpUoiiIr8mjcNE2C82Uzbq1dCPMLURTCtAjBl7lxKY4LyDtN+uPHt0KIXKoff/iJ9N1IMFZchBCEYJTSEHyC4OLiwhgzz/Oihxzarutb7321rtab+ng4d12nlIoxJgi0NSFElatClTEE55ySudbae6eUVEotDJzz+dw0TZ7nWZadTodxSF3feGullBfbzdN+v1hmWz0HpyWXVVU65yHE7el8Pp+zLNPGjONY1zXjBFPy1S9/seiVp2laRGSccwAi52KzWS82G1mWcUqXmQohtFnv9vv94XDYXOx2my0hxNgZQjh10zzPSqlK5YVQy5xGCIkQLM2NS+Wcm/T8tH+GGEmI+rETkiGE5nl8evyIEKKcHE9nJnhI3vtIKWWUHw4HBPHtq2v9k/7x/U+KyvPhqJ3Vxr37+e2P3//Qtu3idCmE+BPoAa2dCSEgoa6dry5kgkDrkXP++tWbPxtCSSkhhBGAaeiKorjYXkYXASOsyuPJQkYiwSAaiAAABkIaGY8pKgAjDDBSgFJKCzwQfv31X/723//f7x8f/uqvfr2Y61OKCQRSCklZil4KwTmP0VtjCCEQAiHkapUpmQMAHj6+x5gqRRa3/rquzTR777fb7TRN799/IBD5FC8uru6ub56Ph/dv33315Zt6t5KEDdpop7//fo8Qub+/x4haa6fRAgBAIlJSrXUMYL3aXl1fSCnPTUMShJw7axcDdIxxlmXTNJ1Op4U8towbxhjn3DLoIYKtd58+PmGMVZZxzl2MEWHy/dsfLjdbmBDGWAhOCAkBj+PIOb+/v99sdj/++OPH9x+stULyoshOp9OsNUCJMLpMz1wKKniatfdecqHtDABggviAFBbr9XqhHy2VeAGuCSEx1kVRaG1BQimBh0+f3r59CwCs1zVBeFOVMQWIESGcAoTB4nkEmrF/enqa53m7Wa02a6Zkcm5xrFimAgwRAKAsy3nWTXO+fXFV5JUzC5YMqqJwzoO0loLNRkdvq1V1PtuUUpHnlJAsk8vuoiwyznmMsdcTjpAQAjESghln26YZho4i6L3fbFerzU4NXTf0kvN+GsZRq6IkBC0bW4RQAPD50GwQCBD8+MPPiko96kN7bprmj9/8/ng+QJRiipBhwgjGCCFkvfMuQBjmea7rDaXYmNk6m2elEOJx/zzOI8s4mhesDwAYPYBaa+MNd7Ys67ltp2mUNUHWA4wAFClFXubQ+jCdEM8AJn9y69A4OgTgzc3dvjnWZbnAroLRBajB6AwAQARhDFJKueJCqdkOzfNBkKxabyileVYyKkJ0C3lz8cl7UxRcCIxx2zSLGMU5t386fPPHP9zeXRdlOTaDqKvorfX+cDhored5XHLvlnuwMXaahyzLrq6unDeLxbfgXM/z4neyNOGlVC2/vuiJEULLKLFMLkVRLNj/zfU1hDDLsqqqFlUGWW3WH58eGcT39/d/3mddXl6GkN6/f3h6fu66jnBKCYQEOeCtXmhqheR0aXYxJsaog8ZbN6Th46cHKeWyPPXeNk2zaM8XGIsQsmwYl8slSMgY632ECO0urgEALMtISv04+hipkpiQ6BMGOKUACX51d7ur109PT8fjMaWkp0koSSnFGH12qJRqIXuaeX71+gWlWM8jpXxXrrthCCGPMd3c3MQIjqfTfr//+PHjer1mjLVNX1T5okoTSgghYkxN00hPq6oCABpnF7M+htG7d+8O58Zb1/cX46Qfn6+qTF1st8659+9/xhjnWRlCQggAGEOMIYRxHAftnSGnpun79g8//vHp8BSMLooqRh+TR5QQgiFIMQVOmRCSMaYUhzg9Hz8ltEspESxOh6MLPiF4OB0lppeXO87pPI8xOO99fzyH5Kv1quDy/HzEDkICAaQIzkQob7A+nzMGvdZkLRFiMXoYRxBN2x1/8Ytf9G46nc5XVzeLroNzLmRmrYcE5kLGCKRUKsu6rrNzmEbbBX/uBynler0mFOnBSCkZE1prxnlZVUzJRep1Pp36vp+1di784utfrtf1MHaUcBsihigk9Nd//ZsY436/r+t6oTB4bxGOw9B1ff/y5cuiuDqdTu/e/axUnqkiBLcc5XmeT6fTPM9SyhjBMAzLjL0ojJfDsNAfCWHL3ZpzPk0TQCnPc/h//O//6zxNL27vKKUAAy4FxYRzfjyfvHVLgWdCNd0ZghTcHFys61pw1XWdc345fCE6jgVEcdTj4ia0uFwsexhjLGOMUqIy4V1cCo/VOkGkjdNa51WplKpWNWPs8Hw8HvcAgDdv3gghuq6zerLaweQppRATH0OK0Do9DL2UcnmWYowBpDzPJaOcMYYRwMjH0LVD17b3N/cvXrw4Na1xNoW4MD5SStaF9+/fx+RXqzqlBABcrWopOUJk0R+FEObZYggxhIRx51xMxnubAPm0Pz58eD9N0zhP9WpVl2uFhR51N/eXF9tNtUsJztZEECJOLthCFj6kj+/e2qkHGD4+P5vZUk4IZwJThFAggGcKxgQBIIQQKjBKMNgQAkacURkhyIS8urqDDJ3PR5jAbr25v7shhDx8+ng8Hr768uub62tKcV1mMYJ+1IwxygSVmEEMEQEA/fDd95MehWS3r766uP1Vgs52D9DZtu2AS4TR47lXSsTkrXPeOYJFAqhar7eXF8/vPyjOAMWHwzPHqCgKXuYgxNPpNE9TURQLu9Zpl+d5hP7Tp0+MMW/84XzyIXAlOedvvvwKU/r9t99QhIs8l1JKlVsfCaPRp+60pwwTzMZ56tojIaSq123beu/v724wxtZ6RsXh+LhI2hlj06TP5zMhZJ7HJfkLIZRlcpGwOxcuLi6urq7atu37NsaY56Vzbhz7cRwZVSTG+PLlS4ZJ13UAxIUpCSGQUjJCPgPDJITZeW8BSOPULzL+WY/W+N1ut1pXi5uXt55STij33gvOvfcQYGNm78MXX3wBQOq6Tkqx5J9BjKWQQqXnJ6uU2mw2IYTT4Xg4PCOEXr58KaV83j+G4ClljCNGJec8AuBsKOrKWsvO5/P5bEOnuMrzvKoqznkITnAeQtB68j4yKYidDu2+/c/9OI5ZLglm0zRlueRChjRf31w2zfn5+XG73XIunTOMo8VMTirOmXz69Pzp4eEXv/hS5rJpGkLyPFeEyqurq1999UUMoOnOj/vnaZr2zb5t++jTZKaPn56UyMaxV7lElDztH6EHu5ubpu8wSEKK1W7rjKeUGj+vqvrV/QtA8Tzp5Ly11iUAkvfeR4gTAs770/7Z6okCBCJEBD49fizKNcJ8tm/tOCefemvef/yklOKCeu+HYTqdDpvdBkTw2V/V2tOpsS51XXfujq8+ffzn/6IKID38/D7jbF0XJniG1ctcLTuDyWjnqJ6tECoFe3h6IgQwyTARRb5KfgIgHp/3nDFrTIx+Qa+qvEgh/vz0SAk/NAefYp6VZbHGGDJOhBBumnCWEYiOhwOIkFAJbRQqAykhONar3LnQNA1lDAA09JrhgWPy03ff/6f/9+9vbi/LohYiCzGeT4dFpL6M/hDCPM/ruq6qylrLmFBKee/fvXv3008//fDDd0VR3d7eCimbppnGkXNalqXzEf5f/+f/VubV0HZLroG1emFs/3lzYoyVMpv6IcIoBJ/1OI7jYkK9ENfKspRSLpZGMcHlGno+HaZJr4o1UxRCWNdlSjBFuAATn43vMG77vm3by8vLoiisj4sYlBCSEgQAFKXkksMEAIAUoXEcV6s1Ffzhw6e+7znnEKVx7oPzGMAU4+IhgwlxzmVKjZP2MC3OYafH0zybu9vr+/ub7W7NGIkJnk6npSE+Pj6+f/9+s9lBmDjndb0WgmEExnHeP+5/+P7HX/zyi9sX1zHGlCCltOu6GP3d9Z1zoesaY+eUkvUREC6E6Jvz8XhmhAnBjTGIkHme3717F0C6vb3Ns3KYeoAgTCBGoE3/9OlRyuyLL77AETptjA8WRD13zkVrIkhoMuM4tkbrqR/KYj3PY3M+AkghI9M0rIvVy5sXeVUbM+aFWK1WgucQpYeH91xyQmgmc5VLENPQayboX3z9iwQIDP7i9nJV1cZ6qx0jpHeDs0HiQDGTUo6T8SlKmWVZFoLzIXHGFn7KPM8hOADAMoKXZYkJ++abb9qm+eLV64vN9tOHB0ppwujQHqty8+LuDkLYjc27d++M9X/7t3+LMf7+u+8QAq9fv44xHs8nN2nC6Wq1zrLi229///33P3LOIUzrqtbaYoxVkT8/nbr+lGVivd1QzDlfAoP94XDIsuz29nZJ/qrrumkaABCjYtmOvH//Pldie3FVlmWCccGni6LQWhOC8OPjx7yoFnC4qMr903NwFhHadX1e5lmWDUMPMKirIsFIab0wxvq+F1KOw/Dw8HB1ecOYgNBhjPM8996mWD8+/rHI67vd1axH7yNn0kXfNO2frB45AMDHuKxpp2naH89FUSxglnPBOVdXZQSh7RpKKSdCKeWcdc5yQRivpeRKqcPp+Pz41DWtd+7u7o4LMU3TZx2Q9/PQE0Yvd1sQU7COcyqlLMryeDwuFhV5oQgh9/cvfbB69qvVijE2z/M0DQTDxc5S5lm93i62Akv3y7Lsw4d3x/3+xf0r5w2lhGKylQohggisJK6rrCgqzun53KYEY4JXV5dN01htSyU3qyqACBEOIYytiiY45+ysX7142bdtRJAoMevRzt7q4L2nDAIIrbXjOHdd1w+TqorNZuVjOOwbmNAQbBpPPlifDBGyGUYpJc0y66PkUsiMS1ll6s3renuxATBW5SZbF/3+5LTjghCQrI25yBFLMZl5nLpuWO12HNO6rgmGwZKEQN8Pfd8TsiiKbEhRcA4AmCYd4nSx20kh2radxwkjxBjaXl7sbq9/+P77v/u7f7verRFC1oWqzNvmVFWrlJL34Xw+zfM8dN2p6T9+/Hh9c/OXv/paKSWlZFQQhg0AJMsk45vNpizWjCNCUN+PCYDwmU+Abm5uECIfPnxcGKBt2z4+Pp7P59vb++3mIiXMeTZNQ0r4cDhRBtfr9RK+9PHhPVmCloRkx+ZsgplHeD6fOKVVVTGCk/NEkOZ0CCBxgdfrGiT2/v37cZh3FxsAIef86dOz1npbXhCiy0wlEAhk5dU1xlQbN00ToTjGiADghAIlrbVt37VtJISEEGECjDEp5aRtURSccx+sVFxBNk2TykRVVNM0WmullBjDvu8hTFVVEkJmPXLOf/nLX57P53mec6kW4wlOhbF+ITgZZ8qy3G0KrXX0vhtaemQhJIQQRCkEFyLhjF1d3vzww08QJgjT4fC8QOvH/WkYhvV2u6RHLitnjDFFuKpWzsyEomp73U+j9x4Q7GMYjx3EUDBmzNwPTQghz4rjufHOX+8u9s+H9x9+/vLLL8uiOh6PTXNys729vJJZlkBAGMhcDuOox2FVr8mKpJRc8J81ay6kCBknUuVSZhgCF7y3ASUwzINPtiiKGJIxzlnddR0EuK62eUaUzAXnUnKZSQSy3dXV4fT+4aendV5CgBAAsMhxgoohwrkZdC4tQsiBCBGy1psQEYgpRBhhmZXjOEYXyrJs2v756SQkgRC3TX9zc3N9cdk0DSHkeG7bab4W3BmbZdI4RRi9v3s9j3bs9z/88NOs//D69WsA4t//x3/48ccfv/76V7/5zX/xq1//jZ4GZ9OL+zdCKETgbMLbdx9fvXlxOh3+7b/7t4KRqihvrl+8un9NJB6nfhrqpmk2m83vf//7//Sf/uNvfvMbrXnXdZTS1Wr19PTp7vbFxcXFzz+/c95jhKQs908PwDu43X769Ekogf+n/+FfQAjHYcQQ6XGyxux2W4SxsxpCsPD4irqqqooxCiHSWp/Ox3GYVpuNtQ4hKJWKKTEqEIRGTyF6a103ToRSQujxdAjeO23GcUgpGGeGaeBcCM4JIYyxBMES8pHlBabEhxiBxwh471KKi1MnADDBqI0OKQolMcYY02EY+27U8+ScWwqztmbSxltHCR2neRo0ggiEwAW3JsQAIkjeO85kUZScs6JQGGPOBUhgGObj8SilKop86UhSZqdTSymWRRaSq8uCc5Zg8sFTKsqyuL66JARP0wwwoZRp57x3MUHKGKH83LbBu7Is58l454TgYz8xKZgS09SXRTGOg5lGVeQxxa5rGWMAQkKpMUZxgUDq2maeRh88RmRo2hRjXRQv7u4EZxQiBCEi0OmBQF9klBKZQtTThGJSXFxud1+9enV1dbnb7m5ubwmAxhlKqJstwvjdz999/933QmWEc5Vv8vUN4xJqPespesg551Is1c17Z4wGEBIMVZY7F6w1znkXwnq9rqpSqkwptd3scqWM0fv9ngq+2e2c1+/ffmibgZKMimK7u3bWztNAs2x3cQMwffPlFwEkY83LV68ur65Fxsdx1kZ75wCEKhdt25yaI8bs6uo6k0JlYl3tplFPcxcTeP/4vm87JSVjrOu6sq4udhfffffd+/cPZVkJwa+vrzHG7z+8naa+KGS9qpw3xkyvX704N80333zDuaRc4P/5X/4LQsjz8z46LzjfrFf1amWtDQC4EPtxIozXdbVer2dtplkDEDNVZlnOOF+In8vKCRPMOAspGRcSJM64GKIU0oc4jxomgjDabjdCSUxYtV6rImOUZnk+z/NnMM95PetFtey9D+GzH7X3njGKKbbGzvPsvbc+xBSXhBvGeN938zwv9KSyrILzmGBKCQQgBY8xIowUea1kZrQGELx+9cVqtQIwMEaXBCRCYd9PznmEYAi+LGuEYAgRQrDe1FW9zvMcweisSwBCQgCGMMUUY9/34zRlMsMQUYQzlas8QxilhJTMBBcIQghJluUYI6uNdf7q9noe52UMhSlZ71NKS7wkJSQB4KzlnC/BBUKI7Xrz4u52s1lTgvNSLRQsPc+zHjvdO2+naZqGXtKCULbdbHa7XV3Xr169Wa03ZZkb67z329324uISA+TcPPTHeTQJorvLl9vrFzQvojbeGQ9QcH4YO0ww43QchxC8lMI5q3KFKNYmcJmtN2UEi8u8ZoxACLqub9tmmodvvvmdc6Yoy4eHTyn4q6uLFy9uIYofP77d7OrZ6m++/cP28oIJdnl1acw0jv3lxUWel9b4T48PQztO8zhOQz93b99/eNofvLN39y+uL1+em/N2u9qud+vNKiSfEtlsNymk/f5AKQshIoSlkATiV29eU0rbtoMQ3dxcCyEeHh6Op713PstUva7HSf/dv/8P/TDUq02R1/h//G//uTEmpKgyxaWIKVlrHx4evAdMSMIFgKA7t3aeEYYRxGmc1qstoTzGVFU1QjDGRAidph5gzLhMiAopGKVKZjDCrMxTQkqoVy9fSCUgQogxlatZaxs8odTM+nw+M8YSTARTjBElrG06a4IUEiAoM0kJjSFmWV4U+RKws92s67qSUhBGBOec87IsKaWE4PVqvTj0M0rKqsqKzDmfl2q9rut6hRDpuj6EYOxorQWJ+mAhCgn4YeiVUplSxujgbT+c61X1T3/9ay4zIRWBCULMVQYRDMkB65Y8MZVnUklGGaWUcZliooQjgBnlPljnLOc0xnhum9N+DzEq65U1ev/8nABo22Gchvv7+2X5HUIw1iKMtTFK5lfXV1VVzfPsnNluN5yzx6fH/f4RRDB2PYiWML6ur4ps++L+i1/9+i8vLq7KqqrXdVWuOJOciZRSij7jHBHSns6ntjHB7vcHmZe//OKrgACXNKagTZtxhDCnnAvJMcLOh5SAFBJCZJ2PGMm8QpjGFLph4IpTSjDCIYCmOZ/Pp9nMGEEp2Ha7QZjEGDGCGDNn435/fP/uHSK0XFWAsFzKjw8PlDDGaN+2IYQIwLt375XI/vpv/snN9Yuy2hAq6npVlavd5ur+xRuEMSEYE+KDk1zFEMtK1fUFwTQFYK1/8+bLerUpsvLu5h4itN1uXr9+rZQoiurm/kWe5d9//5MZ/dd/8avLm+vf/+GP//Ef/qEoCwQBF+r/AwHU3/3CQv3GAAAAAElFTkSuQmCC",
+      "text/plain": [
+       "RGB4 Images.Image with:\n",
+       "  data: 256x256 Array{ColorTypes.RGB4{FixedPointNumbers.UFixed{UInt8,8}},2}\n",
+       "  properties:\n",
+       "    imagedescription: <suppressed>\n",
+       "    spatialorder:  x y\n",
+       "    pixelspacing:  1 1"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "using Images, Colors, ImageMagick\n",
+    "img = load(\"cat.png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let us do some preprocessing. The most important thing is to resize image to 224x224 that the pre-trained neural network model expect. However, since `Images.jl` does not have a `imresize` function yet, we will call Python to do the preprocessing. The helper function is defined in `imagehelper.py` under the same directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Image resized to (224,224,3)\n",
+      "('Original Image Shape: ', (256, 256, 3))\n"
+     ]
+    }
+   ],
+   "source": [
+    "img = convert(Array, separate(convert(Image{RGB}, img)))\n",
+    "using PyCall\n",
+    "unshift!(PyVector(pyimport(\"sys\")[\"path\"]), \"\")\n",
+    "@pyimport imagehelper as helper\n",
+    "\n",
+    "img = helper.PreprocessImage(img)\n",
+    "# transform from Python row-major to Julia column-major\n",
+    "img = permutedims(img, [3,2,1])\n",
+    "println(\"Image resized to $(size(img))\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The last thing we need to do to prepare the image is to subtract it from the mean. The mean image is computed on the training set, and it comes with the pre-trained model archive."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Image prepared!\n"
+     ]
+    }
+   ],
+   "source": [
+    "using MXNet\n",
+    "\n",
+    "model_dir = joinpath(Pkg.dir(\"MXNet\"), \"models/Inception/Inception/\")\n",
+    "mean_file = joinpath(model_dir, \"mean_224.nd\")\n",
+    "mean_arr  = mx.load(mean_file, mx.NDArray)[:mean_img]\n",
+    "\n",
+    "img       = img - copy(mean_arr)\n",
+    "img       = reshape(img, 224, 224, 3, 1) # add a mini-batch dim\n",
+    "println(\"Image prepared!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can load the pre-trained model, via the `load_checkpoint` function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[09:48:53] src/operator/./softmax_output-inl.h:187: Softmax symbol is renamed to SoftmaxOutput. This API will be deprecated in Dec, 2015\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model loaded\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_prefix = joinpath(model_dir, \"Inception_BN\")\n",
+    "model_epoch  = 39\n",
+    "model        = mx.load_checkpoint(model_prefix, model_epoch, mx.FeedForward)\n",
+    "println(\"Model loaded\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With the loaded model, we can do prediction by wrapping the image with a `ArrayDataProvider`. The output is a 1000-way vector giving the predicted probability of each class. The class names are read from `synset.txt`, and we show the class name with the maximum probability."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tiger cat\n"
+     ]
+    }
+   ],
+   "source": [
+    "pred = mx.predict(model, mx.ArrayDataProvider(img))\n",
+    "classes = open(joinpath(model_dir, \"synset.txt\")) do s \n",
+    "    map(x -> replace(strip(x), r\"^n[0-9]+ \", \"\"), readlines(s))\n",
+    "end\n",
+    "println(classes[indmax(pred)])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can also show easily the top-5 classes and the associated probabilities."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "         tiger cat w.p. 0.415807\n",
+      "  tabby, tabby cat w.p. 0.235859\n",
+      "      Egyptian cat w.p. 0.161553\n",
+      "   lynx, catamount w.p. 0.136078\n",
+      "       Persian cat w.p. 0.007109\n"
+     ]
+    }
+   ],
+   "source": [
+    "K = 5\n",
+    "n_best = sortperm(vec(pred), rev=true)[1:K]\n",
+    "best_probs = pred[n_best]\n",
+    "best_labels = classes[n_best]\n",
+    "\n",
+    "for (l,p) in zip(best_labels, best_probs)\n",
+    "    println(mx.format(\"{1:>18} w.p. {2:4f}\", l, p))\n",
+    "end"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Julia 0.4.0",
+   "language": "julia",
+   "name": "julia-0.4"
+  },
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "0.4.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/julia/examples/imagenet/ijulia-pretrained-predict/cat.png b/julia/examples/imagenet/ijulia-pretrained-predict/cat.png
new file mode 100644
index 000000000000..5f681ec7e99c
Binary files /dev/null and b/julia/examples/imagenet/ijulia-pretrained-predict/cat.png differ
diff --git a/julia/examples/imagenet/ijulia-pretrained-predict/imagehelper.py b/julia/examples/imagenet/ijulia-pretrained-predict/imagehelper.py
new file mode 100644
index 000000000000..dddef7415f45
--- /dev/null
+++ b/julia/examples/imagenet/ijulia-pretrained-predict/imagehelper.py
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+from skimage import io, transform
+
+def PreprocessImage(img):
+    img = np.array(img)
+    print("Original Image Shape: ", img.shape)
+    # we crop image from center
+    short_egde = min(img.shape[:2])
+    yy = int((img.shape[0] - short_egde) / 2)
+    xx = int((img.shape[1] - short_egde) / 2)
+    crop_img = img[yy : yy + short_egde, xx : xx + short_egde]
+    # resize to 224, 224
+    resized_img = transform.resize(crop_img, (224, 224))
+    # convert to numpy.ndarray
+    sample = np.asarray(resized_img) * 256
+
+    #-------------------------------------------------------------------
+    # Note: The decoded image should be in BGR channel (opencv output)
+    # For RGB output such as from skimage, we need to convert it to BGR
+    # WRONG channel will lead to WRONG result
+    #-------------------------------------------------------------------
+    # swap channel from RGB to BGR
+    # sample = sample[:, :, [2,1,0]]
+    sample = sample[:, :, [0,1,2]] # actually, in this pre-trained model RGB is used
+
+    # swap axes to make image from (224, 224, 4) to (3, 224, 224)
+    sample = np.swapaxes(sample, 0, 2)
+    sample = np.swapaxes(sample, 1, 2)
+
+    sample.resize(3,224,224)
+    return sample
diff --git a/julia/examples/mnist/lenet-stn.jl b/julia/examples/mnist/lenet-stn.jl
new file mode 100644
index 000000000000..95cd0955d402
--- /dev/null
+++ b/julia/examples/mnist/lenet-stn.jl
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+using MXNet
+
+#--------------------------------------------------------------------------------
+# define lenet with stn layer
+
+
+
+# input
+data = mx.Variable(:data)
+
+
+# the localisation network in lenet-stn
+# it will increase acc about more than 1%, when num-epoch >=15
+# The localization net just takes the data as input and must output a vector in R^n
+loc_net = @mx.chain mx.Convolution(data, num_filter=10, kernel=(5, 5), stride=(2,2)) =>
+                    mx.Activation(act_type=:relu) =>
+                    mx.Pooling( kernel=(2, 2), stride=(2, 2), pool_type=:max) =>
+                    mx.Convolution( num_filter=10, kernel=(3, 3), stride=(2,2), pad=(1, 1)) =>
+                    mx.Activation(act_type=:relu) =>
+                    mx.Pooling( global_pool=true, kernel=(2, 2), pool_type=:avg) =>
+                    mx.Flatten() =>
+                    mx.FullyConnected(num_hidden=6, name=:stn_loc)
+
+data=mx.SpatialTransformer(data,loc_net, target_shape = (28,28), transform_type="affine", sampler_type="bilinear")
+
+# first conv
+conv1 = @mx.chain mx.Convolution(data, kernel=(5,5), num_filter=20)  =>
+                  mx.Activation(act_type=:tanh) =>
+                  mx.Pooling(pool_type=:max, kernel=(2,2), stride=(2,2))
+
+# second conv
+conv2 = @mx.chain mx.Convolution(conv1, kernel=(5,5), num_filter=50) =>
+                  mx.Activation(act_type=:tanh) =>
+                  mx.Pooling(pool_type=:max, kernel=(2,2), stride=(2,2))
+
+# first fully-connected
+fc1   = @mx.chain mx.Flatten(conv2) =>
+                  mx.FullyConnected(num_hidden=500) =>
+                  mx.Activation(act_type=:tanh)
+
+# second fully-connected
+fc2   = mx.FullyConnected(fc1, num_hidden=10)
+
+# softmax loss
+lenet = mx.SoftmaxOutput(fc2, name=:softmax)
+
+
+#--------------------------------------------------------------------------------
+
+# load data
+batch_size = 100
+include("mnist-data.jl")
+train_provider, eval_provider = get_mnist_providers(batch_size; flat=false)
+
+#--------------------------------------------------------------------------------
+# fit model
+model = mx.FeedForward(lenet, context=mx.cpu())
+
+# optimizer
+optimizer = mx.ADAM(η=0.01, λ=0.00001)
+
+# fit parameters
+initializer=mx.XavierInitializer(distribution = mx.xv_uniform, regularization = mx.xv_avg, magnitude = 1)
+mx.fit(model, optimizer, train_provider, n_epoch=20, eval_data=eval_provider,initializer=initializer)
diff --git a/julia/examples/mnist/lenet.jl b/julia/examples/mnist/lenet.jl
new file mode 100644
index 000000000000..5ee15d69dd1b
--- /dev/null
+++ b/julia/examples/mnist/lenet.jl
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+using MXNet
+
+#--------------------------------------------------------------------------------
+# define lenet
+
+# input
+data = mx.Variable(:data)
+
+# first conv
+conv1 = @mx.chain mx.Convolution(data, kernel=(5,5), num_filter=20)  =>
+                  mx.Activation(act_type=:tanh) =>
+                  mx.Pooling(pool_type=:max, kernel=(2,2), stride=(2,2))
+
+# second conv
+conv2 = @mx.chain mx.Convolution(conv1, kernel=(5,5), num_filter=50) =>
+                  mx.Activation(act_type=:tanh) =>
+                  mx.Pooling(pool_type=:max, kernel=(2,2), stride=(2,2))
+
+# first fully-connected
+fc1   = @mx.chain mx.Flatten(conv2) =>
+                  mx.FullyConnected(num_hidden=500) =>
+                  mx.Activation(act_type=:tanh)
+
+# second fully-connected
+fc2   = mx.FullyConnected(fc1, num_hidden=10)
+
+# softmax loss
+lenet = mx.SoftmaxOutput(fc2, name=:softmax)
+
+
+#--------------------------------------------------------------------------------
+# load data
+batch_size = 100
+include("mnist-data.jl")
+train_provider, eval_provider = get_mnist_providers(batch_size; flat=false)
+
+#--------------------------------------------------------------------------------
+# fit model
+model = mx.FeedForward(lenet, context=mx.gpu())
+
+# optimizer
+optimizer = mx.SGD(η=0.05, μ=0.9, λ=0.00001)
+
+# fit parameters
+mx.fit(model, optimizer, train_provider, n_epoch=20, eval_data=eval_provider)
diff --git a/julia/examples/mnist/mlp-test.jl b/julia/examples/mnist/mlp-test.jl
new file mode 100644
index 000000000000..1af84ed3ba8a
--- /dev/null
+++ b/julia/examples/mnist/mlp-test.jl
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file is primarily to be included from runtest.jl. We tried to cover various
+# features of MXNet.jl in this example in order to detect regression errors.
+
+module MNISTTest
+
+using MXNet
+using Base.Test
+
+include("mnist-data.jl")
+
+function get_mnist_mlp()
+  @mx.chain mx.Variable(:data)                   =>
+    mx.FullyConnected(name=:fc1, num_hidden=128) =>
+    mx.Activation(name=:relu1, act_type=:relu)   =>
+    mx.FullyConnected(name=:fc2, num_hidden=64)  =>
+    mx.Activation(name=:relu2, act_type=:relu)   =>
+    mx.FullyConnected(name=:fc3, num_hidden=10)  =>
+    mx.SoftmaxOutput(name=:softmax)
+end
+
+get_mnist_data(batch_size = 100) = get_mnist_providers(batch_size)
+
+function mnist_fit_and_predict(optimizer, initializer, n_epoch)
+  mlp = get_mnist_mlp()
+  train_provider, eval_provider = get_mnist_data()
+
+  # setup model
+  model = mx.FeedForward(mlp, context = mx.cpu())
+
+  # fit parameters
+  cp_prefix = "mnist-test-cp"
+  mx.fit(model, optimizer, train_provider, eval_data=eval_provider, n_epoch=n_epoch,
+         initializer=initializer, callbacks=[mx.speedometer(), mx.do_checkpoint(cp_prefix, save_epoch_0=true)])
+
+  # make sure the checkpoints are saved
+  @test isfile("$cp_prefix-symbol.json")
+  for i_epoch = 0:n_epoch
+    @test isfile(mx.format("{1}-{2:04d}.params", cp_prefix, i_epoch))
+  end
+  mlp_load = mx.load("$cp_prefix-symbol.json", mx.SymbolicNode)
+  @test mx.to_json(mlp_load) == mx.to_json(mlp)
+  mlp_load = mx.from_json(readstring("$cp_prefix-symbol.json"), mx.SymbolicNode)
+  @test mx.to_json(mlp_load) == mx.to_json(mlp)
+
+  #--------------------------------------------------------------------------------
+  # the predict API
+  probs = mx.predict(model, eval_provider)
+
+  # collect all labels from eval data
+  labels = Array[]
+  for batch in eval_provider
+    push!(labels, copy(mx.get(eval_provider, batch, :softmax_label)))
+  end
+  labels = cat(1, labels...)
+
+  # Now we use compute the accuracy
+  correct = 0
+  for i = 1:length(labels)
+    # labels are 0...9
+    if indmax(probs[:,i]) == labels[i]+1
+      correct += 1
+    end
+  end
+  accuracy = 100correct/length(labels)
+  println(mx.format("Accuracy on eval set: {1:.2f}%", accuracy))
+
+  # try to call visualization
+  dot_code = mx.to_graphviz(mlp)
+
+  return accuracy
+end
+
+function test_mnist_mlp()
+  info("MNIST::SGD")
+  @test mnist_fit_and_predict(mx.SGD(η=.2), mx.UniformInitializer(.01), 2) > 90
+
+  info("MNIST::SGD::η scheduler")
+  @test mnist_fit_and_predict(mx.SGD(η_sched=mx.LearningRate.Inv(.25)),
+                              mx.UniformInitializer(.01), 2) > 90
+
+  info("MNIST::SGD::momentum μ")
+  @test mnist_fit_and_predict(mx.SGD(η=.1, μ=.9), mx.UniformInitializer(.01), 2) > 90
+
+  info("MNIST::ADAM")
+  @test mnist_fit_and_predict(mx.ADAM(), mx.NormalInitializer(), 2) > 90
+
+  info("MNIST::AdaGrad")
+  @test mnist_fit_and_predict(mx.AdaGrad(), mx.NormalInitializer(), 2) > 90
+
+  info("MNIST::AdaDelta")
+  @test mnist_fit_and_predict(mx.AdaDelta(), mx.NormalInitializer(), 2) > 90
+
+  info("MNIST::AdaMax")
+  @test mnist_fit_and_predict(mx.AdaMax(), mx.NormalInitializer(), 2) > 90
+
+  info("MNIST::RMSProp")
+  @test mnist_fit_and_predict(mx.RMSProp(), mx.NormalInitializer(), 2) > 90
+
+  info("MNIST::Nadam")
+  @test mnist_fit_and_predict(mx.Nadam(), mx.NormalInitializer(), 2) > 90
+end
+
+test_mnist_mlp()
+
+end # module MNISTTest
diff --git a/julia/examples/mnist/mlp.jl b/julia/examples/mnist/mlp.jl
new file mode 100644
index 000000000000..20facc9b71b3
--- /dev/null
+++ b/julia/examples/mnist/mlp.jl
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+using MXNet
+
+#--------------------------------------------------------------------------------
+# define MLP
+# the following two ways are equivalent
+
+#-- Option 1: explicit composition
+# data = mx.Variable(:data)
+# fc1  = mx.FullyConnected(data, name=:fc1, num_hidden=128)
+# act1 = mx.Activation(fc1, name=:relu1, act_type=:relu)
+# fc2  = mx.FullyConnected(act1, name=:fc2, num_hidden=64)
+# act2 = mx.Activation(fc2, name=:relu2, act_type=:relu)
+# fc3  = mx.FullyConnected(act2, name=:fc3, num_hidden=10)
+# mlp  = mx.SoftmaxOutput(fc3, name=:softmax)
+
+#-- Option 2: using the mx.chain macro
+# mlp = @mx.chain mx.Variable(:data)             =>
+#   mx.FullyConnected(name=:fc1, num_hidden=128) =>
+#   mx.Activation(name=:relu1, act_type=:relu)   =>
+#   mx.FullyConnected(name=:fc2, num_hidden=64)  =>
+#   mx.Activation(name=:relu2, act_type=:relu)   =>
+#   mx.FullyConnected(name=:fc3, num_hidden=10)  =>
+#   mx.SoftmaxOutput(name=:softmax)
+
+#-- Option 3: using nn-factory
+mlp = @mx.chain mx.Variable(:data) =>
+  mx.MLP([128, 64, 10])            =>
+  mx.SoftmaxOutput(name=:softmax)
+
+# data provider
+batch_size = 100
+include("mnist-data.jl")
+train_provider, eval_provider = get_mnist_providers(batch_size)
+
+# setup model
+model = mx.FeedForward(mlp, context=mx.cpu())
+
+# optimizer
+optimizer = mx.SGD(η=0.1, μ=0.9, λ=0.00001)
+
+# fit parameters
+mx.fit(model, optimizer, train_provider, eval_data=eval_provider, n_epoch=20)
+
+#--------------------------------------------------------------------------------
+# Optional, demonstration of the predict API
+probs = mx.predict(model, eval_provider)
+
+# collect all labels from eval data
+labels = reduce(
+  vcat,
+  copy(mx.get(eval_provider, batch, :softmax_label)) for batch ∈ eval_provider)
+# labels are 0...9
+labels .= labels .+ 1
+
+# Now we use compute the accuracy
+pred = map(i -> indmax(probs[1:10, i]), 1:size(probs, 2))
+correct = sum(pred .== labels)
+@printf "Accuracy on eval set: %.2f%%\n" 100correct/length(labels)
diff --git a/julia/examples/mnist/mnist-data.jl b/julia/examples/mnist/mnist-data.jl
new file mode 100644
index 000000000000..12160cf6f18e
--- /dev/null
+++ b/julia/examples/mnist/mnist-data.jl
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+function get_mnist_providers(batch_size::Int; data_name=:data, label_name=:softmax_label, flat=true)
+  # download MNIST into Pkg.dir("MXNet")/data/mnist if not exist
+  filenames = mx.get_mnist_ubyte()
+
+  # data provider
+  train_provider = mx.MNISTProvider(image=filenames[:train_data],
+                                    label=filenames[:train_label],
+                                    data_name=data_name, label_name=label_name,
+                                    batch_size=batch_size, shuffle=true, flat=flat, silent=true)
+  eval_provider = mx.MNISTProvider(image=filenames[:test_data],
+                                   label=filenames[:test_label],
+                                   data_name=data_name, label_name=label_name,
+                                   batch_size=batch_size, shuffle=false, flat=flat, silent=true)
+
+  return (train_provider, eval_provider)
+end
diff --git a/julia/examples/nondefault-example.jl b/julia/examples/nondefault-example.jl
new file mode 100644
index 000000000000..75eff085a459
--- /dev/null
+++ b/julia/examples/nondefault-example.jl
@@ -0,0 +1,147 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#=
+    Contents: This file contains code for:
+              - Setting the initial values of the biases and weights equal to the final values of a previous run.
+	        This is helpful for re-estimating a model on updated training data, where the original and updated training data largely overlap.
+	      - Changing the loss function (in our example from Accuracy to ACE)
+
+    Notes:
+    1. The model is a toy example with 4 outcomes (categories).
+       The model is a poor fit to the data, but this is unimportant. The point of the example is to demonstrate the use of some non-default settings.
+    2. For categorical outcomes, use 0-based categories! Some of the loss functions assume this, such as ACE.
+    3. Incomplete batches are padded with repeated instances of an artificial observation.
+       This is bad because the artificial data is over-represented and thus biases the results.
+       The ideal solution is to distribute the observations from the incomplete batch among the complete batches.
+       This would result in batches of variable but similar size, and thus the estimate of the gradient would not be significantly affected.
+       But this doesn't happen.
+       For simplicity we instead drop these extra observations, so that the number of observations in the data set is a multiple of the batch_size.
+=#
+
+
+using RDatasets
+using MXNet
+
+
+################################################################################
+### Data: Exam scores discretised into 4 categories (use zero-based categories!).
+df = dataset("mlmRev", "Gcsemv");    # 1905 x 5
+complete_cases!(df)                  # 1523 x 5
+n = nrow(df)
+df[:written] = zeros(Int, n)
+df[:course]  = zeros(Int, n)
+for i = 1:n
+    # Categorise :Written
+    if df[i, :Written] <= 20.0
+	df[i, :written] = 0
+    elseif df[i, :Written] <= 40.0
+	df[i, :written] = 1
+    elseif df[i, :Written] <= 60.0
+	df[i, :written] = 2
+    else
+	df[i, :written] = 3
+    end
+
+    # Categorise :Course
+    if df[i, :Course] <= 25.0
+	df[i, :course] = 0
+    elseif df[i, :Course] <= 50.0
+	df[i, :course] = 1
+    elseif df[i, :Course] <= 75.0
+	df[i, :course] = 2
+    else
+	df[i, :course] = 3
+    end
+end
+df = df[1:1500, :]    # Ensure nrows is a multiple of batch_size (100 in our example, see below)
+
+x = convert(Vector{Float64}, df[:course])
+y = convert(Vector{Float64}, df[:written])
+
+
+################################################################################
+### Hyperparameters
+
+# Architecture
+mlp = @mx.chain mx.Variable(:data) =>
+        mx.FullyConnected(name = :h1, num_hidden = 10) =>
+	mx.Activation(name = :h1_out, act_type = :sigmoid) =>
+        mx.FullyConnected(name = :out, num_hidden = 4) =>
+	mx.SoftmaxOutput(name = :softmax)
+
+# Hyperparameters
+n_epoch    = 100
+batch_size = 100
+learn_rate = 0.1
+mom        = 0.9
+wt_decay   = 0.00001
+
+
+# Connect data, network architecture and hyperparameters
+train_prov = mx.ArrayDataProvider(x, y; batch_size = batch_size)
+eval_prov  = mx.ArrayDataProvider(x, y; batch_size = batch_size)
+opt        = mx.SGD(lr = learn_rate, momentum = mom, weight_decay = wt_decay)    # Optimizing algorithm
+
+################################################################################
+### Run 1: Basic run, storing initial and final state.
+
+# Learn
+mdl1 = mx.FeedForward(mlp, context = mx.cpu())                                               # Model targets the local CPU
+cb = mx.do_checkpoint("first", frequency = n_epoch, save_epoch_0 = true)                     # Write initial and final states to disk
+mx.fit(mdl1, opt, train_prov, n_epoch = n_epoch, eval_data = eval_prov, callbacks = [cb])    # Random initial biases and weights
+
+
+################################################################################
+### Run 2: Load the previously trained model and run it some more, starting where Run 1 finished.
+
+# Load final state of 1st run from disk
+arch, arg_params, aux_params = mx.load_checkpoint("first", 100)    # arch is the network structure, arg_params contains the weights and biases
+mdl2 = mx.FeedForward(arch, context = mx.cpu())                    # Only populates the arch and ctx fields
+mdl2.arg_params = arg_params                                       # Populate the arg_params fields
+cb   = mx.do_checkpoint("second", frequency = n_epoch, save_epoch_0 = true)
+mx.fit(mdl2, opt, train_prov, n_epoch = n_epoch, eval_data = eval_prov, callbacks = [cb])
+
+# Test whether the final state of 1st run equals the initial state of 2nd run
+run(`diff first-0100.params second-0000.params`)    # Throws error if not true, does nothing otherwise
+
+
+#=
+    # Other useful functions
+    arch       = mx.load("first-symbol.json", mx.SymbolicNode)
+    arg_params = mx.load("first-0100.params", mx.NDArray)
+=#
+
+
+################################################################################
+### Run 3: Change the loss function from the default Accuracy to ACE
+
+mdl3 = mx.FeedForward(mlp, context = mx.cpu())
+mx.fit(mdl3, opt, train_prov, n_epoch = n_epoch, eval_data = eval_prov, eval_metric = mx.ACE())
+#mx.fit(mdl3, opt, train_prov, n_epoch = n_epoch, eval_data = eval_prov, eval_metric = mx.Accuracy())    # Default eval_metric
+#mx.fit(mdl3, opt, train_prov, n_epoch = n_epoch, eval_data = eval_prov, eval_metric = mx.MultiACE(4))
+
+# Test manually
+probs = mx.predict(mdl3, eval_prov)
+LL    = 0.0
+for i = 1:size(y, 1)
+    LL += log(probs[Int(y[i]) + 1, i])
+end
+-LL / size(y, 1)    # Should equal the value of ACE from the final iteration of fit(mdl3, ...)
+
+
+# EOF
diff --git a/julia/examples/regression-example.jl b/julia/examples/regression-example.jl
new file mode 100644
index 000000000000..bbbb415fe664
--- /dev/null
+++ b/julia/examples/regression-example.jl
@@ -0,0 +1,101 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#=
+This script shows how a simple MLP net may be used
+for regression. It shows how data in memory may be
+used for training and evaluation, and how to obtain
+the predictions from the trained net.
+=#
+using MXNet
+using Distributions
+#using Plots
+
+# data generating process
+generate_inputs(mean, var, size) = rand(MvNormal(mean, var), size)
+output(data) = sin.(data[1:1,:]).*sin.(data[2:2,:])./(data[1:1,:].*data[2:2,:])
+
+# create training and evaluation data sets
+mean=[0.0; 0.0]
+var=[1.0 0.0; 0.0 1.0]
+samplesize  = 5000
+TrainInput = generate_inputs(mean, var, samplesize)
+TrainOutput = output(TrainInput)
+ValidationInput = generate_inputs(mean, var, samplesize)
+ValidationOutput = output(ValidationInput)
+
+# how to set up data providers using data in memory
+function data_source(batchsize = 100)
+  train = mx.ArrayDataProvider(
+    :data => TrainInput,
+    :label => TrainOutput,
+    batch_size = batchsize,
+    shuffle = true,
+    )
+  valid = mx.ArrayDataProvider(
+    :data => ValidationInput,
+    :label => ValidationOutput,
+    batch_size = batchsize,
+    shuffle = true,
+    )
+
+  train, valid
+end
+
+# create a two hidden layer MPL: try varying num_hidden, and change tanh to relu,
+# or add/remove a layer
+data = mx.Variable(:data)
+label = mx.Variable(:label)
+net = @mx.chain     mx.Variable(:data) =>
+                    mx.FullyConnected(num_hidden=10) =>
+                    mx.Activation(act_type=:tanh) =>
+                    mx.FullyConnected(num_hidden=3) =>
+                    mx.Activation(act_type=:tanh) =>
+                    mx.FullyConnected(num_hidden=1) =>
+                    mx.LinearRegressionOutput(mx.Variable(:label))
+
+# final model definition, don't change, except if using gpu
+model = mx.FeedForward(net, context=mx.cpu())
+
+# set up the optimizer: select one, explore parameters, if desired
+#optimizer = mx.SGD(η=0.01, μ=0.9, λ=0.00001)
+optimizer = mx.ADAM()
+
+# train, reporting loss for training and evaluation sets
+# initial training with small batch size, to get to a good neighborhood
+trainprovider, evalprovider = data_source(#= batchsize =# 200)
+mx.fit(model, optimizer, trainprovider,
+       initializer = mx.NormalInitializer(0.0, 0.1),
+       eval_metric = mx.MSE(),
+       eval_data = evalprovider,
+       n_epoch = 20,
+       callbacks = [mx.speedometer()])
+# more training with the full sample
+trainprovider, evalprovider = data_source(#= batchsize =# samplesize)
+mx.fit(model, optimizer, trainprovider,
+       initializer = mx.NormalInitializer(0.0, 0.1),
+       eval_metric = mx.MSE(),
+       eval_data = evalprovider,
+       n_epoch = 500,  # previous setting is batchsize = 200, epoch = 20
+                       # implies we did (5000 / 200) * 20 times update in previous `fit`
+       callbacks = [mx.speedometer()])
+
+# obtain predictions
+plotprovider = mx.ArrayDataProvider(:data => ValidationInput, :label => ValidationOutput)
+fit = mx.predict(model, plotprovider)
+println("correlation between fitted values and true regression line: ", cor(vec(fit), vec(ValidationOutput)))
+#scatter(ValidationOutput',fit',w = 3, xlabel="true", ylabel="predicted", title="45º line is what we hope for", show=true)
diff --git a/julia/models/Inception/.gitignore b/julia/models/Inception/.gitignore
new file mode 100644
index 000000000000..3eabb6e80247
--- /dev/null
+++ b/julia/models/Inception/.gitignore
@@ -0,0 +1,2 @@
+Inception
+Inception.zip
diff --git a/julia/models/Inception/get.sh b/julia/models/Inception/get.sh
new file mode 100755
index 000000000000..16452a361d98
--- /dev/null
+++ b/julia/models/Inception/get.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+wget -c http://data.dmlc.ml/mxnet/data/Inception.zip
+unzip Inception.zip
diff --git a/julia/plugins/README.md b/julia/plugins/README.md
new file mode 100644
index 000000000000..38882889f494
--- /dev/null
+++ b/julia/plugins/README.md
@@ -0,0 +1,14 @@
+# Plugins of MXNet.jl
+
+This directory contains *plugins* of MXNet.jl. A plugin is typically a component that could be part of MXNet.jl, but excluded from the `mx` namespace. The plugins are included here primarily for two reasons:
+
+* To minimize the dependency of MXNet.jl on other optional packages.
+* To serve as examples on how to extend some components of MXNet.jl.
+
+The most straightforward way to use a plugin is to `include` the code. For example
+
+```julia
+include(joinpath(Pkg.dir("MXNet"), "plugins", "io", "svmlight.jl"))
+
+provider = SVMLightProvider("/path/to/dataset", 100)
+```
diff --git a/julia/plugins/io/svmlight.jl b/julia/plugins/io/svmlight.jl
new file mode 100644
index 000000000000..f9d9b2ec83db
--- /dev/null
+++ b/julia/plugins/io/svmlight.jl
@@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#=doc
+SVMLight / LibSVM is a popular data format for sparse features. Some preprocessed
+datasets in this format could be found at http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/
+=#
+using MXNet
+using SVMLightLoader
+
+mutable struct SVMLightProvider <: mx.AbstractDataProvider
+  filename   :: AbstractString
+  batch_size :: Int
+  fea_dim    :: Int
+  data_name  :: Symbol
+  label_name :: Symbol
+end
+
+function SVMLightProvider(filename::AbstractString, batch_size::Int; fea_dim::Int=-1,
+                          data_name::Symbol=:data, label_name::Symbol=:label)
+  if fea_dim == -1
+    info("SVMLightProvider: going over file to get feature dimension of $filename")
+    f = SVMLightFile(filename)
+    for (data, label) in f
+      fea_dim = max(fea_dim, length(data))
+    end
+  end
+
+  return SVMLightProvider(filename, batch_size, fea_dim, data_name, label_name)
+end
+
+mx.get_batch_size(provider :: SVMLightProvider) = provider.batch_size
+function mx.provide_data(provider :: SVMLightProvider)
+  [(provider.data_name, (provider.fea_dim, provider.batch_size))]
+end
+function mx.provide_label(provider :: SVMLightProvider)
+  [(provider.label_name, (provider.batch_size,))]
+end
+
+function mx.eachbatch(provider :: SVMLightProvider)
+  data_jl  = zeros(mx.MX_float, (provider.fea_dim, provider.batch_size))
+  data_nd  = mx.empty(size(data_jl))
+  label_jl = zeros(mx.MX_float, (provider.batch_size,))
+  label_nd = mx.empty(size(label_jl))
+
+  batch = mx.DataBatch([data_nd], [label_nd], provider.batch_size)
+  function _svmlight_iter()
+    f = SVMLightFile(provider.filename)
+    while true
+      error("This is actually buggy and needs fixing")
+      raw = collect(take(f, provider.batch_size))
+      cnt = length(raw)
+      if cnt == 0
+        # end of file, no more data to see
+        return
+      end
+
+      data_jl[:] = 0
+      for i = 1:provider.batch_size
+        vec, gnd = raw[min(i,cnt)]
+        data_jl[1:length(vec),i] = vec
+        label_jl[i]  = gnd
+      end
+      mx.copy!(data_nd, data_jl)
+      mx.copy!(label_nd, label_jl)
+      batch.count = cnt
+      produce(batch)
+    end
+  end
+
+  return Task(_svmlight_iter)
+end
diff --git a/julia/src/MXNet.jl b/julia/src/MXNet.jl
new file mode 100644
index 000000000000..03c3cb89b530
--- /dev/null
+++ b/julia/src/MXNet.jl
@@ -0,0 +1,173 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+__precompile__()
+
+module MXNet
+
+using Reexport
+
+# we put everything in the namespace mx, because there are a lot of
+# functions with the same names as built-in utilities like "zeros", etc.
+export mx
+module mx
+
+import Base.Iterators: filter
+
+using Formatting
+using MacroTools
+using TakingBroadcastSeriously: @unfuse
+import TakingBroadcastSeriously: broadcast_
+
+# Functions from base that we can safely extend and that are defined by libmxnet.
+import Base: round, ceil, floor, cos, sin, abs, sign, exp, sqrt, exp, log, norm,
+             transpose
+
+###############################################################################
+#  exports
+###############################################################################
+
+# symbolic-node.jl
+export SymbolicNode,
+       Variable,
+       @var
+
+# ndarray.jl
+export NDArray,
+       clip,
+       clip!,
+       context,
+       empty,
+       expand_dims,
+       @inplace,
+       # activation funcs
+       σ,
+       sigmoid,
+       relu,
+       softmax,
+       log_softmax,
+       # broadcast utils
+       broadcast_to,
+       broadcast_axis,
+       broadcast_axes
+
+# executor.jl
+export Executor,
+       bind,
+       simple_bind,
+       forward,
+       backward
+
+# context.jl
+export Context,
+       cpu,
+       gpu
+
+# model.jl
+export AbstractModel,
+       FeedForward,
+       predict
+
+# nn-factory.jl
+export MLP
+
+# metric.jl
+export AbstractEvalMetric,
+       ACE,
+       Accuracy,
+       MSE,
+       MultiACE,
+       MultiMetric,
+       NMSE,
+       SeqMetric
+
+# kvstore.jl
+export KVStore,
+       init!,
+       pull!,
+       barrier,
+       setoptimizer!,
+       setupdater!
+
+# initializer.jl
+export AbstractInitializer,
+       UniformInitializer,
+       NormalInitializer,
+       XavierInitializer
+
+# optimizer.jl
+export AbstractOptimizer,
+       AdaDelta,
+       AdaGrad,
+       ADAM,
+       AdaMax,
+       Nadam,
+       RMSProp,
+       SGD,
+       getupdater,
+       normgrad!,
+       update!
+
+# io.jl
+export AbstractDataProvider,
+       AbstractDataBatch,
+       DataBatch,
+       ArrayDataProvider,
+       ArrayDataBatch
+
+# visualize.jl
+export to_graphviz
+
+###############################################################################
+#  includes
+###############################################################################
+
+include("base.jl")
+
+include("context.jl")
+include("util.jl")
+include("broadcast.jl")
+
+include("ndarray.jl")
+include("random.jl")
+include("autograd.jl")
+
+include("name.jl")
+include("symbolic-node.jl")
+include("executor.jl")
+
+include("metric.jl")
+include("optimizer.jl")
+include("initializer.jl")
+
+include("io.jl")
+include("kvstore.jl")
+
+include("callback.jl")
+include("model.jl")
+
+include("visualize.jl")
+
+include("nn-factory.jl")
+
+include("deprecated.jl")
+
+end # mx
+
+@reexport using .mx
+
+end # module MXNet
diff --git a/julia/src/autograd.jl b/julia/src/autograd.jl
new file mode 100644
index 000000000000..72fb82ba1bbb
--- /dev/null
+++ b/julia/src/autograd.jl
@@ -0,0 +1,404 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Autograd for NDArray
+# this is a port of Python's autograd module
+# https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/autograd.py
+
+###############################################################################
+#  Private util functions
+###############################################################################
+
+"""
+    _set_recording(state::Bool)::Bool
+
+Set status to recording/not recording. When recording, graph will be constructed
+for gradient computation.
+
+## Parameters
+
+* `state::Bool`
+
+## Returns
+
+Previous state before this set
+"""
+function _set_recording(state::Bool)::Bool
+  prev = Ref{Cint}(C_NULL)
+  @mxcall(:MXAutogradSetIsRecording, (Cint, Ref{Cint}), state, prev)
+  prev[]
+end
+
+_set_recording(::Void) = nothing
+
+"""
+Set status to training/predicting.
+For example, Dropout will drop inputs randomly when
+`train_mode = true` while simply passing through if `train_mode = false`.
+
+## Parameters
+* `train_mode::Bool`
+
+## Returns
+
+Previous state before this set.
+"""
+function _set_training(train_mode::Bool)::Bool
+  prev = Ref{Cint}(C_NULL)
+  @mxcall(:MXAutogradSetIsTraining, (Cint, Ref{Cint}), train_mode, prev)
+  prev[]
+end
+
+_set_training(::Void) = nothing
+
+###############################################################################
+#  Public API
+###############################################################################
+
+"""
+    is_recording()::Bool
+
+Get status on recording/not recording.
+"""
+function is_recording()::Bool
+  state = Ref{Cint}(C_NULL)
+  @mxcall(:MXAutogradIsRecording, (Ref{Cint},), state)
+  state[]
+end
+
+"""
+    is_training()::Bool
+
+Get status on recording/not recording.
+"""
+function is_training()::Bool
+  state = Ref{Cint}(C_NULL)
+  @mxcall(:MXAutogradIsTraining, (Ref{Cint},), state)
+  state[]
+end
+
+@inline function _record(f, is_record::Union{Void,Bool}, train_mode::Union{Void,Bool})
+  # Port from Python's `_RecordingStateScope` context manager
+  # __enter__
+  prev_is_record = _set_recording(is_record)
+  prev_train_mode = _set_training(train_mode)
+
+  try
+    f()
+  finally
+    # __exit__
+    if is_record != nothing && prev_is_record != is_record
+      _set_recording(prev_is_record)
+    end
+    if train_mode != nothing && prev_train_mode != train_mode
+      _set_recording(prev_train_mode)
+    end
+  end
+end
+
+"""
+    record(f, train_mode = true)
+    record(translates = true) do
+      ...
+    end
+
+Returns an autograd recording scope context to be used in `do` block
+and captures code that needs gradients to be calculated.
+
+Parameter `train_mode::Bool` controls whether the forward pass is in training
+or predicting mode.
+This controls the behavior of some layers such as `Dropout`, `BatchNorm`.
+
+!!! note
+    When forwarding with `train_mode = false`, the corresponding backward
+    should also use `train_mode = false`, otherwise gradient is undefined.
+
+```julia
+x = mx.NDArray([1 2; 3 4])
+∇ = mx.attach_grad!(x)
+y = mx.record() do
+  2x
+end
+mx.backward!(y)
+
+julia> ∇
+2×2 mx.NDArray{Int64,2} @ CPU0:
+ 2  2
+ 2  2
+```
+"""
+record(f, train_mode::Bool = true) = _record(f, true, train_mode)
+
+"""
+    pause(f, train_mode = false)
+    pause(train_mode = false) do
+      ...
+    end
+
+Create a scope context for codes that do not need gradients to be calculated.
+
+```julia
+record() do
+  ...
+  pause() do
+    # testing, IO, gradient updates...
+  end
+end
+```
+"""
+pause(f, train_mode::Bool = false) = _record(f, false, train_mode)
+
+"""
+    train_mode(f)
+    train_mode() do
+      ...
+    end
+
+Create a scope context in which forward pass behavior is set to training mode,
+without changing the recording states.
+
+```julia
+y = model(x)
+train_mode() do
+  z = mx.Dropout(y)
+  ...
+end
+```
+"""
+train_mode(f) = _record(f, nothing, true)
+
+"""
+    predict_mode(f)
+    predict_mode() do
+      ...
+    end
+
+Create a scope context in which forward pass behavior is set to inference mode,
+without changing the recording states.
+
+```julia
+record() do
+  y = model(x)
+  predict_mode() do
+    y = sampling(y)
+  end
+end
+```
+"""
+predict_mode(f) = _record(f, nothing, false)
+
+"""
+    backward!(head,  head_grad;  retain_graph = false, train_mode = true)
+    backward!(heads, head_grads; retain_graph = false, train_mode = true)
+
+Compute the gradients of heads w.r.t previously marked variables.
+
+## Parameters
+
+- `head::NDArray`: output NDArray
+
+- `head_grad::NDArray` or `Void`: gradient coefficient with respect to head.
+
+- `heads::Vector{NDArray}`: a list of output NDArray
+
+- `head_grads::Vector`: a list of gradient coefficient with respect ot heads.
+  the element should be `NDArray` or `Void`
+
+- `retain_graph::Bool`: whether to keep the graph after backward. e.g:
+  If you want to differentiate the same graph twice,
+  you need to pass `retain_graph=true`.
+
+- `train_mode::Bool`: whether to do backward for training or predicting.
+"""
+backward!(head::NDArray, head_grad::NDArray; kws...) =
+  backward!([head], [head_grad]; kws...)
+
+backward!(head::NDArray, head_grad::Void = nothing; kws...) =
+  backward!([head], head_grad; kws...)
+
+function backward!(heads::VecOfNDArray, head_grad::Void;
+                   retain_graph::Bool = false, train_mode::Bool = true)
+  @mxcall(
+    :MXAutogradBackwardEx,
+    (MX_uint,
+     Ptr{MX_handle},
+     Ptr{MX_handle},
+     MX_uint,
+     Ptr{MX_handle},
+     Cint,
+     Cint,
+     Cint,
+     Ptr{MX_handle},
+     Ptr{MX_handle}),
+    length(heads),
+    map(x -> x.handle, heads),
+    C_NULL,
+    0,
+    C_NULL,
+    retain_graph,
+    false,  # create_graph
+    train_mode,
+    C_NULL,
+    C_NULL)
+end
+
+function backward!(heads::VecOfNDArray, head_grads::Vector;
+                   retain_graph::Bool = false, train_mode::Bool = true)
+  output_handles = map(x -> x.handle, heads)
+  ograd_handles  = map(head_grads) do x
+    if x isa NDArray
+      x.handle
+    elseif x isa Void
+      MX_handle(C_NULL)
+    else
+      throw(ArgumentError("element of head_grads should be NDArray or Void"))
+    end
+  end
+  @assert length(output_handles) == length(ograd_handles)
+  @mxcall(
+    :MXAutogradBackwardEx,
+    (MX_uint,
+     Ptr{MX_handle},
+     Ptr{MX_handle},
+     MX_uint,
+     Ptr{MX_handle},
+     Cint,
+     Cint,
+     Cint,
+     Ptr{MX_handle},
+     Ptr{MX_handle}),
+    length(output_handles),
+    output_handles,
+    ograd_handles,
+    0,
+    C_NULL,
+    retain_graph,
+    false,  # create_graph
+    train_mode,
+    C_NULL,
+    C_NULL)
+end
+
+"""
+    getgrad(arr::NDArray)
+
+Returns the gradient buffer attached to this `NDArray`.
+If the gradient buffer isn't attached yet, return `nothing`.
+"""
+function getgrad(arr::NDArray)
+  out = Ref{MX_handle}(C_NULL)
+  @mxcall(:MXNDArrayGetGrad, (MX_handle, Ref{MX_handle}), arr.handle, out)
+  (out[] == C_NULL) ? nothing : NDArray(MX_NDArrayHandle(out[]))
+end
+
+"""
+    attach_grad!(x::NDArray, grad_req::Symbol = :write)
+
+Attach a gradient buffer to this `NDArray`,
+so that [`backward!`](@ref) can compute gradient with respect to it.
+
+## Parameters
+
+- `x::NDArray`
+- `grad_req::Symbol` (default is `:write`)
+
+## Return
+
+The attached gradient buffer
+
+## See also
+
+- [`getgrad`](@ref)
+"""
+function attach_grad!(x::NDArray, grad_req::Symbol = :write)
+  # TODO: support storage type (stype in Python)
+  # TODO: make sure it works with gpu array
+  grad = zeros_like(x)
+  _mark_variables!([x], [grad], grad_req)
+  grad
+end
+
+"""
+    mark_variables!(var,  grad,  grad_req)
+    mark_variables!(vars, grads, grad_reqs)
+
+Mark `NDArrays` as variables to compute gradient for autograd.
+
+## Parameters
+
+- `var::NDArray`
+- `grad::NDArray`
+- `grad_req::Symbol`: `:nop`, `:write`, `:inplace` or `:add`
+- `vars::Vector{NDArray}`
+- `grads::Vector{NDArray}`
+- `grad_req::Vector{Symbol}`
+"""
+mark_variables!(var::NDArray, grad::NDArray, grad_reqs::Symbol = :write) =
+  _mark_variables!([var], [grad], grad_reqs)
+
+mark_variables!(var::VecOfNDArray, grads::VecOfNDArray, grad_reqs = :write) =
+  _mark_variables!(var, grads, grad_reqs)
+
+@inline function _getgrad_req(x::Symbol)::GRAD_REQ
+  val = get(grad_req_map, x, false)
+  if val == false
+    throw(ArgumentError("invalid grad_reqs $x"))
+  end
+  val
+end
+
+@inline _getgrad_reqs(x::Symbol, n::Int) =
+  map((_) -> MX_uint(_getgrad_req(x)), Base.OneTo(n))
+
+@inline function _getgrad_reqs(xs::Vector{Symbol}, n::Int)
+  if length(xs) != n
+    throw(ArgumentError("number of variables and grad_reqs not matched"))
+  end
+  map(MX_uint ∘ _getgrad_req, xs)
+end
+
+@inline function _mark_variables!(vars::VecOfNDArray, grads::VecOfNDArray,
+                                  grad_reqs = :write)
+  n = length(vars)
+  if n != length(grads)
+    throw(ArgumentError("number of variables and gradients not matched"))
+  end
+
+  var_hdls  = map(x -> x.handle, vars)
+  grad_hdls = map(x -> x.handle, grads)
+  grad_reqs = _getgrad_reqs(grad_reqs, n)
+
+  @mxcall(:MXAutogradMarkVariables,
+          (MX_uint, Ref{MX_handle}, Ptr{MX_uint}, Ref{MX_handle}),
+          length(vars), var_hdls, grad_reqs, grad_hdls)
+end
+
+"""
+    symbol(x::NDArray)
+
+Retrieve recorded computation history as `SymbolicNode`,
+ where `x` is a `NDArray` representing the head of computation graph.
+ """
+function symbol(x::NDArray)
+  ref = Ref{MX_handle}(C_NULL)
+  @mxcall(:MXAutogradGetSymbol, (MX_handle, Ref{MX_handle}), x, ref)
+  SymbolicNode(MX_SymbolHandle(ref[]))
+end
+
+###############################################################################
+#  TODO: User-defined differentiable function
+###############################################################################
diff --git a/julia/src/base.jl b/julia/src/base.jl
new file mode 100644
index 000000000000..ce1c183eafb5
--- /dev/null
+++ b/julia/src/base.jl
@@ -0,0 +1,317 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"Exception thrown when an error occurred calling MXNet API."
+struct MXError <: Exception
+  msg :: AbstractString
+end
+
+Base.show(io::IO, e::MXError) = print(io, e.msg)
+
+################################################################################
+# Common types used in MXNet API
+################################################################################
+const MX_uint = Cuint
+const MX_float = Cfloat
+const MX_handle = Ptr{Void}
+
+const char_p = Ptr{UInt8}
+const char_pp = Ptr{char_p}
+
+################################################################################
+# Enumeration from MXNet headers
+################################################################################
+# OpReqType in include/mxnet/op_attr_types.h
+@enum GRAD_REQ GRAD_NOP=0 GRAD_WRITE=1 GRAD_INPLACE=2 GRAD_ADD=3
+const grad_req_map = Dict{Symbol,GRAD_REQ}(
+    :nop     => GRAD_NOP,      # no operation, do not write anything
+    :write   => GRAD_WRITE,    # write gradient to provided space
+    :inplace => GRAD_INPLACE,  # perform an inplace write
+    :add     => GRAD_ADD,      # add to the provided space
+)
+
+################################################################################
+# Initialization and library API entrance
+################################################################################
+const MXNET_LIB = Libdl.find_library(["libmxnet.$(Libdl.dlext)", "libmxnet.so"],  # see build.jl
+                                     [joinpath(get(ENV, "MXNET_HOME", ""), "lib"),
+                                      get(ENV, "MXNET_HOME", ""),
+                                      Pkg.dir("MXNet", "deps", "usr", "lib")])
+if isempty(MXNET_LIB)
+  # touch this file, so that after the user properly build libmxnet, the precompiled
+  # MXNet.ji will be re-compiled to get MXNET_LIB properly.
+  touch(@__FILE__)
+  error("Cannot find or load libmxnet.$(Libdl.dlext). " *
+        "Please see the document on how to build it.")
+else
+  include_dependency(MXNET_LIB)
+end
+
+function __init__()
+  # TODO: bug in nnvm, if do not call this, call get handle "_copyto" will fail
+  _get_libmx_op_names()
+  _populate_iter_creator_cache!()
+
+  global const LIB_VERSION = _get_lib_version()
+
+  atexit() do
+    # notify libmxnet we are shutting down
+    ccall( ("MXNotifyShutdown", MXNET_LIB), Cint, () )
+  end
+end
+
+function mx_get_last_error()
+  msg = ccall( ("MXGetLastError", MXNET_LIB), char_p, () )
+  if msg == C_NULL
+    throw(MXError("Failed to get last error message"))
+  end
+  return unsafe_string(msg)
+end
+
+"Utility macro to call MXNet API functions"
+macro mxcall(fv, argtypes, args...)
+  f = eval(fv)
+  args = map(esc, args)
+  quote
+    _mxret = ccall( ($(Meta.quot(f)), $MXNET_LIB),
+                    Cint, $argtypes, $(args...) )
+    if _mxret != 0
+      err_msg = mx_get_last_error()
+      throw(MXError(err_msg))
+    end
+  end
+end
+
+"""
+Get libmxnet version
+"""
+function _get_lib_version()
+  ver = Ref{Cint}(0)
+  @mxcall :MXGetVersion (Ref{Cint},) ver
+  ver[]
+end
+
+################################################################################
+# Handle types
+################################################################################
+macro mx_define_handle_t(name, destructor)
+  name = esc(name)
+  quote
+    mutable struct $name
+      value :: MX_handle
+
+      function $name(value = C_NULL)
+        hdr = new(value)
+
+        $(if destructor != :nop
+          :(finalizer(hdr, delete!))
+        end)
+
+        return hdr
+      end
+    end
+
+    $(if finalizer != :nop
+      quote
+        function delete!(h :: $name)
+          if h.value != C_NULL
+            @mxcall($(Meta.quot(destructor)), (MX_handle,), h.value)
+            h.value = C_NULL
+          end
+        end
+      end
+    end)
+
+    function Base.unsafe_convert(::Type{MX_handle}, obj::$name)
+      obj.value
+    end
+    Base.convert(t::Type{MX_handle}, obj::$name) = Base.unsafe_convert(t, obj)
+    Base.cconvert(t::Type{MX_handle}, obj::$name) = Base.unsafe_convert(t, obj)
+
+    function Base.isnull(obj::$name) obj.value == C_NULL end
+  end
+end
+
+@mx_define_handle_t(MX_NDArrayHandle, MXNDArrayFree)
+@mx_define_handle_t(MX_OpHandle, nop)
+@mx_define_handle_t(MX_SymbolHandle, MXSymbolFree)
+@mx_define_handle_t(MX_ExecutorHandle, MXExecutorFree)
+@mx_define_handle_t(MX_DataIterHandle, MXDataIterFree)
+@mx_define_handle_t(MX_KVStoreHandle, MXKVStoreFree)
+
+################################################################################
+# MXNet Params
+#
+# MXNet API use string to pass some common parameters like the configurations
+# when defining layers. Typically, it is enough to use string(obj) to get a
+# recognizable representation for libmxnet. However, there is currently a
+# caveat:
+#
+# Because Julia use column-major ordering for tensors. In order to properly
+# interact with Julia Arrays, the shape will look "reversed" from the Julia
+# side. For example, a typical MNIST mini-batch tensor is of shape (28,28,1,100)
+# from Julia side, while the shape information for the same piece of memory
+# should be interpreted as (100,1,28,28) from C/C++/Python side.
+#
+# Therefore, when passing parameters to libmxnet, we should reverse the shape
+# parameter. For example, when the user specify a non-square kernel size for
+# a convolution or pooling layer. Unfortunately, those operators are automatically
+# imported, and information about the type of each parameter is somehow limited.
+# One hacky way is to match the type description for the string "Shape(tuple)"
+# when importing operators. But currently we simply decided to reverse **all**
+# NTuple{N, Int} passed to libmxnet.
+#
+# TODO: find a better solution in case this cause issues in the future.
+# I made `@_remap` in `ndarray.jl`. (Iblis Lin)
+################################################################################
+dump_mx_param(val::Any)        = string(val)
+dump_mx_param(val::Float64)    = @sprintf("%.16e", val)
+dump_mx_param(val::Float32)    = @sprintf("%.8e", val)
+dump_mx_param(val::Float16)    = @sprintf("%.4e", val)
+dump_mx_param(val::Irrational) = @sprintf("%.16e", val)
+dump_mx_param(shape::NTuple{N, <:Integer}) where N =
+  string(tuple(flipdim([shape...], 1)...))
+
+
+"""
+A convenient macro copied from Mocha.jl that could be used to define structs
+with default values and type checks. For example
+```julia
+@defstruct MyStruct Any (
+  field1 :: Int = 0,
+  (field2 :: AbstractString = "", !isempty(field2))
+)
+```
+where each field could be either
+```julia
+field_name :: field_type = default_value
+```
+or put within a tuple, with the second element
+specifying a validation check on the field value.
+In the example above, the default value for
+field2 does not satisfy the assertion, this
+could be used to force user to provide a
+valid value when no meaningful default value
+is available.
+
+The macro will define a constructor that could accept
+the keyword arguments.
+"""
+macro defstruct(name, fields)
+  _defstruct_impl(false, name, fields)
+end
+
+"""A convenient macro to define immutable structs. The same as
+`@defstruct` except that the defined type is immutable.
+"""
+macro defimmutable(name, fields)
+  _defstruct_impl(true, name, fields)
+end
+
+"""Internal use only, this value is used to indicate a required value
+is not specified.
+"""
+struct __Undefined
+end
+
+function _defstruct_impl(is_immutable, name, fields)
+  if isa(fields, Expr) && fields.head == :tuple
+    fields = fields.args
+  else
+    fields = [fields]
+  end
+  @assert length(fields) > 0
+
+  if isa(name, Symbol)
+    name       = esc(name)
+    super_name = :Any
+  else
+    @assert(isa(name, Expr) && name.head == :(<:) && length(name.args) == 2 &&
+            isa(name.args[1], Symbol) && isa(name.args[2], Symbol),
+            "name must be of form 'Name <: SuperType'")
+
+    super_name = esc(name.args[2])
+    name       = esc(name.args[1])
+  end
+
+  field_defs     = Vector{Expr}(length(fields))        # :(field2 :: Int)
+  field_names    = Vector{Expr}(length(fields))        # :field2
+  field_defaults = Vector{Expr}(length(fields))        # :(field2 = 0)
+  field_types    = Vector{Expr}(length(fields))        # Int
+  field_asserts  = Vector{Expr}(length(fields))        # :(field2 >= 0)
+  required_field = Symbol[]
+
+  for i = 1:length(fields)
+    field = fields[i]
+    if field.head == :tuple
+      field_asserts[i] = esc(field.args[2])
+      field = field.args[1]
+    end
+    if field.head == :(=)
+      fname             = field.args[1].args[1]
+      field_defs[i]     = esc(field.args[1])
+      field_names[i]    = esc(fname)
+      field_types[i]    = esc(field.args[1].args[2])
+      field_defaults[i] = Expr(:kw, fname, esc(field.args[2]))
+    else
+      # no default value provided, required field
+      fname             = field.args[1]
+      field_defs[i]     = esc(field)
+      field_names[i]    = esc(fname)
+      field_types[i]    = esc(field.args[2])
+      field_defaults[i] = Expr(:kw, fname, __Undefined())
+      push!(required_field, fname)
+    end
+  end
+
+  # body of layer type, defining fields
+  type_body = Expr(:block, field_defs...)
+
+  # constructor
+  requires = map(required_field) do fname
+    :(@assert(!isa($fname, __Undefined), "value for " * string($fname) * " is required"))
+  end
+  converts = map(zip(field_names, field_types)) do param
+    f_name, f_type = param
+    :($f_name = convert($f_type, $f_name))
+  end
+  asserts = map(filter(i -> isassigned(field_asserts,i), 1:length(fields))) do i
+    :(@assert($(field_asserts[i])))
+  end
+  construct = Expr(:call, name, field_names...)
+  ctor_body = Expr(:block, requires..., converts..., asserts..., construct)
+  ctor_def = Expr(:call, name, Expr(:parameters, field_defaults...))
+  ctor = Expr(:(=), ctor_def, ctor_body)
+
+  if is_immutable
+    quote
+      struct $(name) <: $(super_name)
+        $type_body
+      end
+
+      $ctor
+    end
+  else
+    quote
+      mutable struct $(name) <: $(super_name)
+        $type_body
+      end
+
+      $ctor
+    end
+  end
+end
diff --git a/julia/src/broadcast.jl b/julia/src/broadcast.jl
new file mode 100644
index 000000000000..fee960a46271
--- /dev/null
+++ b/julia/src/broadcast.jl
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+using TakingBroadcastSeriously: Broadcasted, unwrap
+
+for f in :[%,
+           tan, asin, acos, atan,
+           sinh, cosh, tanh, asinh, acosh, atanh,
+           min, max,
+           hypot].args
+  # copy from TakingBroadcastSeriously
+  @eval Base.$f(a::Broadcasted...) = Broadcasted(broadcast_($f, unwrap.(a)...))
+  @eval Base.$f(a::Broadcasted, b) = Broadcasted(broadcast_($f, unwrap(a), b))
+  @eval Base.$f(b, a::Broadcasted) = Broadcasted(broadcast_($f, b, unwrap(a)))
+end
+
+for f in :[σ, sigmoid, relu, softmax, log_softmax].args
+  # copy from TakingBroadcastSeriously
+  @eval $f(a::Broadcasted...) = Broadcasted(broadcast_($f, unwrap.(a)...))
+  @eval $f(a::Broadcasted, b) = Broadcasted(broadcast_($f, unwrap(a), b))
+  @eval $f(b, a::Broadcasted) = Broadcasted(broadcast_($f, b, unwrap(a)))
+end
diff --git a/julia/src/callback.jl b/julia/src/callback.jl
new file mode 100644
index 000000000000..06e431de06d0
--- /dev/null
+++ b/julia/src/callback.jl
@@ -0,0 +1,160 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+    AbstractCallback
+
+Abstract type of callback functions used in training.
+"""
+abstract type AbstractCallback end
+
+"""
+    AbstractBatchCallback
+
+Abstract type of callbacks to be called every mini-batch.
+"""
+abstract type AbstractBatchCallback <: AbstractCallback end
+
+"""
+    AbstractEpochCallback
+
+Abstract type of callbacks to be called every epoch.
+"""
+abstract type AbstractEpochCallback <: AbstractCallback end
+
+mutable struct BatchCallback <: AbstractBatchCallback
+  frequency :: Int
+  call_on_0 :: Bool
+  callback  :: Function
+end
+
+"""
+    every_n_batch(callback :: Function, n :: Int; call_on_0 = false)
+
+A convenient function to construct a callback that runs every `n` mini-batches.
+
+# Arguments
+* `call_on_0::Bool`: keyword argument, default false. Unless set, the callback
+          will *not* be run on batch 0.
+
+For example, the [`speedometer`](@ref) callback is defined as
+
+```julia
+every_n_batch(frequency, call_on_0=true) do state :: OptimizationState
+  if state.curr_batch == 0
+    # reset timer
+  else
+    # compute and print speed
+  end
+end
+```
+
+See also [`every_n_epoch`](@ref) and [`speedometer`](@ref).
+"""
+function every_n_batch(callback::Function, n::Int; call_on_0::Bool = false)
+  BatchCallback(n, call_on_0, callback)
+end
+function (cb :: BatchCallback)(state :: OptimizationState)
+  if state.curr_batch == 0
+    if cb.call_on_0
+      cb.callback(state)
+    end
+  elseif state.curr_batch % cb.frequency == 0
+    cb.callback(state)
+  end
+end
+
+"""
+    speedometer(;frequency=50)
+
+Create an `AbstractBatchCallback` that measure the training speed
+   (number of samples processed per second) every k mini-batches.
+
+# Arguments
+* `frequency::Int`: keyword argument, default 50. The frequency (number of
+          min-batches) to measure and report the speed.
+"""
+function speedometer(;frequency::Int = 50)
+  cl_tic = 0
+  every_n_batch(frequency, call_on_0 = true) do state::OptimizationState
+    if state.curr_batch == 0
+      # reset timer
+      cl_tic = time()
+    else
+      speed = frequency * state.batch_size / (time() - cl_tic)
+      info(format("Speed: {1:>6.2f} samples/sec", speed))
+      cl_tic = time()
+    end
+  end
+end
+
+
+mutable struct EpochCallback <: AbstractEpochCallback
+  frequency :: Int
+  call_on_0 :: Bool
+  callback  :: Function
+end
+
+"""
+    every_n_epoch(callback :: Function, n :: Int; call_on_0 = false)
+
+A convenient function to construct a callback that runs every `n` full data-passes.
+
+* `call_on_0::Bool`: keyword argument, default false. Unless set, the callback
+          will *not* be run on epoch 0. Epoch 0 means no training has been performed
+          yet. This is useful if you want to inspect the randomly initialized model
+          that has not seen any data yet.
+
+See also [`every_n_batch`](@ref).
+"""
+every_n_epoch(callback::Function, n::Int; call_on_0::Bool = false) =
+  EpochCallback(n, call_on_0, callback)
+
+function (cb::EpochCallback)(model::Any, state::OptimizationState,
+                             metric::Vector{Tuple{Symbol, T}}) where T<:Real
+  if state.curr_epoch == 0
+    if cb.call_on_0
+      cb.callback(model, state, metric)
+    end
+  elseif state.curr_epoch % cb.frequency == 0
+    cb.callback(model, state, metric)
+  end
+end
+
+"""
+    do_checkpoint(prefix; frequency=1, save_epoch_0=false)
+
+Create an `AbstractEpochCallback` that save checkpoints of the model to disk.
+The checkpoints can be loaded back later on.
+
+# Arguments
+* `prefix::AbstractString`: the prefix of the filenames to save the model.
+  The model architecture will be saved to prefix-symbol.json,
+  while the weights will be saved to prefix-0012.params,
+  for example, for the 12-th epoch.
+* `frequency::Int`: keyword argument, default is 1.
+  The frequency (measured in epochs) to save checkpoints.
+* `save_epoch_0::Bool`: keyword argument, default false. Whether we should save a
+  checkpoint for epoch 0 (model initialized but not seen any data yet).
+"""
+function do_checkpoint(prefix::AbstractString;
+                       frequency::Int = 1, save_epoch_0::Bool = false)
+  mkpath(dirname(prefix))
+  every_n_epoch(frequency, call_on_0=save_epoch_0) do model, state, metric
+    save_checkpoint(model, prefix, state)
+  end
+end
diff --git a/julia/src/context.jl b/julia/src/context.jl
new file mode 100644
index 000000000000..c97522b3b846
--- /dev/null
+++ b/julia/src/context.jl
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+@enum CONTEXT_TYPE CPU=1 GPU=2 CPU_PINNED=3
+
+"""
+    Context(dev_type, dev_id)
+
+A context describes the device type and id on which computation should be carried on.
+"""
+struct Context
+  device_type :: CONTEXT_TYPE
+  device_id   :: Int
+end
+Context(dev_type :: Union{CONTEXT_TYPE, Int}, dev_id :: Int = 0) =
+    Context(convert(CONTEXT_TYPE, dev_type), dev_id)
+
+Base.show(io::IO, ctx::Context) =
+  print(io, "$(ctx.device_type)$(ctx.device_id)")
+
+"""
+    cpu(dev_id)
+
+Get a CPU context with a specific id. `cpu()` is usually the default context for many
+operations when no context is specified.
+
+# Arguments
+* `dev_id::Int = 0`: the CPU id.
+"""
+cpu(dev_id::Int = 0) = Context(CPU, dev_id)
+
+"""
+    gpu(dev_id)
+
+Get a GPU context with a specific id. The K GPUs on a node is typically numbered as 0,...,K-1.
+
+# Arguments
+* `dev_id :: Int = 0` the GPU device id.
+"""
+gpu(dev_id::Int = 0) = return Context(GPU, dev_id)
diff --git a/julia/src/deprecated.jl b/julia/src/deprecated.jl
new file mode 100644
index 000000000000..12c5345aa198
--- /dev/null
+++ b/julia/src/deprecated.jl
@@ -0,0 +1,160 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# NDArray reshape (#272)
+@deprecate reshape(arr::NDArray; shape=()) reshape(arr, shape)
+@deprecate Reshape(arr::NDArray; shape=()) reshape(arr, shape)
+
+# SymbolicNode reshape (#279)
+@deprecate reshape(sym::SymbolicNode; shape=()) reshape(sym, shape)
+@deprecate Reshape(sym::SymbolicNode; shape=()) reshape(sym, shape)
+
+# srand (#282)
+@deprecate srand!(seed_state::Int) srand(seed_state)
+
+# v0.4
+@deprecate sin(x::NDArray)    sin.(x)
+@deprecate cos(x::NDArray)    cos.(x)
+@deprecate tan(x::NDArray)    tan.(x)
+@deprecate arcsin(x::NDArray) asin.(x)
+@deprecate arccos(x::NDArray) acos.(x)
+@deprecate arctan(x::NDArray) atan.(x)
+
+@deprecate sinh(x::NDArray)    sinh.(x)
+@deprecate cosh(x::NDArray)    cosh.(x)
+@deprecate tanh(x::NDArray)    tanh.(x)
+@deprecate arcsinh(x::NDArray) asinh.(x)
+@deprecate arccosh(x::NDArray) acosh.(x)
+@deprecate arctanh(x::NDArray) atanh.(x)
+
+# @deprecate make `randn` exported accidentially
+# so we make the depwarn manually
+function randn(μ, σ, dims::NTuple{N,Int}, ctx::Context = cpu()) where N
+  warn("mx.randn(μ, σ, dims, ctx = cpu()) is deprecated, use " *
+       "mx.randn(dims...; μ = μ, σ = σ, context = ctx) instead.")
+  mx.randn(dims...; μ = μ, σ = σ, context = ctx)
+end
+
+function randn!(μ, σ, x::NDArray)
+  warn("mx.randn!(μ, σ, x::NDArray) is deprecated, use " *
+       "mx.randn!(x; μ = μ, σ = σ) instead.")
+  randn!(x; μ = μ, σ = σ)
+end
+
+function rand!(low::Real, high::Real, x::NDArray)
+  warn("rand!(low, high, x::NDArray) is deprecated, use " *
+       "rand!(x, low = low, high = high) instead.")
+  rand!(x, low = low, high = high)
+end
+
+function rand(low::Real, high::Real, dims::NTuple{N,Int}, context::Context = cpu()) where N
+  warn("rand!(low, high, dims, x::NDArray, context = cpu()) is deprecated, use " *
+       "rand!(dims..., x; low = low, high = high, context = cpu()) instead.")
+  rand(dims...; low = low, high = high, context = context)
+end
+
+@deprecate sigmoid(x::NDArray)                      sigmoid.(x)
+@deprecate relu(x::NDArray)                         relu.(x)
+@deprecate softmax(x::NDArray; axis = ndims(x))     softmax.(x, axis)
+@deprecate log_softmax(x::NDArray; axis = ndims(x)) log_softmax.(x, axis)
+
+@deprecate clip(x; a_min = 0, a_max = 0) clip(x, a_min, a_max)
+
+function broadcast_plus(x::NDArray, y::NDArray)
+  warn("broadcast_plus(x, y) is deprecated, use x .+ y instead.")
+  x .+ y
+end
+
+function broadcast_add(x::NDArray, y::NDArray)
+  warn("broadcast_add(x, y) is deprecated, use x .+ y instead.")
+  x .+ y
+end
+
+function broadcast_sub(x::NDArray, y::NDArray)
+  warn("broadcast_sub(x, y) is deprecated, use x .- y instead.")
+  x .- y
+end
+
+function broadcast_minus(x::NDArray, y::NDArray)
+  warn("broadcast_minus(x, y) is deprecated, use x .- y instead.")
+  x .- y
+end
+
+function broadcast_mul(x::NDArray, y::NDArray)
+  warn("broadcast_mul(x, y) is deprecated, use x .* y instead.")
+  x .* y
+end
+
+function broadcast_div(x::NDArray, y::NDArray)
+  warn("broadcast_div(x, y) is deprecated, use x ./ y instead.")
+  x ./ y
+end
+
+function broadcast_mod(x::NDArray, y::NDArray)
+  warn("broadcast_mod(x, y) is deprecated, use x .% y instead.")
+  x .% y
+end
+
+function broadcast_power(x::NDArray, y::NDArray)
+  warn("broadcast_power(x, y) is deprecated, use x.^y instead.")
+  x.^y
+end
+
+function broadcast_equal(x::NDArray, y::NDArray)
+  warn("broadcast_equal(x, y) is deprecated, use x .== y instead.")
+  x .== y
+end
+
+function broadcast_not_equal(x::NDArray, y::NDArray)
+  warn("broadcast_not_equal(x, y) is deprecated, use x .== y instead.")
+  x .!= y
+end
+
+function broadcast_greater(x::NDArray, y::NDArray)
+  warn("broadcast_greater(x, y) is deprecated, use x .== y instead.")
+  x .> y
+end
+
+function broadcast_greater_equal(x::NDArray, y::NDArray)
+  warn("broadcast_greater_equal(x, y) is deprecated, use x .== y instead.")
+  x .>= y
+end
+
+function broadcast_lesser(x::NDArray, y::NDArray)
+  warn("broadcast_lesser(x, y) is deprecated, use x .== y instead.")
+  x .< y
+end
+
+function broadcast_lesser_equal(x::NDArray, y::NDArray)
+  warn("broadcast_lesser_equal(x, y) is deprecated, use x .== y instead.")
+  x .<= y
+end
+
+function broadcast_maximum(x::NDArray, y::NDArray)
+  warn("broadcast_maximum(x, y) is deprecated, use max.(x, y) instead.")
+  max.(x, y)
+end
+
+function broadcast_minimum(x::NDArray, y::NDArray)
+  warn("broadcast_minimum(x, y) is deprecated, use min.(x, y) instead.")
+  min.(x, y)
+end
+
+function broadcast_hypot(x::NDArray, y::NDArray)
+  warn("broadcast_hypot(x, y) is deprecated, use hypot.(x, y) instead.")
+  hypot.(x, y)
+end
diff --git a/julia/src/executor.jl b/julia/src/executor.jl
new file mode 100644
index 000000000000..4bf4339d65d1
--- /dev/null
+++ b/julia/src/executor.jl
@@ -0,0 +1,254 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import Base: bind
+
+"""
+    Executor
+
+An executor is a realization of a symbolic architecture defined by a `SymbolicNode`.
+The actual forward and backward computation specified by the network architecture can
+be carried out with an executor.
+"""
+mutable struct Executor
+  handle :: MX_ExecutorHandle
+  symbol :: SymbolicNode
+  arg_arrays  :: VecOfNDArray
+  grad_arrays :: Vector{Union{Void,<:NDArray}}
+  aux_arrays  :: VecOfNDArray
+  outputs     :: VecOfNDArray
+  arg_dict    :: Dict{Symbol}
+  aux_dict    :: Dict{Symbol}
+end
+
+function Executor(hdl::MX_ExecutorHandle, sym::SymbolicNode,
+                  arg_arrays::VecOfNDArray, grad_arrays::AbstractVector,
+                  aux_arrays::VecOfNDArray)
+  # get output arrays
+  ref_size = Ref{MX_uint}(0)
+  ref_hdls = Ref{Ptr{MX_handle}}(C_NULL)
+  @mxcall(:MXExecutorOutputs, (MX_handle, Ref{MX_uint}, Ref{Ptr{MX_handle}}),
+          hdl, ref_size, ref_hdls)
+  out_hdrs = unsafe_wrap(Array, ref_hdls[], ref_size[])
+  out_arrays = [NDArray(MX_NDArrayHandle(x)) for x in out_hdrs]
+
+  arg_names = list_arguments(sym)
+  @assert(length(arg_names) == length(unique(arg_names)), "Duplicated names in arguments: $arg_names")
+  arg_dict = Dict(zip(arg_names, arg_arrays))
+
+  aux_names = list_auxiliary_states(sym)
+  @assert(length(aux_names) == length(unique(aux_names)), "Duplicated names in auxiliary states: $aux_names")
+  aux_dict = Dict(zip(aux_names, aux_arrays))
+
+  Executor(hdl, sym, arg_arrays, grad_arrays, aux_arrays, out_arrays, arg_dict, aux_dict)
+end
+
+Base.unsafe_convert(::Type{MX_handle}, obj::Executor) =
+  Base.unsafe_convert(MX_handle, obj.handle)
+Base.convert(t::Type{MX_handle}, obj::Executor) = Base.unsafe_convert(t, obj)
+Base.cconvert(t::Type{MX_handle}, obj::Executor) = Base.unsafe_convert(t, obj)
+
+function _get_ndarray_inputs(arg_key::AbstractString, args::VecOfNDArray,
+                             arg_names::Vector{Symbol}, allow_missing::Bool)
+  @assert(length(args) == length(arg_names), "Length of $arg_key does not match number of arguments")
+  return (MX_handle[args...], args)
+end
+
+function _get_ndarray_inputs(arg_key::AbstractString, args::Dict{Symbol},
+                             arg_names::Vector{Symbol}, allow_missing::Bool)
+  args_vec = map(arg_names) do name
+    arr = get(args, name, nothing)
+    if !allow_missing
+      @assert(!isa(arr, Void), "Must specify all arguments in $arg_key ($name is missing)")
+    end
+    arr
+  end
+  # help the type inference
+  if allow_missing
+    args_vec = Union{NDArray,Void}[args_vec...]
+  else
+    args_vec = NDArray[args_vec...]
+  end
+  args_hdr = MX_handle[(isa(x,Void) ? MX_handle(0) : x) for x in args_vec]
+  return (args_hdr, args_vec)
+end
+
+"""
+    bind(sym, ctx, args; args_grad=Dict(), aux_states=Dict(), grad_req=GRAD_WRITE)
+
+Create an `Executor` by binding a `SymbolicNode` to concrete `NDArray`.
+
+# Arguments
+* `sym::SymbolicNode`: the network architecture describing the computation graph.
+* `ctx::Context`: the context on which the computation should run.
+* `args`: either a list of `NDArray` or a dictionary of name-array pairs. Concrete
+          arrays for all the inputs in the network architecture. The inputs typically include
+          network parameters (weights, bias, filters, etc.), data and labels.
+          See [`list_arguments`](@ref) and [`infer_shape`](@ref).
+* `args_grad`: a `Vector` of `NDArray` or a `Dict` contains `NDArray`
+* `aux_states`: a `Vector` of `NDArray` or a `Dict` contains `NDArray`
+* `grad_req`: single value, a `Vector` of `GRAD_REQ` or a `Dict{Symbol,GRAD_REQ}`
+"""
+function bind(self::SymbolicNode, ctx::Context, args;
+              args_grad = Dict{Symbol,NDArray}(),
+              aux_states = Dict{Symbol,NDArray}(),
+              grad_req = GRAD_WRITE)
+
+  arg_names = list_arguments(self)
+
+  args_hdr, args           = _get_ndarray_inputs("args", args, arg_names, false)
+  args_grad_hdr, args_grad = _get_ndarray_inputs("args_grad", args_grad, arg_names, true)
+  aux_args_hdr, aux_states = _get_ndarray_inputs("aux_states", aux_states, list_auxiliary_states(self), false)
+
+  if isa(grad_req, GRAD_REQ)
+    reqs = MX_uint[grad_req for i=1:length(args)]
+  elseif isa(grad_req, Vector{GRAD_REQ})
+    @assert(length(grad_req) == length(args))
+    reqs = MX_uint[grad_req...]
+  elseif isa(grad_req, Dict{Symbol, GRAD_REQ})
+    reqs = MX_uint[get(grad_req, name, GRAD_NOP) for name in arg_names]
+  end
+
+  ref_hdr = Ref{MX_handle}(0)
+  @mxcall(:MXExecutorBind,
+          (MX_handle, Cint, Cint, MX_uint, Ptr{MX_handle}, Ptr{MX_handle}, Ptr{MX_uint},
+           MX_uint, Ptr{MX_handle}, Ref{MX_handle}),
+          self, ctx.device_type, ctx.device_id, length(args), args_hdr,
+          args_grad_hdr, reqs, length(aux_states), aux_args_hdr, ref_hdr)
+  args_grad = convert(Vector{Union{Void,NDArray}}, args_grad)
+  executor = Executor(MX_ExecutorHandle(ref_hdr[]), self,
+                      args, args_grad, aux_states)
+end
+
+function bind(x::SymbolicNode; context::Context = cpu(), kwargs...)
+  kwargs = Dict(kwargs)
+  @assert(haskey(kwargs, :args), "Must specify args")
+  args = pop!(kwargs, :args)
+  bind(x, context, args; kwargs...)
+end
+
+function simple_bind(self::SymbolicNode, ctx::Context;
+                     grad_req::Union{GRAD_REQ,Dict{Symbol,GRAD_REQ}} = GRAD_WRITE,
+                     kwargs...)
+  arg_shapes, out_shapes, aux_shapes = infer_shape(self; kwargs...)
+  @assert(!isa(arg_shapes, Void), "Information not enough to perform complete shape inference")
+
+  arg_arrays = NDArray[zeros(shape, ctx) for shape in arg_shapes]
+  arg_names  = list_arguments(self)
+
+  grad_arrays = Dict{Symbol,NDArray}()
+
+  if grad_req != GRAD_NOP
+    shapes = zip(arg_names, arg_shapes)
+
+    # if not in provided data, should be parameters
+    provided_data_names = [x[1] for x in kwargs]
+    shapes = filter(x -> !in(x[1], provided_data_names), shapes)
+
+    # Remove all gradients for nop params
+    # if isa(grad_req, Dict{Symbol, GRAD_REQ})
+    #  shapes = filter(x -> grad_req[x[1]] != GRAD_NOP,shapes)
+    # end
+
+    for (name, shape) in shapes
+      grad_arrays[name] = zeros(shape, ctx)
+    end
+  end
+
+  aux_arrays = [zeros(shape, ctx) for shape in aux_shapes]
+  return bind(self, ctx, arg_arrays, args_grad=grad_arrays, grad_req=grad_req, aux_states=aux_arrays)
+end
+
+
+function forward(self::Executor; is_train::Bool = false, kwargs...)
+  for (k,v) in kwargs
+    @assert(k ∈ self.arg_dict, "Unknown argument $k")
+    @assert(isa(v, NDArray), "Keyword argument $k must be an NDArray")
+    copy!(self.arg_dict[k], v)
+  end
+
+  @mxcall(:MXExecutorForward, (MX_handle, Cint), self, is_train)
+
+  self.outputs
+end
+
+backward(x::Executor) = backward(x, NDArray[])
+backward(x::Executor, out_grad::NDArray) = backward(x, [out_grad])
+backward(x::Executor, out_grads::VecOfNDArray) =
+  @mxcall(:MXExecutorBackward, (MX_handle, MX_uint, Ptr{MX_handle}),
+          x, length(out_grads), MX_handle[out_grads...])
+
+function copy_params_from(self::Executor, arg_params::Dict{Symbol},
+                          aux_params::Dict{Symbol} = Dict{Symbol,Any}();
+                          allow_extra_params::Bool = false)
+  for (name, array) in arg_params
+    if haskey(self.arg_dict, name)
+      copy!(self.arg_dict[name], array)
+    else
+      @assert(allow_extra_params, "Extra params $name not in the arguments")
+    end
+  end
+
+  for (name, array) in aux_params
+    if haskey(self.aux_dict, name)
+      copy!(self.aux_dict[name], array)
+    else
+      @assert(allow_extra_params, "Extra auxiliary state $name not recognized")
+    end
+  end
+end
+
+
+Base.show(io::IO, x::Executor) =
+  print(io, "mx.", split(string(typeof(x)), '.')[end], " ", x.handle.value)
+
+"""
+    print([io::IO], x::Executor)
+
+Get a debug string about internal execution plan.
+
+Can be used to get an estimated about the memory cost.
+
+```julia
+julia> x = mx.Variable(:x)
+MXNet.mx.SymbolicNode x
+
+julia> exec = mx.bind(x + 1, mx.cpu(), Dict(:x => mx.ones(2,3)))
+mx.Executor Ptr{Void} @0x000055c3dee9eb30
+
+julia> print(exec)
+Symbol Outputs:
+        output[0]=_plus_scalar0(0)
+Variable:x
+--------------------
+Op:_plus_scalar, Name=_plus_scalar0
+Inputs:
+        arg[0]=x(0) version=0
+Attrs:
+        scalar=1.00000000e+00
+Total 0 MB allocated
+Total 11 TempSpace resource requested
+```
+"""
+Base.print(io::IO, x::Executor) = print(io, debug_str(x))
+Base.print(x::Executor)         = print(STDOUT, x)
+
+function debug_str(x::Executor)
+  s_ref = Ref{Cstring}(C_NULL)
+  @mxcall(:MXExecutorPrint, (MX_handle, Ptr{Cstring}), x.handle, s_ref)
+  unsafe_string(s_ref[])
+end
diff --git a/julia/src/initializer.jl b/julia/src/initializer.jl
new file mode 100644
index 000000000000..95dbeb31febd
--- /dev/null
+++ b/julia/src/initializer.jl
@@ -0,0 +1,197 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+    AbstractInitializer
+
+The abstract base class for all initializers.
+
+To define a new initializer, it is
+enough to derive a new type, and implement one or more of the following methods:
+
+    _init_weight(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
+    _init_bias(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
+    _init_gamma(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
+    _init_beta(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
+
+Or, if full behavior customization is needed, override the following function
+
+    init(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
+"""
+abstract type AbstractInitializer end
+
+function init(self :: T, name :: Base.Symbol, array :: NDArray) where T<:AbstractInitializer
+  strname = string(name)
+  if startswith(strname,"upsampling")
+    _init_bilinear(self,name, array)
+  elseif startswith(strname,"stn_loc") && endswith(strname,"weight")
+    _init_zero(self,name, array)
+  elseif startswith(strname,"stn_loc") && endswith(strname,"bias")
+    _init_loc_bias(self,name, array)
+  elseif endswith(strname, "bias")
+    _init_bias(self, name, array)
+  elseif endswith(strname, "gamma")
+    _init_gamma(self, name, array)
+  elseif endswith(strname, "beta")
+    _init_beta(self, name, array)
+  elseif endswith(strname, "weight")
+    _init_weight(self, name, array)
+  elseif endswith(strname, "moving_mean")
+    _init_zero(self, name, array)
+  elseif endswith(strname, "moving_var")
+    _init_zero(self, name, array)
+  else
+    _init_default(self, name, array)
+  end
+end
+
+function _init_loc_bias(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
+ assert(size(array) == (6,))
+ array[:]= [1.0, 0, 0, 0, 1.0, 0]
+end
+
+function _init_bilinear(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
+  @assert ndims(array) == 4
+
+  W, H, C, N = size(array) # Inverse of NCHW layout
+  filter = Base.zeros(eltype(array), W, H)
+
+  @assert H == W
+
+  f = ceil(Int, W / 2) # factor
+  c = (2 * f - 1 - f % 2) / (2 * f) # center
+  for x in 0:(W-1)
+    for y in 0:(H-1)
+      filter[x+1, y+1] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
+    end
+  end
+
+  @nd_as_jl rw=array begin
+    for i in 1:N
+      for j in 1:C
+        array[:,:, j, i] = filter
+      end
+    end
+  end
+end
+
+function _init_bias(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
+  array[:] = 0
+end
+function _init_gamma(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
+  array[:] = 1
+end
+function _init_beta(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
+  array[:] = 0
+end
+function _init_zero(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
+  array[:] = 0
+end
+
+function _init_default(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
+  error("Do not know how to init $name")
+end
+
+"""
+    UniformInitializer
+
+Initialize weights according to a uniform distribution within the provided scale.
+"""
+struct UniformInitializer <: AbstractInitializer
+  scale :: AbstractFloat
+end
+"""
+    UniformInitializer(scale=0.07)
+
+Construct a `UniformInitializer` with the specified scale.
+"""
+UniformInitializer() = UniformInitializer(0.07)
+
+_init_weight(i::UniformInitializer, name::Symbol, x::NDArray) =
+  rand!(x, low = -i.scale, high = i.scale)
+
+"""
+    NormalInitializer
+
+Initialize weights according to a univariate Gaussian distribution.
+"""
+struct NormalInitializer <: AbstractInitializer
+  μ :: AbstractFloat
+  σ :: AbstractFloat
+end
+"""
+    NormalInitializer(; mu=0, sigma=0.01)
+
+Construct a `NormalInitializer` with mean `mu` and variance `sigma`.
+"""
+NormalInitializer(; mu=0, sigma=0.01) = NormalInitializer(mu, sigma)
+
+_init_weight(i::NormalInitializer, name::Symbol, x::NDArray) =
+  randn!(x, μ = i.μ, σ = i.σ)
+
+"""
+    XavierInitializer
+
+The initializer documented in the paper [Bengio and Glorot 2010]: *Understanding
+the difficulty of training deep feedforward neuralnetworks*.
+
+There are several different version of the XavierInitializer used in the wild.
+The general idea is that the variance of the initialization distribution is controlled
+by the dimensionality of the input and output. As a distribution one can either choose
+a normal distribution with μ = 0 and σ² or a uniform distribution from -σ to σ.
+
+Several different ways of calculating the variance are given in the literature or are
+used by various libraries.
+
+* [Bengio and Glorot 2010]: `mx.XavierInitializer(distribution = mx.xv_uniform, regularization = mx.xv_avg, magnitude = 1)`
+* [K. He, X. Zhang, S. Ren, and J. Sun 2015]: `mx.XavierInitializer(distribution = mx.xv_gaussian, regularization = mx.xv_in, magnitude = 2)`
+* caffe_avg: `mx.XavierInitializer(distribution = mx.xv_uniform, regularization = mx.xv_avg, magnitude = 3)`
+"""
+
+@enum XavierDistribution xv_uniform xv_normal
+@enum XavierRegularization xv_avg xv_in xv_out
+
+struct XavierInitializer <: AbstractInitializer
+  distribution :: XavierDistribution
+  regularization :: XavierRegularization
+  magnitude :: Float64
+end
+
+XavierInitializer(; distribution = xv_uniform, regularization = xv_avg, magnitude = 3.0) =
+  XavierInitializer(distribution, regularization, magnitude)
+
+function _init_weight(self :: XavierInitializer, name :: Base.Symbol, array :: NDArray)
+  dims    = size(array)
+  fan_in  = prod(dims[2:end])
+  fan_out = dims[1]
+
+  if self.regularization == xv_avg
+    factor = (fan_in + fan_out) / 2
+  elseif self.regularization == xv_in
+    factor = fan_in
+  elseif self.regularization == xv_out
+    factor = fan_out
+  end
+
+  σ = √(self.magnitude / factor)
+
+  if self.distribution == xv_uniform
+    rand!(array, low = -σ, high = σ)
+  elseif self.distribution == xv_normal
+    randn!(array; μ = 0.0, σ = σ)
+  end
+end
diff --git a/julia/src/io.jl b/julia/src/io.jl
new file mode 100644
index 000000000000..e5f43950754c
--- /dev/null
+++ b/julia/src/io.jl
@@ -0,0 +1,643 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+    AbstractDataProvider
+
+The root type for all data provider. A data provider should implement the following interfaces:
+
+* [`get_batch_size`](@ref)
+* [`provide_data`](@ref)
+* [`provide_label`](@ref)
+
+As well as the Julia iterator interface (see [the Julia manual](http://docs.julialang.org/en/stable/manual/interfaces/)).
+Normally this involves defining:
+
+* `Base.eltype(provider) -> AbstractDataBatch`
+* `Base.start(provider) -> AbstractDataProviderState`
+* `Base.done(provider, state) -> Bool`
+* `Base.next(provider, state) -> (AbstractDataBatch, AbstractDataProvider)`
+"""
+abstract type AbstractDataProvider end
+
+"""
+    get_batch_size(provider) -> Int
+
+# Arguments:
+* `provider::AbstractDataProvider`: the data provider.
+
+Returns the mini-batch size of the provided data. All the provided data should have the same mini-batch size (i.e. the last dimension).
+"""
+get_batch_size
+
+"""
+    provide_data(provider) -> Vector{Tuple{Base.Symbol, Tuple}}
+
+# Arguments:
+* `provider::AbstractDataProvider`: the data provider.
+
+Returns a vector of (name, shape) pairs describing the names of the data it provides, and the corresponding shapes.
+
+"""
+provide_data
+
+"""
+    provide_label(provider) -> Vector{Tuple{Base.Symbol, Tuple}}
+
+# Arguments:
+* `provider::AbstractDataProvider`: the data provider.
+
+Returns a vector of (name, shape) pairs describing the names of the labels it provides, and the corresponding shapes.
+"""
+provide_label
+
+"""
+    AbstractDataProviderState
+
+   Base type for data provider states.
+"""
+abstract type AbstractDataProviderState end
+
+"""
+    AbstractDataBatch
+
+   Base type for a data mini-batch. It should implement the following interfaces:
+
+* [`count_samples`](@ref)
+* [`get_data`](@ref)
+* [`get_label`](@ref)
+
+The following utility functions will be automatically defined:
+
+* [`get`](@ref)
+* [`load_data!`](@ref)
+* [`load_label!`](@ref)
+"""
+abstract type AbstractDataBatch end
+
+"""
+    count_samples(provider, batch) -> Int
+
+# Arguments:
+* `batch::AbstractDataBatch`: the data batch object.
+
+Returns the number of samples in this batch. This number should be greater than 0, but less than or equal to the batch size. This is used to indicate at the end of the data set, there might not be enough samples for a whole mini-batch.
+
+"""
+count_samples
+
+"""
+    get_data(provider, batch) -> Vector{NDArray}
+
+# Arguments:
+* `provider::AbstractDataProvider`: the data provider.
+* `batch::AbstractDataBatch`: the data batch object.
+
+Returns a vector of data in this batch, should be in the same order as declared in `provide_data() <AbstractDataProvider.provide_data>`.
+
+The last dimension of each `NDArray` should always match the batch_size, even when `count_samples` returns a value less than the batch size. In this case,      the data provider is free to pad the remaining contents with any value.
+"""
+get_data
+
+"""
+    get_label(provider, batch) -> Vector{NDArray}
+
+# Arguments:
+* `provider::AbstractDataProvider`: the data provider.
+* `batch::AbstractDataBatch`: the data batch object.
+
+Returns a vector of labels in this batch. Similar to [`get_data`](@ref).
+"""
+get_label
+
+"""
+    DataBatch
+
+A basic subclass of `AbstractDataBatch`, that implement the interface by
+accessing member fields.
+"""
+mutable struct DataBatch{T,S,N,M} <: AbstractDataBatch
+  data  :: Vector{NDArray{T,N}}
+  label :: Vector{NDArray{S,M}}
+  count :: Int
+end
+
+count_samples(batch::DataBatch) = batch.count
+
+get_data(::Provider, batch::DataBatch) where {Provider<:AbstractDataProvider} =
+  batch.data
+
+get_label(::Provider, batch::DataBatch) where {Provider<:AbstractDataProvider} =
+  batch.label
+
+"""
+    SlicedNDArray
+
+A alias type of `Tuple{UnitRange{Int},NDArray}`.
+"""
+const SlicedNDArray = Tuple{UnitRange{Int},<:NDArray}
+
+function _load_general!(provider :: AbstractDataProvider, batch :: AbstractDataBatch,
+                        targets :: Vector{<:Vector{<:SlicedNDArray}}, loader::Function)
+  data = loader(provider, batch)
+  for (d_src, d_targets) in zip(data, targets)
+    for (slice_idx, d_dst) in d_targets
+      copy!(d_dst, slice(d_src, slice_idx))
+    end
+  end
+end
+
+"""
+    load_data!(provider, batch, targets)
+
+# Arguments:
+* `provider::AbstractDataProvider`: the data provider.
+* `batch::AbstractDataBatch`: the data batch object.
+* `targets::Vector{Vector{SlicedNDArray}}`: the targets to load data into.
+
+The targets is a list of the same length as number of data provided by this provider.
+Each element in the list is a list of `SlicedNDArray`. This list described a
+spliting scheme of this data batch into different slices, each slice is specified by
+a slice-ndarray pair, where *slice* specify the range of samples in the mini-batch
+that should be loaded into the corresponding *ndarray*.
+
+This utility function is used in data parallelization, where a mini-batch is splited
+and computed on several different devices.
+"""
+function load_data!(provider :: AbstractDataProvider, batch :: AbstractDataBatch,
+                    targets :: Vector{<:Vector{<:SlicedNDArray}})
+  _load_general!(provider, batch, targets, get_data)
+end
+
+"""
+    load_label!(provider, batch, targets)
+
+* `provider::AbstractDataProvider provider`: the data provider.
+* `batch::AbstractDataBatch batch`: the data batch object.
+* `targets::Vector{Vector{SlicedNDArray}}`: the targets to load label into.
+
+The same as [`load_data!`](@ref), except that this is for loading labels.
+"""
+function load_label!(provider :: AbstractDataProvider, batch :: AbstractDataBatch,
+                     targets :: Vector{<:Vector{<:SlicedNDArray}})
+  _load_general!(provider, batch, targets, get_label)
+end
+
+function load_data!(provider :: AbstractDataProvider, batch :: AbstractDataBatch,
+                    targets :: Vector{<:NDArray})
+  for (src, dst) in zip(get_data(provider, batch), targets)
+    copy!(dst, src)
+  end
+end
+function load_label!(provider :: AbstractDataProvider, batch :: AbstractDataBatch,
+                     targets :: Vector{<:NDArray})
+  for (src, dst) in zip(get_label(provider, batch), targets)
+    copy!(dst, src)
+  end
+end
+
+import Base.get
+"""
+    get(provider, batch, name) -> NDArray
+
+* `provider::AbstractDataProvider`: the data provider.
+* `batch::AbstractDataBatch`: the data batch object.
+* `name::Symbol`: the name of the data to get, should be one of the names
+  provided in either `provide_data() <AbstractDataProvider.provide_data>`
+  or `provide_label() <AbstractDataProvider.provide_label>`.
+
+Returns the corresponding data array corresponding to that name.
+"""
+function get(provider::AbstractDataProvider, batch::AbstractDataBatch, name::Symbol)
+  for (idx, (k, s)) in enumerate(provide_data(provider))
+    if name == k
+      return get_data(provider, batch)[idx]
+    end
+  end
+  for (idx, (k, s)) in enumerate(provide_label(provider))
+    if name == k
+      return get_label(provider, batch)[idx]
+    end
+  end
+  error("$name is not provided by this data provider")
+end
+
+"""
+    eachbatch(provider::AbstractDataProvider)
+
+Allows you to perform operations on data every epoch. This is especially useful
+when you need to perform real-time augmentation of the data.
+
+# Arguments:
+* `provider`: an instance of the custom DataProvider type. You must return this
+instance after modifying its fields.
+
+"""
+eachbatch(provider::AbstractDataProvider) = provider
+
+"""
+    ArrayDataProvider
+
+A convenient tool to iterate `NDArray` or Julia `Array`.
+
+    ArrayDataProvider(data[, label]; batch_size, shuffle, data_padding, label_padding)
+
+Construct a data provider from `NDArray` or Julia Arrays.
+
+# Arguments:
+* `data`: the data, could be
+  * a `NDArray`, or a Julia Array. This is equivalent to `:data => data`.
+  * a name-data pair, like `:mydata => array`, where `:mydata` is the name of the data
+  * and `array` is an `NDArray` or a Julia Array.
+  * a list of name-data pairs.
+
+* `label`: the same as the `data` parameter. When this argument is omitted, the constructed provider will provide no labels.
+* `batch_size::Int`: the batch size, default is 0, which means treating the whole array as a single mini-batch.
+* `shuffle::Bool`: turn on if the data should be shuffled at every epoch.
+* `data_padding::Real`: when the mini-batch goes beyond the dataset boundary, there might
+  be less samples to include than a mini-batch. This value specify a scalar to pad the
+  contents of all the missing data points.
+* `label_padding::Real`: the same as `data_padding`, except for the labels.
+
+TODO: remove `data_padding` and `label_padding`, and implement rollover that copies
+the last or first several training samples to feed the padding.
+"""
+mutable struct ArrayDataProvider{T,N} <: AbstractDataProvider
+  data_arrays   :: Vector{Array{T,N}}
+  data_names    :: Vector{Symbol}
+  label_arrays
+  label_names   :: Vector{Symbol}
+  batch_size    :: Int
+  sample_count  :: Int
+  shuffle       :: Bool
+  data_padding  :: MX_float
+  label_padding :: MX_float
+
+  data_batch
+  label_batch
+end
+
+# Julia's type system is sometimes very frustrating. You cannot specify a function
+# with argument Vector{Pair} to expect to be matched when calling with the parameter
+# [:foo => zeros(2,3), :bar => zeros(3)] because the type inference gives very specific
+# results, about the parametric type in the Pair{T1,T2} type, thus does not match the
+# generic Pair type. In general, Int <: Number but Vector{Int} <: Vector{Number} is not
+# true. So let us just use Any here...
+function ArrayDataProvider(data; batch_size::Int = 0, shuffle::Bool = false,
+                           data_padding::Real = 0, label_padding::Real = 0)
+  ArrayDataProvider(data, [], batch_size = batch_size, shuffle = shuffle,
+                    data_padding = data_padding, label_padding = label_padding)
+end
+
+function ArrayDataProvider(data, label; batch_size::Int = 0, shuffle::Bool = false,
+                           data_padding::Real = 0, label_padding::Real = 0)
+  asarr(arr :: Array{T}) where {T} = convert(Array{MX_float}, arr)
+  asarr(arr :: NDArray) = copy(arr)
+
+  if isa(data, Union{NDArray, Array}) && eltype(data) <: Real
+    data_names  = [:data]
+    data_arrays = Array{MX_float}[asarr(data)]
+  elseif isa(data, Pair)
+    @assert isa(data.first, Base.Symbol) && isa(data.second, Union{NDArray, Array})
+    data_names  = [data.first]
+    data_arrays = Array{MX_float}[asarr(data.second)]
+  elseif isa(data, Vector) || isa(data, Tuple)
+    map(data) do d
+      @assert isa(d, Pair) && isa(d.first, Base.Symbol) && isa(d.second, Union{NDArray, Array})
+    end
+    data_names  = Base.Symbol[d.first for d in data]
+    data_arrays = Array{MX_float}[asarr(d.second) for d in data]
+  else
+    error("Invalid data argument type")
+  end
+
+  if isa(label, Union{NDArray, Array}) && eltype(label) <: Real
+    label_names  = [:softmax_label]
+    label_arrays = Array{MX_float}[asarr(label)]
+  elseif isa(label, Pair)
+    @assert isa(label.first, Base.Symbol) && isa(label.second, Union{NDArray, Array})
+    label_names  = [label.first]
+    label_arrays = Array{MX_float}[asarr(label.second)]
+  elseif isa(label, Vector) || isa(label, Tuple)
+    map(label) do d
+      @assert isa(d, Pair) && isa(d.first, Base.Symbol) && isa(d.second, Union{NDArray, Array})
+    end
+    label_names  = Base.Symbol[d.first for d in label]
+    label_arrays = Array{MX_float}[asarr(d.second) for d in label]
+  else
+    error("Invalid label argument type")
+  end
+
+  @assert length(data_arrays) > 0
+  sample_count = size(data_arrays[1])[end]
+  for i = 1:length(data_names)
+    @assert(size(data_arrays[i])[end] == sample_count,
+            "Number of samples in  $(data_names[i]) is mismatch with $(data_names[1])")
+  end
+  for i = 1:length(label_names)
+    @assert(size(label_arrays[i])[end] == sample_count,
+            "Number of samples in  $(label_names[i]) is mismatch with $(data_names[1])")
+  end
+
+  if batch_size == 0
+    batch_size = sample_count
+  end
+  @assert 0 < batch_size <= sample_count
+
+  function gen_batch_nds(arrs :: Vector{Array{MX_float}}, bsize :: Int)
+    map(arrs) do arr
+      shape = size(arr)
+      empty(shape[1:end-1]..., bsize)
+    end
+  end
+
+  data_batch  = gen_batch_nds(data_arrays, batch_size)
+  label_batch = gen_batch_nds(label_arrays, batch_size)
+
+  # reshape data and labels into 2D tensors, so that it is easier to work with them
+  data_arrays = map(data_arrays) do arr
+    reshape(arr, prod(size(arr)[1:end-1]), size(arr)[end])
+  end
+  label_arrays = map(label_arrays) do arr
+    reshape(arr, prod(size(arr)[1:end-1]), size(arr)[end])
+  end
+
+  ArrayDataProvider(data_arrays, data_names, label_arrays, label_names, batch_size,
+                    sample_count, shuffle, MX_float(data_padding), MX_float(label_padding),
+                    data_batch, label_batch)
+end
+
+provide_data(provider::ArrayDataProvider) =
+  collect(zip(provider.data_names, map(size, provider.data_batch)))
+
+provide_label(provider::ArrayDataProvider) =
+  collect(zip(provider.label_names, map(size, provider.label_batch)))
+
+get_batch_size(provider::ArrayDataProvider) = provider.batch_size
+
+struct ArrayDataProviderState <: AbstractDataProviderState
+  curr_idx :: Int
+end
+
+Base.eltype(provider :: ArrayDataProvider) = ArrayDataProviderState
+
+function Base.start(provider :: ArrayDataProvider)
+  if provider.shuffle
+    # re-shuffle all data
+    idx_perm = randperm(provider.sample_count)
+    provider.data_arrays = map(x->x[:,idx_perm], provider.data_arrays)
+    provider.label_arrays = map(x->x[:,idx_perm], provider.label_arrays)
+  end
+
+  return ArrayDataProviderState(1)
+end
+
+Base.done(provider::ArrayDataProvider, state::ArrayDataProviderState) =
+  state.curr_idx > provider.sample_count
+
+struct ArrayDataBatch <: AbstractDataBatch
+  idx :: UnitRange{Int}
+end
+function Base.next(provider :: ArrayDataProvider, state :: ArrayDataProviderState)
+  idx = state.curr_idx:Base.min(state.curr_idx+provider.batch_size-1, provider.sample_count)
+  return (ArrayDataBatch(idx), ArrayDataProviderState(idx.stop+1))
+end
+
+function count_samples(provider :: ArrayDataProvider, batch :: ArrayDataBatch)
+  return length(batch.idx)
+end
+
+function get_data(provider :: ArrayDataProvider, batch :: ArrayDataBatch)
+  for (src, dst) in zip(provider.data_arrays, provider.data_batch)
+    copy_ignore_shape!(dst[1:length(batch.idx)], src[:, batch.idx])
+    if length(batch.idx) < provider.batch_size
+      dst[length(batch.idx)+1:provider.batch_size] = provider.data_padding
+    end
+  end
+  return provider.data_batch
+end
+function get_label(provider :: ArrayDataProvider, batch :: ArrayDataBatch)
+  for (src, dst) in zip(provider.label_arrays, provider.label_batch)
+    copy_ignore_shape!(dst[1:length(batch.idx)], src[:, batch.idx])
+    if length(batch.idx) < provider.batch_size
+      dst[length(batch.idx)+1:provider.batch_size] = provider.label_padding
+    end
+  end
+  return provider.label_batch
+end
+
+
+"""
+    MXDataProvider
+
+A data provider that wrap built-in data iterators from libmxnet. See below for
+a list of built-in data iterators.
+"""
+mutable struct MXDataProvider <: AbstractDataProvider
+  handle     :: MX_DataIterHandle
+  data_shape :: Vector{Tuple{Symbol,Tuple}}
+  label_shape:: Vector{Tuple{Symbol,Tuple}}
+  batch_size :: Int
+
+  # those two a auxiliary variables to help avoid calling reset
+  # but still pre-fetch first batch to get shape information
+  first_epoch:: Bool
+  first_batch:: Bool
+end
+
+function _reset_data_iter(handle :: MX_DataIterHandle)
+  @mxcall(:MXDataIterBeforeFirst, (MX_handle,), handle)
+end
+function _iter_next(handle :: MX_DataIterHandle)
+  ref_ret = Ref{Cint}(0)
+  @mxcall(:MXDataIterNext, (MX_handle, Ref{Cint}), handle, ref_ret)
+  return Bool(ref_ret[])
+end
+function _get_data(handle :: MX_DataIterHandle)
+  ref_hdr = Ref{MX_handle}(0)
+  @mxcall(:MXDataIterGetData, (MX_handle, Ref{MX_handle}), handle, ref_hdr)
+  return NDArray(MX_NDArrayHandle(ref_hdr[]), false)
+end
+function _get_label(handle :: MX_DataIterHandle)
+  ref_hdr = Ref{MX_handle}(0)
+  @mxcall(:MXDataIterGetLabel, (MX_handle, Ref{MX_handle}), handle, ref_hdr)
+  return NDArray(MX_NDArrayHandle(ref_hdr[]), false)
+end
+
+function MXDataProvider(handle     :: MX_DataIterHandle;
+                        data_name  :: Symbol = :data,
+                        label_name :: Union{Symbol,Void} = :softmax_label,
+                        kwargs...) # for convenience, we ignore the rest keyword arguments
+  # init iterator, load the first batch and get shapes
+  @assert(_iter_next(handle), "Failed to load the first batch in MXDataProvider")
+  data_shape = Tuple{Base.Symbol, Tuple}[(data_name, size(_get_data(handle)))]
+  if !isa(label_name, Void)
+    label_shape = Tuple{Base.Symbol, Tuple}[(label_name::Base.Symbol, size(_get_label(handle)))]
+  else
+    label_shape = Tuple{Base.Symbol, Tuple}[]
+  end
+
+  MXDataProvider(handle, data_shape, label_shape, data_shape[1][2][end], true, true)
+end
+
+provide_data(provider::MXDataProvider) = provider.data_shape
+provide_label(provider::MXDataProvider) = provider.label_shape
+get_batch_size(provider::MXDataProvider) = provider.batch_size
+
+mutable struct MXDataProviderState <: AbstractDataProviderState
+  has_next :: Bool
+end
+struct MXDataBatch <: AbstractDataBatch
+end
+
+function Base.eltype(provider :: MXDataProvider)
+  MXDataBatch
+end
+function Base.start(provider :: MXDataProvider)
+  if !provider.first_epoch
+    _reset_data_iter(provider.handle)
+  else
+    provider.first_epoch = false
+  end
+
+  return MXDataProviderState(true)
+end
+function Base.done(provider :: MXDataProvider, state :: MXDataProviderState)
+  if provider.first_batch
+    state.has_next = true
+    provider.first_batch = false
+  else
+    state.has_next = _iter_next(provider.handle)
+  end
+  return !state.has_next
+end
+function Base.next(provider :: MXDataProvider, state :: MXDataProviderState)
+  return (MXDataBatch(), state)
+end
+
+function get_data(provider :: MXDataProvider, batch :: MXDataBatch)
+  return NDArray[_get_data(provider.handle)]
+end
+function get_label(provider :: MXDataProvider, batch :: MXDataBatch)
+  return NDArray[_get_label(provider.handle)]
+end
+function count_samples(provider :: MXDataProvider, batch :: MXDataBatch)
+  ref_pad = Ref{Cint}(0)
+  @mxcall(:MXDataIterGetPadNum, (MX_handle, Ref{Cint}), provider.handle, ref_pad)
+  return provider.batch_size - Int(ref_pad[])
+end
+
+function _get_iter_creators()
+  n_ref = Ref{MX_uint}(0)
+  h_ref = Ref{Ptr{MX_handle}}(0)
+  @mxcall(:MXListDataIters, (Ref{MX_uint}, Ref{Ptr{MX_handle}}), n_ref, h_ref)
+
+  return unsafe_wrap(Array, h_ref[], n_ref[])
+end
+
+function _get_iter_name(hdr :: MX_handle)
+  ref_name      = Ref{char_p}(0)
+  ref_desc      = Ref{char_p}(0)
+  ref_narg      = Ref{MX_uint}(0)
+  ref_arg_names = Ref{char_pp}(0)
+  ref_arg_types = Ref{char_pp}(0)
+  ref_arg_descs = Ref{char_pp}(0)
+
+  @mxcall(:MXDataIterGetIterInfo,
+          (MX_handle, Ref{char_p}, Ref{char_p}, Ref{MX_uint}, Ref{char_pp}, Ref{char_pp}, Ref{char_pp}),
+          hdr, ref_name, ref_desc, ref_narg, ref_arg_names, ref_arg_types, ref_arg_descs)
+
+  return Symbol(unsafe_string(ref_name[]))
+end
+
+const _iter_creator_cache = Dict{Symbol,MX_handle}()
+function _populate_iter_creator_cache!()
+  empty!(_iter_creator_cache)
+  h_creators = _get_iter_creators()
+  for handle in h_creators
+    name = _get_iter_name(handle)
+    _iter_creator_cache[name] = handle
+  end
+end
+
+_get_iter_creator(name :: Symbol) = _iter_creator_cache[name]
+
+function _define_data_iter_creator(hdr :: MX_handle)
+  ref_name      = Ref{char_p}(0)
+  ref_desc      = Ref{char_p}(0)
+  ref_narg      = Ref{MX_uint}(0)
+  ref_arg_names = Ref{char_pp}(0)
+  ref_arg_types = Ref{char_pp}(0)
+  ref_arg_descs = Ref{char_pp}(0)
+
+  @mxcall(:MXDataIterGetIterInfo,
+          (MX_handle, Ref{char_p}, Ref{char_p}, Ref{MX_uint}, Ref{char_pp}, Ref{char_pp}, Ref{char_pp}),
+          hdr, ref_name, ref_desc, ref_narg, ref_arg_names, ref_arg_types, ref_arg_descs)
+
+  iter_name = Symbol(unsafe_string(ref_name[]))
+
+  isprovider =  endswith(string(iter_name), "Iter")
+  signature = _format_signature(Int(ref_narg[]), ref_arg_names)
+  f_desc = "    " * string(iter_name) * "(" *signature * ")\n\n"
+  if isprovider
+    f_desc *= "Can also be called with the alias `$(string(iter_name)[1:end-4] * "Provider")`.\n"
+  end
+  f_desc *= unsafe_string(ref_desc[]) * "\n\n"
+  f_desc *= "# Arguments:\n"
+  f_desc *= "* `data_name::Symbol`: keyword argument, default `:data`. The name of the data.\n"
+  f_desc *= "* `label_name::Symbol`: keyword argument, default `:softmax_label`. " *
+            "The name of the label. Could be `nothing` if no label is presented in this dataset.\n\n"
+  f_desc *= _format_docstring(Int(ref_narg[]), ref_arg_names, ref_arg_types, ref_arg_descs) * "\n"
+  f_desc *= "Returns the constructed `MXDataProvider`."
+
+  if isprovider
+    alias_name = Symbol(string(iter_name)[1:end-4] * "Provider")
+  else
+    alias_name = nothing
+  end
+
+  defun = quote
+    @doc $f_desc ->
+    function $iter_name(; kwargs...)
+      arg_keys = String[string(k) for (k,v) in kwargs]
+      arg_vals = String[dump_mx_param(v) for (k,v) in kwargs]
+      ref_hdr  = Ref{MX_handle}(0)
+
+      local hdr = _get_iter_creator($(QuoteNode(iter_name)))
+      @mxcall(:MXDataIterCreateIter, (MX_handle, MX_uint, char_pp, char_pp, Ref{MX_handle}),
+              hdr, length(arg_keys), arg_keys, arg_vals, ref_hdr)
+
+      return MXDataProvider(MX_DataIterHandle(ref_hdr[]); kwargs...)
+    end
+    $(isprovider ? :(const $alias_name = $iter_name) : :())
+
+  end
+  defun
+end
+
+macro _import_io_iterators()
+  creators = _get_iter_creators()
+  defs = Expr[]
+  for handle in creators
+    push!(defs, _define_data_iter_creator(handle))
+  end
+  esc(quote
+    $(defs...)
+  end)
+end
+
+@_import_io_iterators()
diff --git a/julia/src/kvstore.jl b/julia/src/kvstore.jl
new file mode 100644
index 000000000000..ac0367144384
--- /dev/null
+++ b/julia/src/kvstore.jl
@@ -0,0 +1,353 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import Base.push!
+
+"""
+    KVStore(kv_type = :local)
+
+For single machine training, there are two commonly used types:
+
+- `local`: Copies all gradients to CPU memory and updates weights there.
+
+- `device`: Aggregates gradients and updates weights on GPU(s).
+  With this setting, the `KVStore` also attempts to use GPU peer-to-peer
+  communication, potentially accelerating the communication.
+
+For distributed training, `KVStore` also supports a number of types:
+
+- `dist_sync`: Behaves similarly to `local` but with one major difference.
+  With `dist_sync`, batch-size now means the batch size used on each machine.
+  So if there are `n` machines and we use batch size ``b``,
+  then `dist_sync` behaves like `local` with batch size `n * b`.
+
+- `dist_device_sync`: Identical to `dist_sync` with the difference similar
+  to `device` vs `local`.
+
+- `dist_async`: Performs asynchronous updates.
+  The weights are updated whenever gradients are received from any machine.
+  No two updates happen on the same weight at the same time.
+  However, the order is not guaranteed.
+"""
+mutable struct KVStore
+  handle    :: MX_KVStoreHandle
+  updater_c :: Ptr{Void}
+  updater   :: Function
+
+  KVStore(hdr::MX_KVStoreHandle) = new(hdr, Ptr{Void}(0))
+end
+
+function KVStore(kv_type::Symbol = :local)
+  @assert kv_type ∈ (:local, :device, :dist_sync, :dist_device_sync, :dist_async)
+  ref_hdr = Ref{MX_handle}(0)
+  @mxcall(:MXKVStoreCreate, (char_p, Ref{MX_handle}), dump_mx_param(kv_type), ref_hdr)
+  KVStore(MX_KVStoreHandle(ref_hdr[]))
+end
+
+Base.unsafe_convert(::Type{MX_handle}, obj::KVStore) =
+  Base.unsafe_convert(MX_handle, obj.handle)
+Base.convert(t::Type{MX_handle}, obj::KVStore) = Base.unsafe_convert(t, obj)
+Base.cconvert(t::Type{MX_handle}, obj::KVStore) = Base.unsafe_convert(t, obj)
+
+Base.show(io::IO, kv::KVStore) =
+    print(io, "mx.KVStore @ $(get_type(kv))")
+
+function _flatten_kvlist(keys::Vector{Int}, vals::Vector{<:Vector{<:NDArray}})
+  @assert length(keys) == length(vals)
+  keys_flt = Int[]
+  vals_flt = NDArray[]
+  for (k,v) in zip(keys, vals)
+    append!(keys_flt, Base.ones(Int, length(v))*k)
+    append!(vals_flt, v)
+  end
+  return (keys_flt, vals_flt)
+end
+
+"""
+    init!(kv::KVStore, key::Int, val::NDArray)
+    init!(kv::KVStore, keys, vals)
+
+Initializes a single or a sequence of key-value pairs into the store.
+
+For each key, one must `init!` it before calling `push!` or `pull!`.
+When multiple workers invoke `init!` for the same key, only
+the value supplied by worker with rank `0` is used. This function returns
+after data has been initialized successfully.
+
+```jldoctest
+julia> kv = KVStore(:local)
+mx.KVStore @ local
+
+julia> init!(kv, 42, mx.rand(2, 3))
+```
+"""
+init!(kv::KVStore, key::Int, val::NDArray) = init!(kv, [key], [val])
+init!(kv::KVStore, key::Int, vals::Vector{<:NDArray}) =
+  init!(kv, Base.ones(Int, length(vals)) * key, vals)
+init!(kv::KVStore, keys::Vector{Int}, vals::Vector{<:Vector{<:NDArray}}) =
+  init!(kv, _flatten_kvlist(keys, vals)...)
+
+function init!(kv::KVStore, keys::Vector{Int}, vals::VecOfNDArray)
+  @assert length(keys) == length(vals)
+  keys = Cint[keys...]
+  vals = MX_handle[vals...]
+  @mxcall(:MXKVStoreInit, (MX_handle, MX_uint, Ptr{Cint}, Ptr{MX_handle}),
+          kv, length(keys), keys, vals)
+end
+
+"""
+    push!(kv::KVStore, key,  val;  priority = 0)
+    push!(kv::KVStore, key,  vals; priority = 0)
+    push!(kv::KVStore, keys, vals; priority = 0)
+
+Pushes a single or a sequence of key-value pairs into the store.
+
+This function returns immediately after adding an operator to the engine.
+The actual operation is executed asynchronously. If there are consecutive
+pushes to the same key, there is no guarantee on the serialization of pushes.
+The execution of a push does not guarantee that all previous pushes are
+finished. There is no synchronization between workers by default.
+One can use ``barrier()`` to sync all workers.
+
+`push!` and `pull!` single `NDArray`:
+```jldoctest
+julia> kv = KVStore(:local)
+mx.KVStore @ local
+
+julia> x = mx.empty(2, 3);
+
+julia> init!(kv, 3, x)
+
+julia> push!(kv, 3, mx.ones(2, 3) * 8)
+
+julia> pull!(kv, 3, x)
+
+julia> x
+2×3 mx.NDArray{Float32,2} @ CPU0:
+ 8.0  8.0  8.0
+ 8.0  8.0  8.0
+```
+
+Aggregate values and `push!`:
+```jldoctest
+julia> vals = [mx.ones((2, 3), gpu(0)) * 3, mx.ones((2, 3), gpu(1)) * 4];
+
+julia> push!(kv, 3, vals)
+
+julia> pull!(kv, 3, x)
+
+julia> x
+2×3 mx.NDArray{Float32,2} @ CPU0:
+ 7.0  7.0  7.0
+ 7.0  7.0  7.0
+```
+
+`push!` a list of key to single device:
+
+```jldoctest
+julia> keys = [4, 5];
+
+julia> init!(kv, keys, [empty(2, 3), empty(2, 3)])
+
+julia> push!(kv, keys, [x, x])
+
+julia> y, z = empty(2, 3), empty(2, 3);
+
+julia> pull!(kv, keys, [y, z])
+```
+"""
+push!(kv::KVStore, key::Int, val::NDArray; priority::Int = 0) =
+  push!(kv, [key], [val]; priority = priority)
+push!(kv::KVStore, key::Int, vals::Vector{<:NDArray}; priority::Int = 0) =
+  push!(kv, Base.ones(Int, length(vals)) * key, vals; priority = priority)
+push!(kv:: KVStore, keys::Vector{Int}, vals::Vector{<:Vector{<:NDArray}};
+      priority::Int = 0) =
+  push!(kv, _flatten_kvlist(keys, vals)...; priority = priority)
+
+function push!(kv::KVStore, keys::Vector{Int}, vals::Vector{<:NDArray}; priority::Int = 0)
+  @assert length(keys) == length(vals)
+  keys = Cint[keys...]
+  vals = MX_handle[vals...]
+  @mxcall(:MXKVStorePush, (MX_handle, MX_uint, Ptr{Cint}, Ptr{MX_handle}, Cint),
+          kv, length(keys), keys, vals, priority)
+end
+
+""" Pulls a single value or a sequence of values from the store.
+
+This function returns immediately after adding an operator to the engine.
+Subsequent attempts to read from the `out` variable will be blocked until the
+pull operation completes.
+
+`pull` is executed asynchronously after all previous `pull` calls and only
+the last `push` call for the same input key(s) are finished.
+
+The returned values are guaranteed to be the latest values in the store.
+
+See [`pull!`](@ref) for more examples.
+"""
+pull!(kv::KVStore, key::Int, out::NDArray; priority::Int = 0) =
+  pull!(kv, [key], [out], priority = priority)
+pull!(kv::KVStore, key::Int, outs::Vector{<:NDArray}; priority::Int = 0) =
+  pull!(kv, Base.ones(Int, length(outs))*key, outs; priority = priority)
+pull!(kv::KVStore, keys::Vector{Int}, outs::Vector{<:Vector{<:NDArray}};
+      priority::Int = 0) =
+  pull!(kv, _flatten_kvlist(keys, outs)...; priority = priority)
+
+function pull!(kv::KVStore, keys::Vector{Int}, outs::Vector{<:NDArray}; priority::Int = 0)
+  @assert length(keys) == length(outs)
+  keys = Cint[keys...]
+  outs = MX_handle[outs...]
+  @mxcall(:MXKVStorePull, (MX_handle, MX_uint, Ptr{Cint}, Ptr{MX_handle}, Cint),
+          kv, length(keys), keys, outs, priority)
+end
+
+
+function get_type(kv::KVStore)
+  type_ref = Ref{char_p}(0)
+  @mxcall(:MXKVStoreGetType, (MX_handle, Ref{char_p}), kv, type_ref)
+  return Symbol(unsafe_string(type_ref[]))
+end
+
+function get_num_workers(kv::KVStore)
+  ref_size = Ref{Cint}(0)
+  @mxcall(:MXKVStoreGetGroupSize, (MX_handle, Ref{Cint}), kv, ref_size)
+  return Int(ref_size[])
+end
+
+function get_rank(kv::KVStore)
+  ref_rank = Ref{Cint}(0)
+  @mxcall(:MXKVStoreGetRank, (MX_handle, Ref{Cint}), kv, ref_rank)
+  return Int(ref_rank[])
+end
+
+"""
+    barrier(kv::KVStore)
+
+Invokes global barrier among all worker nodes.
+
+For example, assume there are `n` machines. We would like machine `0` to first
+`init` the values and then have all the workers `pull` the initialized value.
+Before pulling, we can place invoke `barrier(kv)` to guarantee that the
+initialization is finished.
+"""
+barrier(kv::KVStore) = @mxcall(:MXKVStoreBarrier, (MX_handle,), kv)
+
+
+# TODO: Currently Julia does not support closure in c-callbacks, so we are making use of the
+# extra handle parameter of the API to pass the updater object around. Fix this when someday
+# full closure cfunction is supported in Julia.
+function _kvstore_update_wrapper(key::Cint, nd_recv::MX_handle, nd_local::MX_handle,
+                                 updater::Ptr{Void})
+  updater_func = unsafe_pointer_to_objref(updater)
+  updater_func(Int(key), NDArray(MX_NDArrayHandle(nd_recv)),
+               NDArray(MX_NDArrayHandle(nd_local)))
+  nothing
+end
+
+"""
+    setupdater!(kv, updater)
+
+Sets a `push!` updater into the store.
+
+This function only changes the local store.
+When running on multiple machines one must use `set_optimizer`.
+
+```jldoctest
+julia> update(key, val, orig) = mx.@inplace orig += val .* .2
+update (generic function with 1 method)
+
+julia> kv = KVStore(:local)
+mx.KVStore @ local
+
+julia> mx.setupdater!(kv, update)
+
+julia> init!(kv, 42, mx.ones(2, 3))
+
+julia> push!(kv, 42, mx.ones(2, 3))
+
+julia> x = empty(2, 3);
+
+julia> pull!(kv, 42, x)
+
+julia> x
+2×3 mx.NDArray{Float32,2} @ CPU0:
+ 1.2  1.2  1.2
+ 1.2  1.2  1.2
+```
+"""
+function setupdater!(kv::KVStore, updater)
+  kv.updater = updater # keep a reference to the julia object so that updater_c is kept valid
+  kv.updater_c = cfunction(_kvstore_update_wrapper, Void,
+                           (Cint, MX_handle, MX_handle, Ptr{Void}))
+  @mxcall(:MXKVStoreSetUpdater, (MX_handle, Ptr{Void}, Any),
+          kv, kv.updater_c, updater)
+end
+
+"""
+    setoptimizer!(kv::KVStore, opt)
+
+Registers an optimizer with the kvstore.
+
+When using a single machine, this function updates the local optimizer.
+If using multiple machines and this operation is invoked from a worker node,
+it will serialized the optimizer with pickle and send it to all servers.
+The function returns after all servers have been updated.
+
+```jldoctest
+julia> kv = KVStore()
+mx.KVStore @ local
+
+julia> W = mx.zeros(2, 3)  # 2×3 weight matrix
+2×3 mx.NDArray{Float32,2} @ CPU0:
+ 0.0  0.0  0.0
+ 0.0  0.0  0.0
+
+julia> init!(kv, 42, W)
+
+julia> setoptimizer!(kv, SGD(η = .2))  # SGD with .2 as learning rate
+
+julia> ∇W = mx.ones(2, 3)  # assume it's the gradient
+2×3 mx.NDArray{Float32,2} @ CPU0:
+ 1.0  1.0  1.0
+ 1.0  1.0  1.0
+
+julia> push!(kv, 42, ∇W)
+
+julia> pull!(kv, 42, W)  # fetch weight and write back to `W`
+
+julia> W
+2×3 mx.NDArray{Float32,2} @ CPU0:
+ -0.2  -0.2  -0.2
+ -0.2  -0.2  -0.2
+```
+"""
+function setoptimizer!(kv::KVStore, opt::AbstractOptimizer)
+  if ismatch(r"dist", string(get_type(kv))) && _isworker()
+    # TODO
+    error("not implemented")
+  else
+    setupdater!(kv, getupdater(opt))
+  end
+end
+
+function _isworker()::Bool
+  ref = Ref{Cint}(0)
+  @mxcall(:MXKVStoreIsWorkerNode, (Ref{Cint},), ref)
+  ref_is_worker[]
+end
+
+# TODO: sparse support?
diff --git a/julia/src/metric.jl b/julia/src/metric.jl
new file mode 100644
index 000000000000..772eb3b3e680
--- /dev/null
+++ b/julia/src/metric.jl
@@ -0,0 +1,478 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+    AbstractEvalMetric
+
+The base class for all evaluation metrics. The sub-types should implement the following
+interfaces:
+
+* [`update!`](@ref)
+* [`reset!`](@ref)
+* [`get`](@ref)
+"""
+abstract type AbstractEvalMetric end
+
+"""
+    hasNDArraySupport(metric) -> Val{true/false}
+
+Trait for `_update_single_output` should return `Val{true}() if metric can handle `NDArray`
+directly and `Val{false}()` if requires `Array`. Metric that work with NDArrays can be
+async, while native Julia arrays require that we copy the output of the network, which is
+a blocking operation.
+"""
+hasNDArraySupport(::AbstractEvalMetric) = Val{true}()
+
+"""
+    update!(metric, labels, preds)
+
+Update and accumulate metrics.
+
+# Arguments:
+* `metric::AbstractEvalMetric`: the metric object.
+* `labels::Vector{NDArray}`: the labels from the data provider.
+* `preds::Vector{NDArray}`: the outputs (predictions) of the network.
+"""
+function update!(metric::T, labels::VecOfNDArray, preds::VecOfNDArray) where T <: AbstractEvalMetric
+  _update!(metric, labels, preds, hasNDArraySupport(metric))
+end
+
+function _update!(metric::T, labels::VecOfNDArray, preds::VecOfNDArray,
+                  ::Val{true}) where T<: AbstractEvalMetric
+  if length(labels) != length(preds)
+    Base.warn_once(
+      "The number of labels ($(length(labels))) does not correspond to the\
+      number of outputs ($(length(preds))). The calculated metric might not be accuracte.")
+  end
+  for (label, pred) in zip(labels, preds)
+    _update_single_output(metric, label, pred)
+  end
+end
+
+function _update!(metric::T, labels::VecOfNDArray, preds::VecOfNDArray,
+                  ::Val{false}) where T<: AbstractEvalMetric
+  if length(labels) != length(preds)
+    Base.warn_once(
+      "The number of labels ($(length(labels))) does not correspond to the\
+      number of outputs ($(length(preds))). The calculated metric might not be accuracte.")
+  end
+  for (label, pred) in zip(labels, preds)
+     @nd_as_jl ro=(label, pred) begin
+       # This is a dynamic dispatch since the conversion from NDArray to
+       # Array is not type-stable.
+      _update_single_output(metric, label, pred)
+    end
+  end
+end
+
+"""
+    reset!(metric)
+
+Reset the accumulation counter.
+"""
+reset!(metric::AbstractEvalMetric) = throw(MethodError(reset!, (typeof(metric),)))
+
+
+import Base: get
+"""
+    get(metric)
+
+Get the accumulated metrics.
+
+Returns `Vector{Tuple{Base.Symbol, Real}}`, a list of name-value pairs.
+For example, `[(:accuracy, 0.9)]`.
+"""
+get(metric::AbstractEvalMetric) = throw(MethodError(get, (typeof(metric),)))
+
+"""
+    NullMetric()
+
+A metric that calculates nothing. Can be used to ignore an output during training.
+"""
+mutable struct NullMetric <: mx.AbstractEvalMetric
+end
+
+update!(metric::NullMetric, labels::VecOfNDArray, preds::VecOfNDArray) = nothing
+
+reset!(metric::NullMetric) = nothing
+
+get(metric::NullMetric) = Tuple{Symbol, Float64}[]
+
+"""
+    MultiMetric(metrics::Vector{AbstractEvalMetric})
+
+Combine multiple metrics in one and get a result for all of them.
+
+# Usage
+To calculate both mean-squared error [`Accuracy`](@ref) and log-loss [`ACE`](@ref):
+```julia
+  mx.fit(..., eval_metric = mx.MultiMetric([mx.Accuracy(), mx.ACE()]))
+```
+"""
+mutable struct MultiMetric <: AbstractEvalMetric
+    metrics :: Vector{mx.AbstractEvalMetric}
+end
+
+function update!(metric :: MultiMetric, labels :: Vector{<:NDArray}, preds :: Vector{<:NDArray})
+    for m in metric.metrics
+        update!(m, labels, preds)
+    end
+    nothing
+end
+
+function reset!(metric :: MultiMetric)
+    map(reset!, metric.metrics)
+    nothing
+end
+
+get(metric::MultiMetric) = mapreduce(get, append!, metric.metrics)
+
+"""
+    SeqMetric(metrics::Vector{AbstractEvalMetric})
+
+Apply a different metric to each output. This is especially useful for `mx.Group`.
+
+# Usage
+Calculate accuracy [`Accuracy`](@ref) for the first output
+and log-loss [`ACE`](@ref) for the second output:
+```julia
+  mx.fit(..., eval_metric = mx.SeqMetric([mx.Accuracy(), mx.ACE()]))
+```
+"""
+mutable struct SeqMetric <: AbstractEvalMetric
+    metrics :: Vector{AbstractEvalMetric}
+end
+
+function update!(metric::SeqMetric, labels::VecOfNDArray, preds::VecOfNDArray)
+    @assert length(metric.metrics) == length(labels)
+    @assert length(metric.metrics) == length(preds)
+    for (m, l, p) in zip(metric.metrics, labels, preds)
+        update!(m, [l], [p])
+    end
+    nothing
+end
+
+function reset!(metric::SeqMetric)
+    map(reset!, metric.metrics)
+    nothing
+end
+
+get(metric::SeqMetric) = mapreduce(get, append!, metric.metrics)
+
+"""
+    Accuracy
+
+Multiclass classification accuracy.
+
+Calculates the mean accuracy per sample for softmax in one dimension.
+For a multi-dimensional softmax the mean accuracy over all dimensions is calculated.
+"""
+mutable struct Accuracy <: AbstractEvalMetric
+  acc_sum  :: Float64
+  n_sample :: Int
+
+  Accuracy() = new(0.0, 0)
+end
+
+hasNDArraySupport(::Accuracy) = Val{false}()
+
+function _update_single_output(metric::Accuracy, label::Array, pred::Array)
+  # Samples are stored in the last dimension
+  @assert size(label, ndims(label)) == size(pred, ndims(pred))
+
+  if ndims(pred) == 4 # Multidimensional case
+    # Reshape label to be of the same shape as pred.
+    # Except for the third dimension where the predictions are stored.
+    labels = reshape(label, size(pred, 1, 2)..., 1, size(pred, 4))
+
+    for sample in 1:size(labels, 4)
+      for j in 1:size(labels, 2)
+        for i in 1:size(labels, 1)
+          label = labels[i, j, 1, sample]
+          klasses = view(pred, i, j, :, sample)
+          klass = indmax(klasses) - 1 # Classes start at 0...k-1
+
+          metric.acc_sum += klass == label
+          metric.n_sample += 1
+        end
+      end
+    end
+  elseif ndims(pred) == 2 # 1-dimensional case
+    for sample in 1:size(label, 1)
+      klass = indmax(view(pred, :, sample)) - 1
+      metric.acc_sum += klass == label[sample]
+      metric.n_sample += 1
+    end
+  else
+    error("Can't handle prediction with dimensions $(ndims(pred)).")
+  end
+end
+
+get(metric::Accuracy) = [(:accuracy, metric.acc_sum / metric.n_sample)]
+
+function reset!(metric :: Accuracy)
+  metric.acc_sum  = 0.0
+  metric.n_sample = 0
+end
+
+"""
+    MSE
+
+Mean Squared Error.
+
+Calculates the mean squared error regression loss.
+Requires that label and prediction have the same shape.
+"""
+
+mutable struct MSE{N} <: AbstractEvalMetric
+  mse_sum  :: Vector{NDArray{MX_float,N}}
+  n_sample :: Int
+
+  MSE{N}() where {N} = new(Vector{NDArray{MX_float,N}}(), 0)
+end
+
+MSE() = MSE{1}()  # backward compat?
+
+hasNDArraySupport(::MSE) = Val{true}()
+
+function _update_single_output(metric::MSE, label::NDArray{T,N},
+                               pred::NDArray{T,N}) where {T,N}
+  @assert size(label) == size(pred)
+  metric.n_sample += length(label)
+  mse_sum = mx.sum((label .- pred).^2)
+  push!(metric.mse_sum, mse_sum)
+  nothing
+end
+
+function get(metric::MSE)
+  # Delay copy until last possible moment
+  mse_sum = mapreduce(nda->copy(nda)[1], +, 0.0, metric.mse_sum)
+  [(:MSE, mse_sum / metric.n_sample)]
+end
+
+function reset!(metric::MSE{N}) where N
+  metric.mse_sum = Vector{NDArray{Float32,N}}()
+  metric.n_sample = 0
+end
+
+doc"""
+    NMSE
+
+Normalized Mean Squared Error
+
+```math
+\sum_i (\frac{label_i - pred_i}{label_i})^2
+```
+
+Note that there are various ways to do the *normalization*.
+It depends on your own context. Please judge the problem setting you have
+first. If the current implementation do not suitable for you,
+feel free to file it on GitHub.
+
+Let me show you a use case of this kind of normalization:
+
+Bob is training a network for option pricing. The option pricing problem is
+a regression problem (pirce predicting). There are lots of option contracts
+on same target stock but different strike price.
+For example, there is a stock `S`; it's market price is 1000.
+And, there are two call option contracts with different strike price.
+Assume Bob obtains the outcome as following table:
+
+```
++--------+----------------+----------------+--------------+
+|        | Strike Price   | Market Price   | Pred Price   |
++--------+----------------+----------------+--------------+
+| Op 1   | 1500           |  100           | 80           |
++--------+----------------+----------------+--------------+
+| Op 2   | 500            |  10            | 8            |
++--------+----------------+----------------+--------------+
+```
+
+Now, obviously, Bob will calculate the normalized MSE as:
+
+```math
+    (\frac{100 - 80}{100})^2
+    \text{ vs }
+    (\frac{10 - 8}{10}) ^2
+```
+
+Both of the pred prices got the same degree of error.
+
+For more discussion about normalized MSE, please see
+[#211](https://github.com/dmlc/MXNet.jl/pull/211) also.
+
+"""
+mutable struct NMSE <: AbstractEvalMetric
+  nmse_sum  :: Float64
+  n_sample :: Int
+
+  NMSE() = new(0.0, 0)
+end
+
+hasNDArraySupport(::NMSE) = Val{false}()
+
+function _update_single_output(metric::NMSE, label::Array, pred::Array)
+  n_sample = size(pred)[end]
+  metric.n_sample += n_sample
+
+  for i = 1:n_sample
+    if label[i] == 0.0f0  # in case of batch padding
+        continue
+    end
+
+    metric.nmse_sum += ((label[i] - pred[i]) / label[i])^2
+  end
+end
+
+get(metric::NMSE) = [(:NMSE, metric.nmse_sum / metric.n_sample)]
+
+function reset!(metric::NMSE)
+  metric.nmse_sum = 0.0
+  metric.n_sample = 0
+end
+
+"""
+    ACE
+
+Calculates the averaged cross-entropy (logloss) for classification.
+
+# Arguments:
+* `eps::Float64`: Prevents returning `Inf` if `p = 0`.
+"""
+mutable struct ACE <: AbstractEvalMetric
+  ace_sum  :: Float64
+  n_sample :: Int
+  eps :: Float64
+
+  ACE(eps=1.0e-8) = new(0.0, 0, eps)
+end
+
+get(metric::ACE) = [(:ACE, - metric.ace_sum / metric.n_sample)]
+
+function reset!(metric::ACE)
+  metric.ace_sum = 0.0
+  metric.n_sample = 0
+end
+
+hasNDArraySupport(::ACE) = Val{false}()
+
+function _update_single_output(metric :: ACE, label :: Array{T}, pred :: Array{T}) where T
+  eps = convert(T, metric.eps)
+  # Samples are stored in the last dimension
+  @assert size(label, ndims(label)) == size(pred, ndims(pred))
+  if size(label) == size(pred) # simply calculate the cross entropy of the probabilities
+    for (q, p) in zip(pred, label)
+      # p == true probability
+      # q == "unnatural" probability
+      metric.ace_sum += p * log(q + eps)
+      metric.n_sample += 1
+    end
+  elseif ndims(pred) == 4
+    labels = reshape(label, size(pred, 1, 2)..., 1, size(pred, 4))
+    for sample in 1:size(labels, 4)
+      for j in 1:size(labels, 2)
+        for i in 1:size(labels, 1)
+          # Cross-entropy reduces to -(ln(p_1)*0 + ln(p_2)*1) for classification
+          # Since we can only target labels right now this is the only thing we can do.
+          target = Int(labels[i, j, 1, sample]) + 1 # klasses are 0...k-1 => julia indexing
+          p_k = pred[i, j, target, sample]
+          metric.ace_sum += log(p_k + eps)
+          metric.n_sample += 1
+        end
+      end
+    end
+  elseif ndims(pred) == 2 # 1-dimensional case
+    for sample in 1:size(label, 1)
+      target = Int(label[sample]) + 1    # 0-based indexing => 1-based indexing
+      p_k = pred[target, sample]
+      metric.ace_sum += log(p_k +eps)
+      metric.n_sample += 1
+    end
+  else
+    error("Can't handle prediction with dimensions $(ndims(pred)).")
+  end
+end
+
+"""
+    MultiACE
+
+Calculates the averaged cross-entropy per class and overall (see [`ACE`](@ref)).
+This can be used to quantify the influence of different classes on the overall loss.
+"""
+mutable struct MultiACE <: AbstractEvalMetric
+  aces  :: Vector{Float64}
+  counts :: Vector{Int}
+  eps :: Float64
+
+  MultiACE(nclasses, eps=1.0e-8) = new(Base.zeros(nclasses), Base.zeros(Int, nclasses), eps)
+end
+
+function get(metric :: MultiACE)
+  aces = [(Symbol("ACE_$(i-0)"), - metric.aces[i] / metric.counts[i]) for i in 1:length(metric.aces)]
+  push!(aces, (:ACE, - Base.sum(metric.aces) / Base.sum(metric.counts)))
+  return aces
+end
+
+function reset!(metric :: MultiACE)
+  metric.aces = Base.zero(metric.aces)
+  metric.counts = Base.zero(metric.counts)
+end
+
+hasNDArraySupport(::MultiACE) = Val{false}()
+
+function _update_single_output(metric :: MultiACE, label :: Array{T}, pred :: Array{T}) where T
+  eps = convert(T, metric.eps)
+  # Samples are stored in the last dimension
+  @assert size(label, ndims(label)) == size(pred, ndims(pred))
+  @assert size(metric.aces) == size(metric.counts)
+  if size(label) == size(pred) # simply calculate the cross entropy of the probabilities
+    for k in 1:length(metric.aces)
+      kpred  = view(pred,  ntuple(d->:, ndims(pred)  - 2)..., k, :)
+      klabel = view(label, ntuple(d->:, ndims(label) - 2)..., k, :)
+      for (q, p) in zip(kpred, klabel)
+        # p == true probability
+        # q == "unnatural" probability
+        metric.aces[k] += p * log(q + eps)
+        metric.counts[k] += 1
+      end
+    end
+  elseif ndims(pred) == 4
+    labels = reshape(label, size(pred, 1, 2)..., 1, size(pred, 4))
+    for sample in 1:size(labels, 4)
+      for j in 1:size(labels, 2)
+        for i in 1:size(labels, 1)
+          # Cross-entropy reduces to -(ln(p_1)*0 + ln(p_2)*1) for classification
+          # Since we can only target labels right now this is the only thing we can do.
+          target = Int(labels[i, j, 1, sample]) + 1 # klasses are 0...k-1 => julia indexing
+          p_k = pred[i, j, target, sample]
+
+          metric.aces[target] += log(p_k + eps)
+          metric.counts[target] += 1
+        end
+      end
+    end
+  elseif ndims(pred) == 2
+    for sample in 1:size(label, 1)
+      target = Int(label[sample]) + 1
+      p_k = pred[target, sample]
+      metric.aces[target] += log(p_k + eps)
+      metric.counts[target] += 1
+    end
+  else
+    error("Can't handle prediction with dimensions $(ndims(pred)).")
+  end
+end
diff --git a/julia/src/model.jl b/julia/src/model.jl
new file mode 100644
index 000000000000..109cb35e38a6
--- /dev/null
+++ b/julia/src/model.jl
@@ -0,0 +1,671 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+    AbstractModel
+
+The abstract super type of all models in MXNet.jl.
+"""
+abstract type AbstractModel end
+
+"""
+    FeedForward
+
+The feedforward model provides convenient interface to train and predict on
+feedforward architectures like multi-layer MLP, ConvNets, etc. There is no
+explicitly handling of *time index*, but it is relatively easy to implement
+unrolled RNN / LSTM under this framework (*TODO*: add example). For models
+that handles sequential data explicitly, please use *TODO*...
+"""
+mutable struct FeedForward <: AbstractModel
+  arch        :: SymbolicNode
+  ctx         :: Vector{Context}
+
+  arg_params  :: Dict{Symbol}
+  aux_params  :: Dict{Symbol}
+
+  pred_exec   :: Union{Executor,Void}
+
+  # leave the rest fields undefined
+  FeedForward(arch::SymbolicNode, ctx::Vector{Context}) = new(arch, ctx)
+  FeedForward(arch::SymbolicNode, ctx::Context) = new(arch, [ctx])
+end
+
+"""
+Get a split of `batch_size` into `n_split` pieces for data parallelization. Returns a vector
+of length `n_split`, with each entry a `UnitRange{Int}` indicating the slice index for that
+piece.
+"""
+function _split_inputs(batch_size::Int, n_split::Int)
+  @assert(batch_size >= n_split)
+  per_split = floor(Int, batch_size / n_split)
+  counts    = Base.zeros(Int, n_split)+per_split
+  extra     = batch_size - Base.sum(counts)
+  counts[1:extra] += 1
+
+  cum = [0, cumsum(counts)...]
+  idx = [cum[i-1]+1:cum[i] for i = 2:length(cum)]
+  return idx
+end
+
+"""
+    FeedForward(arch :: SymbolicNode, ctx)
+
+# Arguments:
+* `arch`: the architecture of the network constructed using the symbolic API.
+* `ctx`: the devices on which this model should do computation. It could be a single `Context`
+         or a list of `Context` objects. In the latter case, data parallelization will be used
+         for training. If no context is provided, the default context `cpu()` will be used.
+"""
+FeedForward(arch::SymbolicNode; context::Union{Context,Vector{Context}} = [cpu()]) =
+  FeedForward(arch, context)
+
+"""
+    init_model(self, initializer; overwrite=false, input_shapes...)
+
+Initialize the weights in the model.
+
+This method will be called automatically when training a model. So there is usually no
+need to call this method unless one needs to inspect a model with only randomly initialized
+weights.
+
+# Arguments:
+* `self::FeedForward`: the model to be initialized.
+* `initializer::AbstractInitializer`: an initializer describing how the weights should be initialized.
+* `overwrite::Bool`: keyword argument, force initialization even when weights already exists.
+* `input_shapes`: the shape of all data and label inputs to this model, given as keyword arguments.
+                  For example, `data=(28,28,1,100), label=(100,)`.
+"""
+function init_model(self::FeedForward, initializer::AbstractInitializer; overwrite::Bool=false, input_shapes...)
+  # all arg names, including data, label, and parameters
+  arg_names    = list_arguments(self.arch)
+
+  input_names  = [x[1] for x in input_shapes]
+
+  param_names = setdiff(arg_names, input_names)
+  aux_names   = list_auxiliary_states(self.arch)
+
+  arg_shapes, out_shapes, aux_shapes = infer_shape(self.arch; input_shapes...)
+
+  # If target dict is not yet defined set a temporary one
+  if !isdefined(self, :arg_params)
+    self.arg_params = Dict{Symbol, NDArray}()
+  end
+  if !isdefined(self, :aux_params)
+    self.aux_params = Dict{Symbol, NDArray}()
+  end
+
+  arg_params = Dict{Symbol,NDArray}()
+  aux_params = Dict{Symbol,NDArray}()
+
+  for (name, shape) in filter(x -> in(x[1],param_names), zip(arg_names, arg_shapes))
+    if haskey(self.arg_params, name)
+      if shape == size(self.arg_params[name])
+        arg_params[name] = self.arg_params[name]
+        continue
+      else
+        warn("Shape mismatch for $name. Overwriting with new one.")
+        delete!(self.arg_params, name)
+      end
+    end
+    arg_params[name] = empty(shape)
+  end
+
+  for (name, shape) in zip(aux_names, aux_shapes)
+    if haskey(self.aux_params, name)
+      if shape == size(self.aux_params[name])
+        aux_params[name] = self.aux_params[name]
+        continue
+      else
+        warn("Shape mismatch for $name. Overwriting with new one.")
+        delete!(self.aux_params, name)
+      end
+    end
+    aux_params[name] = empty(shape)
+  end
+
+  for (k,v) in arg_params
+    if overwrite || !haskey(self.arg_params, k)
+      init(initializer, k, v)
+    end
+  end
+  for (k,v) in aux_params
+    if overwrite || !haskey(self.aux_params, k)
+      init(initializer, k, v)
+    end
+  end
+
+  self.arg_params = arg_params
+  self.aux_params = aux_params
+
+  return (arg_names, param_names, aux_names)
+end
+
+function _setup_predictor(self::FeedForward, overwrite::Bool=false; verbosity::Integer = 1, data_shapes...)
+  if !isdefined(self, :pred_exec) || isa(self.pred_exec, Void) || overwrite
+    if !isdefined(self, :arg_params) || !isdefined(self, :aux_params)
+      @assert(false, "Model weights not defined, please init or train the model, or load from file")
+    end
+
+    # the predictor use only the first device
+    self.pred_exec = simple_bind(self.arch, self.ctx[1]; grad_req=GRAD_NOP, data_shapes...)
+    dbg_str = mx.debug_str(self.pred_exec)
+    verbosity >= 1 && info(string("TempSpace: ", split(dbg_str, ['\n'])[end-2]..., " on ", self.ctx[1]))
+    copy_params_from(self.pred_exec, self.arg_params, self.aux_params)
+  else
+    # make sure the new setup is compatible with the existing one
+    for (d_name, d_shape) in data_shapes
+      @assert(d_shape == size(self.pred_exec.arg_dict[d_name]),
+              "Shape of $d_name mismatch with existing predictor, use overwrite=true overwrite existing predictor")
+    end
+  end
+end
+
+"""
+    predict(self, data; overwrite=false, callback=nothing)
+
+Predict using an existing model. The model should be already initialized, or trained or loaded from
+a checkpoint. There is an overloaded function that allows to pass the callback as the first argument,
+so it is possible to do
+
+```julia
+predict(model, data) do batch_output
+  # consume or write batch_output to file
+end
+```
+
+# Arguments:
+* `self::FeedForward`:  the model.
+* `data::AbstractDataProvider`: the data to perform prediction on.
+* `overwrite::Bool`: an `Executor` is initialized the first time predict is called. The memory
+                     allocation of the `Executor` depends on the mini-batch size of the test
+                     data provider. If you call predict twice with data provider of the same batch-size,
+                     then the executor can be potentially be re-used. So, if `overwrite` is false,
+                     we will try to re-use, and raise an error if batch-size changed. If `overwrite`
+                     is true (the default), a new `Executor` will be created to replace the old one.
+* `verbosity::Integer`: Determines the verbosity of the print messages. Higher numbers
+          leads to more verbose printing. Acceptable values are
+          - `0`: Do not print anything during prediction
+          - `1`: Print allocation information during prediction
+
+!!! note
+    Prediction is computationally much less costly than training, so the bottleneck sometimes becomes the IO
+    for copying mini-batches of data. Since there is no concern about convergence in prediction, it is better
+    to set the mini-batch size as large as possible (limited by your device memory) if prediction speed is a
+    concern.
+
+    For the same reason, currently prediction will only use the first device even if multiple devices are
+    provided to construct the model.
+
+!!! note
+    If you perform further after prediction. The weights are not automatically synchronized if `overwrite`
+    is set to false and the old predictor is re-used. In this case
+    setting `overwrite` to true (the default) will re-initialize the predictor the next time you call
+    predict and synchronize the weights again.
+
+See also [`train`](@ref), [`fit`](@ref), [`init_model`](@ref), and [`load_checkpoint`](@ref)
+"""
+function predict(callback::Function, self::FeedForward, data::AbstractDataProvider;
+                 overwrite::Bool = true, verbosity::Integer = 1)
+  predict(self, data; overwrite = overwrite, callback=callback, verbosity = verbosity)
+end
+function predict(self::FeedForward, data::AbstractDataProvider;
+                 overwrite::Bool = true, callback::Union{Function,Void}=nothing, verbosity::Integer = 1)
+  data_shapes = provide_data(data)
+  data_names  = [x[1] for x in data_shapes]
+  _setup_predictor(self, overwrite; verbosity = verbosity, data_shapes...)
+
+  batch_size  = get_batch_size(data)
+  data_arrays =  [self.pred_exec.arg_dict[name] for name in data_names]
+  output_list = [Array{MX_float}[] for i=1:length(self.pred_exec.outputs)]
+  for batch in eachbatch(data)
+    load_data!(data, batch, data_arrays)
+    forward(self.pred_exec, is_train=false)
+    if isa(callback, Void)
+      # no callback, accumulate the data and return at the end
+      for (o_list, o_nd) in zip(output_list, self.pred_exec.outputs)
+        push!(o_list, copy(slice(o_nd, 1:count_samples(data, batch))))
+      end
+    else
+      outputs = self.pred_exec.outputs
+      if length(outputs) == 1
+        outputs = outputs[1]
+      end
+      callback(outputs)
+    end
+  end
+
+  if !isa(callback, Void)
+    # callback exists, do not accumulate data
+    return nothing
+  end
+
+  if isempty(output_list)
+    # maybe model does not have outputs
+    return nothing
+  end
+  if isempty(output_list[1])
+    # maybe no output because data is empty
+    return length(output_list) == 1 ? output_list[1] : output_list
+  end
+
+  # concatenate along mini-batches
+  output_arrays = [cat(ndims(x[1]), x...) for x in output_list]
+  if length(output_arrays) == 1
+    # only 1 output, return it directly, instead of a list
+    output_arrays = output_arrays[1]
+  end
+  return output_arrays
+end
+
+function _init_model(self::FeedForward, data::AbstractDataProvider,
+                     initializer::AbstractInitializer, overwrite::Bool)
+  init_model(self, initializer; overwrite=overwrite,
+             [provide_data(data)..., provide_label(data)...]...)
+end
+
+function _create_kvstore(kv_type::Symbol, num_device::Int, arg_params::Dict{Symbol}, verbosity::Int)
+  if num_device == 1 && !ismatch(r"dist", string(kv_type))
+    return nothing
+  else
+    if kv_type == :local
+      max_size = maximum([prod(size(param)) for (k,param) in arg_params])
+      if max_size < 1024 * 1024 * 16
+        kv_type = :local_update_cpu
+      else
+        kv_type = :local_allreduce_cpu
+      end
+      verbosity >= 2 && info("Auto-select kvstore type = $kv_type")
+    end
+    return KVStore(kv_type)
+  end
+end
+
+@defstruct TrainingOptions (
+  initializer :: AbstractInitializer = UniformInitializer(0.01),
+  n_epoch     :: Int = 10,
+  eval_data   :: Union{Void,AbstractDataProvider} = nothing,
+  eval_metric :: AbstractEvalMetric = Accuracy(),
+  kvstore     :: Union{Symbol,KVStore} = :local,
+  force_init  :: Bool = false,
+  callbacks   :: Vector{AbstractCallback} = AbstractCallback[],
+  verbosity   :: Int = 3,
+  η_decay     :: Symbol = :epoch,
+)
+
+function _invoke_callbacks(m::FeedForward, callbacks::Vector{AbstractCallback},
+                           state::OptimizationState, type_filter::Type;
+                           metric = Vector{Tuple{Symbol,Real}}())
+  map(callbacks) do cb
+    !isa(cb, type_filter) && return
+
+    # epoch callback have extra access to the model object
+    type_filter == AbstractEpochCallback && return cb(m, state, metric)
+
+    cb(state)
+  end
+end
+
+"""
+    train(model :: FeedForward, ...)
+
+Alias to [`fit`](@ref).
+"""
+train(m::FeedForward, opt::AbstractOptimizer, data::AbstractDataProvider; kw...) =
+  fit(m, opt, data; kw...)
+
+"""
+    fit(model::FeedForward, optimizer, data; kwargs...)
+
+Train the `model` on `data` with the `optimizer`.
+
+* `model::FeedForward`: the model to be trained.
+* `optimizer::AbstractOptimizer`: the optimization algorithm to use.
+* `data::AbstractDataProvider`: the training data provider.
+* `n_epoch::Int`: default 10, the number of full data-passes to run.
+* `eval_data::AbstractDataProvider`: keyword argument, default `nothing`. The data provider for
+          the validation set.
+* `eval_metric::AbstractEvalMetric`: keyword argument, default [`Accuracy()`](@ref). The metric used
+          to evaluate the training performance. If `eval_data` is provided, the same metric is also
+          calculated on the validation set.
+* `kvstore`: keyword argument, default `:local`. The key-value store used to synchronize gradients
+          and parameters when multiple devices are used for training.
+   :type kvstore: `KVStore` or `Symbol`
+* `initializer::AbstractInitializer`: keyword argument, default `UniformInitializer(0.01)`.
+* `force_init::Bool`: keyword argument, default false. By default, the random initialization using the
+          provided `initializer` will be skipped if the model weights already exists, maybe from a previous
+          call to [`train`](@ref) or an explicit call to [`init_model`](@ref) or [`load_checkpoint`](@ref). When
+          this option is set, it will always do random initialization at the begining of training.
+* `callbacks::Vector{AbstractCallback}`: keyword argument, default `[]`. Callbacks to be invoked at each epoch or mini-batch,
+          see `AbstractCallback`.
+* `verbosity::Int`: Determines the verbosity of the print messages. Higher numbers
+          leads to more verbose printing. Acceptable values are
+          - `0`: Do not print anything during training
+          - `1`: Print starting and final messages
+          - `2`: Print one time messages and a message at the start of each epoch
+          - `3`: Print a summary of the training and validation accuracy for each epoch
+* `η_decay::Symbol`: `:epoch` or `:batch`, decay learning rate on epoch or batch.
+"""
+function fit(self::FeedForward, optimizer::AbstractOptimizer, data::AbstractDataProvider;
+             kwargs...)
+  opts = TrainingOptions(; kwargs...)
+
+  opts.verbosity >= 1 && info("Start training on $(self.ctx)")
+
+  batch_size  = get_batch_size(data)
+  num_dev     = length(self.ctx)
+  slices      = _split_inputs(batch_size, num_dev)
+
+  # initialize parameters
+  opts.verbosity >= 2 && info("Initializing parameters...")
+  arg_names, param_names, aux_names = _init_model(self, data, opts.initializer, opts.force_init)
+
+  # setup kvstore
+  kvstore = opts.kvstore
+  if isa(kvstore, Symbol)
+    opts.verbosity >= 2 && info("Creating KVStore...")
+    kvstore = _create_kvstore(kvstore, length(self.ctx), self.arg_params, opts.verbosity)
+  end
+
+  update_on_kvstore = true
+  if isa(kvstore, Void) || ismatch(r"local_allreduce", string(get_type(kvstore)))
+    update_on_kvstore = false
+  end
+
+  # get grad attribute to allow for freezing
+  freeze_names = Symbol[]
+  for (attr, value) in list_all_attr(self.arch)
+    sattr = string(attr)
+    if endswith(sattr, "grad") && value == "freeze"
+      push!(freeze_names, Symbol(sattr[1:end-5]))
+    end
+  end
+  # Needs to correspond to the correct id in the update loop layer idx=1:length(param_names).
+  freeze_idx = filter(i -> in(param_names[i], freeze_names), 1:length(param_names))
+
+  # Setup grad_req as a dictionary
+  grad_req = Dict{Symbol,GRAD_REQ}()
+  for param in param_names
+    if in(param, freeze_names)
+      grad_req[param] = GRAD_NOP
+    else
+      grad_req[param] = GRAD_WRITE
+    end
+  end
+
+  train_execs = Array{Executor}(num_dev)
+  for i = 1:num_dev
+    data_shapes = Dict(map((x) -> x[1] => tuple(x[2][1:end-1]...,length(slices[i])), provide_data(data)))
+    label_shapes = Dict(map((x) -> x[1] => tuple(x[2][1:end-1]...,length(slices[i])), provide_label(data)))
+    train_execs[i] = simple_bind(self.arch, self.ctx[i]; grad_req=grad_req, data_shapes..., label_shapes...)
+    dbg_str = mx.debug_str(train_execs[i])
+    opts.verbosity >= 2 && info(string("TempSpace: ", split(dbg_str, ['\n'])[end-2]..., " on ", self.ctx[i]))
+
+    copy_params_from(train_execs[i], self.arg_params, self.aux_params)
+  end
+
+  # set up input data structures
+  data_names   = [x[1] for x in provide_data(data)]
+  label_names  = [x[1] for x in provide_label(data)]
+
+  data_arrays  = [SlicedNDArray[(slices[i], exec.arg_dict[name]) for (i,exec) in enumerate(train_execs)]
+                  for name in data_names]
+  label_arrays = [SlicedNDArray[(slices[i], exec.arg_dict[name]) for (i,exec) in enumerate(train_execs)]
+                  for name in label_names]
+
+  param_idx    = filter(i -> in(arg_names[i], param_names), 1:length(arg_names))
+
+  param_arrays = [NDArray[exec.arg_arrays[i] for exec in train_execs] for i in param_idx]
+  grad_arrays  = [NDArray[exec.grad_arrays[i] for exec in train_execs] for i in param_idx]
+  aux_arrays   = [NDArray[exec.aux_arrays[i] for exec in train_execs] for i = 1:length(aux_names)]
+
+  op_state = OptimizationState(batch_size)
+  # set up the gradient rescaling if user not set
+  iszero(optimizer.scale) && (optimizer.scale = 1 / batch_size)
+
+  if !update_on_kvstore
+    updater = getupdater(optimizer)
+  end
+
+  if !isa(kvstore, Void)
+    if update_on_kvstore
+      set_optimizer(kvstore, optimizer)
+    end
+
+    opts.verbosity >= 2 && info("Initializing KVStore...")
+    # init kv with gradients
+    for idx = 1:length(param_arrays)
+      param_on_devs = param_arrays[idx]
+
+      init!(kvstore, idx, self.arg_params[param_names[idx]])
+
+      if update_on_kvstore
+        # pull weights back
+        pull!(kvstore, idx, param_on_devs, priority=-idx)
+      end
+    end
+  end
+
+  # set up output and labels in CPU for evaluation metric
+  output_shapes = [tuple(size(x)[1:end-1]...,batch_size) for x in train_execs[1].outputs]
+  cpu_dev = Context(CPU)
+  cpu_output_arrays = [empty(shape, cpu_dev) for shape in output_shapes]
+  cpu_label_arrays  = [empty(shape, cpu_dev) for (name,shape) in provide_label(data)]
+
+  # invoke callbacks on epoch 0
+  _invoke_callbacks(self, opts.callbacks, op_state, AbstractEpochCallback)
+
+  opts.verbosity >= 2 && info("Start training...")
+  for i_epoch = 1:opts.n_epoch
+    time_start = time()
+    reset!(opts.eval_metric)
+
+    op_state.curr_epoch = i_epoch
+    op_state.curr_batch = 0
+
+    # invoke callbacks on iteration 0
+    _invoke_callbacks(self, opts.callbacks, op_state, AbstractBatchCallback)
+
+    for batch in eachbatch(data)
+      load_data!(data, batch, data_arrays)
+      load_label!(data, batch, label_arrays)
+
+      # forward and backward
+      for (texec, islice) in zip(train_execs, slices)
+        forward(texec, is_train=true)
+
+        # copy outputs into cpu ndarray, for evaluation metric
+        for (cpu_out, dev_out) in zip(cpu_output_arrays, texec.outputs)
+          copy!(slice(cpu_out, islice), dev_out)
+        end
+
+        backward(texec)
+      end
+
+      op_state.curr_iter  += 1
+      op_state.curr_batch += 1
+
+      # update parameters
+      for idx = 1:length(param_names)
+        if in(idx, freeze_idx)
+          continue # Skip parameter update entirely
+        end
+
+        # gradient synchronization
+        if !isa(kvstore, Void)
+          # push gradient, priority is negative index
+          push!(kvstore, idx, grad_arrays[idx], priority=-idx)
+          if update_on_kvstore
+            # pull back the weights
+            pull!(kvstore, idx, param_arrays[idx], priority=-idx)
+          else
+            # pull back the sum-ed gradients, to the same locations
+            pull!(kvstore, idx, grad_arrays[idx], priority=-idx)
+          end
+        end
+
+        if !update_on_kvstore
+          # manual updating
+          for i_dev = 1:num_dev
+            # create a fake index, so that the updater create states
+            # for different param AND different devices, TODO(mli)
+            # use a better solution later
+            fake_idx = idx * num_dev + i_dev
+            updater(fake_idx, grad_arrays[idx][i_dev], param_arrays[idx][i_dev])
+          end
+        end
+      end
+
+      # trigger learning rate decay
+      opts.η_decay == :batch && update!(optimizer.η_sched)
+
+      # invoke callbacks after finishing each iteration
+      _invoke_callbacks(self, opts.callbacks, op_state, AbstractBatchCallback)
+
+      # update evaluation metric on training set
+      load_label!(data, batch, cpu_label_arrays)
+      update!(opts.eval_metric, cpu_label_arrays, cpu_output_arrays)
+    end # end of one epoch
+
+    time_stop = time()
+    metric = get(opts.eval_metric)
+    opts.verbosity >= 2 && info(format("== Epoch {1:0>3d}/{2:0>3d} ==========", i_epoch, opts.n_epoch))
+    if opts.verbosity >= 3
+        info("## Training summary")
+        for (name, value) in metric
+            info(format("{1:>18s} = {2:.4f}", string(name), value))
+        end
+        info(format("{1:>18s} = {2:.4f} seconds", "time", time_stop-time_start))
+    end
+
+    # evaluation on validation set
+    if !isa(opts.eval_data, Void)
+      # because we are re-using the memory allocated for the training network,
+      # the batch_size of the validation dataset must be the same as the training
+      # batch_size
+      @assert(get_batch_size(opts.eval_data) == batch_size)
+
+      reset!(opts.eval_metric)
+      for batch in eachbatch(opts.eval_data)
+        load_data!(opts.eval_data, batch, data_arrays)
+
+        # forward and backward
+        for (texec, islice) in zip(train_execs, slices)
+          forward(texec, is_train=true)
+
+          # copy outputs into cpu ndarray, for evaluation metric
+          for (cpu_out, dev_out) in zip(cpu_output_arrays, texec.outputs)
+            copy!(slice(cpu_out, islice), dev_out)
+          end
+        end
+        load_label!(opts.eval_data, batch, cpu_label_arrays)
+        update!(opts.eval_metric, cpu_label_arrays, cpu_output_arrays)
+      end
+
+      if opts.verbosity >= 3
+          info("## Validation summary")
+          for (name, value) in get(opts.eval_metric)
+            info(format("{1:>18s} = {2:.4f}", string(name), value))
+          end
+      end
+    end
+
+    if i_epoch == opts.n_epoch || any(x->isa(x, AbstractEpochCallback), opts.callbacks)
+      # copy data back to cpu
+      for (name, weights) in zip(param_names, param_arrays)
+        # average parameters across devices
+        weight = +([copy(w, cpu()) for w in weights]...) / length(weights)
+        copy!(self.arg_params[name], weight)
+      end
+      for (name, aux_devs) in zip(aux_names, aux_arrays)
+        aux_avg = +([copy(aux, cpu()) for aux in aux_devs]...) / length(aux_devs)
+        copy!(self.aux_params[name], aux_avg)
+      end
+    end
+
+    # trigger learning rate decay
+    opts.η_decay == :epoch && update!(optimizer.η_sched)
+
+    _invoke_callbacks(self, opts.callbacks, op_state, AbstractEpochCallback; metric=metric)
+  end # end of all epochs
+
+  opts.verbosity >= 1 && info("Finish training on $(self.ctx)")
+  nothing
+end
+
+save_checkpoint(self::FeedForward, prefix::AbstractString, state::OptimizationState) =
+  save_checkpoint(self.arch, self.arg_params, self.aux_params, prefix, state.curr_epoch)
+
+function save_checkpoint(sym::SymbolicNode, arg_params::Dict{Symbol},
+                         aux_params::Dict{Symbol}, prefix::AbstractString, epoch::Int)
+  save("$prefix-symbol.json", sym)
+  save_dict = Dict{Symbol, NDArray}(map((x) -> Symbol("arg:$(x[1])") => x[2], arg_params))
+  if !isempty(aux_params)
+    merge!(save_dict, Dict(map((x) -> Symbol("aux:$(x[1])") => x[2], aux_params)))
+  end
+  save_filename = format("{1}-{2:04d}.params", prefix, epoch)
+  save(save_filename, save_dict)
+  info("Saved checkpoint to '$save_filename'")
+end
+
+function load_checkpoint(prefix::AbstractString, epoch::Int)
+  arch       = load("$prefix-symbol.json", SymbolicNode)
+  saved_dict = load(format("{1}-{2:04d}.params", prefix, epoch), NDArray)
+  arg_params = Dict{Symbol,Any}()
+  aux_params = Dict{Symbol,Any}()
+  for (k,v) in saved_dict
+    tp, name = split(string(k), ':')
+    name = Symbol(name)
+    if tp == "arg"
+      arg_params[name] = v
+    else
+      aux_params[name] = v
+    end
+  end
+
+  return (arch, arg_params, aux_params)
+end
+
+"""
+    load_checkpoint(prefix, epoch, ::mx.FeedForward; context)
+
+Load a mx.FeedForward model from the checkpoint *prefix*, *epoch* and optionally provide a context.
+"""
+function load_checkpoint(prefix::AbstractString, epoch::Int, ::Type{FeedForward}; context = nothing)
+  arch, arg_params, aux_params = load_checkpoint(prefix, epoch)
+  model = FeedForward(arch, context = context)
+  model.arg_params = arg_params
+  model.aux_params = aux_params
+  return model
+end
+
+function load_checkpoint(self::FeedForward, prefix::AbstractString, epoch::Int;
+                         overwrite::Bool = true, allow_different_arch::Bool = false)
+  if isdefined(self, :arg_params) && isdefined(self, :aux_params) && !overwrite
+    info("model weights already exists, skip loading... (call with overwrite=true if needed)")
+    return self
+  end
+
+  arch, arg_params, aux_params = load_checkpoint(prefix, epoch)
+  if !allow_different_arch
+    # TODO: is there better way to compare two symbols
+    @assert(to_json(self.arch) == to_json(arch), "Cannot load from a checkpoint with different network architecture")
+  end
+  self.arg_params = arg_params
+  self.aux_params = aux_params
+  return self
+end
diff --git a/julia/src/name.jl b/julia/src/name.jl
new file mode 100644
index 000000000000..8180886c869c
--- /dev/null
+++ b/julia/src/name.jl
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+abstract type AbstractNameManager end
+const NameType = Union{Base.Symbol, AbstractString}
+const NameCounter = Dict{Base.Symbol, Int}
+
+import Base: get!
+
+# Default implementation for generating a name for a symbol.
+# When a name is specified by the user, it will be used. Otherwise, a name
+# is automatically generated based on the hint string.
+function _default_get_name!(counter :: NameCounter, name :: NameType, hint :: NameType)
+  if isa(name, Base.Symbol) || !isempty(name)
+    return Symbol(name)
+  end
+
+  hint = Symbol(hint)
+  if !haskey(counter, hint)
+    counter[hint] = 0
+  end
+  name = Symbol("$hint$(counter[hint])")
+  counter[hint] += 1
+  return name
+end
+
+mutable struct BasicNameManager <: AbstractNameManager
+  counter :: NameCounter
+end
+BasicNameManager() = BasicNameManager(NameCounter())
+
+function get!(manager :: BasicNameManager, name :: NameType, hint :: NameType)
+  _default_get_name!(manager.counter, name, hint)
+end
+
+mutable struct PrefixNameManager <: AbstractNameManager
+  prefix  :: Base.Symbol
+  counter :: NameCounter
+end
+PrefixNameManager(prefix :: NameType) = PrefixNameManager(Symbol(prefix), NameCounter())
+
+function get!(manager :: PrefixNameManager, name :: NameType, hint :: NameType)
+  name = _default_get_name!(manager.counter, name, hint)
+  return Symbol("$(manager.prefix)$name")
+end
+
+DEFAULT_NAME_MANAGER = BasicNameManager()
diff --git a/julia/src/ndarray.jl b/julia/src/ndarray.jl
new file mode 100644
index 000000000000..9e47150a1a00
--- /dev/null
+++ b/julia/src/ndarray.jl
@@ -0,0 +1,1780 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# All the types supported by mshadow. See `mshadow/base.h`
+const DType = Union{Float32, Float64, Float16, UInt8, Int32, Int8, Int64}
+@enum TypeFlag kFloat32 kFloat64 kFloat16 kUint8 kInt32 kInt8 kInt64
+const DEFAULT_DTYPE = Float32  # MSHADOW_DEFAULT_DTYPE
+
+function toTypeFlag(T::Type{<:DType})
+  if T == Float32
+    return kFloat32
+  elseif T == Float64
+    return kFloat64
+  elseif T == Float16
+    return kFloat16
+  elseif T == UInt8
+    return kUint8
+  elseif T == Int32
+    return kInt32
+  elseif T == Int8
+    return kInt8
+  elseif T == Int64
+    return kInt64
+  else
+    throw(ArgumentError("Can't convert $T to DType."))
+  end
+end
+
+function fromTypeFlag(T::TypeFlag)
+  if T == kFloat32
+    return Float32
+  elseif T == kFloat64
+    return Float64
+  elseif T == kFloat16
+    return Float16
+  elseif T == kUint8
+    return UInt8
+  elseif T == kInt32
+    return Int32
+  elseif T == kInt8
+    return Int8
+  elseif T == kInt64
+    return Int64
+  else
+    throw(ArgumentError("Can't convert DType $T."))
+  end
+end
+
+# create a NDArray handle of specific shape
+function _ndarray_alloc(shape :: NTuple{N, Int}, ctx :: Context, delay_alloc :: Bool) where N
+  h_ref  = Ref{MX_handle}(0)
+  shape  = flipdim(MX_uint[shape...],1)
+  @mxcall(:MXNDArrayCreate, (Ptr{MX_uint}, MX_uint, Cint, Cint, Cint, Ref{MX_handle}),
+      shape, length(shape), ctx.device_type, ctx.device_id, delay_alloc, h_ref)
+  handle = MX_NDArrayHandle(h_ref[])
+  return handle
+end
+
+# create a NDArray handle of specific shape type
+function _ndarray_alloc(:: Type{T}, shape :: NTuple{N, Int}, ctx :: Context, delay_alloc :: Bool) where {T <: DType,N}
+  h_ref  = Ref{MX_handle}(0)
+  shape  = flipdim(MX_uint[shape...],1)
+  dtype  = toTypeFlag(T)
+  @mxcall(:MXNDArrayCreateEx, (Ptr{MX_uint}, MX_uint, Cint, Cint, Cint, Cint, Ref{MX_handle}),
+      shape, length(shape), ctx.device_type, ctx.device_id, delay_alloc, dtype, h_ref)
+  handle = MX_NDArrayHandle(h_ref[])
+  return handle
+end
+
+# create a handle to an empty NDArray, this handle can be used to hold
+# results returned by libmx API calls
+function _ndarray_alloc()
+  h_ref = Ref{MX_handle}(0)
+  @mxcall(:MXNDArrayCreateNone, (Ref{MX_handle},), h_ref)
+  return MX_NDArrayHandle(h_ref[])
+end
+
+################################################################################
+# NDArray Type
+################################################################################
+"""
+    NDArray{T,N}
+
+Wrapper of the `NDArray` type in `libmxnet`. This is the basic building block
+of tensor-based computation.
+
+!!! note
+      since C/C++ use row-major ordering for arrays while Julia follows a
+      column-major ordering. To keep things consistent, we keep the underlying data
+      in their original layout, but use *language-native* convention when we talk
+      about shapes. For example, a mini-batch of 100 MNIST images is a tensor of
+      C/C++/Python shape (100,1,28,28), while in Julia, the same piece of memory
+      have shape (28,28,1,100).
+"""
+mutable struct NDArray{T,N}
+  handle   :: MX_NDArrayHandle
+  writable :: Bool
+
+  NDArray{T,N}(handle, writable = true) where {T,N} = new(handle, writable)
+end
+
+NDArray(x::AbstractArray{T}) where {T<:DType} = copy(collect(x), cpu())
+NDArray(x::Array{T}) where {T<:DType} = copy(x, cpu())
+NDArray(::Type{T}, x::AbstractArray) where {T<:DType} =
+  copy(convert(AbstractArray{T}, x), cpu())
+NDArray(handle, writable = true) =
+  NDArray{eltype(handle), ndims(handle)}(handle, writable)
+
+# type aliases
+const NDArrayOrReal = Union{NDArray, Real}
+const VecOfNDArray = AbstractVector{<:NDArray}
+
+@unfuse NDArray
+
+function Base.show(io::IO, x::NDArray)
+  print(io, "NDArray ")
+  Base.showarray(io, try_get_shared(x, sync = :read), header = false)
+end
+
+# for REPL
+function Base.show(io::IO, ::MIME{Symbol("text/plain")}, x::NDArray{T, N}) where {T, N}
+  type_ = split(string(typeof(x)), '.', limit=2)[end]
+  size_ = N == 1 ? "$(length(x))-element" : join(size(x), "×")
+  println(io, "$size_ $type_ @ $(context(x)):")
+  Base.showarray(io, try_get_shared(x, sync = :read), false, header = false)
+end
+
+Base.unsafe_convert(::Type{MX_handle}, obj::NDArray) =
+  Base.unsafe_convert(MX_handle, obj.handle)
+Base.convert(T::Type{MX_handle}, obj::NDArray) = Base.unsafe_convert(T, obj)
+Base.cconvert(T::Type{MX_handle}, obj::NDArray) = Base.unsafe_convert(T, obj)
+
+################################################################################
+# NDArray functions exported to the users
+################################################################################
+"""
+    context(arr::NDArray)
+
+Get the context that this `NDArray` lives on.
+"""
+function context(arr::NDArray)
+  ref_typeid = Ref{Cint}(0)
+  ref_devid  = Ref{Cint}(0)
+  @mxcall(:MXNDArrayGetContext, (MX_handle, Ref{Cint}, Ref{Cint}),
+          arr, ref_typeid, ref_devid)
+  return Context(ref_typeid[], ref_devid[])
+end
+
+"""
+    empty(DType, dims[, ctx::Context = cpu()])
+    empty(DType, dims)
+    empty(DType, dim1, dim2, ...)
+
+Allocate memory for an uninitialized `NDArray` with a specified type.
+"""
+empty(::Type{T}, dims::NTuple{N,Int}, ctx::Context = cpu()) where {N,T<:DType} =
+  NDArray{T, N}(_ndarray_alloc(T, dims, ctx, false))
+empty(::Type{T}, dims::Int...) where {T<:DType} = empty(T, dims)
+
+"""
+    empty(dims::Tuple[, ctx::Context = cpu()])
+    empty(dim1, dim2, ...)
+
+Allocate memory for an uninitialized `NDArray` with specific shape of type Float32.
+"""
+empty(dims::NTuple{N,Int}, ctx::Context = cpu()) where N =
+  NDArray(_ndarray_alloc(dims, ctx, false))
+empty(dims::Int...) = empty(dims)
+
+"""
+    similar(x::NDArray)
+
+Create an `NDArray` with similar shape, data type,
+and context with the given one.
+Note that the returned `NDArray` is uninitialized.
+"""
+Base.similar(x::NDArray{T}) where {T} = empty(T, size(x), context(x))
+
+"""
+    zeros([DType], dims, [ctx::Context = cpu()])
+    zeros([DType], dims...)
+    zeros(x::NDArray)
+
+Create zero-ed `NDArray` with specific shape and type.
+"""
+function zeros(::Type{T}, dims::NTuple{N,Int}, ctx::Context = cpu()) where {N,T<:DType}
+  arr = empty(T, dims, ctx)
+  arr[:] = zero(T)
+  arr
+end
+
+zeros(::Type{T}, dims::Int...) where {T<:DType} = zeros(T, dims)
+
+zeros(dims::NTuple{N,Int}, ctx::Context = cpu()) where N =
+  zeros(MX_float, dims, ctx)
+zeros(dims::Int...) = zeros(dims)
+
+zeros(x::NDArray)::typeof(x)      = zeros_like(x)
+Base.zeros(x::NDArray)::typeof(x) = zeros_like(x)
+
+"""
+    ones([DType], dims, [ctx::Context = cpu()])
+    ones([DType], dims...)
+    ones(x::NDArray)
+
+Create an `NDArray` with specific shape & type, and initialize with 1.
+"""
+function ones(::Type{T}, dims::NTuple{N,Int}, ctx::Context = cpu()) where {N,T<:DType}
+  arr = empty(T, dims, ctx)
+  arr[:] = one(T)
+  arr
+end
+
+ones(::Type{T}, dims::Int...) where T<:DType = ones(T, dims)
+
+ones(dims::NTuple{N,Int}, ctx::Context = cpu()) where N =
+  ones(MX_float, dims, ctx)
+ones(dims::Int...) = ones(dims)
+
+ones(x::NDArray)::typeof(x)      = ones_like(x)
+Base.ones(x::NDArray)::typeof(x) = ones_like(x)
+
+import Base: size, length, ndims, eltype
+
+"""
+    size(x::NDArray)
+    size(x::NDArray, dims...)
+
+Get the shape of an `NDArray`. The shape is in Julia's column-major convention.
+See also the notes on NDArray shapes [`NDArray`](@ref).
+"""
+function size(x::NDArray)
+  ref_ndim  = Ref{MX_uint}(0)
+  ref_shape = Ref{Ptr{MX_uint}}(0)
+  @mxcall(:MXNDArrayGetShape, (MX_handle, Ref{MX_uint}, Ref{Ptr{MX_uint}}),
+          x, ref_ndim, ref_shape)
+  tuple(map(Int, flipdim(unsafe_wrap(Array, ref_shape[], ref_ndim[]),1))...)
+end
+
+function size(x::NDArray{T,N}, dim::Int) where {T,N}
+  if dim > N
+    1
+  else
+    size(x)[dim]
+  end
+end
+
+size(x::NDArray, dims::Int...) = map(d -> size(x, d), dims)
+
+"""
+    length(x::NDArray)
+
+Get the number of elements in an `NDArray`.
+"""
+length(x::NDArray) = prod(size(x))
+
+"""
+    ndims(x::NDArray)
+
+Get the number of dimensions of an `NDArray`.
+Is equivalent to `length(size(arr))`.
+"""
+ndims(x::NDArray) = ndims(x.handle)
+
+function ndims(x::MX_NDArrayHandle)::Int
+  ref_ndim  = Ref{MX_uint}(0)
+  ref_shape = Ref{Ptr{MX_uint}}(0)
+  @mxcall(:MXNDArrayGetShape, (MX_handle, Ref{MX_uint}, Ref{Ptr{MX_uint}}),
+          x, ref_ndim, ref_shape)
+  ref_ndim[]
+end
+
+"""
+    eltype(x::NDArray)
+
+Get the element type of an `NDArray`.
+"""
+function eltype(x::Union{NDArray, MX_NDArrayHandle})
+  dtype_ref = Ref{Cint}(0)
+  @mxcall(:MXNDArrayGetDType, (MX_handle, Ptr{Cint}), x, dtype_ref)
+
+  if dtype_ref[] == -1 # x->is_none()
+    warn("Eltype of $x is not defined")
+    Base.show_backtrace(STDOUT, backtrace())
+    println()
+    Float32
+  else
+    fromTypeFlag(TypeFlag(dtype_ref[]))
+  end
+end
+
+@inline _first(x::NDArray) = try_get_shared(x, sync = :read) |> first
+
+Base.first(x::NDArray) = _first(x)
+
+Base.endof(x::NDArray) = length(x)
+
+"""
+    slice(arr :: NDArray, start:stop)
+
+Create a view into a sub-slice of an `NDArray`. Note only slicing at the slowest
+changing dimension is supported. In Julia's column-major perspective, this is the last
+dimension. For example, given an `NDArray` of shape (2,3,4), `slice(array, 2:3)` will create
+a `NDArray` of shape (2,3,2), sharing the data with the original array. This operation is
+used in data parallelization to split mini-batch into sub-batches for different devices.
+"""
+function slice(arr::NDArray, ::Colon)
+  arr
+end
+function slice(arr::NDArray, slice::UnitRange{Int})
+  dim1 = size(arr)[end]
+  @assert(1 <= slice.start <= slice.stop <= dim1)
+  if slice.start == 1 && slice.stop == dim1
+    return arr
+  end
+
+  hdr_ref = Ref{MX_handle}(0)
+  # note Julia is 1-based, inclusive-inclusive indexing, while C++ is
+  # 0-based, inclusive-exclusive indexing. So 1:3 in Julia should
+  # translates into 0:3 in C++.
+  @mxcall(:MXNDArraySlice, (MX_handle, MX_uint, MX_uint, Ref{MX_handle}),
+          arr, slice.start-1, slice.stop, hdr_ref)
+  return NDArray(MX_NDArrayHandle(hdr_ref[]), arr.writable)
+end
+
+function _at(handle::Union{MX_NDArrayHandle, MX_handle}, idx::Integer)
+  h_ref = Ref{MX_handle}(C_NULL)
+  @mxcall(:MXNDArrayAt, (MX_handle, MX_uint, Ref{MX_handle}),
+          handle, idx, h_ref)
+  h_ref[]
+end
+
+import Base: setindex!
+
+"""
+    setindex!(arr::NDArray, val, idx)
+
+Assign values to an `NDArray`.
+The following scenarios are supported
+
+* single value assignment via linear indexing: `arr[42] = 24`
+
+* `arr[:] = val`: whole array assignment, `val` could be a scalar or an array (Julia `Array`
+  or `NDArray`) of the same shape.
+* `arr[start:stop] = val`: assignment to a *slice*, `val` could be a scalar or an array of
+  the same shape to the slice. See also [`slice`](@ref).
+"""
+function setindex!(arr::NDArray, val::Real, idx::Integer)
+  # linear indexing
+  @assert arr.writable
+  _set_value(out=arr[idx], src=val)
+end
+
+function setindex!(arr::NDArray, val::Real, ::Colon)
+  @assert arr.writable
+  _set_value(out = arr, src = dump_mx_param(val))
+end
+
+function setindex!(arr::NDArray, val::Array{T}, ::Colon) where T<:Real
+  @assert arr.writable
+  copy!(arr, val)
+end
+
+function setindex!(arr::NDArray, val::NDArray, ::Colon)
+  @assert arr.writable
+  copy!(arr, val)
+end
+
+function setindex!(arr::NDArray, val::Union{T,Array{T},NDArray},
+                   idx::UnitRange{Int}) where T<:Real
+  @assert arr.writable
+  setindex!(slice(arr, idx), val, Colon())
+end
+
+import Base: getindex
+"""
+    getindex(arr::NDArray, idx)
+
+Shortcut for [`slice`](@ref). A typical use is to write
+
+```julia
+  arr[:] += 5
+```
+
+which translates into
+
+```julia
+  arr[:] = arr[:] + 5
+```
+
+which furthur translates into
+
+```julia
+  setindex!(getindex(arr, Colon()), 5, Colon())
+```
+
+!!! note
+    The behavior is quite different from indexing into Julia's `Array`. For example, `arr[2:5]`
+    create a **copy** of the sub-array for Julia `Array`, while for `NDArray`, this is
+    a *slice* that shares the memory.
+"""
+getindex(arr::NDArray, ::Colon) = arr
+
+"""
+Shortcut for [`slice`](@ref).
+**NOTE** the behavior for Julia's built-in index slicing is to create a
+copy of the sub-array, while here we simply call `slice`,
+which shares the underlying memory.
+"""
+getindex(arr::NDArray, idx::UnitRange{Int}) = slice(arr, idx)
+
+getindex(arr::NDArray) = _first(arr)
+
+function getindex(arr::NDArray, idx::Integer)
+  # linear indexing
+  len = length(arr)
+  size_ = size(arr)
+
+  if idx <= 0 || idx > len
+    throw(BoundsError(
+      "attempt to access $(join(size_, 'x')) NDArray at index $(idx)"))
+  end
+
+  idx -= 1
+  offsets = size_[1:end-1] |> reverse ∘ cumprod ∘ collect
+  handle = arr.handle
+  for offset ∈ offsets
+    handle = _at(handle, idx ÷ offset)
+    idx %= offset
+  end
+
+  _at(handle, idx) |> MX_NDArrayHandle |> x -> NDArray(x, arr.writable)
+end
+
+import Base: copy!, copy, convert, deepcopy
+
+"""
+    copy!(dst::Union{NDArray, Array}, src::Union{NDArray, Array})
+
+Copy contents of `src` into `dst`.
+"""
+function copy!(dst::NDArray, src::NDArray)
+  @assert(dst.writable)
+  if dst.handle == src.handle
+    warn("Copying an NDArray to itself")
+    return
+  end
+
+  _copyto(src, out=dst)
+  return dst
+end
+
+function copy!(dst::Array{T}, src::NDArray{T}) where T<:DType
+  @assert size(dst) == size(src)
+  @mxcall(:MXNDArraySyncCopyToCPU, (MX_handle, Ptr{Void}, Csize_t),
+          src, pointer(dst), length(dst))
+  dst
+end
+
+copy!(dst::Array{<:Real}, src::NDArray) = copy!(dst, copy(src))
+copy!(dst::NDArray, src::AbstractArray) = copy!(dst, collect(src))
+
+function copy!(dst::NDArray{T}, src::Array{<:Real}) where {T}
+  @assert dst.writable
+  @assert size(dst) == size(src)
+  src = convert(Array{T}, src) # this might involve copying
+  @mxcall(:MXNDArraySyncCopyFromCPU, (MX_handle, Ptr{Void}, Csize_t),
+          dst.handle, pointer(src), length(src))
+  dst
+end
+
+function copy_ignore_shape!(dst::NDArray{T}, src::Array{<:Real}) where {T}
+  @assert dst.writable
+  @assert length(dst) == length(src)
+  src = convert(Array{T}, src) # this might involve copying
+  @mxcall(:MXNDArraySyncCopyFromCPU, (MX_handle, Ptr{Void}, Csize_t),
+          dst.handle, pointer(src), length(src))
+  dst
+end
+
+
+"""
+    copy(arr :: NDArray)
+    copy(arr :: NDArray, ctx :: Context)
+    copy(arr :: Array, ctx :: Context)
+
+Create a copy of an array. When no `Context` is given, create a Julia `Array`.
+Otherwise, create an `NDArray` on the specified context.
+"""
+# Create copy: NDArray -> Julia Array
+copy(x::NDArray{T,D}) where{T,D} = copy!(Array{T,D}(size(x)), x)
+
+# Create copy: NDArray -> NDArray in a given context
+copy(x::NDArray{T,D}, ctx::Context) where {T,D} =
+  copy!(NDArray{T,D}(_ndarray_alloc(T, size(x), ctx, true)), x)
+
+# Create copy: Julia Array -> NDArray in a given context
+copy(x::Array{T}, ctx::Context) where {T<:DType} =
+  copy!(empty(T, size(x), ctx), x)
+
+copy(x::AbstractArray, ctx::Context) =
+  copy!(empty(eltype(x), size(x), ctx), collect(x))
+
+"""
+    convert(::Type{Array{<:Real}}, x::NDArray)
+
+Convert an `NDArray` into a Julia `Array` of specific type.
+Data will be copied.
+"""
+convert(T::Type{Array{<:Real}}, x::NDArray) = convert(T, copy(x))
+
+"""
+    deepcopy(arr::NDArray)
+
+Get a deep copy of the data blob in the form of an NDArray of default storage
+type. This function blocks. Do not use it in performance critical code.
+"""
+function deepcopy(arr::NDArray)
+  out_ref = Ref{MX_handle}(C_NULL)
+  @mxcall(:MXNDArrayGetDataNDArray, (MX_handle, Ref{MX_handle}), arr, out_ref)
+  NDArray(MX_NDArrayHandle(out_ref[]))
+end
+
+"""
+    hcat(x::NDArray...)
+"""
+Base.hcat(xs::NDArray{T}...) where T = cat(2, xs...)
+
+"""
+    vcat(x::NDArray...)
+"""
+Base.vcat(xs::NDArray{T}...) where T = cat(1, xs...)
+
+"""
+    cat(dim, xs::NDArray...)
+
+Concate the `NDArray`s which have the same element type along the `dim`.
+Building a diagonal matrix is not supported yet.
+"""
+function Base.cat(dim::Int, xs::NDArray{T}...) where T
+  ns = ndims.(xs)
+  d = Base.max(dim, maximum(ns))
+  xs′ = map(zip(ns, xs)) do i
+    n, x = i
+    (d > n) ? reshape(x, -2, Base.ones(Int, d - n)...) : x
+  end
+  concat(xs′..., dim = d - dim)
+end
+
+"""
+    @inplace
+
+Julia does not support re-definiton of `+=` operator (like `__iadd__` in python),
+When one write `a += b`, it gets translated to `a = a+b`. `a+b` will allocate new
+memory for the results, and the newly allocated `NDArray` object is then assigned
+back to a, while the original contents in a is discarded. This is very inefficient
+when we want to do inplace update.
+
+This macro is a simple utility to implement this behavior. Write
+
+```julia
+  @mx.inplace a += b
+```
+
+will translate into
+
+```julia
+  mx.add_to!(a, b)
+```
+
+which will do inplace adding of the contents of `b` into `a`.
+"""
+macro inplace(ex)
+  f = if ex.head == :+= || ex.head == :.+=
+    :add_to!
+  elseif ex.head == :-= || ex.head == :.-=
+    :sub_from!
+  elseif ex.head == :.*=
+    :mul_to!
+  elseif ex.head == :./=
+    :div_from!
+  elseif ex.head == :.%=
+    :mod_from!
+  else
+    error("unsupported inplace translation for $ex")
+  end
+  Expr(:call, f, esc(ex.args[1]), esc(ex.args[2]))
+end
+
+"""
+    add_to!(dst::NDArray, args::NDArrayOrReal...)
+
+Add a bunch of arguments into `dst`. Inplace updating.
+"""
+function add_to!(dst::NDArray, args::NDArrayOrReal...)
+  @assert dst.writable
+  for arg in args
+    if isa(arg, Real)
+      _plus_scalar(dst, scalar = arg, out = dst)
+    else
+      _plus!(dst, arg)
+    end
+  end
+  dst
+end
+
+import Base: +
+
+"""
+    +(args...)
+    .+(args...)
+
+Summation. Multiple arguments of either scalar or `NDArray` could be
+added together. Note at least the first or second argument needs to be an
+`NDArray` to avoid ambiguity of built-in summation.
+"""
++(x::NDArray)             = x
++(x::NDArray, y::NDArray) = _plus(x, y)
++(x::NDArray, y::Real)    = _plus_scalar(x, scalar = y)
++(y::Real,    x::NDArray) = _plus_scalar(x, scalar = y)
+
+broadcast_(::typeof(+), x::NDArray, y::Real) = x + y
+broadcast_(::typeof(+), x::Real, y::NDArray) = x + y
+
+broadcast_(::typeof(+), x::NDArray{T,N}, y::NDArray{T,N}) where {T,N}   = x + y
+broadcast_(::typeof(+), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
+  _broadcast_add(x, y)
+
+"""
+    sub_from!(dst::NDArray, args::NDArrayOrReal...)
+
+Subtract a bunch of arguments from `dst`. Inplace updating.
+"""
+function sub_from!(dst::NDArray, arg::NDArrayOrReal)
+  @assert dst.writable
+  if isa(arg, Real)
+    _minus_scalar(dst, scalar = arg, out = dst)
+  else
+    _minus!(dst, arg)
+  end
+  dst
+end
+
+import Base: -
+
+"""
+    -(x::NDArray)
+    -(x, y)
+    .-(x, y)
+
+Subtraction `x - y`, of scalar types or `NDArray`.
+Or create the negative of `x`.
+"""
+-(x::NDArray) = _mul_scalar(x, scalar = -one(eltype(x)))
+-(x::NDArray, y::NDArray) = _minus(x, y)
+-(x::NDArray, y::Real)    = _minus_scalar(x, scalar = y)
+-(y::Real, x::NDArray)    = _rminus_scalar(x, scalar = y)
+
+broadcast_(::typeof(-), x::NDArray, y::Real) = x - y
+broadcast_(::typeof(-), x::Real, y::NDArray) = x - y
+
+broadcast_(::typeof(-), x::NDArray{T,N}, y::NDArray{T,N}) where {T,N}   = x - y
+broadcast_(::typeof(-), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
+  _broadcast_minus(x, y)
+
+"""
+    mul_to!(dst::NDArray, arg::NDArrayOrReal)
+
+Elementwise multiplication into `dst` of either a scalar or an `NDArray` of the same shape.
+Inplace updating.
+"""
+function mul_to!(dst::NDArray, arg::NDArrayOrReal)
+  @assert dst.writable
+  if isa(arg, Real)
+    _mul_scalar(dst, scalar = arg, out = dst)
+  else
+    _mul(dst, arg, out = dst)
+  end
+  dst
+end
+
+import Base: *
+
+"""
+    .*(x, y)
+
+Elementwise multiplication for `NDArray`.
+"""
+*(x::NDArray, y::Real)  = _mul_scalar(x, scalar = y)
+*(y::Real, x::NDArray)  = _mul_scalar(x, scalar = y)
+
+broadcast_(::typeof(*), x::NDArray, y::Real) = x * y
+broadcast_(::typeof(*), y::Real, x::NDArray) = x * y
+
+broadcast_(::typeof(*), x::NDArray{T,N}, y::NDArray{T,N}) where {T,N} =
+  _mul(x, y)
+broadcast_(::typeof(*), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
+  _broadcast_mul(x, y)
+
+"""
+    *(A::NDArray, B::NDArray)
+
+Matrix/tensor multiplication.
+"""
+*(x::NDArray{T}, y::NDArray{T}) where T = x ⋅ y
+
+"""
+    div_from!(dst::NDArray, arg::NDArrayOrReal)
+
+Elementwise divide a scalar or an `NDArray` of the same shape from `dst`. Inplace updating.
+"""
+function div_from!(dst::NDArray, arg::NDArrayOrReal)
+  @assert dst.writable
+  if isa(arg, Real)
+    _div_scalar(dst, scalar = arg, out = dst)
+  else
+    _div(dst, arg, out = dst)
+  end
+  dst
+end
+
+function div_from!(dst::NDArray{T}, arg::Real) where {T<:Integer}
+  @assert dst.writable
+  @assert(round(T, arg) != zero(T), "Integer divided by zero")
+  _div_scalar(dst, scalar = arg, out = dst)
+  dst
+end
+
+"""
+    rdiv_from!(x:: Real, y::NDArray)
+
+Elementwise divide a scalar by an `NDArray`. Inplace updating.
+"""
+function rdiv_from!(x::Real, y::NDArray)
+  @assert y.writable
+  _rdiv_scalar(y, scalar = x, out = y)
+  y
+end
+
+import Base: /
+
+"""
+    ./(x::NDArray, y::NDArray)
+    ./(x::NDArray, y::Real)
+    ./(x::Real, y::NDArray)
+
+* Elementwise dividing an `NDArray` by a scalar or another `NDArray`
+of the same shape.
+
+* Elementwise divide a scalar by an `NDArray`.
+
+* Matrix division (solving linear systems) is not implemented yet.
+"""
+/(x::NDArray, y::Real) = _div_scalar(x, scalar = y)
+
+broadcast_(::typeof(/), x::NDArray, y::Real)    = _div_scalar(x, scalar = y)
+broadcast_(::typeof(/), y::Real, x::NDArray)    = _rdiv_scalar(x, scalar = y)
+
+broadcast_(::typeof(/), x::NDArray{T,N}, y::NDArray{T,N}) where {T,N} =
+  _div(x, y)
+broadcast_(::typeof(/), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
+  _broadcast_div(x, y)
+
+function broadcast_(::typeof(/), x::NDArray{T}, y::Real) where {T<:Integer}
+  @assert(round(T, y) != zero(T), "Integer divided by zero")
+  _div_scalar(x, scalar = y)
+end
+
+"""
+    mod_from!(x::NDArray, y::NDArray)
+    mod_from!(x::NDArray, y::Real)
+
+Elementwise modulo for `NDArray`.
+Inplace updating.
+"""
+mod_from!(x::NDArray, y::NDArray) = _mod!(x, y)
+mod_from!(x::NDArray, y::Real)    = _mod_scalar!(x, y)
+
+"""
+    rmod_from!(y::Real, x::NDArray)
+
+Elementwise modulo for `NDArray`.
+Inplace updating.
+"""
+rmod_from!(y::Real, x::NDArray) = _rmod_scalar!(x, y)
+
+import Base: %
+
+"""
+    .%(x::NDArray, y::NDArray)
+    .%(x::NDArray, y::Real)
+    .%(x::Real, y::NDArray)
+
+Elementwise modulo for `NDArray`.
+"""
+%(x::NDArray, y::Real) = _mod_scalar(x, scalar = y)
+
+broadcast_(::typeof(%), x::NDArray, y::Real)    = _mod_scalar(x, y)
+broadcast_(::typeof(%), y::Real, x::NDArray)    = _rmod_scalar(x, y)
+
+broadcast_(::typeof(%), x::NDArray{T,N}, y::NDArray{T,N}) where {T,N} =
+  _mod(x, y)
+broadcast_(::typeof(%), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
+  _broadcast_mod(x, y)
+
+import Base: ^
+
+# document of `.^` is merged into SymbolicNode's
+
+broadcast_(::typeof(^), x::NDArray, s::Real)    = _power_scalar(x, scalar = s)
+broadcast_(::typeof(^), s::Real, x::NDArray)    = _rpower_scalar(x, scalar = s)
+
+broadcast_(::typeof(^), ::Irrational{:e}, x::NDArray) = exp(x)
+broadcast_(::typeof(^), x::NDArray, s::Irrational)    = _power_scalar(x, scalar = s)
+broadcast_(::typeof(^), s::Irrational, x::NDArray)    = _rpower_scalar(x, scalar = s)
+
+broadcast_(::typeof(^), x::NDArray{T,N}, y::NDArray{T,N}) where {T,N} =
+  _power(x, y)
+broadcast_(::typeof(^), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
+  _broadcast_power(x, y)
+
+###############################################################################
+# comparison
+###############################################################################
+
+broadcast_(::typeof(==), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_equal(x, y)
+
+broadcast_(::typeof(!=), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_not_equal(x, y)
+
+broadcast_(::typeof(>), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_greater(x, y)
+
+broadcast_(::typeof(>=), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_greater_equal(x, y)
+
+broadcast_(::typeof(<), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_lesser(x, y)
+
+broadcast_(::typeof(<=), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_lesser_equal(x, y)
+
+
+###############################################################################
+# min/max
+###############################################################################
+
+import Base: min, max
+
+broadcast_(::typeof(max), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_maximum(x, y)
+
+broadcast_(::typeof(min), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_minimum(x, y)
+
+"""
+    fill!(arr::NDArray, x)
+
+Create an `NDArray` filled with the value `x`, like `Base.fill!`.
+"""
+function Base.fill!(arr::NDArray, x)
+  arr[:] = x
+  arr
+end
+
+"""
+    fill(x, dims, ctx=cpu())
+    fill(x, dims...)
+
+Create an `NDArray` filled with the value `x`, like `Base.fill`.
+"""
+function fill(x, dims::NTuple{N,Integer}, ctx::Context=cpu()) where N
+  arr = empty(typeof(x), dims, ctx)
+  arr[:] = x
+  arr
+end
+
+fill(x, dims::Integer...) = fill(x, dims)
+
+import Base: hypot
+
+broadcast_(::typeof(hypot), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_hypot(x, y)
+
+"""
+Manipulating as Julia Arrays
+----------------------------
+
+    @nd_as_jl(captures..., statement)
+
+A convenient macro that allows to operate `NDArray` as Julia Arrays. For example,
+
+```julia
+  x = mx.zeros(3,4)
+  y = mx.ones(3,4)
+  z = mx.zeros((3,4), mx.gpu())
+
+  @mx.nd_as_jl ro=(x,y) rw=z begin
+    # now x, y, z are just ordinary Julia Arrays
+    z[:,1] = y[:,2]
+    z[:,2] = 5
+  end
+```
+
+Under the hood, the macro convert all the declared captures from `NDArray` into Julia
+Arrays, by using `try_get_shared`. And automatically commit the modifications back into
+the `NDArray` that is declared as `rw`. This is useful for fast prototyping and when
+implement non-critical computations, such as `AbstractEvalMetric`.
+
+!!! note
+* Multiple `rw` and / or `ro` capture declaration could be made.
+* The macro does **not** check to make sure that `ro` captures are not modified. If the
+  original `NDArray` lives in CPU memory, then it is very likely the corresponding
+  Julia Array shares data with the `NDArray`, so modifying the Julia Array will also
+  modify the underlying `NDArray`.
+* More importantly, since the `NDArray` is
+  asynchronized, we will wait for *writing* for `rw` variables but wait only for *reading*
+  in `ro` variables. If we write into those `ro` variables, **and** if the memory is
+  shared, racing condition might happen, and the behavior is undefined.
+* When an `NDArray` is declared to be captured as `rw`, its contents is always sync
+  back in the end.
+* The execution results of the expanded macro is always `nothing`.
+* The statements are wrapped in a `let`, thus locally introduced new variables will not be
+  available after the statements. So you will need to declare the variables before calling the
+  macro if needed.
+"""
+macro nd_as_jl(m_args...)
+  @assert(length(m_args) > 0)
+  stmts = m_args[end]
+  @assert(isa(stmts, Expr) && stmts.head == :block,
+          "The last argument should be a statement block (begin-end); but get $stmts")
+  stmts = esc(stmts)
+
+  dclrs  = m_args[1:end-1]
+  nd_ro  = []
+  nd_rw  = []
+  nd_all = []
+  for declr in dclrs
+    @assert(isa(declr, Expr) && declr.head == :(=) && length(declr.args)==2 && declr.args[1] ∈ (:ro,:rw),
+            "Invalid declaration, should be rw=(x,y) or ro=z; but get $declr")
+
+    declr_vars = declr.args[2]
+    if isa(declr_vars, Symbol)
+      declr_vars = (declr_vars,)
+    elseif isa(declr_vars, Expr)
+      @assert(declr_vars.head ∈ (:tuple, :vect),
+              "Capture declaration should be a variable or a tuple of variables; but got $declr_vars")
+      declr_vars = declr_vars.args
+    else
+      @assert(false, "Capture declaration should be a variable or a tuple of variables; but got $declr_vars")
+    end
+    for declr_var in declr_vars
+      @assert(isa(declr_var, Symbol),
+              "Captured ndarrays in ro/rw declaration should be variables, but get $(declr_var)")
+    end
+    append!(nd_all, [declr_vars...])
+    if declr.args[1] == :ro
+      append!(nd_ro, [declr_vars...])
+    else
+      append!(nd_rw, [declr_vars...])
+    end
+  end
+
+  nd_ro    = map(esc, nd_ro)
+  nd_rw    = map(esc, nd_rw)
+  nd_all   = map(esc, nd_all)
+  rw_origs = [gensym() for _ in nd_rw]
+
+  save_statements  = Expr(:block, [:($v_orig = $v) for (v_orig, v) in zip(rw_origs, nd_rw)]...)
+  wait_statements  = Expr(:block, [:(_wait_to_read($v)) for v in nd_ro]...,
+                                  [:(_wait_to_write($v)) for v in nd_rw]...)
+  clear_statements = Expr(:block, [:($v_orig = nothing) for v_orig in rw_origs]...)
+  let_assignments  = [:($v = try_get_shared($v)) for v in nd_all]
+  sync_statements  = map(rw_origs, nd_rw) do v_orig, v
+    quote
+      if !is_shared($v, $v_orig)
+        # copy data back if not or no longer sharing data
+        copy!($v_orig, $v)
+      end
+    end
+  end
+  sync_statements  = Expr(:block, sync_statements...)
+
+  let_statement = Expr(:let, quote
+    $stmts
+    $sync_statements
+  end, let_assignments...)
+  m_body = quote
+    $wait_statements
+    $save_statements
+    $let_statement
+    $clear_statements
+    nothing # the final results is always nothing
+  end
+
+  m_body
+end
+
+# NOTE: internal use only. Accessing pointers on a different device (e.g. accessing GPU
+# pointers from CPU) leads to undefined behavior.
+import Base.pointer
+function pointer(arr :: NDArray)
+  pdata = Ref{Ptr{Void}}(0)
+  @mxcall(:MXNDArrayGetData, (MX_handle, Ref{Ptr{Void}}), arr, pdata)
+  return convert(Ptr{eltype(arr)}, pdata[])
+end
+
+@inline _wait_to_read(arr :: NDArray) =
+  @mxcall(:MXNDArrayWaitToRead, (MX_handle,), arr)
+@inline _wait_to_write(arr :: NDArray) =
+  @mxcall(:MXNDArrayWaitToWrite, (MX_handle,), arr)
+
+"""
+    try_get_shared(arr; sync=:nop)
+
+Try to create a Julia array by sharing the data with the underlying `NDArray`.
+
+# Arguments:
+
+* `arr::NDArray`: the array to be shared.
+
+!!! note
+    The returned array does not guarantee to share data with the underlying `NDArray`.
+    In particular, data sharing is possible only when the `NDArray` lives on CPU.
+
+* `sync::Symbol`: `:nop`,`:write`, `:read`
+  On CPU, invoke `_wait_to_read` if `:read`;
+  invoke `_wait_to_write` if `:write`.
+"""
+function try_get_shared(x::NDArray; sync::Symbol=:nop)
+  if context(x).device_type == CPU
+    # try to do data sharing
+    if sync == :read
+      _wait_to_read(x)
+    elseif sync == :write
+      _wait_to_write(x)
+    end
+
+    unsafe_wrap(Array, pointer(x), size(x))
+  else
+    # impossible to share, just copying
+    copy(x)
+  end
+end
+
+"""
+    is_shared(j_arr, arr)
+
+Test whether `j_arr` is sharing data with `arr`.
+
+# Arguments:
+
+* `j_arr::Array`: the Julia Array.
+* `arr::NDArray`: the `NDArray`.
+"""
+is_shared(::Array, ::NDArray) = false
+
+function is_shared(j_arr::Array{T}, arr::NDArray{T}) where {T<:DType}
+  if length(j_arr) != length(arr)
+    return false
+  end
+  if context(arr).device_type != CPU
+    return false
+  end
+  pointer(j_arr) == pointer(arr)
+end
+
+"""
+    load(filename, ::Type{NDArray})
+
+Load NDArrays from binary file.
+
+# Arguments:
+* `filename::String`: the path of the file to load. It could be S3 or HDFS address.
+
+Returns either `Dict{Symbol, NDArray}` or `Vector{NDArray}`.
+
+`filename` can point to `s3` or `hdfs` resources if the `libmxnet` is built with the
+corresponding components enabled. Examples:
+* `s3://my-bucket/path/my-s3-ndarray`
+* `hdfs://my-bucket/path/my-hdfs-ndarray`
+* `/path-to/my-local-ndarray`
+"""
+function load(filename::AbstractString, ::Type{<:NDArray})
+  out_size      = Ref{MX_uint}(0)
+  out_hdrs      = Ref{Ptr{MX_handle}}(0)
+  out_name_size = Ref{MX_uint}(0)
+  out_names     = Ref{char_pp}(0)
+  @mxcall(:MXNDArrayLoad, (char_p, Ref{MX_uint}, Ref{Ptr{MX_handle}}, Ref{MX_uint}, Ref{char_pp}),
+          filename, out_size, out_hdrs, out_name_size, out_names)
+  out_name_size = out_name_size[]
+  out_size      = out_size[]
+  if out_name_size == 0
+    return [NDArray(MX_NDArrayHandle(hdr)) for hdr in unsafe_wrap(Array, out_hdrs[], out_size)]
+  else
+    @assert out_size == out_name_size
+    return Dict([(Symbol(unsafe_string(k)), NDArray(MX_NDArrayHandle(hdr))) for (k,hdr) in
+                 zip(unsafe_wrap(Array, out_names[], out_size), unsafe_wrap(Array, out_hdrs[], out_size))])
+  end
+end
+
+"""
+    save(filename::AbstractString, data)
+
+Save NDarrays to binary file. Filename could be S3 or HDFS address, if `libmxnet` is built
+with corresponding support (see `load`).
+
+* `filename::String`: path to the binary file to write to.
+* `data`: data to save to file. Data can be a`NDArray`, a `Vector` of `NDArray`,
+  or a `Dict{Symbol}` contains `NDArray`s.
+"""
+save(filename::String, data::NDArray) = save(filename, [data])
+
+save(filename::String, data::VecOfNDArray) =
+  @mxcall(:MXNDArraySave, (char_p, MX_uint, Ptr{MX_handle}, char_pp),
+          filename, length(data), MX_handle[data...], char_pp(0))
+
+function save(filename::String, data::Dict{Symbol})
+  names  = keys(data)
+  arrays = MX_handle.(collect(values(data)))
+  names  = String.(collect(names))
+
+  @mxcall(:MXNDArraySave, (char_p, MX_uint, Ptr{MX_handle}, char_pp),
+          filename, length(names), arrays, names)
+end
+
+################################################################################
+# Mapping NDArray functions to Base-like API
+################################################################################
+
+const _ndsig = Dict{Symbol,Expr}()
+const _nddoc = Dict{Symbol,Any}()
+
+function _autoimport(name::Symbol, sig::Expr)
+  if name == :broadcast_
+    name = _broadcast_target(sig)
+  end
+
+  if isdefined(Base, name)
+    :(import Base: $name)
+  else
+    :()
+  end
+end
+
+_isinplace(name::Symbol) = endswith(string(name), "!")
+
+_writable(name::Symbol, x) =
+  _isinplace(name) ? :(@assert $x.writable "this NDArray isn't writable") : :()
+
+function _outexpr(name::Symbol, x #= the first arg of `sig` =#)
+  if _isinplace(name)  # `func!`
+    Ptr, 1, :([[MX_handle(x.handle)]]), :($x)
+  else
+    retexpr = :(NDArray(MX_NDArrayHandle(unsafe_load(hdls_ref[], 1))))
+    Ref, 0, :(Ref{Ptr{MX_handle}}(C_NULL)), retexpr
+  end
+end
+
+_broadcast_target(sig::Expr) = sig.args[2].args[].args[end]
+
+"""
+Generate docstring from function signature
+"""
+function _docsig(fname::Symbol, sig::Expr, opname::String)
+  if fname !== :broadcast_
+    get(_nddoc, fname, "    $sig") * "\n" * _getdocdefine(opname)
+  else
+    name = _broadcast_target(sig)
+    str = get(_nddoc, name, "")
+    _nddoc[name] = false  # change to false, denote docstring has been set up
+    if isempty(str)
+      sig_ = Expr(:call, Symbol(name, "."), sig.args[3:end]...)
+      str = "    $sig_"
+    end
+    if str ≠ false
+      # append "Defined in ..."
+      def = _getdocdefine(opname)
+      str = if str isa Markdown.MD
+        str = Markdown.MD(copy(str.content), copy(str.meta))
+        push!(str, Markdown.Paragraph(def))
+        str
+      else
+        str * def
+      end
+
+      @eval @doc $str $name
+    end
+    ""
+  end
+end
+
+macro _remap(sig::Expr, imp::Expr)
+  fname = (sig.head == :call) ? sig.args[1] : sig.args[1].args[1]  # case of `where`
+  opname = string(imp.args[1])
+
+  import_expr = _autoimport(fname, sig)
+
+  if isa(imp.args[2], Expr) && imp.args[2].head == :parameters
+    ndin = imp.args[3:end]
+    mxargs = imp.args[2].args
+  else  # no keyword arguments
+    ndin = imp.args[2:end]
+    mxargs = []
+  end
+
+  mxkeys = map(x -> string(x.args[1]), mxargs)
+  mxvals = Expr(:vect, map(x -> :(dump_mx_param($(x.args[2]))), mxargs)...)
+  ndhlds = Expr(:vect, map(x -> :($(x).handle), ndin)...)
+
+  # handler for `func!` which has side effect on first argument.
+  T, n_output, hdls_ref, retexpr = _outexpr(fname, _firstarg(sig))
+
+  assert_expr = _writable(fname, _firstarg(sig))
+
+  func_body = quote
+    $assert_expr
+    op_handle = _get_cached_libmx_op_handle($opname)
+    n_output = Ref(Cint($n_output))
+    hdls_ref = $hdls_ref
+    @mxcall(:MXImperativeInvoke,
+            (MX_handle,
+             Cint,
+             Ptr{MX_handle},
+             Ref{Cint},
+             $T{Ptr{MX_handle}},
+             Cint,
+             char_pp,
+             char_pp),
+            op_handle,
+            $(length(ndin)),
+            $(ndhlds),
+            n_output,
+            hdls_ref,
+            $(length(mxargs)),
+            $mxkeys,
+            $mxvals)
+    $retexpr
+  end
+
+  docstr = _docsig(fname, sig, opname)
+  func_def = Expr(:function, sig, func_body)
+
+  esc(quote
+    $import_expr
+    @doc $docstr ->
+    $func_def
+  end)
+end
+
+macro _remap(sig::Expr, imp::Symbol)
+  imp = _ndsig[imp]
+
+  esc(quote
+    @_remap($sig, $imp)
+  end)
+end
+
+_ndsig[:reshape] = :(reshape(arr; shape = dim, reverse = !reverse))
+@_remap reshape(arr::NDArray, dim...; reverse = false) reshape
+@_remap reshape(arr::NDArray, dim; reverse = false)    reshape
+
+@_remap mean(arr::NDArray)         mean(arr)
+@_remap mean(arr::NDArray, region) mean(arr; axis = 0 .- region, keepdims = true)
+
+@_remap sum(arr::NDArray)       sum(arr)
+@_remap sum(arr::NDArray, dims) sum(arr; axis = 0 .- dims, keepdims = true)
+
+@_remap maximum(arr::NDArray)       max(arr)
+@_remap maximum(arr::NDArray, dims) max(arr; axis = 0 .- dims, keepdims = true)
+
+@_remap minimum(arr::NDArray)       min(arr)
+@_remap minimum(arr::NDArray, dims) min(arr; axis = 0 .- dims, keepdims = true)
+
+# See https://github.com/dmlc/MXNet.jl/issues/55
+@_remap dot(x::NDArray, y::NDArray) dot(y, x)
+
+# See https://github.com/dmlc/MXNet.jl/pull/123
+@_remap transpose(arr::NDArray{T,1}) where T reshape(arr; shape = (1, length(arr)), reverse = true)
+@_remap transpose(arr::NDArray{T,2}) where T transpose(arr)
+@_remap permutedims(arr::NDArray, axes) transpose(arr; axes = length(axes) .- tuple(axes...))
+
+@_remap prod(arr::NDArray)       prod(arr)
+@_remap prod(arr::NDArray, dims) prod(arr; axis = 0 .- dims, keepdims = true)
+
+_nddoc[:clip] = _nddoc[:clip!] =
+"""
+    clip(x::NDArray, min, max)
+    clip!(x::NDArray, min, max)
+
+Clips (limits) the values in `NDArray`.
+Given an interval, values outside the interval are clipped to the interval edges.
+Clipping `x` between `min` and `x` would be:
+
+```julia
+clip(x, min_, max_) = max(min(x, max_), min_))
+```
+
+```jldoctest
+julia> x = NDArray(1:9);
+
+julia> mx.clip(x, 2, 8)'
+1×9 mx.NDArray{Int64,2} @ CPU0:
+ 2  2  3  4  5  6  7  8  8
+```
+
+The storage type of clip output depends on storage types of inputs and the
+`min`, `max` parameter values:
+
+- clip(default) = default
+- clip(row_sparse, min <= 0, max >= 0) = row_sparse
+- clip(csr, min <= 0, max >= 0) = csr
+- clip(row_sparse, min < 0, max < 0) = default
+- clip(row_sparse, min > 0, max > 0) = default
+- clip(csr, min < 0, max < 0) = csr
+- clip(csr, min > 0, max > 0) = csr
+"""
+@_remap clip(x::NDArray, min::Real, max::Real) clip(x; a_min = min, a_max = max)
+@_remap clip!(x::NDArray, min::Real, max::Real) clip(x; a_min = min, a_max = max)
+
+_nddoc[:expand_dims] =
+"""
+    expand_dims(x::NDArray, dim)
+
+Insert a new axis into `dim`.
+
+```julia
+julia> x
+4 mx.NDArray{Float64,1} @ CPU0:
+ 1.0
+ 2.0
+ 3.0
+ 4.0
+
+julia> mx.expand_dims(x, 1)
+1×4 mx.NDArray{Float64,2} @ CPU0:
+ 1.0  2.0  3.0  4.0
+
+julia> mx.expand_dims(x, 2)
+4×1 mx.NDArray{Float64,2} @ CPU0:
+ 1.0
+ 2.0
+ 3.0
+ 4.0
+```
+"""
+@_remap expand_dims(x::NDArray, dim) expand_dims(x; axis = -dim)
+
+# trigonometric functions, remap to keep consistent with Base
+@_remap broadcast_(::typeof(sin),  x::NDArray) sin(x)
+@_remap broadcast_(::typeof(cos),  x::NDArray) cos(x)
+@_remap broadcast_(::typeof(tan),  x::NDArray) tan(x)
+@_remap broadcast_(::typeof(asin), x::NDArray) arcsin(x)
+@_remap broadcast_(::typeof(acos), x::NDArray) arccos(x)
+@_remap broadcast_(::typeof(atan), x::NDArray) arctan(x)
+
+# hyperbolic funcs, remap to keep consistent with Base
+@_remap broadcast_(::typeof(sinh),  x::NDArray) sinh(x)
+@_remap broadcast_(::typeof(cosh),  x::NDArray) cosh(x)
+@_remap broadcast_(::typeof(tanh),  x::NDArray) tanh(x)
+@_remap broadcast_(::typeof(asinh), x::NDArray) arcsinh(x)
+@_remap broadcast_(::typeof(acosh), x::NDArray) arccosh(x)
+@_remap broadcast_(::typeof(atanh), x::NDArray) arctanh(x)
+
+# activation functions
+_nddoc[:σ] = _nddoc[:sigmoid] = doc"""
+    σ.(x::NDArray)
+    sigmoid.(x::NDArray)
+
+Computes sigmoid of x element-wise.
+
+```math
+σ(x) = \frac{1}{(1 + exp(-x))}
+```
+
+The storage type of `sigmoid` output is always dense.
+"""
+@_remap broadcast_(::typeof(σ), x::NDArray)       sigmoid(x)
+@_remap broadcast_(::typeof(sigmoid), x::NDArray) sigmoid(x)
+
+_nddoc[:relu] = doc"""
+    relu.(x::NDArray)
+
+Computes rectified linear.
+
+```math
+\max(x, 0)
+```
+"""
+@_remap broadcast_(::typeof(relu), x::NDArray) relu(x)
+
+_nddoc[:softmax] = doc"""
+    softmax.(x::NDArray, [dim = ndims(x)])
+
+Applies the softmax function.
+
+The resulting array contains elements in the range `(0, 1)`
+and the elements along the given axis sum up to 1.
+
+```math
+softmax(\mathbf{z})_j = \frac{e^{z_j}}{\sum_{k=1}^K e^{z_k}}
+```
+"""
+@_remap broadcast_(::typeof(softmax), x::NDArray) softmax(x; axis = -ndims(x))
+@_remap broadcast_(::typeof(softmax), x::NDArray, dim::Int) softmax(x; axis = -dim)
+
+_nddoc[:log_softmax] = """
+    log_softmax.(x::NDArray, [dim = ndims(x)])
+
+Computes the log softmax of the input.
+This is equivalent to computing softmax followed by log.
+
+julia> x
+2×3 mx.NDArray{Float64,2} @ CPU0:
+ 1.0  2.0  0.1
+ 0.1  2.0  1.0
+
+julia> mx.log_softmax.(x)
+2×3 mx.NDArray{Float64,2} @ CPU0:
+ -1.41703  -0.41703  -2.31703
+ -2.31703  -0.41703  -1.41703
+"""
+@_remap broadcast_(::typeof(log_softmax), x::NDArray) log_softmax(x; axis = -ndims(x))
+@_remap broadcast_(::typeof(log_softmax), x::NDArray, dim::Int) log_softmax(x; axis = -dim)
+
+################################################################################
+# remapping to solving type unstablility
+################################################################################
+
+@_remap _plus(x::NDArray, y::NDArray)  _plus(x, y)
+@_remap _plus!(x::NDArray, y::NDArray) _plus(x, y)
+
+@_remap _minus(x::NDArray, y::NDArray)  _minus(x, y)
+@_remap _minus!(x::NDArray, y::NDArray) _minus(x, y)
+
+@_remap _mod(x::NDArray, y::NDArray)  _mod(x, y)
+@_remap _mod!(x::NDArray, y::NDArray) _mod(x, y)
+
+@_remap _mod_scalar(x::NDArray, y::Real)  _mod_scalar(x; scalar = y)
+@_remap _mod_scalar!(x::NDArray, y::Real) _mod_scalar(x; scalar = y)
+
+@_remap _rmod_scalar(x::NDArray, y::Real)  _rmod_scalar(x; scalar = y)
+@_remap _rmod_scalar!(x::NDArray, y::Real) _rmod_scalar(x; scalar = y)
+
+@_remap _broadcast_add(x::NDArray, y::NDArray)  broadcast_add(x, y)
+@_remap _broadcast_add!(x::NDArray, y::NDArray) broadcast_add(x, y)
+
+@_remap _broadcast_minus(x::NDArray, y::NDArray)  broadcast_minus(x, y)
+@_remap _broadcast_minus!(x::NDArray, y::NDArray) broadcast_minus(x, y)
+
+@_remap _broadcast_mul(x::NDArray, y::NDArray)  broadcast_mul(x, y)
+@_remap _broadcast_mul!(x::NDArray, y::NDArray) broadcast_mul(x, y)
+
+@_remap _broadcast_div(x::NDArray, y::NDArray)  broadcast_div(x, y)
+@_remap _broadcast_div!(x::NDArray, y::NDArray) broadcast_div(x, y)
+
+@_remap _broadcast_mod(x::NDArray, y::NDArray)  broadcast_mod(x, y)
+@_remap _broadcast_mod!(x::NDArray, y::NDArray) broadcast_mod(x, y)
+
+@_remap _broadcast_power(x::NDArray, y::NDArray)  broadcast_power(x, y)
+@_remap _broadcast_power!(x::NDArray, y::NDArray) broadcast_power(x, y)
+
+@_remap _broadcast_equal(x::NDArray, y::NDArray)  broadcast_equal(x, y)
+@_remap _broadcast_equal!(x::NDArray, y::NDArray) broadcast_equal(x, y)
+
+@_remap _broadcast_not_equal(x::NDArray, y::NDArray)  broadcast_not_equal(x, y)
+@_remap _broadcast_not_equal!(x::NDArray, y::NDArray) broadcast_not_equal(x, y)
+
+@_remap _broadcast_greater(x::NDArray, y::NDArray)  broadcast_greater(x, y)
+@_remap _broadcast_greater!(x::NDArray, y::NDArray) broadcast_greater(x, y)
+
+@_remap _broadcast_greater_equal(x::NDArray, y::NDArray)  broadcast_greater_equal(x, y)
+@_remap _broadcast_greater_equal!(x::NDArray, y::NDArray) broadcast_greater_equal(x, y)
+
+@_remap _broadcast_lesser(x::NDArray, y::NDArray)  broadcast_lesser(x, y)
+@_remap _broadcast_lesser!(x::NDArray, y::NDArray) broadcast_lesser(x, y)
+
+@_remap _broadcast_lesser_equal(x::NDArray, y::NDArray)  broadcast_lesser_equal(x, y)
+@_remap _broadcast_lesser_equal!(x::NDArray, y::NDArray) broadcast_lesser_equal(x, y)
+
+@_remap _broadcast_maximum(x::NDArray, y::NDArray)  broadcast_maximum(x, y)
+@_remap _broadcast_maximum!(x::NDArray, y::NDArray) broadcast_maximum(x, y)
+
+@_remap _broadcast_minimum(x::NDArray, y::NDArray)  broadcast_minimum(x, y)
+@_remap _broadcast_minimum!(x::NDArray, y::NDArray) broadcast_minimum(x, y)
+
+@_remap _broadcast_hypot(x::NDArray, y::NDArray)  broadcast_hypot(x, y)
+@_remap _broadcast_hypot!(x::NDArray, y::NDArray) broadcast_hypot(x, y)
+
+_nddoc[:broadcast_to] = """
+    broadcast_to(x::NDArray, dims)
+    broadcast_to(x::NDArray, dims...)
+
+Broadcasts the input array to a new shape.
+
+In the case of broacasting doesn't work out of box,
+you can expand the NDArray first.
+
+```jldoctest
+julia> x = mx.ones(2, 3, 4);
+
+julia> y = mx.ones(1, 1, 4);
+
+julia> x .+ mx.broadcast_to(y, 2, 3, 4)
+2×3×4 mx.NDArray{Float32,3} @ CPU0:
+[:, :, 1] =
+ 2.0  2.0  2.0
+ 2.0  2.0  2.0
+
+[:, :, 2] =
+ 2.0  2.0  2.0
+ 2.0  2.0  2.0
+
+[:, :, 3] =
+ 2.0  2.0  2.0
+ 2.0  2.0  2.0
+
+[:, :, 4] =
+ 2.0  2.0  2.0
+ 2.0  2.0  2.0
+```
+"""
+@_remap broadcast_to(x::NDArray, dims)    broadcast_to(x; shape = dims)
+@_remap broadcast_to(x::NDArray, dims...) broadcast_to(x; shape = dims)
+
+_nddoc[:broadcast_axis] = _nddoc[:broadcast_axes] = """
+    broadcast_axis(x::NDArray, dim, size)
+    broadcast_axes(x::NDArray, dim, size)
+
+Broadcasts the input array over particular axis(axes).
+Parameter `dim` and `size` could be a scalar, a Tuple or an Array.
+
+`broadcast_axes` is just an alias.
+
+```jldoctest
+julia> x
+1×2×1 mx.NDArray{Int64,3} @ CPU0:
+[:, :, 1] =
+ 1  2
+
+julia> mx.broadcast_axis(x, 1, 2)
+2×2×1 mx.NDArray{Int64,3} @ CPU0:
+[:, :, 1] =
+ 1  2
+ 1  2
+
+julia> mx.broadcast_axis(x, 3, 2)
+1×2×2 mx.NDArray{Int64,3} @ CPU0:
+[:, :, 1] =
+ 1  2
+
+[:, :, 2] =
+ 1  2
+```
+"""
+@_remap(broadcast_axis(x::NDArray, dim, size),
+        broadcast_axis(x; axis = ndims(x) .- dim, size = size))
+@_remap(broadcast_axes(x::NDArray, dim, size),
+        broadcast_axes(x; axis = ndims(x) .- dim, size = size))
+
+################################################################################
+# NDArray functions dynamically imported from libmxnet
+################################################################################
+function _invoke_mxfunction(func_handle::MX_handle, use_vars, scalars, mut_vars; kwargs...)
+  names = String[string(entry[1]) for entry in kwargs]
+  args = String[string(entry[2]) for entry in kwargs]
+  @mxcall(:MXFuncInvokeEx,
+          (MX_handle, Ptr{MX_handle}, Ptr{MX_float}, Ptr{MX_handle}, Cint, char_pp, char_pp),
+          func_handle, use_vars, scalars, mut_vars, length(names), names, args)
+end
+
+@enum(LIBMX_FUNC_TYPE_MASK,
+  NDARRAY_ARG_BEFORE_SCALAR = 1,
+  ACCEPT_EMPTY_MUTATE_TARGET = (1 << 2)
+)
+
+# Import corresponding math functions from base so the automatically defined libmxnet
+# functions can overload them
+import Base: sqrt
+
+"""
+The libxmnet APIs are automatically imported from `libmxnet.so`. The functions listed
+here operate on `NDArray` objects. The arguments to the functions are typically ordered
+as
+
+```julia
+  func_name(arg_in1, arg_in2, ..., scalar1, scalar2, ..., arg_out1, arg_out2, ...)
+```
+
+unless `NDARRAY_ARG_BEFORE_SCALAR` is not set. In this case, the scalars are put before the input arguments:
+
+```julia
+  func_name(scalar1, scalar2, ..., arg_in1, arg_in2, ..., arg_out1, arg_out2, ...)
+```
+
+If `ACCEPT_EMPTY_MUTATE_TARGET` is set. An overloaded function without the output arguments will also be defined:
+
+```julia
+  func_name(arg_in1, arg_in2, ..., scalar1, scalar2, ...)
+```
+
+Upon calling, the output arguments will be automatically initialized with empty NDArrays.
+
+Those functions always return the output arguments. If there is only one output (the typical situation), that
+object (`NDArray`) is returned. Otherwise, a tuple containing all the outputs will be returned.
+"""
+function _get_ndarray_function_def(name :: String)
+  func_name = Symbol(name)
+
+  func_def = quote
+    function $func_name(::Type{<:NDArray}, args::NDArray...; out=nothing, kwargs...)
+      if out != nothing
+        output_vars = out
+        if isa(output_vars, NDArray)
+          output_vars = NDArray[output_vars]
+        end
+        num_outputs = length(output_vars)
+      else
+        output_vars = NDArray[]
+        num_outputs = 0
+      end
+
+      args = collect(args)  # tuple to list
+      if length(args) == 0
+        args = MX_handle[]
+      end
+
+      output_handles_pp = if length(output_vars) > 0
+        [map(x -> x.handle, output_vars)]
+      else
+        [Ptr{MX_handle}(C_NULL)]
+      end
+      num_outputs_p = [convert(Cint, num_outputs)]
+
+      kw_keys_str = String[string(x[1]) for x in kwargs]
+      kw_vals_str = String[dump_mx_param(x[2]) for x in kwargs]
+
+      op_handle = _get_cached_libmx_op_handle($(name))
+      @mxcall(:MXImperativeInvoke,
+              (MX_handle, Cint, Ptr{MX_handle},
+               Ptr{Cint}, Ptr{Ptr{MX_handle}},
+               Cint, char_pp, char_pp),
+              op_handle, length(args), args,
+              num_outputs_p, output_handles_pp,
+              length(kwargs), kw_keys_str, kw_vals_str)
+
+      if out == nothing
+        n = num_outputs_p[]
+        hdls = unsafe_wrap(Array{MX_handle}, output_handles_pp[], n)
+        xs = NDArray[NDArray(MX_NDArrayHandle(x)) for x in hdls]
+        if n == 1
+          return xs[]
+        else
+          return xs
+        end
+      else
+        return out
+      end
+    end
+  end
+
+  func_def2 = quote
+    function $func_name(args::NDArray...; out=nothing, kwargs...)
+      $func_name(NDArray, args...; out=out, kwargs...)
+    end
+  end
+
+  return func_def, func_def2
+end
+
+const _op_import_bl = [  # import black list; do not import these funcs
+    "_full",   # we already have `mx.fill`
+    "_ones",   # we already have `mx.ones`
+    "_zeros",  # we already have `mx.zeros`
+    "clip",
+    "expand_dims",
+
+    # arithmetic
+    "_plus",
+    "_minus",
+    "_mod",
+    "_mod_scalar",
+    "_rmod_scalar",
+
+    "dot",
+    "max",
+    "max_axis",
+    "mean",
+    "min",
+    "min_axis",
+    "prod",
+    "reshape",
+    "sum",
+    "transpose",
+
+    # trigonometric
+    "sin",
+    "cos",
+    "tan",
+    "arcsin",
+    "arccos",
+    "arctan",
+
+    # hyperbolic
+    "sinh",
+    "cosh",
+    "tanh",
+    "arcsinh",
+    "arccosh",
+    "arctanh",
+
+    # activation
+    "sigmoid",
+    "relu",
+    "softmax",
+    "log_softmax",
+
+    # broadcast
+    "broadcast_add",
+    "broadcast_plus",
+    "broadcast_minus",
+    "broadcast_sub",
+    "broadcast_mul",
+    "broadcast_div",
+    "broadcast_mod",
+    "broadcast_power",
+    "broadcast_equal",
+    "broadcast_not_equal",
+    "broadcast_greater",
+    "broadcast_greater_equal",
+    "broadcast_lesser",
+    "broadcast_lesser_equal",
+    "broadcast_maximum",
+    "broadcast_minimum",
+    "broadcast_to",
+    "broadcast_axis",
+    "broadcast_axes",
+    "broadcast_hypot",
+]
+
+macro _import_ndarray_functions()
+  names = filter(n -> ∉(lowercase(n), _op_import_bl), _get_libmx_op_names())
+
+  func_exprs = map(names) do name
+    op_handle = _get_libmx_op_handle(name)
+
+    desc, key_narg = _get_libmx_op_description(name, op_handle)
+    func_def, func_def2 = _get_ndarray_function_def(name)
+
+    func_name = Symbol(name)
+    expr = quote
+      # TODO the explicit exclusion of take will no longer be necessary when it is removed from Base
+      $((isdefined(Base, func_name) && func_name ≠ :take) ? :(import Base.$func_name) : :())
+      $func_def
+      @doc $desc ->
+      $func_def2
+    end
+  end
+
+  esc(quote
+    $(func_exprs...)
+  end)
+end
+
+@_import_ndarray_functions()
diff --git a/julia/src/nn-factory.jl b/julia/src/nn-factory.jl
new file mode 100644
index 000000000000..b5134f9d2bd5
--- /dev/null
+++ b/julia/src/nn-factory.jl
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+    MLP(input, spec; hidden_activation = :relu, prefix)
+
+Construct a multi-layer perceptron. A MLP is a multi-layer neural network with
+fully connected layers.
+
+# Arguments:
+* `input::SymbolicNode`: the input to the mlp.
+* `spec`: the mlp specification, a list of hidden dimensions. For example,
+          `[128, (512, :sigmoid), 10]`. The number in the list indicate the
+          number of hidden units in each layer. A tuple could be used to specify
+          the activation of each layer. Otherwise, the default activation will
+          be used (except for the last layer).
+* `hidden_activation::Symbol`: keyword argument, default `:relu`, indicating
+          the default activation for hidden layers. The specification here could be overwritten
+          by layer-wise specification in the `spec` argument. Also activation is not
+          applied to the last, i.e. the prediction layer. See [`Activation`](@ref) for a
+          list of supported activation types.
+* `prefix`: keyword argument, default `gensym()`, used as the prefix to
+          name the constructed layers.
+
+Returns the constructed MLP.
+"""
+function MLP(input, spec; hidden_activation::Symbol=:relu, prefix=gensym())
+  spec = convert(Vector{Union{Int,Tuple}}, spec)
+
+  n_layer = length(spec)
+  for (i, s) in enumerate(spec)
+    if isa(s, Tuple)
+      n_unit, act_type = s
+    else
+      n_unit = s
+      act_type = hidden_activation
+    end
+    input = FullyConnected(input, name=Symbol(prefix, "fc$i"), num_hidden=n_unit)
+    if i < n_layer || isa(s, Tuple)
+      # will not add activation unless the user explicitly specified
+      input = Activation(input, name=Symbol(prefix, "$act_type$i"), act_type=act_type)
+    end
+  end
+
+  return input
+end
diff --git a/julia/src/optimizer.jl b/julia/src/optimizer.jl
new file mode 100644
index 000000000000..6436878df678
--- /dev/null
+++ b/julia/src/optimizer.jl
@@ -0,0 +1,308 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+###############################################################################
+#  Types
+###############################################################################
+
+"""
+    AbstractOptimizer
+
+Base type for all optimizers.
+"""
+abstract type AbstractOptimizer end
+
+"""
+    AbstractLearningRateScheduler
+
+Base type for all learning rate scheduler.
+"""
+abstract type AbstractLearningRateScheduler end
+
+"""
+    AbstractMomentumScheduler
+
+Base type for all momentum scheduler.
+"""
+abstract type AbstractMomentumScheduler end
+
+"""
+    OptimizationState
+
+### Attributes
+* `batch_size`: The size of the mini-batch used in stochastic training.
+* `curr_epoch`:
+  The current epoch count. Epoch 0 means no training yet, during the first
+  pass through the data, the epoch will be 1; during the second pass, the
+  epoch count will be 1, and so on.
+* `curr_batch`:
+  The current mini-batch count. The batch count is reset during every epoch.
+  The batch count 0 means the beginning of each epoch, with no mini-batch
+  seen yet. During the first mini-batch, the mini-batch count will be 1.
+* `curr_iter`:
+  The current iteration count. One iteration corresponds to one mini-batch,
+  but unlike the mini-batch count, the iteration count does **not** reset
+  in each epoch. So it track the *total* number of mini-batches seen so far.
+"""
+mutable struct OptimizationState
+  batch_size :: Int
+  curr_epoch :: Int
+  curr_batch :: Int
+  curr_iter  :: Int
+end
+
+OptimizationState(batch_size::Int) = OptimizationState(batch_size, 0, 0, 0)
+
+###############################################################################
+#  LearningRate module
+###############################################################################
+
+module LearningRate
+
+import Base: get
+import ..mx: AbstractLearningRateScheduler, OptimizationState, update!
+
+export initlrsched
+
+initlrsched(η::Real) = LearningRate.Fixed(η)
+
+update!(a::AbstractLearningRateScheduler) = (isdefined(a, :t) && (a.t += 1))
+
+"""
+    get(sched::AbstractLearningRateScheduler)
+
+Returns the current learning rate.
+"""
+get(::AbstractLearningRateScheduler) = nothing
+
+"""
+    LearningRate.Fixed(η)
+
+Fixed learning rate scheduler always return the same learning rate.
+"""
+struct Fixed <: AbstractLearningRateScheduler
+  η::Float64
+end
+
+get(f::Fixed) = f.η
+
+doc"""
+    LearningRate.Exp(η₀; γ = 0.9)
+
+```math
+\eta_t = \eta_0\gamma^t
+```
+
+Where `t` is the epoch count, or the iteration count.
+"""
+mutable struct Exp <: AbstractLearningRateScheduler
+  η₀::Float64
+  γ ::Float64
+  t ::Int
+end
+
+function Exp(η₀; γ = 0.9, t = 0)
+  @assert 0 < γ < 1
+  Exp(η₀, γ, t)
+end
+
+get(a::Exp) = a.η₀ * a.γ^a.t
+
+doc"""
+    LearningRate.Inv(η₀; γ = 0.9, p = 0.5)
+
+```math
+\eta_t = \eta_0 (1 + \gamma t)^{-p}
+```
+
+Where `t` is the epoch count, or the iteration count.
+"""
+mutable struct Inv <: AbstractLearningRateScheduler
+  η₀::Float64
+  γ ::Float64
+  p ::Float64
+  t ::Int
+end
+
+function Inv(η₀; γ = 0.9, p = 0.5, t = 0)
+  @assert 0 < γ < 1
+  @assert 0 <= p
+  Inv(η₀, γ, p, t)
+end
+
+get(i::Inv) = i.η₀ * (1 + i.γ*i.t)^(-i.p)
+
+end  # module LearningRate
+
+using .LearningRate
+
+###############################################################################
+#  Momentum module
+###############################################################################
+
+module Momentum
+
+import Base: get
+import ..mx: AbstractMomentumScheduler, OptimizationState
+
+export initmomsched
+
+"""
+    get(sched)
+
+* `sched::AbstractMomentumScheduler`: the momentum scheduler.
+
+Returns the current momentum.
+"""
+get
+
+initmomsched(μ::Real) = iszero(μ) ? Momentum.Null() : Momentum.Fixed(μ)
+
+"""
+    Momentum.Null
+
+The null momentum scheduler always returns 0 for momentum. It is also used to
+explicitly indicate momentum should not be used.
+"""
+struct Null <: AbstractMomentumScheduler
+end
+
+get(::Null) = 0.0
+
+"""
+    Momentum.Fixed
+
+Fixed momentum scheduler always returns the same value.
+"""
+mutable struct Fixed <: AbstractMomentumScheduler
+  μ::Float64
+end
+
+get(f::Fixed) = f.μ
+
+doc"""
+    NadamScheduler(; μ = 0.99, δ = 0.004, γ = 0.5, α = 0.96)
+
+Nesterov-accelerated adaptive momentum scheduler.
+
+Description in [Incorporating Nesterov Momentum into Adam]
+(http://cs229.stanford.edu/proj2015/054_report.pdf).
+
+```math
+\mu_t = \mu_0 * (1 - \gamma * \alpha^{t * \delta})
+```
+
+Where
+* `t`: iteration count
+* `μ`: default `0.99`, μ₀
+* `δ`: default `0.004` is scheduler decay.
+* `γ`: default `0.5`
+* `α`: default `0.96`
+"""
+struct NadamScheduler <: AbstractMomentumScheduler
+  μ::Float64
+  δ::Float64
+  γ::Float64
+  α::Float64
+end
+
+function NadamScheduler(; μ = 0.99, δ = 0.004, γ = 0.5, α = 0.96)
+  @assert 0.0 <= μ < 1.0
+  @assert 0.0 <= δ
+  @assert 0.0 <= γ <= 1.0
+  @assert 0.0 <= α <= 1.0
+  NadamScheduler(μ, δ, γ, α)
+end
+
+"""
+    get(n::NadamScheduler, t)
+
+Where `t` is the iteration count.
+"""
+get(n::NadamScheduler, t) =
+  n.μ * (1.0 - n.γ * n.α^( t      * n.δ)),
+  n.μ * (1.0 - n.γ * n.α^((t + 1) * n.δ))
+
+end  # module Momentum
+
+using .Momentum
+
+###############################################################################
+# Public APIs
+###############################################################################
+
+"""
+    getupdater(optimizer)
+
+A utility function to create an updater function of `KVStore`,
+that uses its closure to store all the states needed for each weights.
+
+Ther returned function has following signature:
+
+```julia
+decend!(index::Int, ∇::NDArray, x::NDArray)
+```
+
+If the optimizer is stateful and need access/store states during updating,
+`index` will be the key to access/store states.
+"""
+function getupdater(optimizer::AbstractOptimizer)
+  states = Dict{Int,Any}()
+  function updater(index::Int, ∇::NDArray, x::NDArray)
+    if !haskey(states, index)
+      states[index] = create_state(optimizer, index, x)
+    end
+    update!(optimizer, index, x, ∇, states[index])
+  end
+  updater
+end
+
+"""
+    normgrad(optimizer, W, ∇)
+
+Get the properly normalized gradient (re-scaled and clipped if necessary).
+
+* `optimizer`: the optimizer,
+  should contain the field `scale`, `clip` and `λ`.
+* `W::NDArray`: the trainable weights.
+* `∇::NDArray`: the original gradient of the weights.
+"""
+function normgrad!(opt::AbstractOptimizer, W::NDArray, ∇::NDArray)
+  # rescaling
+  s = opt.scale
+  !iszero(s) && @inplace ∇ .*= s
+  # gradient clipping
+  c = opt.clip
+  c > 0 && clip!(∇, -c, c)
+  # weight decay
+  λ = opt.λ
+  λ > 0 && @inplace ∇ += λ .* W
+
+  ∇
+end
+
+###############################################################################
+# Builtin Optimizers
+###############################################################################
+
+include("optimizers/sgd.jl")
+include("optimizers/adam.jl")
+include("optimizers/adagrad.jl")
+include("optimizers/adadelta.jl")
+include("optimizers/adamax.jl")
+include("optimizers/rmsprop.jl")
+include("optimizers/nadam.jl")
diff --git a/julia/src/optimizers/adadelta.jl b/julia/src/optimizers/adadelta.jl
new file mode 100644
index 000000000000..2b0cd0060261
--- /dev/null
+++ b/julia/src/optimizers/adadelta.jl
@@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+doc"""
+    AdaDelta(; kwargs...)
+
+Scale learning rates by the ratio of accumulated gradients to accumulated
+updates, see [1] and notes for further description.
+
+### Attributes
+* `η`: default `1.0`, learning rate.
+* `ρ`: default `0.95`, squared gradient moving average decay factor.
+* `ϵ`: default `1e-6`, small value added for numerical stability.
+* `clip`: default `0`, gradient clipping.
+  If positive, will clip the gradient into the range `[-clip, clip]`.
+* `scale`: default `0`, gradient rescaling.
+  If != 0, multiply the gradient with `scale` before updating.
+  Often choose to be `1.0 / batch_size`.
+  If leave it default, high-level API like `fit!` will set it to
+  `1.0 / batch_size`, since `fit!` knows the `batch_size`.
+* `λ`: default `0.00001`, weight decay is equivalent
+  to adding a global l2 regularizer for all the parameters.
+
+### Notes
+`ρ` should be between 0 and 1. A value of `ρ` close to 1 will decay the
+moving average slowly and a value close to 0 will decay the moving average
+fast.
+
+`ρ = 0.95` and `ϵ = 1e-6` are suggested in the paper and reported to
+work for multiple datasets (MNIST, speech). In the paper, no learning rate is
+considered (so `η = 1.0`). Probably best to keep it at this value.
+
+`ϵ` is important for the very first update (so the numerator does not become 0).
+
+Using the step size `η` and a decay factor `ρ` the learning rate is
+calculated as:
+
+```math
+\begin{align*}
+  r_t &= ρ r_{t-1} + (1 - ρ) g^2 \\
+  η_t &= η \frac{\sqrt{s_{t-1} + ϵ}} {\sqrt{r_t + ϵ}} \\
+  s_t &= ρ s_{t-1} + (1 - ρ) (η_t \times g)^2
+\end{align*}
+```
+
+### References
+1. Zeiler, M. D. (2012):
+   ADADELTA: An Adaptive Learning Rate Method. arXiv Preprint arXiv:1212.5701.
+"""
+AdaDelta
+
+@defstruct AdaDelta <: AbstractOptimizer (
+  (η      :: Real = 1.0,  η > 0),
+  (ρ      :: Real = 0.95, 0 < ρ < 1 ),
+  (ϵ      :: Real = 1e-6, ϵ > 0),
+  (clip   :: Real = 0,    clip >= 0),
+   scale  :: Real = 0,
+  (λ      :: Real = 1e-5, λ >= 0),
+  η_sched :: Any  = initlrsched(η)
+)
+
+mutable struct AdaDeltaState
+  x  :: NDArray
+  Δx :: NDArray
+end
+
+create_state(::AdaDelta, ::Int, W::NDArray) =
+  AdaDeltaState(zeros(size(W), context(W)), zeros(size(W), context(W)))
+
+function update!(ada::AdaDelta, ::Int, W::NDArray, ∇::NDArray, s::AdaDeltaState)
+  η  = get(ada.η_sched)
+  x  = s.x
+  Δx = s.Δx
+  ρ  = ada.ρ
+  ϵ  = ada.ϵ
+
+  normgrad!(ada, W, ∇)
+
+  # Update s.acc as in RMSProp
+  @inplace x .*= ρ
+  @inplace x .+= (1 - ρ) .* ∇.^2
+
+  # Compute update using the "old" Δx
+  Δxₜ = ∇ .* sqrt(Δx .+ ϵ) ./ sqrt(x .+ ϵ)  # FIXME: sqrt dot-call
+  @inplace W .+= -η .* Δxₜ
+
+  # update Δx using update
+  @inplace Δx .*= ρ
+  @inplace Δx .+= (1 - ρ) .* Δxₜ.^2
+end
diff --git a/julia/src/optimizers/adagrad.jl b/julia/src/optimizers/adagrad.jl
new file mode 100644
index 000000000000..4236cb8cda20
--- /dev/null
+++ b/julia/src/optimizers/adagrad.jl
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+doc"""
+    AdaGrad(; kwargs...)
+
+Scale learning rates by dividing with the square root of accumulated
+squared gradients. See [1] for further description.
+
+### Arguments
+* `η`: default `0.1`, learning rate.
+* `ϵ`: default `1e-6`, small value added for numerical stability.
+* `clip`: default `0`, gradient clipping.
+  If positive, will clip the gradient into the range `[-clip, clip]`.
+* `scale`: default `0`, gradient rescaling.
+  If != 0, multiply the gradient with `scale` before updating.
+  Often choose to be `1.0 / batch_size`.
+  If leave it default, high-level API like `fit!` will set it to
+  `1.0 / batch_size`, since `fit!` knows the `batch_size`.
+* `λ`: default `0.00001`, weight decay is equivalent
+  to adding a global l2 regularizer for all the parameters.
+
+### Notes
+Using step size `η` AdaGrad calculates the learning rate for feature `i` at
+time step t as:
+
+```math
+η_{t,i} = \frac{lr}{\sqrt{\sum^t_{t^\prime} g^2_{t^\prime,i} + ϵ}} g_{t,i}
+```
+
+as such the learning rate is monotonically decreasing.
+Epsilon is not included in the typical formula, see [2].
+
+### References
+1. Duchi, J., Hazan, E., & Singer, Y. (2011):
+   Adaptive subgradient methods for online learning and
+   stochastic optimization. JMLR, 12:2121-2159.
+2. Chris Dyer: Notes on AdaGrad.
+   [http://www.ark.cs.cmu.edu/cdyer/adagrad.pdf]
+   (http://www.ark.cs.cmu.edu/cdyer/adagrad.pdf)
+"""
+AdaGrad
+
+@defstruct AdaGrad <: AbstractOptimizer (
+  (η      :: Real = 0.1,  η > 0),
+  (ϵ      :: Real = 1e-6, ϵ > 0),
+  (clip   :: Real = 0,    clip >= 0),
+   scale  :: Real = 0,
+  (λ      :: Real = 1e-5, λ >= 0),
+  η_sched :: Any  = initlrsched(η)
+)
+
+create_state(::AdaGrad, ::Int, W::NDArray) = zeros(size(W), context(W))
+
+function update!(ada::AdaGrad, ::Int, W::NDArray, ∇::NDArray, x::NDArray)
+  η = get(ada.η_sched)
+  ϵ = ada.ϵ
+
+  normgrad!(ada, W, ∇)
+
+  @inplace x .+= ∇.^2  # update state
+  @inplace W .+= -η .* ∇ ./ sqrt(x .+ ϵ)  # FIXME: sqrt dot-call
+end
diff --git a/julia/src/optimizers/adam.jl b/julia/src/optimizers/adam.jl
new file mode 100644
index 000000000000..c6aa99ba71fb
--- /dev/null
+++ b/julia/src/optimizers/adam.jl
@@ -0,0 +1,88 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+     ADAM
+
+The solver described in Diederik Kingma, Jimmy Ba: *Adam: A Method for
+Stochastic Optimization*. arXiv:1412.6980 [cs.LG].
+
+    ADAM(; kwargs...)
+
+### Arguments
+* `η`: default `0.001`, learning rate.
+* `β1`: default `0.9`.
+* `β2`: default `0.999`.
+* `ϵ`: default `1e-8`.
+* `clip`: default `0`, gradient clipping.
+  If positive, will clip the gradient into the range `[-clip, clip]`.
+* `scale`: default `0`, gradient rescaling.
+  If != 0, multiply the gradient with `scale` before updating.
+  Often choose to be `1.0 / batch_size`.
+  If leave it default, high-level API like `fit!` will set it to
+  `1.0 / batch_size`, since `fit!` knows the `batch_size`.
+* `λ`: default `0.00001`, weight decay is equivalent
+  to adding a global l2 regularizer for all the parameters.
+* `η_sched::AbstractLearningRateScheduler`: default `LearningRate.Fixed(η)`, a
+  dynamic learning rate scheduler. If set, will overwrite the `η` parameter.
+"""
+ADAM
+
+@defstruct ADAM <: AbstractOptimizer (
+  (η      :: Real = 0.001, η > 0),
+  (β1     :: Real = 0.9,   0 <= β1 < 1),
+  (β2     :: Real = 0.999, 0 <= β2 < 1),
+  (ϵ      :: Real = 1e-8,  ϵ > 0),
+  (clip   :: Real = 0,     clip >= 0),
+   scale  :: Real = 0,
+  (λ      :: Real = 1e-5,  λ >= 0),
+  η_sched :: Any  = initlrsched(η)
+)
+
+mutable struct ADAMState
+  η   :: Float64  # current learning rate
+  mₜ  :: NDArray
+  vₜ  :: NDArray
+  β1ᵗ :: Float64
+  β2ᵗ :: Float64
+end
+
+create_state(adam::ADAM, ::Int, W::NDArray) =
+  ADAMState(get(adam.η_sched),
+            zeros(size(W), context(W)),
+            zeros(size(W), context(W)),
+            adam.β1, adam.β2)
+
+function update!(adam::ADAM, ::Int, W::NDArray, ∇:: NDArray, s::ADAMState)
+  η = s.η
+  β1 = adam.β1
+  β2 = adam.β2
+  ϵ = adam.ϵ
+
+  normgrad!(adam, W, ∇)
+
+  s.mₜ = β1 * s.mₜ + (1 - β1) .* ∇
+  s.vₜ = β2 * s.vₜ + (1 - β2) .* ∇.^2
+
+  aₜ= sqrt(1.0 - s.β2ᵗ)/(1.0 - s.β1ᵗ)
+
+  # update βᵗ to βᵗ⁺¹
+  s.β1ᵗ *= β1
+  s.β2ᵗ *= β2
+
+  @inplace W .+= -η * aₜ * s.mₜ ./ (sqrt(s.vₜ) .+ ϵ)
+end
diff --git a/julia/src/optimizers/adamax.jl b/julia/src/optimizers/adamax.jl
new file mode 100644
index 000000000000..de6a1ab759b3
--- /dev/null
+++ b/julia/src/optimizers/adamax.jl
@@ -0,0 +1,84 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+    AdaMax(; kwargs...)
+
+This is a variant of of the Adam algorithm based on the infinity norm.
+See [1] for further description.
+
+### Arguments
+* `η`: default `0.002`, learning rate.
+* `β1`: default `0.9`, exponential decay rate for the first moment estimates.
+* `β2`: default `0.999`, exponential decay rate for the weighted
+  infinity norm estimates.
+* `ϵ`: default `1e-8`, small value added for numerical stability.
+* `clip`: default `0`, gradient clipping.
+  If positive, will clip the gradient into the range `[-clip, clip]`.
+* `scale`: default `0`, gradient rescaling.
+  If != 0, multiply the gradient with `scale` before updating.
+  Often choose to be `1.0 / batch_size`.
+  If leave it default, high-level API like `fit!` will set it to
+  `1.0 / batch_size`, since `fit!` knows the `batch_size`.
+* `λ`: default `0.00001`, weight decay is equivalent
+  to adding a global l2 regularizer for all the parameters.
+
+### References
+1. Kingma, Diederik, and Jimmy Ba (2014):
+   Adam: A Method for Stochastic Optimization. Section 7.
+   [http://arxiv.org/abs/1412.6980]
+   (http://arxiv.org/abs/1412.6980).
+"""
+AdaMax
+
+@defstruct AdaMax <: AbstractOptimizer (
+  (η      :: Real = 0.002, η > 0),
+  (β1     :: Real = 0.9,   0 <= β1 < 1),
+  (β2     :: Real = 0.999, 0 <= β2 < 1),
+  (ϵ      :: Real = 1e-8,  ϵ > 0),
+  (clip   :: Real = 0,     clip >= 0),
+   scale  :: Real = 0,
+  (λ      :: Real = 1e-5,  λ >= 0),
+  η_sched :: Any  = initlrsched(η)
+)
+
+mutable struct AdaMaxState
+  mₜ  :: NDArray
+  uₜ  :: NDArray
+  β1ᵗ :: Float64
+end
+
+create_state(ada::AdaMax, ::Int, W::NDArray) =
+  AdaMaxState(zeros(size(W), context(W)),
+              zeros(size(W), context(W)),
+              ada.β1)
+
+function update!(ada::AdaMax, ::Int, W::NDArray, ∇::NDArray, s::AdaMaxState)
+  η = get(ada.η_sched)
+  β1 = ada.β1
+  β2 = ada.β2
+  ϵ = ada.ϵ
+
+  normgrad!(ada, W, ∇)
+
+  s.mₜ = β1 * s.mₜ .+ (1 - β1) .* ∇
+  s.uₜ = _maximum(β2 * s.uₜ, abs(∇))  # FIXME abs dot-call
+
+  @inplace W .+= -η / (1 - s.β1ᵗ) * s.mₜ ./ (s.uₜ + ϵ)
+
+  s.β1ᵗ *= ada.β1
+end
diff --git a/julia/src/optimizers/nadam.jl b/julia/src/optimizers/nadam.jl
new file mode 100644
index 000000000000..fdcd1ea7e7ab
--- /dev/null
+++ b/julia/src/optimizers/nadam.jl
@@ -0,0 +1,111 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+doc"""
+    Nadam(; kwargs...)
+
+Nesterov Adam optimizer: Adam RMSprop with Nesterov momentum,
+see [1] and notes for further description.
+
+
+### Arguments
+* `η`: default `0.001`, learning rate.
+* `β1`: default `0.99`.
+* `β2`: default `0.999`.
+* `ϵ`: default `1e-8`, small value added for numerical stability.
+* `clip`: default `0`, gradient clipping.
+  If positive, will clip the gradient into the range `[-clip, clip]`.
+* `scale`: default `0`, gradient rescaling.
+  If != 0, multiply the gradient with `scale` before updating.
+  Often choose to be `1.0 / batch_size`.
+  If leave it default, high-level API like `fit!` will set it to
+  `1.0 / batch_size`, since `fit!` knows the `batch_size`.
+* `λ`: default `0.00001`, weight decay is equivalent
+  to adding a global l2 regularizer for all the parameters.
+* `η_sched::AbstractLearningRateScheduler`: default `nothing`, a
+  dynamic learning rate scheduler. If set, will overwrite the `η`
+  parameter.
+* `μ_sched::NadamScheduler` default `NadamScheduler()` of the form.
+
+  ```math
+  \mu_t = β_1 (1 - 0.5 \times 0.96^{t \times 0.004})
+  ```
+
+### Notes
+Default parameters follow those provided in the paper.
+It is recommended to leave the parameters of this optimizer
+at their default values.
+
+### References
+1. [Incorporating Nesterov Momentum into Adam]
+   (http://cs229.stanford.edu/proj2015/054_report.pdf).
+
+2. [On the importance of initialization and momentum in deep learning]
+   (http://www.cs.toronto.edu/~fritz/absps/momentum.pdf).
+"""
+Nadam
+
+@defstruct Nadam <: AbstractOptimizer (
+  (η      :: Real = 0.001, η > 0),
+  (β1     :: Real = 0.99,  0 <= β1 < 1),
+  (β2     :: Real = 0.999, 0 <= β2 < 1),
+  (ϵ      :: Real = 1e-8,  ϵ > 0),
+  (clip   :: Real = 0,     clip >= 0),
+   scale  :: Real = 0,
+  (λ      :: Real = 1e-5,  λ >= 0),
+  η_sched :: Any = initlrsched(η),
+  μ_sched :: Momentum.NadamScheduler = Momentum.NadamScheduler(μ = β1)
+)
+
+mutable struct NadamState
+  m  :: NDArray
+  n  :: NDArray
+  Πμ  :: Float64
+  β2ᵗ :: Float64
+  t  :: Int  # use in NadamScheduler.
+             # we store `t` in state because state is created for each `index`
+end
+
+create_state(n::Nadam, ::Int, W::NDArray) =
+  NadamState(zeros(size(W), context(W)), zeros(size(W), context(W)),
+             1.0, n.β2, 1)
+
+function update!(na::Nadam, ::Int, W::NDArray, ∇::NDArray, s::NadamState)
+  η = get(na.η_sched)
+  μₜ, μₜ₁= get(na.μ_sched, s.t)
+  β1, β2 = na.β1, na.β2
+  ϵ = na.ϵ
+
+  normgrad!(na, W, ∇)
+  s.t += 1
+
+  s.Πμ *= μₜ
+  Πμ′ = s.Πμ * μₜ₁
+
+  ∇′ = ∇ / (1.0 - s.Πμ)
+  @inplace s.m .*= β1
+  @inplace s.m .+= (1.0 - β1) * ∇
+  m̂ = s.m / (1.0 - Πμ′)
+
+  @inplace s.n .*= β2
+  @inplace s.n .+= (1.0 - β2) .* ∇.^2
+  n̂ = s.n / (1.0 - s.β2ᵗ)
+  s.β2ᵗ *= β2
+
+  m̄ = (1.0 - μₜ) * ∇′+ μₜ₁ * m̂
+  @inplace W .+= -η * m̄ ./ (sqrt(n̂) + ϵ)
+end
diff --git a/julia/src/optimizers/rmsprop.jl b/julia/src/optimizers/rmsprop.jl
new file mode 100644
index 000000000000..8351142681b1
--- /dev/null
+++ b/julia/src/optimizers/rmsprop.jl
@@ -0,0 +1,84 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+doc"""
+    RMSProp(; kwargs...)
+
+Scale learning rates by dividing with the moving average of the root mean
+squared (RMS) gradients. See [1] for further description.
+
+### Arguments
+
+* `η`: default `0.1`, learning rate.
+* `ρ`: default `0.9`, gradient moving average decay factor.
+* `ϵ`: default `1e-8`, small value added for numerical stability.
+* `clip`: default `0`, gradient clipping.
+  If positive, will clip the gradient into the range `[-clip, clip]`.
+* `scale`: default `0`, gradient rescaling.
+  If != 0, multiply the gradient with `scale` before updating.
+  Often choose to be `1.0 / batch_size`.
+  If leave it default, high-level API like `fit!` will set it to
+  `1.0 / batch_size`, since `fit!` knows the `batch_size`.
+* `λ`: default `0.00001`, weight decay is equivalent
+  to adding a global l2 regularizer for all the parameters.
+
+### Notes
+`ρ` should be between 0 and 1. A value of `ρ` close to 1 will decay the
+moving average slowly and a value close to 0 will decay the moving average
+fast.
+
+Using the step size `η` and a decay factor `ρ the
+learning rate `ηₜ` is calculated as:
+
+```math
+\begin{align*}
+  r_t &= ρ r_{t-1} + (1 - ρ)g^2 \\
+  η_t &= \frac{η}{\sqrt{r_t + ϵ}}
+\end{align*}
+```
+
+### References
+1. Tieleman, T. and Hinton, G. (2012):
+   Neural Networks for Machine Learning, Lecture 6.5 - rmsprop.
+   Coursera. [http://www.youtube.com/watch?v=O3sxAc4hxZU]
+   (http://www.youtube.com/watch?v=O3sxAc4hxZU) (formula @5:20)
+"""
+RMSProp
+
+@defstruct RMSProp <: AbstractOptimizer (
+  (η      :: Real = 0.001, η > 0),
+  (ρ      :: Real = 0.9,   0 < ρ < 1),
+  (ϵ      :: Real = 1e-8,  ϵ > 0),
+  (clip   :: Real = 0,     clip >= 0),
+   scale  :: Real = 0,
+  (λ      :: Real = 1e-5,  λ >= 0),
+  η_sched :: Any  = initlrsched(η)
+)
+
+create_state(::RMSProp, ::Int, W::NDArray) = zeros(size(W), context(W))
+
+function update!(rms::RMSProp, ::Int, W::NDArray, ∇::NDArray, s::NDArray)
+  η = get(rms.η_sched)
+  ρ = rms.ρ
+  ϵ = rms.ϵ
+
+  normgrad!(rms, W, ∇)
+
+  @inplace s .*= ρ
+  @inplace s .+= (1 - ρ) .* (∇.^2)
+  @inplace W .+= -η .* ∇ ./ sqrt(s .+ ϵ)  # FIXME: sqrt should be dot-call
+end
diff --git a/julia/src/optimizers/sgd.jl b/julia/src/optimizers/sgd.jl
new file mode 100644
index 000000000000..dfd3d954baa3
--- /dev/null
+++ b/julia/src/optimizers/sgd.jl
@@ -0,0 +1,88 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+doc"""
+    SGD(; kwargs...)
+
+Stochastic gradient descent optimizer.
+
+Vanilla SGD:
+
+```math
+\theta \leftarrow \theta - \eta \nabla
+```
+
+SGD with momentum::
+
+```math
+\begin{align*}
+  \nu    & \leftarrow \mu \nu_{t-1} - \eta \nabla \\
+  \theta & \leftarrow \theta + \nu_t
+\end{align*}
+```
+
+### Arguments
+
+* `η`: default `0.01`, learning rate.
+* `μ`: default `0`, the momentum, usually set to `0.9` in this implementation.
+* `λ`: default `0.0001`, weight decay is equivalent to
+  adding a global l2 regularizer to the parameters.
+* `clip`: default `0`, gradient clipping.
+  If positive, will clip the gradient into the bounded range `[-clip, clip]`.
+* `scale`: default `0`, gradient rescaling.
+  If != 0, multiply the gradient with `scale` before updating.
+  Often choose to be `1.0 / batch_size`.
+  If leave it default, high-level API like `fit!` will set it to
+  `1.0 / batch_size`, since `fit!` knows the `batch_size`.
+* `μ_sched::AbstractMomentumScheduler`: default `Momentum.Null()`,
+  a dynamic momentum scheduler. If set, will overwrite the `momentum`
+  parameter.
+* `η_sched::AbstractLearningRateScheduler`: default `LearningRate.Fixed(η)`, a
+  dynamic learning rate scheduler. If set, will overwrite the `η` parameter.
+"""
+SGD
+
+@defstruct SGD <: AbstractOptimizer (
+  (η      :: Real = 0.01,   η > 0),
+  (μ      :: Real = 0.0,    μ >= 0),
+  (clip   :: Real = 0,      clip >= 0),
+   scale  :: Real = 0,
+  (λ      :: Real = 0.0001, λ >= 0),
+  η_sched :: Any  = initlrsched(η),
+  μ_sched :: Any  = initmomsched(μ)
+)
+
+create_state(sgd::SGD, ::Int, W::NDArray) =
+  isa(sgd.μ_sched, Momentum.Null) ? nothing : zeros(size(W), context(W))
+
+function update!(sgd::SGD, ::Int, W::NDArray, ∇::NDArray, ::Void)
+  η = get(sgd.η_sched)
+  normgrad!(sgd, W, ∇)
+  @inplace W += -η * ∇
+end
+
+# update with momentum
+function update!(sgd::SGD, ::Int, W::NDArray, ∇::NDArray, ν::NDArray)
+  η = get(sgd.η_sched)
+  μ = get(sgd.μ_sched)
+
+  normgrad!(sgd, W, ∇)
+
+  @inplace ν .*= μ
+  @inplace ν .+= -η .* ∇
+  @inplace W .+= ν
+end
diff --git a/julia/src/random.jl b/julia/src/random.jl
new file mode 100644
index 000000000000..b9b32a42ecff
--- /dev/null
+++ b/julia/src/random.jl
@@ -0,0 +1,83 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+    rand!(x::NDArray; low = 0, high = 1)
+
+Draw random samples from a uniform distribution.
+Samples are uniformly distributed over the half-open interval [low, high)
+(includes low, but excludes high).
+
+```julia
+julia> mx.rand!(empty(2, 3))
+2×3 mx.NDArray{Float32,2} @ CPU0:
+ 0.385748   0.839275  0.444536
+ 0.0879585  0.215928  0.104636
+
+julia> mx.rand!(empty(2, 3), low = 1, high = 10)
+2×3 mx.NDArray{Float32,2} @ CPU0:
+ 6.6385   4.18888  2.07505
+ 8.97283  2.5636   1.95586
+```
+"""
+rand!(x::NDArray; low = 0, high = 1) =
+  _random_uniform(NDArray, low = low, high = high, shape = size(x), out = x)
+
+"""
+    rand(dims...; low = 0, high = 1, context = cpu())
+
+Draw random samples from a uniform distribution.
+Samples are uniformly distributed over the half-open interval [low, high)
+(includes low, but excludes high).
+
+```julia
+julia> mx.rand(2, 2)
+2×2 mx.NDArray{Float32,2} @ CPU0:
+ 0.487866   0.825691
+ 0.0234245  0.794797
+
+julia> mx.rand(2, 2; low = 1, high = 10)
+2×2 mx.NDArray{Float32,2} @ CPU0:
+ 5.5944   5.74281
+ 9.81258  3.58068
+```
+"""
+rand(dims::Int...; low = 0, high = 1, context = cpu()) =
+  rand!(empty(dims, context), low = low, high = high)
+
+"""
+    randn!(x::NDArray; μ = 0, σ = 1)
+
+Draw random samples from a normal (Gaussian) distribution.
+"""
+randn!(x::NDArray; μ = 0, σ = 1) =
+  _random_normal(NDArray, loc = μ, scale = σ, shape = size(x), out = x)
+
+"""
+    randn(dims...; μ = 0, σ = 1, context = cpu())
+
+Draw random samples from a normal (Gaussian) distribution.
+"""
+randn(dims::Int...; μ = 0, σ = 1, context = cpu()) =
+  randn!(empty(dims, context), μ = μ, σ = σ)
+
+"""
+    srand(seed::Int)
+
+Set the random seed of libmxnet
+"""
+srand(seed_state::Int) = @mxcall(:MXRandomSeed, (Cint,), seed_state)
diff --git a/julia/src/symbolic-node.jl b/julia/src/symbolic-node.jl
new file mode 100644
index 000000000000..508f9d449028
--- /dev/null
+++ b/julia/src/symbolic-node.jl
@@ -0,0 +1,1003 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+    SymbolicNode
+
+SymbolicNode is the basic building block of the symbolic graph in MXNet.jl.
+
+    (self :: SymbolicNode)(args :: SymbolicNode...)
+    (self :: SymbolicNode)(; kwargs...)
+
+Make a new node by composing `self` with `args`. Or the arguments
+can be specified using keyword arguments.
+"""
+mutable struct SymbolicNode
+  handle::MX_SymbolHandle
+end
+
+const SymbolicNodeOrReal = Union{SymbolicNode, Real}
+
+@unfuse SymbolicNode  # for broadcasting
+
+Base.unsafe_convert(::Type{MX_handle}, obj::SymbolicNode) =
+  Base.unsafe_convert(MX_handle, obj.handle)
+Base.convert(t::Type{MX_handle}, obj::SymbolicNode) = Base.unsafe_convert(t, obj)
+Base.cconvert(t::Type{MX_handle}, obj::SymbolicNode) = Base.unsafe_convert(t, obj)
+
+"""
+    deepcopy(self :: SymbolicNode)
+
+Make a deep copy of a SymbolicNode.
+"""
+function Base.deepcopy(self :: SymbolicNode)
+  ref_hdr = Ref{MX_handle}(0)
+  @mxcall(:MXSymbolCopy, (MX_handle, Ref{MX_handle}), self, ref_hdr)
+  return SymbolicNode(MX_SymbolHandle(ref_hdr[]))
+end
+
+"""
+    copy(self :: SymbolicNode)
+
+Make a copy of a SymbolicNode. The same as making a deep copy.
+"""
+function Base.copy(self :: SymbolicNode)
+  Base.deepcopy(self)
+end
+
+function (self::SymbolicNode)(args :: SymbolicNode...)
+  s = deepcopy(self)
+  _compose!(s, args...)
+end
+function (self::SymbolicNode)(;kwargs...)
+  s = deepcopy(self)
+  _compose!(s; kwargs...)
+end
+
+macro _list_symbol_info(self, func_name)
+  quote
+    ref_sz    = Ref{MX_uint}(0)
+    ref_names = Ref{char_pp}(0)
+    @mxcall($func_name, (MX_handle, Ref{MX_uint}, Ref{char_pp}),
+            $(esc(self)), ref_sz, ref_names)
+    narg = ref_sz[]
+    names = unsafe_wrap(Array, ref_names[], narg)
+    names = [Symbol(unsafe_string(x)) for x in names]
+    return names
+  end
+end
+
+"""
+    list_arguments(self :: SymbolicNode)
+
+List all the arguments of this node. The argument for a node contains both
+the inputs and parameters. For example, a `FullyConnected` node will
+have both data and weights in its arguments. A composed node (e.g. a MLP) will
+list all the arguments for intermediate nodes.
+
+Returns a list of symbols indicating the names of the arguments.
+"""
+function list_arguments(self :: SymbolicNode)
+  @_list_symbol_info(self, :MXSymbolListArguments)
+end
+
+"""
+    list_outputs(self :: SymbolicNode)
+
+List all the outputs of this node.
+
+Returns a list of symbols indicating the names of the outputs.
+"""
+function list_outputs(self :: SymbolicNode)
+  @_list_symbol_info(self, :MXSymbolListOutputs)
+end
+
+
+"""
+    list_auxiliary_states(self :: SymbolicNode)
+
+
+List all auxiliary states in the symbool.
+
+Auxiliary states are special states of symbols that do not corresponds to an argument,
+and do not have gradient. But still be useful for the specific operations.
+A common example of auxiliary state is the moving_mean and moving_variance in BatchNorm.
+Most operators do not have Auxiliary states.
+
+Returns a list of symbols indicating the names of the auxiliary states.
+"""
+function list_auxiliary_states(self :: SymbolicNode)
+  @_list_symbol_info(self, :MXSymbolListAuxiliaryStates)
+end
+
+"""
+    get_internals(self :: SymbolicNode)
+
+Get a new grouped `SymbolicNode` whose output contains all the internal outputs of
+this `SymbolicNode`.
+"""
+function get_internals(self :: SymbolicNode)
+  ref_hdr = Ref{MX_handle}(0)
+  @mxcall(:MXSymbolGetInternals, (MX_handle, Ref{MX_handle}), self, ref_hdr)
+  return SymbolicNode(MX_SymbolHandle(ref_hdr[]))
+end
+
+"""
+    get_children(x::SymbolicNode)
+
+Gets a new grouped `SymbolicNode` whose output contains inputs to output
+nodes of the original symbol.
+
+```julia
+julia> x = mx.Variable(:x)
+MXNet.mx.SymbolicNode x
+
+julia> y = mx.Variable(:y)
+MXNet.mx.SymbolicNode y
+
+julia> z = x + y
+MXNet.mx.SymbolicNode _plus1
+
+julia> a |> mx.get_children |> mx.list_outputs
+2-element Array{Symbol,1}:
+ :x
+ :y
+```
+"""
+function get_children(x::SymbolicNode)
+  hdl = Ref{MX_handle}(C_NULL)
+  @mxcall(:MXSymbolGetChildren, (MX_handle, Ref{MX_handle}), x, hdl)
+  sym = hdl[] |> MX_SymbolHandle |> SymbolicNode
+  isempty(list_outputs(sym)) ? nothing : sym
+end
+
+"""
+    get_attr(self :: SymbolicNode, key :: Symbol)
+
+Get attribute attached to this `SymbolicNode` belonging to key.
+
+Returns the value belonging to key as a `Nullable`.
+"""
+function get_attr(self :: SymbolicNode, key :: Symbol)
+  key_s = string(key)
+  ref_out = Ref{Cstring}()
+  ref_success = Ref{Cint}(-1)
+  @mxcall(:MXSymbolGetAttr, (MX_handle, Cstring, Ref{Cstring}, Ref{Cint}),
+          self, key_s, ref_out, ref_success)
+  if ref_success[] == 1
+    return Nullable{String}(unsafe_string(ref_out[]))
+  else
+    return Nullable{String}()
+  end
+end
+
+"""
+    list_attr(self :: SymbolicNode)
+
+Get all attributes from a symbol.
+
+Returns a dictionary of attributes.
+"""
+function list_attr(self :: SymbolicNode)
+  ref_sz    = Ref{MX_uint}(0)
+  ref_strings = Ref{char_pp}(0)
+  @mxcall(:MXSymbolListAttrShallow, (MX_handle, Ref{MX_uint}, Ref{char_pp}),
+            self, ref_sz, ref_strings)
+  narg = 2*ref_sz[]
+  strings = unsafe_wrap(Array, ref_strings[], narg)
+  out = Dict{Symbol, String}()
+  for i in 1:2:narg
+    key = Symbol(unsafe_string(strings[i]))
+    value = unsafe_string(strings[i+1]) # Creates a copy of string
+    out[key] = value
+  end
+  return out
+end
+
+"""
+    list_all_attr(self :: SymbolicNode)
+
+Get all attributes from the symbol graph.
+
+Returns a dictionary of attributes.
+"""
+function list_all_attr(self :: SymbolicNode)
+  ref_sz    = Ref{MX_uint}(0)
+  ref_strings = Ref{char_pp}(0)
+  @mxcall(:MXSymbolListAttr, (MX_handle, Ref{MX_uint}, Ref{char_pp}),
+            self, ref_sz, ref_strings)
+  narg = 2*ref_sz[]
+  strings = unsafe_wrap(Array, ref_strings[], narg)
+  out = Dict{Symbol, String}()
+  for i in 1:2:narg
+    key = Symbol(unsafe_string(strings[i]))
+    value = unsafe_string(strings[i+1])
+    out[key] = value
+  end
+  return out
+end
+
+"""
+    set_attr(self:: SymbolicNode, key :: Symbol, value :: AbstractString)
+
+Set the attribute key to value for this `SymbolicNode`.
+
+!!! note
+    It is encouraged not to call this function directly, unless you know exactly what you are doing. The
+    recommended way of setting attributes is when creating the `SymbolicNode`. Changing
+    the attributes of a `SymbolicNode` that is already been used somewhere else might
+    cause unexpected behavior and inconsistency.
+"""
+function set_attr(self :: SymbolicNode, key :: Symbol, value :: AbstractString)
+  key_s = string(key)
+  value_s = String(value)
+
+  @mxcall(:MXSymbolSetAttr, (MX_handle, Cstring, Cstring), self, key_s, value_s)
+end
+
+"""
+    get_name(self :: SymbolicNode)
+
+Get the name of the symbol.
+
+    julia> x = mx.Variable(:data)
+    julia> mx.get_name(x)
+    :data
+
+    julia> y = mx.FullyConnected(x, num_hidden = 128)
+    julia> mx.get_name(y)
+    :fullyconnected0
+"""
+function get_name(self :: mx.SymbolicNode)
+    name = Ref{mx.char_p}(0)
+    success = Ref(0)
+    @mxcall(:MXSymbolGetName, (MX_handle, Ref{char_p}, Ref{Int}), self.handle.value, name, success)
+    @assert success[] != -1
+
+    str = name[]
+    if str == C_NULL  # e.g. the symbol returned via get_internals
+        string(self.handle.value)
+    else
+        Symbol(unsafe_string(str))
+    end
+end
+
+Base.show(io::IO, sym::SymbolicNode) =
+  print(io, "$(typeof(sym)) $(get_name(sym))")
+
+import Base: print
+
+function print(io::IO, sym::SymbolicNode)
+  out = Ref{mx.char_p}(C_NULL)
+  @mx.mxcall(:MXSymbolPrint, (mx.MX_SymbolHandle, Ref{mx.char_p}), sym.handle, out)
+  print(io, unsafe_string(out[]))
+end
+
+print(sym::SymbolicNode) = print(STDOUT, sym)
+
+"""
+    print([io::IO], sym::SymbolicNode)
+
+Print the content of symbol, used for debug.
+
+```julia
+julia> layer = @mx.chain mx.Variable(:data)           =>
+         mx.FullyConnected(name=:fc1, num_hidden=128) =>
+         mx.Activation(name=:relu1, act_type=:relu)
+MXNet.mx.SymbolicNode(MXNet.mx.MX_SymbolHandle(Ptr{Void} @0x000055b29b9c3520))
+
+julia> print(layer)
+Symbol Outputs:
+        output[0]=relu1(0)
+Variable:data
+Variable:fc1_weight
+Variable:fc1_bias
+--------------------
+Op:FullyConnected, Name=fc1
+Inputs:
+        arg[0]=data(0) version=0
+        arg[1]=fc1_weight(0) version=0
+        arg[2]=fc1_bias(0) version=0
+Attrs:
+        num_hidden=128
+--------------------
+Op:Activation, Name=relu1
+Inputs:
+        arg[0]=fc1(0)
+Attrs:
+        act_type=relu
+```
+"""
+print
+
+"""
+    grad(self :: SymbolicNode, wrt :: Vector{SymbolicNode})
+
+Get the autodiff gradient of the current `SymbolicNode`. This function can
+only be used if the current symbol is a loss function.
+
+# Arguments:
+* `self::SymbolicNode`: current node.
+* `wrt::Vector{Symbol}`: the names of the arguments to the gradient.
+
+Returns a gradient symbol of the corresponding gradient.
+"""
+function grad(self :: SymbolicNode, wrt :: Vector{Symbol})
+  hdr_ref = Ref{MX_handle}(0)
+  keys = String[string(key) for key in wrt]
+
+  @mxcall(:MXSymbolGrad, (MX_handle, MX_uint, char_pp, Ptr{MX_handle}), self, length(keys), keys, hdr_ref)
+  return SymbolicNode(MX_SymbolHandle(hdr_ref[]))
+end
+
+"""
+    Variable(name :: Union{Symbol, AbstractString})
+
+Create a symbolic variable with the given name. This is typically used as a placeholder.
+For example, the data node, acting as the starting point of a network architecture.
+
+# Arguments
+* Dict{Symbol, AbstractString} attrs: The attributes associated with this `Variable`.
+"""
+function Variable(name :: Union{Symbol, AbstractString}; attrs = Dict())
+  attrs = convert(Dict{Symbol, AbstractString}, attrs)
+  hdr_ref = Ref{MX_handle}(0)
+  @mxcall(:MXSymbolCreateVariable, (char_p, Ref{MX_handle}), name, hdr_ref)
+  node = SymbolicNode(MX_SymbolHandle(hdr_ref[]))
+  for (k, v) in attrs
+    set_attr(node, k, v)
+  end
+  node
+end
+
+"""
+    @var <symbols>...
+
+A handy macro for creating `mx.Variable`.
+
+```julia
+julia> x = @mx.var x
+MXNet.mx.SymbolicNode x
+
+julia> x, y, z = @mx.var x y z
+(MXNet.mx.SymbolicNode x, MXNet.mx.SymbolicNode y, MXNet.mx.SymbolicNode z)
+```
+"""
+macro var(n::Symbol)
+  Expr(:call, :Variable, QuoteNode(n))
+end
+
+macro var(names::Symbol...)
+  Expr(:tuple, map(n -> Expr(:call, :Variable, QuoteNode(n)), names)...)
+end
+
+"""
+    Group(nodes :: SymbolicNode...)
+
+Create a `SymbolicNode` by grouping nodes together.
+"""
+function Group(nodes :: SymbolicNode...)
+  handles = MX_handle[nodes...]
+  ref_hdr = Ref{MX_handle}(0)
+  @mxcall(:MXSymbolCreateGroup, (MX_uint, Ptr{MX_handle}, Ref{MX_handle}),
+          length(handles), handles, ref_hdr)
+  SymbolicNode(MX_SymbolHandle(ref_hdr[]))
+end
+
+function _build_shapes(shape_size::MX_uint, shape_ndim::Ptr{MX_uint}, shape_data::Ptr{Ptr{MX_uint}})
+  shape_ndim = unsafe_wrap(Array, shape_ndim, shape_size)
+  shape_data = unsafe_wrap(Array, shape_data, shape_size)
+  shapes = map(1:shape_size) do i
+    my_shape = unsafe_wrap(Array, shape_data[i], shape_ndim[i])
+    tuple(flipdim(Int[my_shape...],1)...)
+  end
+  convert(Vector{Tuple}, shapes)
+end
+
+function _infer_shape(self, keys, indptr, sdata)
+  ref_arg_shape_size = Ref{MX_uint}(0)
+  ref_arg_shape_ndim = Ref{Ptr{MX_uint}}(0)
+  ref_arg_shape_data = Ref{Ptr{Ptr{MX_uint}}}(0)
+  ref_out_shape_size = Ref{MX_uint}(0)
+  ref_out_shape_ndim = Ref{Ptr{MX_uint}}(0)
+  ref_out_shape_data = Ref{Ptr{Ptr{MX_uint}}}(0)
+  ref_aux_shape_size = Ref{MX_uint}(0)
+  ref_aux_shape_ndim = Ref{Ptr{MX_uint}}(0)
+  ref_aux_shape_data = Ref{Ptr{Ptr{MX_uint}}}(0)
+  ref_complete       = Ref{Cint}(0)
+  @mxcall(:MXSymbolInferShape,
+          (MX_handle, MX_uint, char_pp, Ptr{MX_uint}, Ptr{MX_uint},
+           Ref{MX_uint}, Ref{Ptr{MX_uint}}, Ref{Ptr{Ptr{MX_uint}}},
+           Ref{MX_uint}, Ref{Ptr{MX_uint}}, Ref{Ptr{Ptr{MX_uint}}},
+           Ref{MX_uint}, Ref{Ptr{MX_uint}}, Ref{Ptr{Ptr{MX_uint}}},
+           Ref{Cint}),
+          self, length(indptr)-1, keys, indptr, sdata,
+          ref_arg_shape_size, ref_arg_shape_ndim, ref_arg_shape_data,
+          ref_out_shape_size, ref_out_shape_ndim, ref_out_shape_data,
+          ref_aux_shape_size, ref_aux_shape_ndim, ref_aux_shape_data,
+          ref_complete)
+  if ref_complete[] == 0
+    return (nothing, nothing, nothing)
+  else
+    return (
+      _build_shapes(ref_arg_shape_size[], ref_arg_shape_ndim[], ref_arg_shape_data[]),
+      _build_shapes(ref_out_shape_size[], ref_out_shape_ndim[], ref_out_shape_data[]),
+      _build_shapes(ref_aux_shape_size[], ref_aux_shape_ndim[], ref_aux_shape_data[])
+    )
+  end
+end
+
+"""
+    infer_shape(self :: SymbolicNode, args...)
+    infer_shape(self :: SymbolicNode; kwargs...)
+
+Do shape inference according to the input shapes. The input shapes could be provided
+as a list of shapes, which should specify the shapes of inputs in the same order as
+the arguments returned by [`list_arguments`](@ref). Alternatively, the shape information
+could be specified via keyword arguments.
+
+Returns a 3-tuple containing shapes of all the arguments, shapes of all the outputs and
+shapes of all the auxiliary variables. If shape inference failed due to incomplete
+or incompatible inputs, the return value will be `(nothing, nothing, nothing)`.
+"""
+function infer_shape(self :: SymbolicNode; kwargs...)
+  sdata  = MX_uint[]
+  indptr = MX_uint[0]
+  for (k,v) in kwargs
+    append!(sdata, flipdim([v...],1))
+    push!(indptr, length(sdata))
+  end
+  keys = AbstractString[string(x[1]) for x in kwargs]
+  _infer_shape(self, keys, indptr, sdata)
+end
+function infer_shape(self :: SymbolicNode, args :: Union{Tuple, Void}...)
+  sdata  = MX_uint[]
+  indptr = MX_uint[0]
+  for arg in args
+    if isa(arg, Void); continue; end
+    append!(sdata, flipdim([arg...],1))
+    push!(indptr, length(sdata))
+  end
+  keys = Ptr{char_p}(0)
+  _infer_shape(self, keys, indptr, sdata)
+end
+
+function _infer_type(self, keys, arg_type_data)
+  ref_in_type_size  = Ref{MX_uint}()
+  ref_in_type_data  = Ref{Ptr{Cint}}()
+  ref_out_type_size = Ref{MX_uint}()
+  ref_out_type_data = Ref{Ptr{Cint}}()
+  ref_aux_type_size = Ref{MX_uint}()
+  ref_aux_type_data = Ref{Ptr{Cint}}()
+  ref_complete      = Ref{Cint}()
+
+  @mxcall(:MXSymbolInferType,
+          (MX_handle, MX_uint, char_pp, Ptr{Cint},
+           Ref{MX_uint}, Ref{Ptr{Cint}},
+           Ref{MX_uint}, Ref{Ptr{Cint}},
+           Ref{MX_uint}, Ref{Ptr{Cint}},
+           Ref{Cint}),
+          self, length(arg_type_data)-1, keys, arg_type_data,
+          ref_in_type_size, ref_in_type_data,
+          ref_out_type_size, ref_out_type_data,
+          ref_aux_type_size, ref_aux_type_data,
+          ref_complete)
+
+  if ref_complete[] == 0
+    return (nothing, nothing, nothing)
+  else
+    in_type = unsafe_wrap(Array, ref_in_type_data[], ref_in_type_size[])
+    out_type = unsafe_wrap(Array, ref_out_type_data[], ref_out_type_size[])
+    aux_type = unsafe_wrap(Array, ref_aux_type_data[], ref_aux_type_size[])
+    return ([fromTypeFlag(TypeFlag(t)) for t in in_type],
+            [fromTypeFlag(TypeFlag(t)) for t in out_type],
+            [fromTypeFlag(TypeFlag(t)) for t in aux_type])
+  end
+end
+
+"""
+    infer_type(self :: SymbolicNode; kwargs...)
+    infer_type(self :: SymbolicNode, args...)
+
+Do type inference according to the input types. The input types could be provided
+as a list of types, which should specify the types of inputs in the same order as
+the arguments returned by [`list_arguments`](@ref). Alternatively, the type information
+could be specified via keyword arguments.
+
+Returns a 3-tuple containing types of all the arguments, types of all the outputs and
+types of all the auxiliary variables. If type inference failed due to incomplete
+or incompatible inputs, the return value will be `(nothing, nothing, nothing)`.
+"""
+function infer_type(self :: SymbolicNode; kwargs...)
+  types = Cint[toTypeFlag(x[2]) for x in kwargs]
+  keys = AbstractString[string(x[1]) for x in kwargs]
+  _infer_type(self, keys, types)
+end
+
+function infer_type(self :: SymbolicNode, args :: Union{Tuple, Void}...)
+  types = Cint[]
+  keys = Ptr{char_p}(0)
+
+  for arg in args
+    if isa(arg, Void); continue; end
+    push!(types, toTypeFlag(arg))
+  end
+  _infer_type(self, keys, types)
+end
+
+"""
+    getindex(self :: SymbolicNode, idx :: Union{Int, Base.Symbol, AbstractString})
+
+Get a node representing the specified output of this node. The index could be
+a symbol or string indicating the name of the output, or a 1-based integer
+indicating the index, as in the list of [`list_outputs`](@ref).
+"""
+function Base.getindex(self :: SymbolicNode, idx :: Union{Base.Symbol, AbstractString})
+  idx   = Symbol(idx)
+  i_idx = find(idx .== list_outputs(self))
+  @assert(length(i_idx) > 0, "Cannot find output with name '$idx'")
+  @assert(length(i_idx) < 2, "Found duplicated output with name '$idx'")
+  Base.getindex(self, i_idx[1])
+end
+function Base.getindex(self :: SymbolicNode, idx :: Int)
+  ref_hdr = Ref{MX_handle}(0)
+  # note Julia is 1-based, while MXNet is 0-based
+  @mxcall(:MXSymbolGetOutput, (MX_handle, MX_uint, Ref{MX_handle}), self, idx-1, ref_hdr)
+  return SymbolicNode(MX_SymbolHandle(ref_hdr[]))
+end
+
+import Base: +
+
+"""
+    +(args...)
+    .+(args...)
+
+Elementwise summation of `SymbolicNode`.
+"""
+function +(x::SymbolicNode, ys::SymbolicNodeOrReal...)
+  ret = x
+  for y ∈ ys
+    if y isa SymbolicNode
+      ret = _plus(ret, y)
+    else
+      ret = _plus_scalar(ret, scalar=MX_float(y))
+    end
+  end
+  ret
+end
+
++(s::Real, x::SymbolicNode, ys::SymbolicNodeOrReal...) = +(x + s, ys...)
+
+broadcast_(::typeof(+), x::SymbolicNode, ys::SymbolicNodeOrReal...) = +(x, ys...)
+broadcast_(::typeof(+), s::Real, x::SymbolicNode, ys::SymbolicNodeOrReal...) = +(x + s, ys...)
+
+import Base: -
+
+"""
+    -(x, y)
+    .-(x, y)
+
+Elementwise substraction of `SymbolicNode`.
+Operating with `Real` is available.
+"""
+x::SymbolicNode - y::SymbolicNode = _minus(x, y)
+x::SymbolicNode - s::Real         = _minus_scalar(x,  scalar=MX_float(s))
+s::Real         - x::SymbolicNode = _rminus_scalar(x, scalar=MX_float(s))
+
+-(x::SymbolicNode) = 0 - x
+
+broadcast_(::typeof(-), x::SymbolicNode, y::SymbolicNodeOrReal) = x - y
+broadcast_(::typeof(-), s::Real, x::SymbolicNode) = s - x
+
+import Base: *
+
+"""
+    .*(x, y)
+
+Elementwise multiplication of `SymbolicNode`.
+"""
+x::SymbolicNode * s::Real = _mul_scalar(x, scalar=MX_float(s))
+s::Real * x::SymbolicNode = _mul_scalar(x, scalar=MX_float(s))
+
+function broadcast_(::typeof(*), x::SymbolicNode, ys::SymbolicNodeOrReal...)
+  ret = x
+  for y in ys
+    if y isa SymbolicNode
+      ret = _mul(ret, y)
+    else
+      ret = _mul_scalar(ret, scalar=MX_float(y))
+    end
+  end
+  ret
+end
+
+broadcast_(::typeof(*), s::Real, x::SymbolicNode, ys::SymbolicNodeOrReal...) =
+  broadcast_(*, x * s, ys...)
+
+import Base: /
+
+"""
+    ./(x, y)
+
+* Elementwise dividing a `SymbolicNode` by a scalar or another `SymbolicNode`
+of the same shape.
+
+* Elementwise divide a scalar by an `SymbolicNode`.
+
+* Matrix division (solving linear systems) is not implemented yet.
+"""
+x::SymbolicNode / s::Real = _DivScalar(x, scalar=MX_float(s))
+
+broadcast_(::typeof(/), x::SymbolicNode, y::SymbolicNode) = _div(x, y)
+broadcast_(::typeof(/), x::SymbolicNode, s::Real) = _div_scalar(x,  scalar=MX_float(s))
+broadcast_(::typeof(/), s::Real, x::SymbolicNode) = _rdiv_scalar(x, scalar=MX_float(s))
+
+
+import Base: ^
+
+"""
+    .^(x, y)
+
+Elementwise power of `SymbolicNode` and `NDArray`.
+Operating with `Real` is available.
+"""
+^
+
+broadcast_(::typeof(^), x::SymbolicNode, y::SymbolicNode) = _power(x, y)
+broadcast_(::typeof(^), x::SymbolicNode, s::Real) = _power_scalar(x,  scalar=MX_float(s))
+broadcast_(::typeof(^), s::Real, x::SymbolicNode) = _rpower_scalar(x, scalar=MX_float(s))
+
+broadcast_(::typeof(^), ::Irrational{:e}, x::SymbolicNode) = exp(x)
+broadcast_(::typeof(^), x::SymbolicNode, s::Irrational) =
+  _power_scalar(x, scalar=MX_float(s))
+broadcast_(::typeof(^), s::Irrational, x::SymbolicNode) =
+  _rpower_scalar(x, scalar=MX_float(s))
+
+function _compose!(node :: SymbolicNode; kwargs...)
+  name     = char_p(0)
+  arg_keys = AbstractString[]
+  arg_vals = MX_handle[]
+
+  for (k,v) in kwargs
+    if k == :name
+      name = string(v)
+    else
+      @assert(isa(v, SymbolicNode), "Compose expect `SymbolicNode` as arguments")
+      push!(arg_keys, string(k))
+      push!(arg_vals, v)
+    end
+  end
+
+  @mxcall(:MXSymbolCompose,
+          (MX_handle, char_p, MX_uint, Ptr{char_p}, Ptr{MX_handle}),
+          node, name, length(arg_keys), arg_keys, arg_vals)
+  return node
+end
+function _compose!(node :: SymbolicNode, args::SymbolicNode...)
+  _compose!(node, char_p(0), args...)
+end
+function _compose!(node :: SymbolicNode, name :: Union{Base.Symbol, char_p}, args::SymbolicNode...)
+  if isa(name, Base.Symbol); name = string(name); end
+  arg_keys = Ptr{char_p}(0)
+  arg_vals = MX_handle[args...]
+
+  @mxcall(:MXSymbolCompose,
+          (MX_handle, char_p, MX_uint, Ptr{char_p}, Ptr{MX_handle}),
+          node, name, length(arg_vals), arg_keys, arg_vals)
+  return node
+end
+
+"""
+    to_json(self :: SymbolicNode)
+
+Convert a `SymbolicNode` into a JSON string.
+"""
+function to_json(self :: SymbolicNode)
+  ref_json = Ref{char_p}(0)
+  @mxcall(:MXSymbolSaveToJSON, (MX_handle, Ref{char_p}), self, ref_json)
+  return unsafe_string(ref_json[])
+end
+
+"""
+    from_json(repr :: AbstractString, ::Type{SymbolicNode})
+
+Load a `SymbolicNode` from a JSON string representation.
+"""
+function from_json(repr :: AbstractString, ::Type{SymbolicNode})
+  ref_hdr = Ref{MX_handle}(0)
+  @mxcall(:MXSymbolCreateFromJSON, (char_p, Ref{MX_handle}), repr, ref_hdr)
+  return SymbolicNode(MX_SymbolHandle(ref_hdr[]))
+end
+
+"""
+    load(filename :: AbstractString, ::Type{SymbolicNode})
+
+Load a `SymbolicNode` from a JSON file.
+"""
+function load(filename :: AbstractString, ::Type{SymbolicNode})
+  ref_hdr = Ref{MX_handle}(0)
+  @mxcall(:MXSymbolCreateFromFile, (char_p, Ref{MX_handle}), filename, ref_hdr)
+  return SymbolicNode(MX_SymbolHandle(ref_hdr[]))
+end
+
+"""
+    save(filename :: AbstractString, node :: SymbolicNode)
+
+Save a `SymbolicNode` to a JSON file.
+"""
+function save(filename :: AbstractString, node :: SymbolicNode)
+  @mxcall(:MXSymbolSaveToFile, (MX_handle, char_p), node, filename)
+end
+
+import Base: reshape
+
+"""
+    reshape(sym::SymbolicNode, dim; reverse=false, name)
+    reshape(sym::SymbolicNode, dim...; reverse=false, name)
+
+Reshape SymbolicNode operator
+
+Some dimensions of the shape can take special values from the set
+{0, -1, -2, -3, -4}.
+The significance of each is explained below:
+
+- `0`  copy this dimension from the input to the output shape.
+
+  Example:
+
+  - input shape = (2,3,4), shape = (4,0,2), output shape = (4,3,2)
+  - input shape = (2,3,4), shape = (2,0,0), output shape = (2,3,4)
+
+- `-1` infers the dimension of the output shape by using the remainder of the
+  input dimensions keeping the size of the new array same as that of the input
+  array. At most one dimension of shape can be -1.
+
+  Example:
+
+  - input shape = (2,3,4), shape = (6,1,-1), output shape = (6,1,4)
+  - input shape = (2,3,4), shape = (3,-1,8), output shape = (3,1,8)
+  - input shape = (2,3,4), shape=(-1,), output shape = (24,)
+
+- `-2` copy all/remainder of the input dimensions to the output shape.
+
+  Example:
+
+  - input shape = (2,3,4), shape = (-2,), output shape = (2,3,4)
+  - input shape = (2,3,4), shape = (2,-2), output shape = (2,3,4)
+  - input shape = (2,3,4), shape = (-2,1,1), output shape = (2,3,4,1,1)
+
+- `-3` use the product of two consecutive dimensions of the input shape as the
+  output dimension.
+
+  Example:
+
+  - input shape = (2,3,4), shape = (-3,4), output shape = (6,4)
+  - input shape = (2,3,4,5), shape = (-3,-3), output shape = (6,20)
+  - input shape = (2,3,4), shape = (0,-3), output shape = (2,12)
+  - input shape = (2,3,4), shape = (-3,-2), output shape = (6,4)
+
+- `-4` split one dimension of the input into two dimensions passed subsequent
+  to -4 in shape (can contain -1).
+
+  Example:
+
+  - input shape = (2,3,4), shape = (-4,1,2,-2), output shape = (1,2,3,4)
+  - input shape = (2,3,4), shape = (2,-4,-1,3,-2), output shape = (2,1,3,4)
+
+If the argument `reverse` is set to `1`, then the special values are inferred
+from right to left.
+
+  Example:
+
+  - with `reverse=false`, for input shape = (10,5,4), shape = (-1,0),
+    output shape would be (40,5)
+  - with `reverse=true`, output shape will be (50,4).
+"""
+reshape(sym::SymbolicNode, dim::NTuple{N, Integer}; kwargs...) where {N} =
+  _reshape(sym, dim; kwargs...)
+reshape(sym::SymbolicNode, dim::Integer...; kwargs...) =
+  _reshape(sym, dim; kwargs...)
+
+@inline function _reshape(sym::SymbolicNode, dim::NTuple{N, Integer};
+                          reverse::Bool=false, name::String="") where N
+  op = _get_cached_libmx_op_handle("reshape")
+  node = _create_atomic_symbol(op.value, ["shape", "reverse"],
+                               [dump_mx_param(dim), dump_mx_param(!reverse)])
+  name = get!(DEFAULT_NAME_MANAGER, name, "reshape")
+  _compose!(node, name=name, data=sym)
+end
+
+################################################################################
+# Atomic SymbolicNode functions dynamically imported from libmxnet
+################################################################################
+@inline function _create_atomic_symbol(creator::MX_handle, keys::Vector{String},
+                                       vals::Vector{String})
+  ref_sym_hdr = Ref{MX_handle}(C_NULL)
+  @mxcall(:MXSymbolCreateAtomicSymbol,
+          (MX_handle, MX_uint, Ptr{char_p}, Ptr{char_p}, Ref{MX_handle}),
+          creator, length(keys), keys, vals, ref_sym_hdr)
+  SymbolicNode(MX_SymbolHandle(ref_sym_hdr[]))
+end
+
+@inline function _create_atomic_symbol(creator::MX_handle, keys::Vector{String},
+                                       vals::Vector{String},
+                                       attrs::Dict{Symbol, String})
+  node = _create_atomic_symbol(creator, keys, vals)
+  # set attrs
+  for (k, v) in attrs
+    set_attr(node, k, v)
+  end
+  node
+end
+
+function _define_atomic_symbol_creator(name :: String)
+  handle = _get_libmx_op_handle(name)
+  f_desc, key_narg = _get_libmx_op_description(name, handle)
+
+  f_desc *= "* `name::Symbol`: The name of the `SymbolicNode`. (e.g. `:my_symbol`), optional.\n"
+  f_desc *= "* `attrs::Dict{Symbol, AbstractString}`: The attributes associated with this `SymbolicNode`.\n\n"
+
+  func_name = Symbol(name)
+  func_def = quote
+  function $func_name(::Type{SymbolicNode}, args::SymbolicNode...; kwargs...)
+    idx = findfirst(x -> x[1] == :name, kwargs)
+    if idx > 0
+      name = kwargs[idx][2]
+    else
+      name = ""
+    end
+
+    # XXX: hacky way of solving the problem that the arguments of `dot` should be swapped
+    # See https://github.com/dmlc/MXNet.jl/issues/55
+    if $name == "dot"
+      args = reverse(args)
+    end
+
+    # XXX: hacky way of solving the semantic difference of the axes parameter in Julia
+    # and in libmxnet.
+    # See https://github.com/dmlc/MXNet.jl/pull/123
+    if $name == "transpose"
+      kwargs = Any[key != :axes ? (key, arg) : (key, reverse(map(i->length(arg)-i, arg))) for (key, arg) in kwargs]
+    end
+
+    param_keys = String[]
+    param_vals = String[]
+    symbol_kws = Dict{Symbol, SymbolicNode}()
+    attrs = Dict{Symbol, String}()
+
+    $(if key_narg != ""
+      quote
+        if !in($key_narg, param_keys)
+          push!(param_keys, $key_narg)
+          push!(param_vals, string(length(args)))
+        end
+      end
+    end)
+
+    for (k,v) in kwargs
+      if k == :name; continue; end
+      if isa(v, SymbolicNode)
+        symbol_kws[k] = v
+      elseif k == :attrs
+        if isa(v, Dict)
+          attrs = convert(Dict{Symbol, String}, v)
+        else
+          throw(ArgumentError("attrs needs to be a Dictionary"))
+        end
+      else
+        push!(param_keys, string(k))
+        push!(param_vals, dump_mx_param(v))
+      end
+    end
+
+    if length(args) > 1 && length(symbol_kws) != 0
+      @assert(false, $name * " only accepts SymbolicNode either as positional or keyword arguments with optional positional `data` argument, not both.")
+    end
+    $(if key_narg != ""
+      quote
+        if length(symbol_kws) > 0
+          @assert(false, $name * " takes variable number of SymbolicNode arguments, " *
+                         "please pass input Symbols via positional arguments, instead of keyword arguments.")
+        end
+      end
+    end)
+
+    local op = _get_cached_libmx_op_handle($name)
+    node = _create_atomic_symbol(op.value, param_keys, param_vals, attrs)
+
+    # generate a new name for the new symbol if user not provided in kwargs
+    hint = lowercase($name)
+    name = get!(DEFAULT_NAME_MANAGER, name, hint)
+
+    if length(symbol_kws) == 0
+      _compose!(node, name, args...)
+    elseif length(args) == 1
+      _compose!(node; name=name, data=args[1], symbol_kws...)
+    else
+      _compose!(node; name=name, symbol_kws...)
+    end
+
+    return node
+  end # function
+  end # quote
+
+  func_def2 = quote
+  @doc $f_desc ->
+  function $func_name(args::SymbolicNode...; kwargs...)
+    $func_name(SymbolicNode, args...; kwargs...)
+  end # function
+  end # quote
+
+  return quote
+    $func_def
+    $func_def2
+  end
+end
+
+macro _import_atomic_symbol_creators()
+  # XXX: those are operators defined for NDArray, we exclude them here
+  # because the calling convention for the type signature is not strong
+  # enough to disambiguate the method for NDArray and SymbolicNode
+  const ignored_ops = ["_set_value", "reshape"]  # in lowercase
+
+  op_names = _get_libmx_op_names()
+  func_exprs = map(op_names) do name
+    if lowercase(name) ∉ ignored_ops
+      expr = _define_atomic_symbol_creator(name)
+    end
+  end
+
+  esc(quote
+    $(func_exprs...)
+  end)
+end
+
+@_import_atomic_symbol_creators()
+
+################################################################################
+# Utility macros to chain up symbols
+################################################################################
+macro chain(layers)
+    exprs = []
+    last_layer = nothing
+
+    function _chain_layer(layer, last_layer)
+        if isa(last_layer, Void)
+            return esc(layer)
+        else
+            if @capture(layer, f_(x__))
+                x′ = esc.(x)
+                return :($f($last_layer, $(x′...)))
+            else
+                throw(AssertionError("$layer is not a valid function call and cannot be chained."))
+            end
+        end
+    end
+
+    while true
+        if @capture(layers, l1_=>l2_)
+            new_layer = gensym()
+            push!(exprs, :($new_layer = $(_chain_layer(l1, last_layer))))
+            last_layer = new_layer
+            layers = l2
+        else
+            push!(exprs, _chain_layer(layers, last_layer))
+            break
+        end
+    end
+    Expr(:block, exprs...)
+end
diff --git a/julia/src/util.jl b/julia/src/util.jl
new file mode 100644
index 000000000000..c07c9ec910fb
--- /dev/null
+++ b/julia/src/util.jl
@@ -0,0 +1,255 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+################################################################################
+# Dataset related utilities
+################################################################################
+function get_data_dir()
+  data_dir = joinpath(Pkg.dir("MXNet"), "data")
+  mkpath(data_dir)
+  data_dir
+end
+
+function get_mnist_ubyte()
+  data_dir  = get_data_dir()
+  mnist_dir = joinpath(data_dir, "mnist")
+  mkpath(mnist_dir)
+  filenames = Dict(:train_data  => "train-images-idx3-ubyte",
+                   :train_label => "train-labels-idx1-ubyte",
+                   :test_data   => "t10k-images-idx3-ubyte",
+                   :test_label  => "t10k-labels-idx1-ubyte")
+  filenames = Dict(map((x) -> x[1] => joinpath(mnist_dir, x[2]), filenames))
+  if !all(isfile, values(filenames))
+    cd(mnist_dir) do
+      mnist_dir = download("http://data.mxnet.io/mxnet/data/mnist.zip", "mnist.zip")
+        try
+          run(`unzip -u $mnist_dir`)
+        catch
+          try
+            run(pipe(`7z x $mnist_dir`,stdout=DevNull))
+          catch
+            error("Extraction Failed:No extraction program found in path")
+          end
+      end
+    end
+  end
+  return filenames
+end
+
+function get_cifar10()
+  data_dir    = get_data_dir()
+  cifar10_dir = joinpath(data_dir, "cifar10")
+  mkpath(cifar10_dir)
+  filenames = Dict(:train => "cifar/train.rec", :test => "cifar/test.rec")
+  filenames = Dict(map((x) -> x[1] => joinpath(cifar10_dir, x[2]), filenames))
+  if !all(isfile, values(filenames))
+    cd(cifar10_dir) do
+      download("http://data.mxnet.io/mxnet/data/cifar10.zip", "cifar10.zip")
+        try
+          run(`unzip -u cifar10.zip`)
+        catch
+          try
+            run(pipeline(`7z x cifar10.zip`, stdout=DevNull))
+          catch
+            error("Extraction Failed:No extraction program found in path")
+          end
+      end
+    end
+  end
+
+  filenames[:mean] = joinpath(cifar10_dir, "cifar/cifar_mean.bin")
+  return filenames
+end
+
+
+################################################################################
+# Internal Utilities
+################################################################################
+function _get_libmx_op_names()
+  n = Ref{MX_uint}(0)
+  names = Ref{char_pp}(0)
+
+  @mxcall(:MXListAllOpNames, (Ref{MX_uint}, Ref{char_pp}), n, names)
+
+  names = unsafe_wrap(Array, names[], n[])
+  return [unsafe_string(x) for x in names]
+end
+function _get_libmx_op_handle(name :: String)
+  handle = Ref{MX_handle}(0)
+  @mxcall(:NNGetOpHandle, (char_p, Ref{MX_handle}), name, handle)
+  return MX_OpHandle(handle[])
+end
+
+# We keep a cache and retrieve the address everytime
+# we run Julia, instead of pre-compiling with macro,
+# because the actual handle might change in different
+# runs
+const _libmx_op_cache = Dict{String, MX_OpHandle}()
+function _get_cached_libmx_op_handle(name :: String)
+  if !haskey(_libmx_op_cache, name)
+    handle = _get_libmx_op_handle(name)
+    _libmx_op_cache[name] = handle
+    return handle
+  else
+    return _libmx_op_cache[name]
+  end
+end
+
+function _get_libmx_op_description(name::String, handle::MX_OpHandle)
+  # get operator information (human readable)
+  ref_real_name = Ref{char_p}(0)
+  ref_desc = Ref{char_p}(0)
+  ref_narg = Ref{MX_uint}(0)
+
+  ref_arg_names = Ref{char_pp}(0)
+  ref_arg_types = Ref{char_pp}(0)
+  ref_arg_descs = Ref{char_pp}(0)
+
+  ref_key_narg  = Ref{char_p}(0)
+  ref_ret_type  = Ref{char_p}(0)
+
+  @mxcall(:MXSymbolGetAtomicSymbolInfo,
+         (MX_handle, Ref{char_p}, Ref{char_p}, Ref{MX_uint}, Ref{char_pp},
+          Ref{char_pp}, Ref{char_pp}, Ref{char_p}, Ref{char_p}),
+          handle, ref_real_name, ref_desc, ref_narg, ref_arg_names,
+          ref_arg_types, ref_arg_descs, ref_key_narg, ref_ret_type)
+
+  real_name = unsafe_string(ref_real_name[])
+  signature = _format_signature(Int(ref_narg[]), ref_arg_names)
+  desc = "    " * name * "(" * signature * ")\n\n"
+  if real_name != name
+    desc *= name * " is an alias of " * real_name * ".\n\n"
+  end
+
+  key_narg = unsafe_string(ref_key_narg[])
+  if key_narg != ""
+    desc *= "**Note**: " * name * " takes variable number of positional inputs. "
+    desc *= "So instead of calling as $name([x, y, z], $key_narg=3), "
+    desc *= "one should call via $name(x, y, z), and $key_narg will be "
+    desc *= "determined automatically.\n\n"
+  end
+
+  desc *= unsafe_string(ref_desc[]) * "\n\n"
+  desc *= "# Arguments\n"
+  desc *= _format_docstring(Int(ref_narg[]), ref_arg_names, ref_arg_types, ref_arg_descs)
+  return desc, key_narg
+end
+
+function _format_typestring(typestr :: String)
+  replace(typestr, r"\bSymbol\b", "SymbolicNode")
+end
+function _format_docstring(narg::Int, arg_names::Ref{char_pp}, arg_types::Ref{char_pp}, arg_descs::Ref{char_pp}, remove_dup::Bool=true)
+  param_keys = Set{String}()
+
+  arg_names  = unsafe_wrap(Array, arg_names[], narg)
+  arg_types  = unsafe_wrap(Array, arg_types[], narg)
+  arg_descs  = unsafe_wrap(Array, arg_descs[], narg)
+  docstrings = String[]
+
+  for i = 1:narg
+    arg_name = unsafe_string(arg_names[i])
+    if arg_name ∈ param_keys && remove_dup
+      continue
+    end
+    push!(param_keys, arg_name)
+
+    arg_type = _format_typestring(unsafe_string(arg_types[i]))
+    arg_desc = unsafe_string(arg_descs[i])
+    push!(docstrings, "* `$arg_name::$arg_type`: $arg_desc\n")
+  end
+  return join(docstrings, "\n")
+end
+
+function _format_signature(narg::Int, arg_names::Ref{char_pp})
+  arg_names  = unsafe_wrap(Array, arg_names[], narg)
+
+  return join([unsafe_string(name) for name in arg_names] , ", ")
+end
+
+"""
+Extract the line of `Defined in ...`
+
+julia> mx._getdocdefine("sgd_update")
+"Defined in src/operator/optimizer_op.cc:L53"
+```
+"""
+function _getdocdefine(name::String)
+  op = _get_libmx_op_handle(name)
+  str = _get_libmx_op_description(name, op)[1]
+  lines = split(str, '\n')
+  for m ∈ match.(r"^Defined in .*$", lines)
+    m != nothing && return m.match
+  end
+  ""
+end
+
+"""
+libmxnet operators signature checker.
+
+C/Python have different convernsion of accessing array. Those languages
+handle arrays in row-major and zero-indexing which differs from Julia's
+colume-major and 1-indexing.
+
+This function scans the docstrings of NDArray's APIs,
+filter out the signature which contain `axis`, `axes`, `keepdims` and `shape`
+as its function argument.
+
+We invoks this checker in Travis CI build and pop up the warning message
+if the functions does not get manually mapped
+(imply it's dimension refering may looks weird).
+
+If you found any warning in Travis CI build, please open an issue on GitHub.
+"""
+function _sig_checker()
+  names = filter(n -> ∉(lowercase(n), _op_import_bl), _get_libmx_op_names())
+  foreach(names) do name
+    op_handle = _get_libmx_op_handle(name)
+
+    desc, key_narg = _get_libmx_op_description(name, op_handle)
+    _sig = desc |> s -> split(s, '\n') |> first |> strip
+    _m = match(r"(axis|axes|keepdims|shape)", _sig)
+
+    if _m === nothing
+      return
+    end
+
+    warn(_sig)
+
+  end
+end
+
+"""
+Get first position argument from function sig
+"""
+function _firstarg(sig::Expr)
+  if sig.head ∈ (:where, :(::))
+    _firstarg(sig.args[1])
+  elseif sig.head == :call
+    i = if sig.args[2] isa Expr && sig.args[2].head == :parameters
+      # there are some keyward arguments locate at args[2]
+      3
+    elseif sig.args[1] === :broadcast_
+      # case of broadcasting, skip the first arg `::typeof(...)`
+      3
+    else
+      2
+    end
+    _firstarg(sig.args[i])
+  end
+end
+
+_firstarg(s::Symbol) = s
diff --git a/julia/src/visualize.jl b/julia/src/visualize.jl
new file mode 100644
index 000000000000..b41db7e9e5a7
--- /dev/null
+++ b/julia/src/visualize.jl
@@ -0,0 +1,213 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import JSON
+
+"""
+    to_graphviz(network)
+
+* `network::SymbolicNode`: the network to visualize.
+* `title::AbstractString:` keyword argument, default "Network Visualization",
+          the title of the GraphViz graph.
+* `input_shapes`: keyword argument, default `nothing`. If provided,
+          will run shape inference and plot with the shape information. Should
+          be either a dictionary of name-shape mapping or an array of shapes.
+
+Returns the graph description in GraphViz `dot` language.
+"""
+function to_graphviz(network :: SymbolicNode; title="Network Visualization", input_shapes=nothing)
+  if !isa(input_shapes, Void)
+    internals = get_internals(network)
+    if isa(input_shapes, Dict)
+      _, out_shapes, _ = infer_shape(internals; input_shapes...)
+    else
+      _, out_shapes, _ = infer_shape(internals, input_shapes...)
+    end
+    @assert(!isa(out_shapes, Void), "Failed to do shape inference, input shapes are incomplete")
+    shape_dict = Dict(zip(list_outputs(internals), out_shapes))
+    draw_shape = true
+  else
+    draw_shape = false
+  end
+
+  conf = JSON.parse(to_json(network))
+  nodes = conf["nodes"]
+  heads = unique([x[1]+1 for x in conf["heads"]])
+  node_attr = Dict(:shape => :box, :fixedsize => true, :width => 1.3,
+                   :height => 0.8034, :style => (:rounded, :filled), :penwidth => 2)
+  io = IOBuffer()
+  println(io, "digraph $(_simple_escape(title)) {")
+  println(io, "node [fontsize=10];")
+  println(io, "edge [fontsize=10];")
+
+  # color map
+  fillcolors = ("#8dd3c7", "#fb8072", "#ffffb3", "#bebada", "#80b1d3",
+                "#fdb462", "#b3de69", "#fccde5")
+  edgecolors = ("#245b51", "#941305", "#999900", "#3b3564", "#275372",
+                "#975102", "#597d1c", "#90094e")
+
+  # make nodes
+  for i = 1:length(nodes)
+    node  = nodes[i]
+    op    = node["op"]
+    name  = node["name"]
+    attr  = deepcopy(node_attr)
+    label = op
+
+    # Up to 0.11.0 version of mxnet additional info was stored in
+    # node["attr"]. Staring from 0.12 `attr` was changed to `attrs`.
+    # See: https://github.com/dmlc/nnvm/pull/152
+    if haskey(node, "attrs")
+      node_info = node["attrs"]
+    elseif haskey(node, "attr")
+      node_info = node["attr"]
+    end
+
+    if op == "null"
+      if i ∈ heads
+        # heads are output nodes
+        label = node["name"]
+        colorkey = 1
+      else
+        # otherwise, input nodes, might be data, label or parameters
+        continue
+      end
+    elseif op == "Convolution"
+      if haskey(node_info,"stride")
+        stride_info=_extract_shape(node_info["stride"])
+      else
+        stride_info="1"
+      end
+
+      label = format("Convolution\nkernel={1}\nstride={2}\nn-filter={3}",
+                     _extract_shape(node_info["kernel"]),
+                     stride_info,
+                     node_info["num_filter"])
+      colorkey = 2
+    elseif op == "FullyConnected"
+      label = format("FullyConnected\nnum-hidden={1}", node_info["num_hidden"])
+      colorkey = 2
+    elseif op == "Activation"
+      label = format("Activation\nact-type={1}", node_info["act_type"])
+      colorkey = 3
+    elseif op == "BatchNorm"
+      colorkey = 4
+    elseif op == "Pooling"
+      if haskey(node_info,"stride")
+        stride_info=_extract_shape(node_info["stride"])
+      else
+        stride_info="1"
+      end
+      label = format("Pooling\ntype={1}\nkernel={2}\nstride={3}",
+                     node_info["pool_type"],
+                     _extract_shape(node_info["kernel"]),
+                     stride_info)
+      colorkey = 5
+    elseif op ∈ ("Concat", "Flatten", "Reshape")
+      colorkey = 6
+    elseif endswith(op, "Output") || op == "BlockGrad"
+      colorkey = 7
+    else
+      colorkey = 8
+    end
+
+    if op != "null"
+      label = "$name\n$label"
+    end
+    attr[:fillcolor] = fillcolors[colorkey]
+    attr[:color]     = edgecolors[colorkey]
+    attr[:label]     = label
+    _format_graphviz_node(io, name, attr)
+  end
+
+  # add edges
+  for i = 1:length(nodes)
+    node  = nodes[i]
+    op    = node["op"]
+    name  = node["name"]
+    if op == "null"
+      continue
+    end
+    inputs = node["inputs"]
+    for item in inputs
+      input_node = nodes[item[1]+1]
+      input_name = input_node["name"]
+      if input_node["op"] != "null" || (item[1]+1) ∈ heads
+        attr = Dict(:dir => :back, :arrowtail => :open, :color => "#737373")
+        if draw_shape
+          if input_node["op"] != "null"
+            key   = Symbol(input_name, "_output")
+            shape = shape_dict[key][1:end-1]
+          else
+            key   = Symbol(input_name)
+            shape = shape_dict[key][1:end-1]
+          end
+          label = "(" * join([string(x) for x in shape], ",") * ")"
+          attr[:label] = label
+        end
+        _format_graphviz_edge(io, name, input_name, attr)
+      end
+    end
+  end
+  println(io, "}")
+
+  return String(take!(io))
+end
+
+function _format_graphviz_attr(io::IOBuffer, attrs)
+  label = get(attrs, :label, nothing)
+  if isa(label, Void)
+    print(io, " [")
+  else
+    print(io, " [label=$(_simple_escape(label)),")
+  end
+  first_attr = true
+  for (k,v) in attrs
+    if k != :label
+      if !first_attr
+        print(io, ",")
+      end
+      first_attr = false
+
+      if isa(v, AbstractString) && v[1] == '#'
+        # color
+        v = _simple_escape(v)
+      elseif isa(v, Tuple)
+        v = _simple_escape(join([string(x) for x in v], ","))
+      end
+      print(io, "$k=$v")
+    end
+  end
+  println(io, "];")
+end
+function _simple_escape(str)
+  str = replace(string(str), r"\n", "\\n")
+  return "\"$str\""
+end
+function _format_graphviz_node(io::IOBuffer, name::AbstractString, attrs)
+  print(io, "$(_simple_escape(name)) ")
+  _format_graphviz_attr(io, attrs)
+end
+function _format_graphviz_edge(io::IOBuffer, head, tail, attrs)
+  print(io, """$(_simple_escape(head)) -> $(_simple_escape(tail)) """)
+  _format_graphviz_attr(io, attrs)
+end
+function _extract_shape(str :: AbstractString)
+  shape = matchall(r"\d+", str)
+  shape = reverse(shape) # JSON in libmxnet has reversed shape (column vs row majoring)
+  return "(" * join(shape, ",") * ")"
+end
diff --git a/julia/test/common.jl b/julia/test/common.jl
new file mode 100644
index 000000000000..5ac5f905143a
--- /dev/null
+++ b/julia/test/common.jl
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+################################################################################
+# Common models used in testing
+################################################################################
+function rand_dims(max_ndim=6)
+  tuple(rand(1:10, rand(1:max_ndim))...)
+end
+
+function mlp2()
+  data = mx.Variable(:data)
+  out = mx.FullyConnected(data, name=:fc1, num_hidden=1000)
+  out = mx.Activation(out, act_type=:relu)
+  out = mx.FullyConnected(out, name=:fc2, num_hidden=10)
+  return out
+end
+
+function mlpchain()
+  mx.@chain mx.Variable(:data) =>
+            mx.FullyConnected(name=:fc1, num_hidden=1000) =>
+            mx.Activation(act_type=:relu) =>
+            mx.FullyConnected(name=:fc2, num_hidden=10)
+end
+
+"""
+execution helper of SymbolicNode
+"""
+function exec(x::mx.SymbolicNode; feed...)
+  ks, vs = zip(feed...)
+  vs′ = mx.NDArray.(vs)
+
+  e = mx.bind(x, context = mx.cpu(), args = Dict(zip(ks, vs′)))
+  mx.forward(e)
+  e.outputs
+end
diff --git a/julia/test/runtests.jl b/julia/test/runtests.jl
new file mode 100644
index 000000000000..8b46bfda6726
--- /dev/null
+++ b/julia/test/runtests.jl
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+using MXNet
+using Base.Test
+
+# run test in the whole directory, latest modified files
+# are run first, this makes waiting time shorter when writing
+# or modifying unit-tests
+function test_dir(dir)
+  jl_files = sort(filter(x -> ismatch(r".*\.jl$", x), readdir(dir)), by = fn -> stat(joinpath(dir,fn)).mtime)
+  map(reverse(jl_files)) do file
+    include("$dir/$file")
+  end
+end
+
+info("libmxnet version => $(mx.LIB_VERSION)")
+
+include(joinpath(dirname(@__FILE__), "common.jl"))
+@testset "MXNet Test" begin
+  test_dir(joinpath(dirname(@__FILE__), "unittest"))
+
+  # run the basic MNIST mlp example
+  if haskey(ENV, "CONTINUOUS_INTEGRATION")
+    @testset "MNIST Test" begin
+      include(joinpath(Pkg.dir("MXNet"), "examples", "mnist", "mlp-test.jl"))
+    end
+  end
+end
diff --git a/julia/test/travis/run_coverage.sh b/julia/test/travis/run_coverage.sh
new file mode 100644
index 000000000000..ee22b258b549
--- /dev/null
+++ b/julia/test/travis/run_coverage.sh
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+julia -e 'cd(Pkg.dir("MXNet")); Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder())'
diff --git a/julia/test/travis/run_test.sh b/julia/test/travis/run_test.sh
new file mode 100755
index 000000000000..414b1450b554
--- /dev/null
+++ b/julia/test/travis/run_test.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
+julia -e 'Pkg.clone(pwd())'
+(
+    cd `julia -e 'println(Pkg.dir("MXNet", "deps"))'` &&
+    ln -fs $TRAVIS_BUILD_DIR/deps/src
+)
+julia -e 'Pkg.build("MXNet"); Pkg.test("MXNet"; coverage=true)'
diff --git a/julia/test/travis/setup_env.sh b/julia/test/travis/setup_env.sh
new file mode 100755
index 000000000000..9d2f7341998d
--- /dev/null
+++ b/julia/test/travis/setup_env.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+echo "##########################"
+echo $TRAVIS_OS_NAME
+
+if [ ${TRAVIS_OS_NAME} == "osx" ]; then
+  brew update >/dev/null 2>&1
+  brew tap homebrew/science
+  brew info opencv
+  brew install graphviz
+  brew install opencv
+fi
+
+if [ ${TRAVIS_OS_NAME} == "linux" ]; then
+  mkdir shadow_bin
+  ln -s `which gcc-4.8` shadow_bin/gcc
+  ln -s `which g++-4.8` shadow_bin/g++
+
+  export PATH=$PWD/shadow_bin:$PATH
+fi
diff --git a/julia/test/unittest/autograd.jl b/julia/test/unittest/autograd.jl
new file mode 100644
index 000000000000..96f275b626af
--- /dev/null
+++ b/julia/test/unittest/autograd.jl
@@ -0,0 +1,403 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module TestAutoGrad
+
+using Base.Test
+
+using MXNet
+
+
+function checkgradient(f, x, y, ∇)
+  ∇x = mx.attach_grad!(x)
+  y′ = mx.record(f)
+  @test copy(y′) ≈ y
+  @test copy(∇x) |> sum == 0
+  mx.backward!(y′)
+  @test copy(mx.getgrad(x)) ≈ ∇
+end  # function checkgradient
+
+
+function test_getgrad()
+  info("AutoGrad::getgrad")
+
+  info("AutoGrad::getgrad::unattached")
+  @test nothing == mx.getgrad(mx.zeros(10))
+
+  info("AutoGrad::getgrad::attached")
+  x = mx.NDArray([1 2; 3 4])
+  grad = mx.attach_grad!(x)
+  @test eltype(grad) ≡ Int
+  @test copy(grad) == [0 0; 0 0]
+
+  grad[:] = 42
+  @test copy(mx.getgrad(x)) == [42 42; 42 42]
+end
+
+
+function test_mark_variables!()
+  info("AutoGrad::mark_variables!")
+  x = mx.zeros(4)
+  ẋ = mx.zeros(4)
+  y = mx.zeros(4)
+  ẏ = mx.zeros(4)
+  mx.mark_variables!([x, y], [ẋ, ẏ], [:nop, :nop])
+  ẋ[:] = 42
+  ẏ[:] = 24
+
+  @test copy(mx.getgrad(x)) == [42, 42, 42, 42]
+  @test copy(mx.getgrad(y)) == [24, 24, 24, 24]
+
+  info("AutoGrad::mark_variables!::invalid grad_reqs")
+  x = mx.zeros(4)
+  y = mx.zeros(4)
+  @test_throws ArgumentError mx.mark_variables!(x, y, :magic)
+  @test_throws ArgumentError mx.mark_variables!([x], [y], [:magic])
+
+  info("AutoGrad::mark_variables!::args length mismatch")
+  x = mx.zeros(4)
+  y = mx.zeros(4)
+  z = mx.zeros(4)
+  @test_throws ArgumentError mx.mark_variables!([x], [y, z])
+  @test_throws ArgumentError mx.mark_variables!([x], [y], [:write, :nop])
+end
+
+
+function test_record()
+  let x = mx.NDArray([1 2; 3 4])
+    info("AutoGrad::record::backward!")
+
+    y = [1 4; 9 16]
+    ∇ = [2 4; 6 8]  # gradient is 2x
+    checkgradient(x, y, ∇) do
+      mx.square(x)
+    end
+  end
+
+  let x = mx.NDArray([1 2; 3 4])
+    info("AutoGrad::record::symbol")
+
+    mx.attach_grad!(x)
+    y = mx.record() do
+      mx.square(x)
+    end
+
+    @test copy(y) == [1 4; 9 16]
+
+    @test isa(mx.symbol(y), mx.SymbolicNode)
+  end
+
+  let x = mx.NDArray([1 2; 3 4])
+    info("AutoGrad::record::backward!(retain_graph=true)")
+
+    mx.attach_grad!(x)
+    y = mx.record() do
+      mx.square(x)
+    end
+
+    @test copy(y) == [1 4; 9 16]
+
+    mx.backward!(y, retain_graph=true)
+    # gradient is 2x
+    @test copy(mx.getgrad(x)) == [2 4; 6 8]
+
+    @test isa(mx.symbol(y), mx.SymbolicNode)
+  end
+
+  mx._record(nothing, nothing) do  # no error with edage case
+    @test true
+  end
+end  # function test_record
+
+
+function test_is_recording()
+  info("AutoGrad::is_recording")
+  mx.record() do
+    @test mx.is_recording()
+  end
+end  # function test_is_recording
+
+
+function test_is_training()
+  info("AutoGrad::is_training")
+  mx.record() do
+    @test mx.is_training()
+  end
+
+  mx.record(false) do
+    @test !mx.is_training()
+  end
+end  # function test_is_training
+
+
+function test_pause()
+  info("AutoGrad::pause")
+  let x = mx.NDArray([1 2; 3 4])
+    ∇ = mx.attach_grad!(x)
+    y = mx.record() do
+      y = mx.square(x)
+      mx.pause() do
+        z = mx.square(y)
+        @test copy(z) == [1 16; 81 256]
+      end
+      y
+    end
+
+    @test copy(y) == [1 4; 9 16]
+
+    mx.backward!(y)
+    @test copy(∇) == [2 4; 6 8]
+  end
+end  # function test_pause
+
+
+function test_train_mode()
+  info("AutoGrad::train_mode")
+  let x = mx.NDArray(Float32[1 2; 3 4])
+    y = mx.train_mode() do
+      mx.Dropout(x, p = 1)
+    end
+
+    @test all(isnan.(copy(y)))
+  end
+end  # function test_train_mode
+
+
+function test_predict_mode()
+  info("AutoGrad::predict_mode")
+  let x = mx.NDArray(Float32[1 2; 3 4])
+    y = mx.predict_mode() do
+      mx.Dropout(x, p = 1)
+    end
+
+    @test copy(y) ≈ Float32[1 2; 3 4]
+  end
+end  # function test_train_mode
+
+
+function test_backward!()
+  info("AutoGrad::backward!::with head_grad")
+  let x = mx.NDArray(Float32[1 2; 3 4]), A = Float32[.2 .4; 0 .1]
+    ∇ = mx.attach_grad!(x)
+    y = mx.record() do
+      mx.square(x)
+    end
+    mx.backward!(y, mx.NDArray(A))
+    @test copy(∇) ≈ [2 4; 6 8] .* A
+  end
+
+  info("AutoGrad::backward!::with head_grads")
+  let x = mx.NDArray(Float32[1 2; 3 4])
+    ∇ = mx.attach_grad!(x)
+    mx.record() do
+      x′ = mx.square(x)
+      y = mx.square(x)
+      z = mx.square(x) .+ 42
+      mx.backward!([x′, y, z], [nothing,
+                                mx.NDArray(Float32[.01 .01; 1 1]),
+                                mx.NDArray(Float32[1 1; .1 .1])])
+    end
+    ans = [4.02 8.04
+           12.6 16.8]
+    @test copy(∇) ≈ ans
+  end
+
+  info("AutoGrad::backward!::ArgumentError")
+  let x = mx.NDArray([42])
+    @test_throws ArgumentError mx.backward!([x], [24])
+  end
+end  # function test_backward!
+
+
+function test_symbol()
+  info("AutoGrad::symbol")
+
+  let x = mx.zeros(4)
+    mx.attach_grad!(x)
+    @test isa(mx.symbol(x), mx.SymbolicNode)
+  end
+end
+
+
+function test_add()
+  info("AutoGrad::add")
+
+  info("AutoGrad::add::x")
+  let x = mx.NDArray([1 2; 3 4])
+    y = [1 2; 3 4]
+    ∇ = [1 1; 1 1]  # gradient is 1
+    checkgradient(x, y, ∇) do
+      x
+    end
+  end
+
+  info("AutoGrad::add::+x")
+  let x = mx.NDArray([1 2; 3 4])
+    y = [1 2; 3 4]
+    ∇ = [1 1; 1 1]  # gradient is 1
+    checkgradient(x, y, ∇) do
+      +x
+    end
+  end
+
+  info("AutoGrad::add::x .+ 42")
+  let x = mx.NDArray([1 2; 3 4])
+    y = [43 44; 45 46]
+    ∇ = [1 1; 1 1]  # gradient is 1
+    checkgradient(x, y, ∇) do
+      x .+ 42
+    end
+  end
+
+  info("AutoGrad::add::42 .+ x")
+  let x = mx.NDArray([1 2; 3 4])
+    y = [43 44; 45 46]
+    ∇ = [1 1; 1 1]
+    checkgradient(x, y, ∇) do
+      42 .+ x
+    end
+  end
+
+  # TODO: info("AutoGrad::add::x .+ y")
+end  # function test_add
+
+
+function test_sub()
+  info("AutoGrad::sub")
+
+  info("AutoGrad::sub::-x")
+  let x = mx.NDArray([1 2; 3 4])
+    y = [-1 -2; -3 -4]
+    ∇ = [-1 -1; -1 -1]  # gradient is -1
+    checkgradient(x, y, ∇) do
+      -x
+    end
+  end
+
+  info("AutoGrad::sub::x .- 42")
+  let x = mx.NDArray([1 2; 3 4])
+    y = [-41 -40; -39 -38]
+    ∇ = [1 1; 1 1]
+    checkgradient(x, y, ∇) do
+      x .- 42
+    end
+  end
+
+  info("AutoGrad::sub::42 .- x")
+  let x = mx.NDArray([1 2; 3 4])
+    y = [41 40; 39 38]
+    ∇ = -[1 1; 1 1]
+    checkgradient(x, y, ∇) do
+      42 .- x
+    end
+  end
+
+  # TODO: info("AutoGrad::add::x .- y")
+end  # function test_sub
+
+
+function test_mul()
+  info("AutoGrad::mul")
+
+  info("AutoGrad::mul::2x .* x")
+  let x = mx.NDArray([1 2; 3 4])
+    y = [2 8; 18 32]
+    ∇ = [4 8; 12 16]  # 4x
+    checkgradient(x, y, ∇) do
+      2x .* x
+    end
+  end
+
+  info("AutoGrad::mul::x * 2 .* x")
+  let x = mx.NDArray([1 2; 3 4])
+    y = [2 8; 18 32]
+    ∇ = [4 8; 12 16]  # 4x
+    checkgradient(x, y, ∇) do
+      x * 2 .* x
+    end
+  end
+end
+
+
+function test_div()
+  info("AutoGrad::div")
+
+  info("AutoGrad::div::x ./ 2")
+  let x = mx.NDArray(Float32[1 2; 3 4])
+    y = Float32[.5 1; 1.5 2]
+    ∇ = [.5 .5; .5 .5]
+    checkgradient(x, y, ∇) do
+      x ./ 2
+    end
+  end
+
+  info("AutoGrad::rdiv::2 ./ x")
+  let A = Float32[1 2; 3 4], x = mx.NDArray(A)
+    y = 2 ./ A
+    ∇ = @. -2 / A^2  # -2 / x²
+    checkgradient(x, y, ∇) do
+      2 ./ x
+    end
+  end
+end  # function test_div
+
+
+function test_power()
+  info("AutoGrad::power")
+
+  info("AutoGrad::power::x.^3")
+  let A = Float32[1 2; 3 4]
+    x = mx.NDArray(A)
+    y = A.^3
+    ∇ = 3(A.^2)
+    checkgradient(x, y, ∇) do
+      x.^3
+    end
+  end
+
+  info("AutoGrad::power::x.^.5")
+  let A = Float32[1 2; 3 4]
+    x = mx.NDArray(A)
+    y = A.^.5
+    ∇ = .5(A.^-.5)
+    checkgradient(x, y, ∇) do
+      x.^.5
+    end
+  end
+end
+
+
+@testset "AutoGrad Test" begin
+  test_getgrad()
+  test_mark_variables!()
+  test_record()
+  test_is_recording()
+  test_is_training()
+  test_pause()
+  test_train_mode()
+  test_predict_mode()
+  test_backward!()
+  test_symbol()
+  test_add()
+  test_sub()
+  test_mul()
+  test_div()
+  test_power()
+end
+
+
+end  # model TestAutoGrad
diff --git a/julia/test/unittest/bind.jl b/julia/test/unittest/bind.jl
new file mode 100644
index 000000000000..538b556bf474
--- /dev/null
+++ b/julia/test/unittest/bind.jl
@@ -0,0 +1,95 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module TestBind
+using MXNet
+using Base.Test
+
+using ..Main: rand_dims
+
+################################################################################
+# Test Implementations
+################################################################################
+function test_arithmetic(::Type{T}, uf, gf) where T <: mx.DType
+  shape = rand_dims()
+  info("Bind::arithmetic::$T::$uf::dims = $shape")
+
+  lhs = mx.Variable(:lhs)
+  rhs = mx.Variable(:rhs)
+  ret = uf(lhs, rhs)
+  @test mx.list_arguments(ret) == [:lhs, :rhs]
+
+  lhs_arr  = mx.NDArray(rand(T, shape))
+  rhs_arr  = mx.NDArray(rand(T, shape))
+  lhs_grad = mx.empty(T, shape)
+  rhs_grad = mx.empty(T, shape)
+
+  exec2 = mx.bind(ret, mx.Context(mx.CPU), [lhs_arr, rhs_arr], args_grad=[lhs_grad, rhs_grad])
+  exec3 = mx.bind(ret, mx.Context(mx.CPU), [lhs_arr, rhs_arr])
+  exec4 = mx.bind(ret, mx.Context(mx.CPU), Dict(:lhs=>lhs_arr, :rhs=>rhs_arr),
+                  args_grad=Dict(:rhs=>rhs_grad, :lhs=>lhs_grad))
+
+  mx.forward(exec2)
+  mx.forward(exec3)
+  mx.forward(exec4)
+
+  out1 = uf(copy(lhs_arr), copy(rhs_arr))
+  out2 = copy(exec2.outputs[1])
+  out3 = copy(exec3.outputs[1])
+  out4 = copy(exec4.outputs[1])
+  @test isapprox(out1, out2)
+  @test isapprox(out1, out3)
+  @test isapprox(out1, out4)
+
+  # test gradients
+  out_grad = mx.NDArray(ones(T, shape))
+  lhs_grad2, rhs_grad2 = gf(copy(out_grad), copy(lhs_arr), copy(rhs_arr))
+  mx.backward(exec2, out_grad)
+  @test isapprox(copy(lhs_grad), lhs_grad2)
+  @test isapprox(copy(rhs_grad), rhs_grad2)
+
+  # reset grads
+  lhs_grad[:] = 0
+  rhs_grad[:] = 0
+  # compute using another binding
+  mx.backward(exec4, out_grad)
+  @test isapprox(copy(lhs_grad), lhs_grad2)
+  @test isapprox(copy(rhs_grad), rhs_grad2)
+end
+
+function test_arithmetic()
+  for T in [mx.fromTypeFlag(TF) for TF in instances(mx.TypeFlag)]
+    test_arithmetic(T, (x,y) -> x .+ y, (g,x,y) -> (g,g))
+    test_arithmetic(T, (x,y) -> x .- y, (g,x,y) -> (g,-g))
+    test_arithmetic(T, (x,y) -> x .* y, (g,x,y) -> (y.*g, x.*g))
+    if T <: Integer || T == Float16
+      warn("Not running division test for $T")
+    else
+      test_arithmetic(T, (x,y) -> x ./ y, (g,x,y) -> (g ./ y, -x .* g ./ (y.^2)))
+    end
+  end
+end
+
+################################################################################
+# Run tests
+################################################################################
+@testset "Bind Test" begin
+  test_arithmetic()
+end
+
+end
+
diff --git a/julia/test/unittest/initializer.jl b/julia/test/unittest/initializer.jl
new file mode 100644
index 000000000000..fa528c9f8e13
--- /dev/null
+++ b/julia/test/unittest/initializer.jl
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+@testset "Initializers" begin
+  @testset "Bilinear initializer" begin
+    # Setup a filter with scale = 2
+    expectedFilter = Float32[
+                       0.0625 0.1875 0.1875 0.0625;
+                       0.1875 0.5625 0.5625 0.1875;
+                       0.1875 0.5625 0.5625 0.1875;
+                       0.0625 0.1875 0.1875 0.0625]
+    filter = mx.zeros(Float32, 4, 4, 1, 4)
+    mx.init(mx.XavierInitializer(), :upsampling0_weight, filter)
+
+    mx.@nd_as_jl ro=filter begin
+      for s in 1:size(filter, 4)
+        @test all(filter[:, :, 1, s] .== expectedFilter)
+      end
+    end
+  end
+end
diff --git a/julia/test/unittest/io.jl b/julia/test/unittest/io.jl
new file mode 100644
index 000000000000..81f2ff79a83b
--- /dev/null
+++ b/julia/test/unittest/io.jl
@@ -0,0 +1,145 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module TestIO
+
+using MXNet
+using Base.Test
+
+using ..Main: rand_dims
+
+function test_mnist()
+  info("IO::MNIST")
+  filenames = mx.get_mnist_ubyte()
+
+  batch_size = 10
+  mnist_provider = mx.MNISTProvider(image=filenames[:train_data],
+                                    label=filenames[:train_label],
+                                    batch_size=batch_size, silent=true, shuffle=false)
+  data_spec = mx.provide_data(mnist_provider)
+  label_spec = mx.provide_label(mnist_provider)
+  @test data_spec == [(:data, (28,28,1,batch_size))]
+  @test label_spec == [(:softmax_label, (batch_size,))]
+
+  n_batch = 0
+  for batch in mnist_provider
+    if n_batch == 0
+      data_array  = mx.empty(28,28,1,batch_size)
+      label_array = mx.empty(batch_size)
+      # have to use "for i=1:1" to get over the legacy "feature" of using
+      # [ ] to do concatenation in Julia
+      data_targets = [[(1:batch_size, data_array)] for i = 1:1]
+      label_targets = [[(1:batch_size, label_array)] for i = 1:1]
+
+      mx.load_data!(mnist_provider, batch, data_targets)
+      mx.load_label!(mnist_provider, batch, label_targets)
+
+      true_labels = [5,0,4,1,9,2,1,3,1,4] # the first 10 labels in MNIST train
+      got_labels  = Int[copy(label_array)...]
+      @test true_labels == got_labels
+    end
+
+    n_batch += 1
+  end
+
+  @test n_batch == 60000 / batch_size
+end
+
+function test_arrays_impl(data::Vector, label::Vector, provider::mx.ArrayDataProvider)
+  data = convert(Vector{Array{Float64}}, data)
+  label = convert(Vector{Array{Float64}}, label)
+
+  sample_count = size(data[1])[end]
+  batch_size   = mx.get_batch_size(provider)
+  idx_all = 1:batch_size:sample_count
+
+  for (d1, (_, d2)) in zip(data, mx.provide_data(provider))
+    @test size(d1)[1:end-1] == d2[1:end-1]
+    @test batch_size == d2[end]
+  end
+  for (d1, (_, d2)) in zip(label, mx.provide_label(provider))
+    @test size(d1)[1:end-1] == d2[1:end-1]
+    @test batch_size == d2[end]
+  end
+
+  info("IO::Array::#data=$(length(data)),#label=$(length(label)),batch_size=$batch_size")
+  for (idx, batch) in zip(idx_all, provider)
+    data_batch = [x[[Colon() for i=1:ndims(x)-1]..., idx:min(idx+batch_size-1,sample_count)] for x in data]
+    data_get   = mx.get_data(provider, batch)
+
+    for (d_real, d_get) in zip(data_batch, data_get)
+      @test d_real ≈ copy(d_get)[[1:n for n in size(d_real)]...]
+      @test mx.count_samples(provider, batch) == size(d_real)[end]
+    end
+  end
+end
+
+function test_arrays()
+  sample_count = 15
+  batch_size   = 4
+  dims_data    = [rand_dims()..., sample_count]
+  data         = rand(dims_data...)
+  provider     = mx.ArrayDataProvider(data, batch_size=batch_size)
+  test_arrays_impl(Array[data], [], provider)
+
+  dims_label   = [rand_dims()..., sample_count]
+  label        = rand(dims_label...)
+  provider     = mx.ArrayDataProvider(data, label, batch_size=batch_size)
+  test_arrays_impl(Array[data], Array[label], provider)
+
+  provider     = mx.ArrayDataProvider(:data=>data, :my_label=>label, batch_size=batch_size)
+  test_arrays_impl(Array[data], Array[label], provider)
+
+  dims_data2   = [rand_dims()..., sample_count]
+  data2        = rand(dims_data2...)
+  provider     = mx.ArrayDataProvider((:data=>data, :data2=>data2), label, batch_size=batch_size)
+  test_arrays_impl(Array[data,data2], Array[label], provider)
+end
+
+function test_arrays_shuffle()
+  info("IO::Array::shuffle")
+
+  sample_count = 15
+  batch_size   = 4
+  data         = rand(mx.MX_float, 1, sample_count)
+  label        = collect(1:sample_count)
+  provider     = mx.ArrayDataProvider(data, :index => label, batch_size=batch_size, shuffle=true)
+
+  idx_all      = 1:batch_size:sample_count
+  data_got     = similar(data)
+  label_got    = similar(label)
+  for (idx, batch) in zip(idx_all, provider)
+    data_batch  = mx.get(provider, batch, :data)
+    label_batch = mx.get(provider, batch, :index)
+    ns_batch    = mx.count_samples(provider, batch)
+    data_got[idx:idx+ns_batch-1]  = copy(data_batch)[1:ns_batch]
+    label_got[idx:idx+ns_batch-1] = copy(label_batch)[1:ns_batch]
+  end
+
+  @test label_got != label
+  @test sort(label_got) == label
+  @test size(data_got) == size(data[:, Int[label_got...]])
+  @test data_got ≈ data[:, Int[label_got...]]
+end
+
+@testset "IO Test" begin
+  test_arrays_shuffle()
+  test_arrays()
+  test_mnist()
+end
+
+end
diff --git a/julia/test/unittest/kvstore.jl b/julia/test/unittest/kvstore.jl
new file mode 100644
index 000000000000..96e1643d8d83
--- /dev/null
+++ b/julia/test/unittest/kvstore.jl
@@ -0,0 +1,114 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module TestKVStore
+using MXNet
+using Base.Test
+
+using ..Main: rand_dims
+
+SHAPE = rand_dims()
+KEYS  = [5,7,11]
+
+function init_kv()
+  kv = mx.KVStore()
+  mx.init!(kv, 3, mx.zeros(SHAPE))
+
+  vals = [mx.zeros(SHAPE) for k in KEYS]
+  mx.init!(kv, KEYS, vals)
+  kv
+end
+
+function test_kv_basic()
+  info("KVStore::basic")
+
+  kv = init_kv()
+  @test mx.get_type(kv) == :local
+  @test mx.get_rank(kv) == 0
+  @test mx.get_num_workers(kv) == 1
+end
+
+function test_single_kv_pair()
+  info("KVStore::single")
+
+  kv = init_kv()
+  mx.push!(kv, 3, mx.ones(SHAPE))
+  val = mx.empty(SHAPE)
+  mx.pull!(kv, 3, val)
+  @test maximum(abs.(copy(val) .- 1)) == 0
+end
+
+function test_aggregator()
+  info("KVStore::aggregator")
+
+  kv = init_kv()
+
+  num_devs = 4
+  devs = [mx.Context(mx.CPU, i) for i=0:num_devs-1]
+  vals = [mx.ones(SHAPE, dev) for dev in devs]
+
+  mx.push!(kv, 3, vals)
+  mx.pull!(kv, 3, vals)
+  for v in vals
+    @test maximum(abs.(copy(v)) - num_devs) == 0
+  end
+
+  # list
+  vals = [mx.NDArray[mx.ones(SHAPE, dev)*2 for dev in devs] for k in KEYS]
+  mx.push!(kv, KEYS, vals)
+  mx.pull!(kv, KEYS, vals)
+
+  for vv in vals
+    for v in vv
+      @test maximum(abs.(copy(v)) - 2 * num_devs) == 0
+    end
+  end
+end
+
+function check_setupdater!(f)
+  kv = KVStore(:local)
+  setupdater!(kv, f)
+
+  A = Float32[1, 2, 3, 4]
+  B = Float32[.5, .6, .7, .8]
+  x = NDArray(A)
+  Δ = NDArray(B)
+  init!(kv, 42, x)
+  push!(kv, 42, Δ)
+  pull!(kv, 42, x)
+
+  @test copy(x) ≈ A + 2B
+end  # function check_setupdater!
+
+function test_setupdater!()
+  info("KVStore::setupdater!")
+
+  f(key, Δ, x) = @mx.inplace x += 2Δ
+  g(key, Δ, x) = (x[:] += 2Δ)
+
+  check_setupdater!(f)
+  check_setupdater!(g)
+end  # test_setupdater!
+
+@testset "KVStore Test" begin
+  test_kv_basic()
+  test_single_kv_pair()
+  test_aggregator()
+  test_setupdater!()
+end
+
+end
diff --git a/julia/test/unittest/metric.jl b/julia/test/unittest/metric.jl
new file mode 100644
index 000000000000..32c4538b608c
--- /dev/null
+++ b/julia/test/unittest/metric.jl
@@ -0,0 +1,116 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module TestMetric
+
+using MXNet
+using Base.Test
+
+################################################################################
+# Supporting functions
+################################################################################
+
+"""
+Returns a random n x m array in which each column defines a discrete probability distribution.
+Each column contains numbers between 0 and 1, and each column sums to 1.
+"""
+function generate_probs(n, m)
+    # Init
+    result = rand(n, m)
+
+    # Normalize: ensure each column sums to 1
+    for j = 1:m
+        colsum = sum(result[:, j])
+        for i = 1:n
+            result[i, j] /= colsum
+        end
+    end
+    result
+end
+
+
+function loglikelihood(labels::Vector{T}, probs::Array{T, 2}) where T <: AbstractFloat
+    LL = 0.0
+    eps = convert(T, 1.0e-8)
+    for i = 1:size(labels, 1)
+        LL += log(probs[Int(labels[i]) + 1, i] + eps)    # labels are zero-based
+    end
+    LL / size(labels, 1)
+end
+
+
+################################################################################
+# Test Implementations
+################################################################################
+
+function test_ace()
+    info("EvalMetric::ACE")
+    n_categories   = 4
+    n_observations = 100
+    labels         = convert(Vector{Float32}, rand(0:(n_categories - 1), n_observations))    # MXNet uses Float32
+    probs          = convert(Array{Float32}, generate_probs(n_categories, n_observations))
+    LL             = loglikelihood(labels, probs)
+    metric         = mx.ACE()    # For categorical variables, ACE == -LL
+    mx._update_single_output(metric, labels, probs)
+    LL_v2 = metric.ace_sum / metric.n_sample
+    @test LL ≈ LL_v2 atol=1e-12
+end
+
+
+function test_nmse()
+    info("EvalMetric::NMSE")
+
+    @testset "EvalMetric::NMSE::update!" begin
+        metric = mx.NMSE()
+        labels = Array{mx.NDArray}(
+            [mx.NDArray([100.0, 0.0]), mx.NDArray([10.0, 0.0])])
+        preds = Array{mx.NDArray}(
+            [mx.NDArray([20.0, 0.0]), mx.NDArray([2.0, 0.0])])
+
+        mx.update!(metric, labels, preds)
+        @test metric.nmse_sum ≈ 0.64 * 2
+    end
+
+    @testset "EvalMetric::NMSE::reset!" begin
+        metric = mx.NMSE()
+        metric.nmse_sum = sum(rand(10))
+        metric.n_sample = 42
+
+        mx.reset!(metric)
+
+        @test metric.nmse_sum == 0.0
+        @test metric.n_sample == 0
+    end
+
+    @testset "EvalMetric::NMSE::get" begin
+        metric = mx.NMSE()
+        metric.nmse_sum = 100.0
+        metric.n_sample = 20
+
+        @test mx.get(metric) == [(:NMSE, 5.0)]
+    end
+end
+
+
+################################################################################
+# Run tests
+################################################################################
+test_ace()
+test_nmse()
+
+
+end
diff --git a/julia/test/unittest/model.jl b/julia/test/unittest/model.jl
new file mode 100644
index 000000000000..dbe97093dc68
--- /dev/null
+++ b/julia/test/unittest/model.jl
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module TestModel
+
+using Base.Test
+using MXNet
+
+
+function test_feedforward()
+  info("Model::FeedForward::constructor")
+  let x = @mx.var x
+    m = mx.FeedForward(x)
+    @test m.arch === x
+    @test length(m.ctx) == 1
+  end
+
+  info("Model::FeedForward::constructor::keyword context")
+  let x = @mx.var x
+    m = mx.FeedForward(x, context = mx.cpu())
+    @test m.arch === x
+    @test length(m.ctx) == 1
+  end
+
+  let x = @mx.var x
+    m = mx.FeedForward(x, context = [mx.cpu(), mx.cpu(1)])
+    @test m.arch === x
+    @test length(m.ctx) == 2
+  end
+end
+
+
+@testset "Model Test" begin
+  test_feedforward()
+end
+
+end  # module TestModel
diff --git a/julia/test/unittest/name.jl b/julia/test/unittest/name.jl
new file mode 100644
index 000000000000..eea73efb43c9
--- /dev/null
+++ b/julia/test/unittest/name.jl
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module TestNameManager
+using MXNet
+using Base.Test
+
+function test_default()
+  info("NameManager::default")
+
+  name = :_____aaaaa_____
+  @test get!(mx.DEFAULT_NAME_MANAGER, name, "") == name
+  @test get!(mx.DEFAULT_NAME_MANAGER, string(name), "") == name
+
+  hint = name
+  @test get!(mx.DEFAULT_NAME_MANAGER, "", hint) == Symbol("$(hint)0")
+  @test get!(mx.DEFAULT_NAME_MANAGER, "", string(hint)) == Symbol("$(hint)1")
+end
+
+function test_prefix()
+  info("NameManager::prefix")
+
+  name   = :_____bbbbb_____
+  prefix = :_____foobar_____
+
+  prefix_manager = mx.PrefixNameManager(prefix)
+  @test get!(prefix_manager, name, "") == Symbol("$prefix$name")
+  @test get!(prefix_manager, "", name) == Symbol("$prefix$(name)0")
+end
+
+@testset "Name Test" begin
+  test_default()
+  test_prefix()
+end
+
+end
diff --git a/julia/test/unittest/ndarray.jl b/julia/test/unittest/ndarray.jl
new file mode 100644
index 000000000000..3bacbb009dfc
--- /dev/null
+++ b/julia/test/unittest/ndarray.jl
@@ -0,0 +1,1475 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module TestNDArray
+
+using MXNet
+using Base.Test
+
+using ..Main: rand_dims
+
+################################################################################
+# Test Implementations
+################################################################################
+rand_tensors(dims::NTuple{N,Int}) where {N} = rand_tensors(mx.MX_float, dims)
+function rand_tensors(::Type{T}, dims::NTuple{N,Int}) where {N,T}
+  tensor = rand(T, dims)
+  array  = copy(tensor, mx.cpu())
+  return (tensor, array)
+end
+
+function test_constructor()
+  info("NDArray::NDArray(x::AbstractArray)")
+  function check_absarray(x)
+    y = mx.NDArray(x)
+    @test ndims(x)  == ndims(y)
+    @test eltype(x) == eltype(y)
+    @test x[3]      == y[3][]
+  end
+
+  check_absarray(1:10)
+  check_absarray(1.0:10)
+
+  info("NDArray::NDArray(Type, AbstractArray)")
+  let
+    x = mx.NDArray(Float32, [1, 2, 3])
+    @test eltype(x) == Float32
+    @test copy(x) == [1, 2, 3]
+  end
+  let
+    x = mx.NDArray(Float32, [1.1, 2, 3])
+    @test eltype(x) == Float32
+    @test copy(x) ≈ [1.1, 2, 3]
+  end
+end  # function test_constructor
+
+
+function test_ones_zeros_like()
+  info("NDArray::Base.zeros")
+  let x = mx.rand(1, 3, 2, 4, low = 1, high = 10)
+    y = zeros(x)
+    @test sum(copy(y)) == 0
+
+    y = mx.zeros(x)
+    @test sum(copy(y)) == 0
+  end
+
+  info("NDArray::Base.ones")
+  let x = mx.rand(1, 3, 2, 4, low = 1, high = 10)
+    y = ones(x)
+    @test sum(copy(y)) == 1 * 3 * 2 * 4
+
+    y = mx.ones(x)
+    @test sum(copy(y)) == 1 * 3 * 2 * 4
+  end
+end  # function test_ones_zeros_like
+
+
+function test_copy()
+  dims    = rand_dims()
+  tensor  = rand(mx.MX_float, dims)
+
+  info("NDArray::copy::dims = $dims")
+
+  # copy to NDArray and back
+  array   = copy(tensor, mx.cpu())
+  tensor2 = copy(array)
+  @test tensor ≈ tensor2
+
+  # copy between NDArray
+  array2  = copy(array, mx.cpu())
+  tensor2 = copy(array2)
+  @test tensor ≈ tensor2
+
+  info("NDArray::copy::AbstractArray")
+  let x = copy(1:4, mx.cpu())
+    @test eltype(x) == Int
+    @test copy(x) == [1, 2, 3, 4]
+  end
+
+  let x = copy(1.:4, mx.cpu())
+    @test eltype(x) == Float64
+    @test copy(x) ≈ [1., 2, 3, 4]
+  end
+
+  info("NDArray::copy!::AbstractArray")
+  let
+    x = mx.zeros(4)
+    copy!(x, 1:4)
+
+    @test eltype(x) == Float32
+    @test copy(x) == [1, 2, 3, 4]
+  end
+end
+
+function test_deepcopy()
+  info("NDArray::deepcopy")
+
+  x = mx.zeros(2, 5)
+  y = deepcopy(x)
+  x[:] = 42
+  @test copy(x) != copy(y)
+end
+
+function test_assign()
+  dims    = rand_dims()
+  tensor  = rand(mx.MX_float, dims)
+
+  info("NDArray::assign::dims = $dims")
+
+  # Julia Array -> NDArray assignment
+  array   = mx.empty(size(tensor))
+  array[:]= tensor
+  @test tensor ≈ copy(array)
+
+  array2  = mx.zeros(size(tensor))
+  @test zeros(size(tensor)) ≈ copy(array2)
+
+  array3 = mx.zeros(Float16, size(tensor))
+  @test zeros(Float16, size(tensor)) ≈ copy(array2)
+
+  # scalar -> NDArray assignment
+  scalar    = rand()
+  array2[:] = scalar
+  @test zeros(size(tensor)) + scalar ≈ copy(array2)
+
+  scalar = rand(Float16)
+  array2[:] = scalar
+  @test zeros(size(tensor)) + scalar ≈ copy(array2)
+
+  scalar = rand(Float64)
+  array2[:] = scalar
+  array3[:] = scalar
+  @test zeros(size(tensor)) + scalar ≈ copy(array2)
+  @test zeros(Float16, size(tensor)) + scalar ≈ copy(array3)
+
+  # NDArray -> NDArray assignment
+  array[:]  = array2
+  @test zeros(size(tensor)) + scalar ≈ copy(array)
+end
+
+function test_slice()
+  array = mx.zeros((2, 4))
+  array[2:3] = ones(2, 2)
+  @test copy(array) == [0 1 1 0; 0 1 1 0]
+  @test copy(mx.slice(array, 2:3)) == [1 1; 1 1]
+end
+
+function test_linear_idx()
+  info("NDArray::getindex::linear indexing")
+  let A = reshape(1:30, 3, 10)
+    x = mx.NDArray(A)
+
+    @test copy(x) == A
+    @test copy(x[1])  == [1]
+    @test copy(x[2])  == [2]
+    @test copy(x[3])  == [3]
+    @test copy(x[12]) == [12]
+    @test copy(x[13]) == [13]
+    @test copy(x[14]) == [14]
+
+    @test_throws BoundsError x[-1]
+    @test_throws BoundsError x[0]
+    @test_throws BoundsError x[31]
+    @test_throws BoundsError x[42]
+  end
+
+  let A = reshape(1:24, 3, 2, 4)
+    x = mx.NDArray(A)
+
+    @test copy(x) == A
+    @test copy(x[1])  == [1]
+    @test copy(x[2])  == [2]
+    @test copy(x[3])  == [3]
+    @test copy(x[11]) == [11]
+    @test copy(x[12]) == [12]
+    @test copy(x[13]) == [13]
+    @test copy(x[14]) == [14]
+  end
+
+  info("NDArray::setindex!::linear indexing")
+  let A = reshape(1:24, 3, 2, 4)
+    x = mx.NDArray(A)
+
+    @test copy(x) == A
+
+    x[4] = -4
+    @test copy(x[4])  == [-4]
+
+    x[11] = -11
+    @test copy(x[11]) == [-11]
+
+    x[24] = 42
+    @test copy(x[24]) == [42]
+  end
+
+  info("NDArray::setindex!::type convert")
+  let
+    x = NDArray([1, 2, 3])
+    @test eltype(x) == Int
+    x[:] = π
+    @test copy(x) == [3, 3, 3]
+  end
+end  # function test_linear_idx
+
+function test_first()
+  info("NDArray::first")
+  let A = reshape(1:30, 3, 10)
+    x = mx.NDArray(A)
+
+    @test x[]    == 1
+    @test x[5][] == 5
+
+    @test first(x)    == 1
+    @test first(x[5]) == 5
+  end
+end  # function test_first
+
+function test_endof()
+  info("NDArray::endof")
+  let A = [1 2; 3 4; 5 6], x = mx.NDArray(A)
+    @test endof(A) == endof(x)
+  end
+end  # function test_endof
+
+function test_cat()
+  function check_cat(f, A, B = 2A)
+    C = [A B]
+    D = [A; B]
+    x = NDArray(A)
+    y = NDArray(B)
+    z = NDArray(C)
+    d = NDArray(D)
+
+    if f == :hcat
+      @test copy([x y]) == [A B]
+      @test copy([x y 3y x]) == [A B 3B A]
+      @test copy([z y x]) == [C B A]
+    elseif f == :vcat
+      @test copy([x; y]) == [A; B]
+      @test copy([x; y; 3y; x]) == [A; B; 3B; A]
+      @test copy([x; d]) == [A; D]
+      @test copy([d; x]) == [D; A]
+    else
+      @assert false
+    end
+  end
+
+  let A = [1, 2, 3, 4]
+    info("NDArray::hcat::1D")
+    check_cat(:hcat, A)
+
+    info("NDArray::vcat::1D")
+    check_cat(:vcat, A)
+  end
+
+  let A = [1 2; 3 4]
+    info("NDArray::hcat::2D")
+    check_cat(:hcat, A)
+
+    info("NDArray::vcat::2D")
+    check_cat(:vcat, A)
+  end
+
+  let A = rand(4, 3, 2)
+    info("NDArray::hcat::3D")
+    check_cat(:hcat, A)
+
+    info("NDArray::vcat::3D")
+    check_cat(:vcat, A)
+  end
+
+  let A = rand(4, 3, 2, 2)
+    info("NDArray::hcat::4D")
+    check_cat(:hcat, A)
+
+    info("NDArray::vcat::4D")
+    check_cat(:vcat, A)
+  end
+
+  let A = [1, 2, 3, 4]
+    info("NDArray::cat::3D/1D")
+    check_cat(:vcat, reshape(A, 4, 1, 1), 2A)
+  end
+end  # function test_cat
+
+function test_plus()
+  dims   = rand_dims()
+  t1, a1 = rand_tensors(dims)
+  t2, a2 = rand_tensors(dims)
+  t3, a3 = rand_tensors(dims)
+
+  info("NDArray::plus::dims = $dims")
+
+  @test t1 + t2  ≈ copy(a1 + a2)
+  @test t1 .+ t2 ≈ copy(a1 .+ a2)
+
+  @test t1 + t2 + t3 ≈ copy(a1 + a2 + a3)
+
+  # test inplace += operation
+  a0 = a1               # keep a reference to a1
+  @mx.inplace a1 += a2  # perform inplace +=
+  @test a0 == a1        # make sure they are still the same object
+  @test copy(a0) ≈ copy(a1)
+  @test copy(a1) ≈ t1 + t2
+
+  # test scalar
+  scalar = rand()
+  @test t3 + scalar      ≈ copy(a3 + scalar)
+  @test t2 + scalar + t3 ≈ copy(a2 + scalar + a3)
+
+  # test small and large scalar
+  t4 = zeros(Float32, dims)
+  a4 = copy(t4, mx.cpu())
+  scalar_small = 1e-8
+  scalar_large = 1e8
+  @test t4 + scalar_small ≈ copy(a4 .+ scalar_small)
+  @test t4 + scalar_large ≈ copy(a4 .+ scalar_large)
+
+  t5 = zeros(Float64, dims)
+  a5 = copy(t5, mx.cpu())
+  scalar_small = 1e-8
+  scalar_large = 1e8
+  @test t5 + scalar_small ≈ copy(a5 .+ scalar_small)
+  @test t5 + scalar_large ≈ copy(a5 .+ scalar_large)
+
+  t6 = zeros(Float16, dims)
+  a6 = copy(t6, mx.cpu())
+  scalar_small = Float16(1e-5)
+  scalar_large = Float16(1e4)
+  @test t6 + scalar_small ≈ copy(a6 .+ scalar_small)
+  @test t6 + scalar_large ≈ copy(a6 .+ scalar_large)
+
+  let x = mx.NDArray([1 2; 3 4]), y = mx.NDArray([1 1; 1 1])
+    @test copy(42 .+ x) == [43 44; 45 46]
+    @test copy(x .+ 42) == [43 44; 45 46]
+    @test copy(0 .+ x .+ y .+ 41) == [43 44; 45 46]
+  end
+
+  info("NDArray::plus::scalar::type convert")
+  let x = mx.NDArray([1, 2, 3])
+    y = x .+ 0.5
+    @test copy(y) == copy(x)
+
+    y = x .+ 2.9
+    @test copy(y) == [3, 4, 5]
+  end
+
+  info("NDArray::broadcast_add")
+  let
+    A = [1 2 3;
+         4 5 6]
+    B = [1,
+         2]
+    x = NDArray(A)
+    y = NDArray(B)
+
+    z = x .+ y
+    @test copy(z) == A .+ B
+
+    # TODO
+    # @inplace x .+= y
+    # @test copy(x) == A .+ B
+  end
+end
+
+function test_minus()
+  dims   = rand_dims()
+  t1, a1 = rand_tensors(dims)
+  t2, a2 = rand_tensors(dims)
+
+  info("NDArray::minus::dims = $dims")
+
+  @test t1 - t2  ≈ copy(a1 - a2)
+  @test t1 .- t2 ≈ copy(a1 .- a2)
+
+  @test -t1 ≈ copy(-a1)
+
+  # make sure the negation is not in-place, so a1 is not changed after previous
+  # statement is executed
+  @test t1 ≈ copy(a1)
+
+  # test inplace -= operation
+  a0 = a1              # keep a reference to a1
+  @mx.inplace a1 -= a2 # perform inplace -=
+  @test a0 == a1       # make sure they are still the same object
+  @test a0.handle == a1.handle
+  @test copy(a0) ≈ copy(a1)
+  @test copy(a1) ≈ t1 - t2
+
+  # test scalar
+  scalar = rand()
+  @test t2 - scalar ≈ copy(a2 - scalar)
+
+  # test small and large scalar
+  t4 = zeros(Float32, dims)
+  a4 = copy(t4, mx.cpu())
+  scalar_small = 1e-8
+  scalar_large = 1e8
+  @test t4 - scalar_small ≈ copy(a4 .- scalar_small)
+  @test t4 - scalar_large ≈ copy(a4 .- scalar_large)
+
+  t5 = zeros(Float64, dims)
+  a5 = copy(t5, mx.cpu())
+  scalar_small = 1e-8
+  scalar_large = 1e8
+  @test t5 - scalar_small ≈ copy(a5 .- scalar_small)
+  @test t5 - scalar_large ≈ copy(a5 .- scalar_large)
+
+  t6 = zeros(Float16, dims)
+  a6 = copy(t6, mx.cpu())
+  scalar_small = Float16(1e-5)
+  scalar_large = Float16(1e4)
+  @test t6 - scalar_small ≈ copy(a6 .- scalar_small)
+  @test t6 - scalar_large ≈ copy(a6 .- scalar_large)
+
+  info("NDArray::minus::scalar::type convert")
+  let x = mx.NDArray([1, 2, 3])
+    @test copy(x .- π) ≈ [-2, -1, 0]
+  end
+
+  info("NDArray::broadcast_minus")
+  let
+    A = [1 2 3;
+         4 5 6]
+    B = [1,
+         2]
+    x = NDArray(A)
+    y = NDArray(B)
+
+    z = x .- y
+    @test copy(z) == A .- B
+
+    # TODO
+    # @inplace x .-= y
+    # @test copy(x) == A .- B
+  end
+end
+
+function test_mul()
+  dims   = rand_dims()
+  t1, a1 = rand_tensors(dims)
+  t2, a2 = rand_tensors(dims)
+  t3, a3 = rand_tensors(dims)
+
+  info("NDArray::mul::dims = $dims")
+
+  @test t1 .* t2 ≈ copy(a1.*a2)
+
+  # test inplace .*= operation
+  a0 = a1               # keep a reference to a1
+  @mx.inplace a1 .*= a2 # perform inplace .*=
+  @test a0 == a1        # make sure they are still the same object
+  @test a0.handle == a1.handle
+  @test copy(a0) ≈ copy(a1)
+  @test copy(a1) ≈ t1 .* t2
+
+  # test scalar
+  scalar = mx.MX_float(rand())
+  @test t3 * scalar ≈ copy(a3 .* scalar)
+
+  # test small and large scalar
+  t4, a4 = rand_tensors(Float32, dims)
+  scalar_small = 1e-8
+  scalar_large = 1e8
+  @test t4 * scalar_small ≈ copy(a4 .* scalar_small)
+  @test t4 * scalar_large ≈ copy(a4 .* scalar_large)
+
+  t5, a5 = rand_tensors(Float64, dims)
+  scalar_small = 1e-8
+  scalar_large = 1e8
+  @test t5 * scalar_small ≈ copy(a5 .* scalar_small)
+  @test t5 * scalar_large ≈ copy(a5 .* scalar_large)
+
+  t6, a6 = rand_tensors(Float16, dims)
+  scalar_small = Float16(1e-5)
+  @test t6 * scalar_small ≈ copy(a6 .* scalar_small)
+
+  info("NDArray::mul::matrix multiplication")
+  let x = mx.NDArray([1.  2])
+    y = x' * x
+    @test copy(y) == [1. 2; 2 4]
+  end
+
+  info("NDArray::mul::elementwise::issue 253")
+  let x = mx.NDArray([1.  2])
+    y = x .* x
+    @test copy(y) == [1. 4.]
+  end
+
+  info("NDArray::mul::scalar::type convert")
+  let x = mx.NDArray([1, 2, 3])
+    y = x .* π
+    @test eltype(x) == Int
+    @test copy(y) == [3, 6, 9]
+  end
+
+  info("NDArray::broadcast_mul")
+  let
+    A = [1 2 3;
+         4 5 6]
+    B = [1,
+         2]
+    x = NDArray(A)
+    y = NDArray(B)
+
+    z = x .* y
+    @test copy(z) == A .* B
+
+    # TODO
+    # @inplace x .*= y
+    # @test copy(x) == A .* B
+  end
+end
+
+function test_div()
+  dims   = rand_dims()
+  t1, a1 = rand_tensors(dims)
+  t2, a2 = rand_tensors(dims)
+
+  info("NDArray::div::dims = $dims")
+  t2             .+= 2  # avoid numerical instability
+  @mx.inplace a2 .+= 2
+
+  @test t1 ./ t2 ≈ copy(a1 ./ a2)
+
+  # test inplace -= operation
+  a0 = a1                # keep a reference to a2
+  @mx.inplace a1 ./= a2  # perform inplace ./=
+  @test a0 == a1         # make sure they are still the same object
+  @test a0.handle == a1.handle
+  @test copy(a0) ≈ copy(a1)
+  @test copy(a1) ≈ t1 ./ t2
+
+  # test scalar
+  scalar = rand() + 2
+  @test t2 ./ scalar ≈ copy(a2 ./ scalar)
+
+  # test small and large scalar
+  t4, a4 = rand_tensors(Float32, dims)
+  scalar_small = 1e-8
+  scalar_large = 1e8
+  @test t4 ./ scalar_small ≈ copy(a4 ./ scalar_small)
+  @test t4 ./ scalar_large ≈ copy(a4 ./ scalar_large)
+
+  t5, a5 = rand_tensors(Float64, dims)
+  scalar_small = 1e-8
+  scalar_large = 1e8
+  @test t5 ./ scalar_small ≈ copy(a5 ./ scalar_small)
+  @test t5 ./ scalar_large ≈ copy(a5 ./ scalar_large)
+
+  t6, a6 = rand_tensors(Float16, dims)
+  scalar_large = 1e4
+  @test t6 ./ scalar_large ≈ copy(a6 ./ scalar_large)
+
+  info("NDArray::div::scalar::type convert")
+  let x = mx.NDArray([1, 2, 3])
+    y = x ./ 1.1
+    @test eltype(y) == Int
+    @test copy(y) == [1, 2, 3]
+
+    y = x ./ 2
+    @test eltype(y) == Int  # this differs from julia
+    @test copy(y) == [0, 1, 1]
+
+    @test_throws AssertionError x ./ 0.5
+  end
+
+  info("NDArray::broadcast_div")
+  let
+    A = Float32[1 2 3;
+                4 5 6]
+    B = Float32[1,
+                2]
+    x = NDArray(A)
+    y = NDArray(B)
+
+    z = x ./ y
+    @test copy(z) == A ./ B
+
+    # TODO
+    # @inplace x ./= y
+    # @test copy(x) == A ./ B
+  end
+end
+
+
+function test_rdiv()
+  info("NDArray::rdiv")
+
+  info("NDArray::rdiv::Inf16")
+  let x = 1 ./ mx.zeros(Float16, 4)
+    @test copy(x) == [Inf16, Inf16, Inf16, Inf16]
+  end
+
+  info("NDArray::rdiv::Inf32")
+  let x = 1 ./ mx.zeros(Float32, 4)
+    @test copy(x) == [Inf32, Inf32, Inf32, Inf32]
+  end
+
+  info("NDArray::rdiv::Inf64")
+  let x = 1 ./ mx.zeros(Float64, 4)
+    @test copy(x) == [Inf64, Inf64, Inf64, Inf64]
+  end
+
+  info("NDArray::rdiv::Int")
+  let x = 1 ./ mx.NDArray([1 2; 3 4])
+    @test copy(x) == [1 0; 0 0]
+  end
+
+  info("NDArray::rdiv::Float32")
+  let x = 1 ./ mx.NDArray(Float32[1 2; 3 4])
+    y = 1 ./ Float32[1 2; 3 4]
+    @test copy(x) ≈ y
+  end
+
+  info("NDArray::rdiv::type convert")
+  let x = mx.NDArray([1, 2, 3])
+    y = 5.5 ./ x
+    @test eltype(y) == Int  # this differs from julia
+    @test copy(y) == [5, 2, 1]
+  end
+end  # function test_rdiv
+
+
+function test_mod()
+  info("NDArray::mod")
+  const A = [1 2; 3 4]
+  const B = [1 1; 3 3]
+
+  let x = NDArray(A), y = NDArray(B)
+    C = A .% B
+    D = B .% A
+
+    w = x .% y
+    z = y .% x
+
+    @test copy(w) ≈ C
+    @test copy(z) ≈ D
+  end
+
+  info("NDArray::mod::scalar")
+  let x = NDArray(A)
+    C = A .% 2
+    y = x .% 2
+    @test copy(y) ≈ C
+  end
+
+  info("NDArray::rmod")
+  let x = NDArray(A)
+    C = 11 .% A
+    y = 11 .% x
+    @test copy(y) ≈ C
+  end
+
+  info("NDArray::mod_from!")
+  let
+    x = NDArray(A)
+    y = NDArray(B)
+    C = A .% B
+    mx.mod_from!(x, y)
+    @test copy(x) ≈ C
+  end
+
+  let
+    x = NDArray(A)
+    y = NDArray(B)
+    C = B .% A
+    mx.mod_from!(y, x)
+
+    @test copy(y) ≈ C
+  end
+
+  info("NDArray::mod_from!::scalar")
+  let
+    x = NDArray(A)
+    C = A .% 2
+    mx.mod_from!(x, 2)
+    @test copy(x) ≈ C
+  end
+
+  info("NDArray::rmod_from!")
+  let
+    x = NDArray(A)
+    C = 11 .% A
+    mx.rmod_from!(11, x)
+    @test copy(x) ≈ C
+  end
+
+  info("NDArray::mod_from!::writable")
+  let
+    x = NDArray(A)
+    y = NDArray(B)
+    x.writable = false
+    y.writable = false
+    @test_throws AssertionError mx.mod_from!(x, y)
+    @test_throws AssertionError mx.mod_from!(y, x)
+    @test_throws AssertionError mx.mod_from!(x, 2)
+    @test_throws AssertionError mx.rmod_from!(2, x)
+  end
+
+  info("NDArray::mod::inplace")
+  let
+    x = NDArray(A)
+    y = NDArray(B)
+    C = A .% B
+    @inplace x .%= y
+    @test copy(x) ≈ C
+  end
+
+  info("NDArray::broadcast_mod")
+  let
+    A = [1 2 3;
+         4 5 6]
+    B = [1,
+         2]
+    x = NDArray(A)
+    y = NDArray(B)
+
+    z = x .% y
+    @test copy(z) == A .% B
+
+    # TODO
+    # @inplace x .%= y
+    # @test copy(x) == A .% B
+  end
+end  # function test_mod
+
+
+function test_gd()
+  dims   = rand_dims()
+  tw, aw = rand_tensors(dims)
+  tg, ag = rand_tensors(dims)
+
+  info("NDArray::gd::dims = $dims")
+
+  lr = rand()
+  wd = rand()
+
+  @mx.inplace aw += -lr * (ag + wd * aw)
+  tw += -lr * (tg + wd * tw)
+  @test copy(aw) ≈ tw
+end
+
+
+function test_saveload()
+  n_arrays = 5
+  info("NDArray::saveload::n_arrays = $n_arrays")
+  fname = tempname()
+
+  # save and load a single array
+  dims   = rand_dims()
+  j_array, nd_array = rand_tensors(dims)
+  mx.save(fname, nd_array)
+  data = mx.load(fname, mx.NDArray)
+  @test data isa Vector{<:mx.NDArray}
+  @test length(data) == 1
+  @test copy(data[1]) ≈ j_array
+
+  # save and load N arrays of different shape
+  arrays = [rand_tensors(rand_dims()) for i = 1:n_arrays]
+  nd_arrays = mx.NDArray[x[2] for x in arrays]
+  mx.save(fname, nd_arrays)
+  data = mx.load(fname, mx.NDArray)
+  @test data isa Vector{<:mx.NDArray}
+  @test length(data) == n_arrays
+  for i = 1:n_arrays
+    @test copy(data[i]) ≈ arrays[i][1]
+  end
+
+  # save and load dictionary of ndarrays
+  names = [Symbol("array$i") for i = 1:n_arrays]
+  dict = Dict([(n, v) for (n,v) in zip(names, nd_arrays)])
+  mx.save(fname, dict)
+  data = mx.load(fname, mx.NDArray)
+  @test data isa Dict{Symbol,<:mx.NDArray}
+  @test length(data) == n_arrays
+  for i = 1:n_arrays
+    @test copy(data[names[i]]) ≈ arrays[i][1]
+  end
+
+  rm(fname)
+end
+
+function test_clip()
+  dims = rand_dims()
+  info("NDArray::clip::dims = $dims")
+
+  j_array, nd_array = rand_tensors(dims)
+  clip_up   = maximum(abs.(j_array)) / 2
+  clip_down = 0
+  clipped   = clip(nd_array, clip_down, clip_up)
+
+  # make sure the original array is not modified
+  @test copy(nd_array) ≈ j_array
+
+  @test all(clip_down .<= copy(clipped) .<= clip_up)
+
+  info("NDArray::clip!")
+  let
+    x = NDArray(1.0:20)
+    clip!(x, 5, 15)
+    @test all(5 .<= copy(x) .<= 15)
+  end
+end
+
+function test_power()
+  info("NDArray::power")
+
+  info("NDArray::power::Int::x.^n")
+  let x = mx.NDArray([1 2; 3 4])
+    @test eltype(x) == Int
+    @test copy(x.^-1)  == [1 0; 0 0]
+    @test copy(x.^0)   == [1 1; 1 1]
+    @test copy(x.^1)   == [1 2; 3 4]
+    @test copy(x.^1.1) == [1 2; 3 4]
+    @test copy(x.^2)   == [1 4; 9 16]
+    @test copy(x.^2.9) == [1 4; 9 16]
+    @test copy(x.^3)   == [1 8; 27 64]
+  end
+
+  info("NDArray::power::Int::n.^x")
+  let x = mx.NDArray([1 2; 3 4])
+    @test eltype(x) == Int
+    @test copy(0.^x)   == [0 0; 0 0]
+    @test copy(1.^x)   == [1 1; 1 1]
+    @test copy(1.1.^x) == [1 1; 1 1]
+    @test copy(2.^x)   == [2 4; 8 16]
+    @test copy(2.9.^x) == [2 4; 8 16]
+    @test copy(3.^x)   == [3 9; 27 81]
+  end
+
+  info("NDArray::power::Int::x.^y")
+  let x = mx.NDArray([1 2; 3 4]), y = mx.NDArray([2 2; 2 2])
+    @test eltype(x) == Int
+    @test eltype(y) == Int
+    @test copy(x.^y) == [1 4; 9 16]
+    @test copy(y.^x) == [2 4; 8 16]
+  end
+
+  info("NDArray::power::Float32::x.^n")
+  let x = mx.NDArray(Float32[1 2; 3 4]), A = Float32[1 2; 3 4]
+    @test eltype(x) == Float32
+    @test copy(x.^0) == Float32[1 1; 1 1]
+    @test copy(x.^1) == Float32[1 2; 3 4]
+    @test copy(x.^2) == Float32[1 4; 9 16]
+    @test copy(x.^3) == Float32[1 8; 27 64]
+
+    @test copy(x.^-1)  ≈ A.^-1
+    @test copy(x.^1.1) ≈ A.^1.1
+    @test copy(x.^2.9) ≈ A.^2.9
+  end
+
+  info("NDArray::power::Float32::n.^x")
+  let x = mx.NDArray(Float32[1 2; 3 4]), A = Float32[1 2; 3 4]
+    @test eltype(x) == Float32
+    @test copy(0.^x) == Float32[0 0; 0 0]
+    @test copy(1.^x) == Float32[1 1; 1 1]
+    @test copy(2.^x) == Float32[2 4; 8 16]
+    @test copy(3.^x) == Float32[3 9; 27 81]
+
+    @test copy(1.1.^x) ≈ 1.1.^A
+    @test copy(2.9.^x) ≈ 2.9.^A
+  end
+
+  info("NDArray::power::Float32::x.^y")
+  let x = mx.NDArray(Float32[1 2; 3 4]), y = mx.NDArray(Float32[2 2; 2 2])
+    @test eltype(x) == Float32
+    @test eltype(y) == Float32
+    @test copy(x.^y) == Float32[1 4; 9 16]
+    @test copy(y.^x) == Float32[2 4; 8 16]
+  end
+
+  info("NDArray::power::e.^x::x.^e")
+  let x = mx.zeros(2, 3), A = [1 1 1; 1 1 1]
+    @test copy(e.^x) ≈ A
+  end
+
+  let A = Float32[1 2; 3 4], x = mx.NDArray(A)
+    @test copy(e.^x) ≈ e.^A
+    @test copy(x.^e) ≈ A.^e
+  end
+
+  info("NDArray::power::π.^x::x.^π")
+  let A = Float32[1 2; 3 4], x = mx.NDArray(A)
+    @test copy(π.^x) ≈ π.^A
+    @test copy(x.^π) ≈ A.^π
+  end
+
+  # TODO: Float64: wait for https://github.com/apache/incubator-mxnet/pull/8012
+
+  info("NDArray::broadcast_power")
+  let
+    A = [1 2 3;
+         4 5 6]
+    B = [1,
+         2]
+    x = NDArray(A)
+    y = NDArray(B)
+
+    z = x.^y
+    @test copy(z) == A.^B
+
+    # TODO
+    # @inplace x .^= y
+    # @test copy(x) == A.^B
+  end
+end # function test_power
+
+function test_sqrt()
+  dims = rand_dims()
+  info("NDArray::sqrt::dims = $dims")
+
+  j_array, nd_array = rand_tensors(dims)
+  sqrt_ed = sqrt(nd_array)
+  @test copy(sqrt_ed) ≈ sqrt.(j_array)
+end
+
+function test_nd_as_jl()
+  dims = (2, 3)
+  info("NDArray::nd_as_jl::dims = $dims")
+
+  x = mx.zeros(dims) + 5
+  y = mx.ones(dims)
+  z = mx.zeros(dims)
+  @mx.nd_as_jl ro=x rw=(y, z) begin
+    for i = 1:length(z)
+      z[i] = x[i]
+    end
+
+    z[:, 1] = y[:, 1]
+    y[:] = 0
+  end
+
+  @test sum(copy(y)) == 0
+  @test sum(copy(z)[:, 1]) == 2
+  @test copy(z)[:, 2:end] ≈ copy(x)[:, 2:end]
+end
+
+function test_dot()
+  dims1 = (2, 3)
+  dims2 = (3, 8)
+  info("NDArray::dot")
+
+  x = mx.zeros(dims1)
+  y = mx.zeros(dims2)
+  z = mx.dot(x, y)
+  @test size(z) == (2, 8)
+
+  x = mx.zeros(1, 2)
+  y = mx.zeros(1, 2, 3)
+  @test_throws mx.MXError dot(x, y)  # dimension mismatch
+
+  info("NDArray::matrix mul")
+  let
+    A = [1. 2 3; 4 5 6]
+    B = [-1., -2, -3]
+    x = NDArray(A)
+    y = NDArray(B)
+    z = x * y
+    @test copy(z) == A * B
+    @test size(z) == (2,)
+  end
+
+  let
+    A = [1. 2 3; 4 5 6]
+    B = [-1. -2; -3 -4; -5 -6]
+    x = NDArray(A)
+    y = NDArray(B)
+    z = x * y
+    @test copy(z) == A * B
+    @test size(z) == (2, 2)
+  end
+end
+
+function test_eltype()
+  info("NDArray::eltype")
+  dims1 = (3,3)
+
+  x = mx.empty(dims1)
+  @test eltype(x) == mx.DEFAULT_DTYPE
+
+  for TF in instances(mx.TypeFlag)
+    T = mx.fromTypeFlag(TF)
+    x = mx.empty(T, dims1)
+    @test eltype(x) == T
+  end
+end
+
+function test_reshape()
+  info("NDArray::reshape")
+  A = rand(2, 3, 4)
+
+  B = reshape(NDArray(A), 4, 3, 2)
+  @test size(B) == (4, 3, 2)
+  @test copy(B)[3, 1, 1] == A[1, 2, 1]
+
+  C = reshape(NDArray(A), (4, 3, 2))
+  @test size(C) == (4, 3, 2)
+  @test copy(C)[3, 1, 1] == A[1, 2, 1]
+
+  info("NDArray::reshape::reverse")
+  A = mx.zeros(10, 5, 4)
+
+  B = reshape(A, -1, 0)
+  @test size(B) == (40, 5)
+
+  C = reshape(A, -1, 0, reverse=true)
+  @test size(C) == (50, 4)
+end
+
+function test_expand_dims()
+  info("NDArray::expand_dims")
+  let A = [1, 2, 3, 4], x = NDArray(A)
+    @test size(x) == (4,)
+
+    y = expand_dims(x, 1)
+    @test size(y) == (1, 4)
+
+    y = expand_dims(x, 2)
+    @test size(y) == (4, 1)
+  end
+
+  let A = [1 2; 3 4; 5 6], x = NDArray(A)
+    @test size(x) == (3, 2)
+
+    y = expand_dims(x, 1)
+    @test size(y) == (1, 3, 2)
+
+    y = expand_dims(x, 2)
+    @test size(y) == (3, 1, 2)
+
+    y = expand_dims(x, 3)
+    @test size(y) == (3, 2, 1)
+  end
+end  # test_expand_dims
+
+function test_sum()
+  info("NDArray::sum")
+
+  let A = reshape(1.0:8, 2, 2, 2), X = mx.NDArray(A)
+    @test copy(sum(X))[]       == sum(A)
+    @test copy(sum(X, 1))      == sum(A, 1)
+    @test copy(sum(X, 2))      == sum(A, 2)
+    @test copy(sum(X, 3))      == sum(A, 3)
+    @test copy(sum(X, [1, 2])) == sum(A, [1, 2])
+    @test copy(sum(X, (1, 2))) == sum(A, (1, 2))
+  end
+end
+
+function test_mean()
+  info("NDArray::mean")
+
+  let A = reshape(1.0:8, 2, 2, 2), X = mx.NDArray(A)
+    @test copy(mean(X))[]       == mean(A)
+    @test copy(mean(X, 1))      == mean(A, 1)
+    @test copy(mean(X, 2))      == mean(A, 2)
+    @test copy(mean(X, 3))      == mean(A, 3)
+    @test copy(mean(X, [1, 2])) == mean(A, [1, 2])
+    @test copy(mean(X, (1, 2))) == mean(A, (1, 2))
+  end
+end
+
+function test_maximum()
+  info("NDArray::maximum")
+
+  let A = reshape(1.0:8, 2, 2, 2), X = mx.NDArray(A)
+    @test copy(maximum(X))[]       == maximum(A)
+    @test copy(maximum(X, 1))      == maximum(A, 1)
+    @test copy(maximum(X, 2))      == maximum(A, 2)
+    @test copy(maximum(X, 3))      == maximum(A, 3)
+    @test copy(maximum(X, [1, 2])) == maximum(A, [1, 2])
+    @test copy(maximum(X, (1, 2))) == maximum(A, (1, 2))
+  end
+
+  info("NDArray::broadcast_maximum")
+  let
+    A = [1 2 3;
+         4 5 6]
+    B = [1,
+         2]
+    x = NDArray(A)
+    y = NDArray(B)
+
+    z = max.(x, y)
+    @test copy(z) == max.(A, B)
+  end
+end
+
+function test_minimum()
+  info("NDArray::minimum")
+
+  let A = reshape(1.0:8, 2, 2, 2), X = mx.NDArray(A)
+    @test copy(minimum(X))[]       == minimum(A)
+    @test copy(minimum(X, 1))      == minimum(A, 1)
+    @test copy(minimum(X, 2))      == minimum(A, 2)
+    @test copy(minimum(X, 3))      == minimum(A, 3)
+    @test copy(minimum(X, [1, 2])) == minimum(A, [1, 2])
+    @test copy(minimum(X, (1, 2))) == minimum(A, (1, 2))
+  end
+
+  info("NDArray::broadcast_minimum")
+  let
+    A = [1 2 3;
+         4 5 6]
+    B = [1,
+         2]
+    x = NDArray(A)
+    y = NDArray(B)
+
+    z = min.(x, y)
+    @test copy(z) == min.(A, B)
+  end
+end
+
+function test_prod()
+  info("NDArray::prod")
+
+  let A = reshape(1.0:8, 2, 2, 2), X = mx.NDArray(A)
+    @test copy(prod(X))[]       == prod(A)
+    @test copy(prod(X, 1))      == prod(A, 1)
+    @test copy(prod(X, 2))      == prod(A, 2)
+    @test copy(prod(X, 3))      == prod(A, 3)
+    @test copy(prod(X, [1, 2])) == prod(A, [1, 2])
+    @test copy(prod(X, (1, 2))) == prod(A, (1, 2))
+  end
+end
+
+function test_fill()
+  info("NDArray::fill")
+
+  let x = mx.fill(42, 2, 3, 4)
+    @test eltype(x) == Int
+    @test size(x) == (2, 3, 4)
+    @test copy(x) == fill(42, 2, 3, 4)
+  end
+
+  let x = mx.fill(Float32(42), 2, 3, 4)
+    @test eltype(x) == Float32
+    @test size(x) == (2, 3, 4)
+    @test copy(x) ≈ fill(Float32(42), 2, 3, 4)
+  end
+
+  let x = mx.fill(42, (2, 3, 4))
+    @test eltype(x) == Int
+    @test size(x) == (2, 3, 4)
+    @test copy(x) == fill(42, 2, 3, 4)
+  end
+
+  let x = mx.fill(Float32(42), (2, 3, 4))
+    @test eltype(x) == Float32
+    @test size(x) == (2, 3, 4)
+    @test copy(x) ≈ fill(Float32(42), 2, 3, 4)
+  end
+
+  info("NDArray::fill!::arr")
+  let x = fill!(mx.zeros(2, 3, 4), 42)
+    @test eltype(x) == Float32
+    @test size(x) == (2, 3, 4)
+    @test copy(x) ≈ fill(Float32(42), 2, 3, 4)
+  end
+end  # function test_fill
+
+function test_transpose()
+  info("NDArray::transpose::1D")
+  let A = rand(Float32, 4), x = NDArray(A)
+    @test size(x) == (4,)
+    @test size(x') == (1, 4)
+  end
+
+  info("NDArray::transpose::2D")
+  let A = rand(Float32, 2, 3), x = mx.NDArray(A)
+    @test size(x) == (2, 3)
+    @test size(x') == (3, 2)
+  end
+
+  info("NDArray::permutedims")
+  let A = collect(Float32, reshape(1.0:24, 2, 3, 4)), x = mx.NDArray(A)
+    A′ = permutedims(A, [2, 1, 3])
+    x′ = permutedims(x, [2, 1, 3])
+    @test size(A′) == size(x′)
+    @test A′ == copy(x′)
+  end
+end
+
+function test_show()
+  info("NDArray::show::REPL")
+  let str = sprint(show, MIME"text/plain"(), mx.NDArray([1 2 3 4]))
+    @test contains(str, "1×4")
+    @test contains(str, "mx.NDArray")
+    @test contains(str, "Int64")
+    @test contains(str, "CPU")
+    @test match(r"1\s+2\s+3\s+4", str) != nothing
+  end
+
+  info("NDArray::show")
+  let str = sprint(show, mx.NDArray([1 2 3 4]))
+    @test str == "NDArray [1 2 3 4]"
+  end
+
+  let str = sprint(show, mx.zeros(4))
+    @test str == "NDArray Float32[0.0, 0.0, 0.0, 0.0]"
+  end
+end
+
+function test_size()
+  info("NDArray::size")
+  let A = [1 2; 3 4; 5 6], x = mx.NDArray(A)
+    @test size(A) == size(x)
+    @test size(A, 1, 2, 3, 4, 5) == size(x, 1, 2, 3, 4, 5)
+    @inferred size(x, 1, 2, 3, 4, 5)
+  end
+end  # function test_size()
+
+function check_trigonometric(f)
+  info("NDArray::$f")
+  let A = [.1 .2; .3 .4], x = mx.NDArray(A)
+    B = f.(A)
+    y = f.(x)
+    @test copy(y) ≈ B
+  end
+
+  let A = Float32[.1 .2; .3 .4], x = mx.NDArray(A)
+    B = f.(A)
+    y = f.(x)
+    @test copy(y) ≈ B
+  end
+end  # function check_trigonometric
+
+function test_trigonometric()
+  for f ∈ [sin, cos, tan, asin, acos, atan]
+    check_trigonometric(f)
+  end
+end  # function test_trigonometric
+
+function check_hyperbolic(f, A)
+  info("NDArray::$f")
+  let x = NDArray(A)
+    B = f.(A)
+    y = f.(x)
+    @test copy(y) ≈ B
+  end
+
+  let A = Float32.(A), x = NDArray(A)
+    B = f.(A)
+    y = f.(x)
+    @test copy(y) ≈ B
+  end
+end  # function check_hyperbolic
+
+function test_hyperbolic()
+  for f ∈ [sinh, cosh, tanh, asinh, acosh, atanh]
+    A = if f == acosh
+      [1.1, 1.2, 1.3, 1.4]
+    else
+      [.1, .2, .3, .4]
+    end
+    check_hyperbolic(f, A)
+  end
+end  # function test_hyperbolic
+
+function test_act_funcs()
+  info("NDArray::σ/sigmoid")
+  let
+    A = Float32[.1, .2, -.3, -.4]
+    B = @. 1 / (1 + e^(-A))
+    x = NDArray(A)
+    y = σ.(x)
+    @test copy(y) ≈ B
+
+    z = sigmoid.(x)
+    @test copy(z) ≈ B
+  end
+
+  info("NDArray::relu")
+  let
+    A = [1, 2, -3, -4]
+    B = max.(A, 0)
+    x = NDArray(A)
+    y = relu.(x)
+    @test copy(y) ≈ B
+  end
+
+  info("NDArray::softmax::1D")
+  let
+    A = Float32[1, 2, 3, 4]
+    B = exp.(A) ./ sum(exp.(A))
+    x = NDArray(A)
+    y = softmax.(x)
+    @test copy(y) ≈ B
+  end
+
+  info("NDArray::softmax::2D")
+  let
+    A = Float32[1 2; 3 4]
+    B = exp.(A) ./ sum(exp.(A), 1)
+    x = NDArray(A)
+    y = softmax.(x, 1)
+    @test copy(y) ≈ B
+
+    C = exp.(A) ./ sum(exp.(A), 2)
+    z = softmax.(x, 2)
+    @test copy(z) ≈ C
+  end
+
+  info("NDArray::log_softmax::1D")
+  let
+    A = Float32[1, 2, 3, 4]
+    B = log.(exp.(A) ./ sum(exp.(A)))
+    x = NDArray(A)
+    y = log_softmax.(x)
+    @test copy(y) ≈ B
+  end
+
+  info("NDArray::log_softmax::2D")
+  let
+    A = Float32[1 2; 3 4]
+    B = log.(exp.(A) ./ sum(exp.(A), 1))
+    x = NDArray(A)
+    y = log_softmax.(x, 1)
+    @test copy(y) ≈ B
+
+    C = log.(exp.(A) ./ sum(exp.(A), 2))
+    z = log_softmax.(x, 2)
+    @test copy(z) ≈ C
+  end
+end  # function test_act_funcs
+
+macro check_equal(op)
+  quote
+    A = [1 2 3
+         4 5 6]
+    B = [1,
+         6]
+    x = NDArray(A)
+    y = NDArray(B)
+    a = broadcast($op, x, y)
+    @test copy(a) == broadcast($op, A, B)
+
+    C = [3 2 1
+         6 5 4]
+    z = NDArray(C)
+    b = broadcast($op, x, z)
+    @test copy(b) == broadcast($op, A, C)
+  end
+end
+
+function test_equal()
+  info("NDArray::broadcast_equal")
+  @check_equal ==
+
+  info("NDArray::broadcast_not_equal")
+  @check_equal !=
+
+  info("NDArray::broadcast_greater")
+  @check_equal >
+
+  info("NDArray::broadcast_greater_equal")
+  @check_equal >=
+
+  info("NDArray::broadcast_lesser")
+  @check_equal <
+
+  info("NDArray::broadcast_lesser_equal")
+  @check_equal <=
+end  # function test_equal
+
+function test_broadcast_to()
+  info("NDArray::broadcast_to")
+  A = [1 2 3]
+  x = NDArray(A)
+  @test mx.broadcast_to(x, (1, 3)) |> copy == A
+  @test mx.broadcast_to(x, (5, 3)) |> copy == repeat(A, outer = (5, 1))
+
+  @test mx.broadcast_to(x, 1, 3) |> copy == A
+  @test mx.broadcast_to(x, 5, 3) |> copy == repeat(A, outer = (5, 1))
+end  # function test_broadcast_to
+
+function test_broadcast_axis()
+  info("NDArray::broadcast_axis")
+  A = reshape([1, 2, 3], 1, 3, 1)
+  x = NDArray(A)
+
+  @test mx.broadcast_axis(x, 1, 4) |> copy == [A; A; A; A]
+  @test mx.broadcast_axis(x, 3, 2) |> copy == cat(3, A, A)
+
+  info("NDArray::broadcast_axes")
+  @test mx.broadcast_axes(x, 1, 4) |> copy == [A; A; A; A]
+  @test mx.broadcast_axes(x, 3, 2) |> copy == cat(3, A, A)
+end  # function test_broadcast_axis
+
+function test_hypot()
+  info("NDArray::hypot")
+  A = [3 3 3]
+  B = [4, 4]
+  C = hypot.(A, B)
+
+  x = NDArray(A)
+  y = NDArray(B)
+  z = hypot.(x, y)
+
+  @test copy(z) == C
+end  # function test_hypot
+
+################################################################################
+# Run tests
+################################################################################
+@testset "NDArray Test" begin
+  test_constructor()
+  test_ones_zeros_like()
+  test_assign()
+  test_copy()
+  test_slice()
+  test_linear_idx()
+  test_first()
+  test_endof()
+  test_cat()
+  test_plus()
+  test_minus()
+  test_mul()
+  test_div()
+  test_rdiv()
+  test_mod()
+  test_gd()
+  test_saveload()
+  test_clip()
+  test_power()
+  test_sqrt()
+  test_eltype()
+  test_nd_as_jl()
+  test_dot()
+  test_reshape()
+  test_expand_dims()
+  test_sum()
+  test_mean()
+  test_maximum()
+  test_minimum()
+  test_prod()
+  test_fill()
+  test_transpose()
+  test_show()
+  test_size()
+  test_trigonometric()
+  test_hyperbolic()
+  test_act_funcs()
+  test_equal()
+  test_broadcast_to()
+  test_broadcast_axis()
+  test_hypot()
+end
+
+end
diff --git a/julia/test/unittest/operator.jl b/julia/test/unittest/operator.jl
new file mode 100644
index 000000000000..ed8312d91cc9
--- /dev/null
+++ b/julia/test/unittest/operator.jl
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module TestOperator
+
+using MXNet
+using Base.Test
+
+using ..Main: rand_dims
+
+function test_scalar_op()
+  data  = mx.Variable(:data)
+  shape = rand_dims()
+  info("Operator::scalar_op::dims = $shape")
+
+  data_jl  = 5ones(Float32, shape)
+  arr_data = mx.copy(data_jl, mx.cpu())
+  arr_grad = mx.zeros(shape)
+
+  test = 2 ./ (4 - ((1+data+1)*2/5) - 0.2)
+  exec_test = mx.bind(test, mx.cpu(), [arr_data], args_grad=[arr_grad])
+  mx.forward(exec_test)
+  out = copy(exec_test.outputs[1])
+  jl_out1 = (4 - ((1+data_jl+1)*2/5) - 0.2)
+  jl_out = 2 ./ jl_out1
+  @test copy(out) ≈ jl_out
+
+  out_grad = 2mx.ones(shape)
+  jl_grad  = 2copy(out_grad) / 5
+  jl_grad  = 2jl_grad ./ (jl_out1 .^ 2)
+  mx.backward(exec_test, out_grad)
+  @test copy(arr_grad) ≈ jl_grad
+end
+
+################################################################################
+# Run tests
+################################################################################
+
+@testset "Operator Test" begin
+  test_scalar_op()
+end
+
+end
diff --git a/julia/test/unittest/optimizer.jl b/julia/test/unittest/optimizer.jl
new file mode 100644
index 000000000000..b068f12fffd7
--- /dev/null
+++ b/julia/test/unittest/optimizer.jl
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module TestOptimizer
+
+using Base.Test
+
+using MXNet
+using MXNet.mx.LearningRate
+using MXNet.mx.Momentum
+
+
+function test_fixed_η()
+  info("Optimizer::LearningRate::Fixed")
+  x = LearningRate.Fixed(.42)
+  @test get(x) == .42
+  update!(x)
+  @test get(x) == .42
+end  # function test_fixed_η
+
+
+function check_η_decay(x)
+  info("Optimizer::LearningRate::$x")
+
+  η = get(x)
+  @test η == 1
+
+  for i ∈ 1:5
+    update!(x)
+    η′ = get(x)
+    @test η′ < η
+    η = η′
+  end
+end  # function check_η_decay
+
+
+test_exp_η() = LearningRate.Exp(1) |> check_η_decay
+
+
+test_inv_η() = LearningRate.Inv(1) |> check_η_decay
+
+
+function test_μ_null()
+  info("Optimizer::Momentum::Null")
+  x = Momentum.Null()
+  @test iszero(get(x))
+end
+
+
+function test_μ_fixed()
+  info("Optimizer::Momentum::Fixed")
+  x = Momentum.Fixed(42)
+  @test get(x) == 42
+end
+
+
+@testset "Optimizer Test" begin
+    @testset "LearningRate Test" begin
+      test_fixed_η()
+      test_exp_η()
+      test_inv_η()
+    end
+
+    @testset "Momentum Test" begin
+      test_μ_null()
+      test_μ_fixed()
+    end
+end
+
+
+end  # module TestOptimizer
diff --git a/julia/test/unittest/random.jl b/julia/test/unittest/random.jl
new file mode 100644
index 000000000000..973a4bc32faa
--- /dev/null
+++ b/julia/test/unittest/random.jl
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module TestRandom
+using MXNet
+using Base.Test
+
+function test_uniform()
+  dims = (100, 100, 2)
+  info("random::uniform::dims = $dims")
+
+  low = -10; high = 10
+  seed = 123
+  mx.srand(seed)
+  ret1 = mx.rand(dims..., low = low, high = high)
+
+  mx.srand(seed)
+  ret2 = mx.empty(dims)
+  mx.rand!(ret2, low = low, high = high)
+
+  @test copy(ret1) == copy(ret2)
+  @test abs(mean(copy(ret1)) - (high+low)/2) < 0.1
+end
+
+function test_gaussian()
+  dims = (80, 80, 4)
+  info("random::gaussian::dims = $dims")
+
+  μ = 10; σ = 2
+  seed = 456
+  mx.srand(seed)
+  ret1 = mx.randn(dims..., μ = μ, σ = σ)
+
+  mx.srand(seed)
+  ret2 = mx.empty(dims)
+  mx.randn!(ret2, μ = μ, σ = σ)
+
+  @test copy(ret1) == copy(ret2)
+  @test abs(mean(copy(ret1)) - μ) < 0.1
+  @test abs(std(copy(ret1)) - σ) < 0.1
+end
+
+@testset "Random Test" begin
+  test_uniform()
+  test_gaussian()
+end
+
+end
diff --git a/julia/test/unittest/symbolic-node.jl b/julia/test/unittest/symbolic-node.jl
new file mode 100644
index 000000000000..507af17332f8
--- /dev/null
+++ b/julia/test/unittest/symbolic-node.jl
@@ -0,0 +1,565 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module TestSymbolicNode
+
+using MXNet
+using Base.Test
+
+using ..Main: mlp2, mlpchain, exec
+
+################################################################################
+# Test Implementations
+################################################################################
+function test_basic()
+  info("SymbolicNode::basic")
+
+  model = mlp2()
+  @test mx.list_arguments(model) == [:data,:fc1_weight,:fc1_bias,:fc2_weight,:fc2_bias]
+  @test mx.list_outputs(model) == [:fc2_output]
+  @test mx.list_auxiliary_states(model) == Symbol[]
+end
+
+function test_chain()
+  info("SymbolicNode::chain")
+
+  model = mlpchain()
+  @test mx.list_arguments(model) == [:data,:fc1_weight,:fc1_bias,:fc2_weight,:fc2_bias]
+  @test mx.list_outputs(model) == [:fc2_output]
+  @test mx.list_auxiliary_states(model) == Symbol[]
+
+  let layerconfig = [20, 10, 6]
+    model = @mx.chain mx.Variable(:data) =>
+      mx.MLP(layerconfig, prefix=:magic_) =>
+      mx.LinearRegressionOutput(mx.Variable(:label))
+
+    @test mx.list_arguments(model) == [
+      :data,
+      :magic_fc1_weight, :magic_fc1_bias,
+      :magic_fc2_weight, :magic_fc2_bias,
+      :magic_fc3_weight, :magic_fc3_bias,
+      :label]
+  end
+end
+
+function test_internal()
+  info("SymbolicNode::internal")
+
+  data  = mx.Variable(:data)
+  oldfc = mx.FullyConnected(data, name=:fc1, num_hidden=10)
+  net1  = mx.FullyConnected(oldfc, name=:fc2, num_hidden=100)
+
+  @test mx.list_arguments(net1) == [:data,:fc1_weight,:fc1_bias,:fc2_weight,:fc2_bias]
+
+  internal = mx.get_internals(net1)
+  fc1      = internal[:fc1_output]
+  @test mx.list_arguments(fc1) == mx.list_arguments(oldfc)
+end
+
+function test_get_children()
+  info("SymbolicNode::get_children")
+
+  let x = mx.Variable(:x), y = mx.Variable(:y)
+    z = x + y
+    @test length(mx.list_outputs(z)) == 1
+    @test length(mx.list_outputs(mx.get_children(z))) == 2
+    @test mx.list_outputs(mx.get_children(z)) == [:x, :y]
+  end
+
+  info("SymbolicNode::get_children::on leaf")
+  let x = mx.Variable(:x)
+    @test mx.get_children(x) == nothing
+  end
+end  # test_get_children
+
+
+function test_compose()
+  info("SymbolicNode::compose")
+
+  data = mx.Variable(:data)
+  net1 = mx.FullyConnected(data, name=:fc1, num_hidden=10)
+  net1 = mx.FullyConnected(net1, name=:fc2, num_hidden=100)
+
+  net2 = mx.FullyConnected(mx.SymbolicNode, name=:fc3, num_hidden=10)
+  net2 = mx.Activation(net2, act_type=:relu)
+  net2 = mx.FullyConnected(net2, name=:fc4, num_hidden=20)
+
+  composed  = net2(fc3_data=net1, name=:composed)
+  multi_out = mx.Group(composed, net1)
+  @test mx.list_outputs(multi_out) == [:composed_output, :fc2_output]
+end
+
+function test_infer_shape()
+  info("SymbolicNode::infer_shape::mlp2")
+
+  model = mlp2()
+  data_shape = (100, 100)
+  arg_shapes, out_shapes, aux_shapes = mx.infer_shape(model, data=data_shape)
+  arg_shape_dict = Dict{Symbol,Tuple}(zip(mx.list_arguments(model), arg_shapes))
+  @test arg_shape_dict == Dict{Symbol,Tuple}(:fc2_bias => (10,),:fc2_weight => (1000,10),
+                                             :fc1_bias => (1000,), :fc1_weight => (100, 1000),
+                                             :data => data_shape)
+  @test length(out_shapes) == 1
+  @test out_shapes[1] == (10, 100)
+end
+
+function test_infer_shape_error()
+  info("SymbolicNode::infer_shape::throws")
+
+  model = mlp2()
+  weight_shape = (100, 1)
+  data_shape   = (100, 100)
+  @test_throws mx.MXError mx.infer_shape(model, data=data_shape, fc1_weight=weight_shape)
+end
+
+function test_saveload()
+  info("SymbolicNode::saveload::mlp2")
+
+  model = mlp2()
+  fname = tempname()
+  mx.save(fname, model)
+  model2 = mx.load(fname, mx.SymbolicNode)
+  @test mx.to_json(model) == mx.to_json(model2)
+
+  rm(fname)
+end
+
+function test_attrs()
+  info("SymbolicNode::Attributes")
+
+  data = mx.Variable(:data)
+
+  @test mx.get_name(data) == :data
+  result = mx.get_attr(data, :test)
+  @test isnull(result)
+  mx.set_attr(data, :test, "1.0")
+  result = mx.get_attr(data, :test)
+  @test !isnull(result)
+  @test get(result) == "1.0"
+
+  data2 = mx.Variable(:data2, attrs = Dict(:test => "hallo!"))
+  @test get(mx.get_attr(data2, :test)) == "hallo!"
+
+  conv = mx.Convolution(data2, kernel = (1,1), num_filter = 1)
+  @test isnull(mx.get_attr(conv, :b))
+  @test isa(mx.get_name(conv), Symbol)
+
+  @test_throws MethodError mx.Variable(:data3, attrs = Dict(:test => "1.0", :test2 => 1.0))
+  @test_throws MethodError mx.Convolution(data2, kernel = (1,1), num_filter = 1, attrs = Dict(:test => "1.0", :test2 => 1.0))
+end
+
+function test_functions()
+  info("SymbolicNode::Functions")
+  data = mx.Variable(:data)
+  typeof(mx.sum(data)) == mx.SymbolicNode
+end
+
+function test_reshape()
+  info("SymbolicNode::reshape(sym, dim...)")
+
+  A = mx.NDArray(collect(1:24))
+  x = mx.Variable(:x)
+  y = mx.reshape(x, 2, 3, 4)
+  e = mx.bind(y, mx.cpu(), Dict(:x => A))
+  mx.forward(e)
+  out = e.outputs[1]
+
+  @test size(out) == (2, 3, 4)
+  @test copy(out) == reshape(1:24, 2, 3, 4)
+
+  info("SymbolicNode::reshape(sym, dim)")
+
+  A = mx.NDArray(collect(1:24))
+  x = mx.Variable(:x)
+  y = mx.reshape(x, (2, 3, 4))
+  e = mx.bind(y, mx.cpu(), Dict(:x => A))
+  mx.forward(e)
+  out = e.outputs[1]
+
+  @test size(out) == (2, 3, 4)
+  @test copy(out) == reshape(1:24, 2, 3, 4)
+
+  info("SymbolicNode::reshape::reverse")
+
+  A = mx.zeros(10, 5, 4)
+  x = mx.Variable(:x)
+  y = mx.reshape(x, -1, 0, reverse=true)
+  e = mx.bind(y, mx.cpu(), Dict(:x => A))
+  mx.forward(e)
+  out = e.outputs[1]
+
+  @test size(out) == (50, 4)
+
+  info("SymbolicNode::reshape::0")
+
+  A = mx.zeros(2, 3, 4)
+  x = mx.Variable(:x)
+  y = mx.reshape(x, 4, 0, 2)
+  e = mx.bind(y, mx.cpu(), Dict(:x => A))
+  mx.forward(e)
+  out = e.outputs[1]
+
+  @test size(out) == (4, 3, 2)
+
+  info("SymbolicNode::reshape::-1")
+
+  A = mx.zeros(2, 3, 4)
+  x = mx.Variable(:x)
+  y = mx.reshape(x, 6, 1, -1)
+  e = mx.bind(y, mx.cpu(), Dict(:x => A))
+  mx.forward(e)
+  out = e.outputs[1]
+
+  @test size(out) == (6, 1, 4)
+
+  info("SymbolicNode::reshape::-2")
+
+  A = mx.zeros(2, 3, 4, 2)
+  x = mx.Variable(:x)
+  y = mx.reshape(x, 3, 2, -2)
+  e = mx.bind(y, mx.cpu(), Dict(:x => A))
+  mx.forward(e)
+  out = e.outputs[1]
+
+  @test size(out) == (3, 2, 4, 2)
+
+  info("SymbolicNode::reshape::-3")
+
+  A = mx.zeros(2, 3, 4, 5)
+  x = mx.Variable(:x)
+  y = mx.reshape(x, -3, -3)
+  e = mx.bind(y, mx.cpu(), Dict(:x => A))
+  mx.forward(e)
+  out = e.outputs[1]
+
+  @test size(out) == (6, 20)
+
+  info("SymbolicNode::reshape::-4")
+
+  A = mx.zeros(2, 3, 4)
+  x = mx.Variable(:x)
+  y = mx.reshape(x, 0, 0, -4, 2, 2)
+  e = mx.bind(y, mx.cpu(), Dict(:x => A))
+  mx.forward(e)
+  out = e.outputs[1]
+
+  @test size(out) == (2, 3, 2, 2)
+end
+
+function test_dot()
+  info("SymbolicNode::dot")
+  x = mx.Variable(:x)
+  y = mx.Variable(:y)
+  z = mx.dot(x, y)
+  z_exec = mx.bind(z, context=mx.cpu(),
+                   args=Dict(:x => mx.ones((100, 2)), :y => mx.ones((2, 200))))
+  mx.forward(z_exec)
+
+  ret = copy(z_exec.outputs[1])
+  @test size(ret) == (100, 200)
+  @test ret ≈ 2*ones(100, 200)
+end
+
+function test_print()
+  info("SymbolicNode::print")
+  io = IOBuffer()
+  print(io, mx.Variable(:x))
+  @test !isempty(String(take!(io)))
+end
+
+function test_misc()
+  info("SymbolicNode::Miscellaneous")
+  # Test for #189
+  a = mx.Variable("a")
+  b = mx.Variable("b")
+  symb = mx.ElementWiseSum(a, b)
+end
+
+function test_add()
+  info("SymbolicNode::elementwise add")
+  let x = mx.Variable(:x), A = Float32[1 2; 3 4]
+    let y = exec(x .+ 42; :x => A)[]
+      @test size(y) == size(A)
+      @test copy(y) == A .+ 42
+    end
+
+    let y = exec(42 .+ x; :x => A)[]
+      @test size(y) == size(A)
+      @test copy(y) == 42 .+ A
+    end
+
+    let y = exec(-1 .+ x .+ 42; :x => A)[]
+      @test size(y) == size(A)
+      @test copy(y) == -1 .+ A .+ 42
+    end
+  end
+
+  let A = Float32[1 2; 3 4], B = Float32[2 4; 6 8]
+    x = mx.Variable(:x)
+    y = mx.Variable(:y)
+
+    let z = x .+ y
+      z = exec(z; :x => A, :y => B)[]
+
+      @test size(z) == size(A)
+      @test copy(z) == A .+ B
+    end
+
+    let z = y .+ x
+      z = exec(z; :x => A, :y => B)[]
+
+      @test size(z) == size(A)
+      @test copy(z) == B .+ A
+    end
+  end
+end  # function test_add
+
+function test_minus()
+  info("SymbolicNode::elementwise minus")
+  let x = mx.Variable(:x), A = Float32[1 2; 3 4]
+    let y = exec(x .- 42; :x => A)[]
+      @test size(y) == size(A)
+      @test copy(y) == A .- 42
+    end
+
+    let y = exec(42 .- x; :x => A)[]
+      @test size(y) == size(A)
+      @test copy(y) == 42 .- A
+    end
+
+    let y = exec(-1 .- x .- 42; :x => A)[]
+      @test size(y) == size(A)
+      @test copy(y) == -1 .- A .- 42
+    end
+
+    let y = exec(-x; :x => A)[]
+      @test size(y) == size(A)
+      @test copy(y) == -A
+    end
+  end
+
+  let A = Float32[1 2; 3 4], B = Float32[2 4; 6 8]
+    x = mx.Variable(:x)
+    y = mx.Variable(:y)
+
+    let z = x .- y
+      z = exec(z; :x => A, :y => B)[]
+
+      @test size(z) == size(A)
+      @test copy(z) == A .- B
+    end
+
+    let z = y .- x
+      z = exec(z; :x => A, :y => B)[]
+
+      @test size(z) == size(A)
+      @test copy(z) == B .- A
+    end
+  end
+end  # function test_minus
+
+function test_mul()
+  info("SymbolicNode::elementwise mul")
+  let x = mx.Variable(:x), A = Float32[1 2; 3 4]
+    let y = exec(x .* 42; :x => A)[]
+      @test size(y) == size(A)
+      @test copy(y) == A .* 42
+    end
+
+    let y = exec(42 .* x; :x => A)[]
+      @test size(y) == size(A)
+      @test copy(y) == 42 .* A
+    end
+
+    let y = exec(-1 .* x .* 42; :x => A)[]
+      @test size(y) == size(A)
+      @test copy(y) == -1 .* A .* 42
+    end
+  end
+
+  let A = Float32[1 2; 3 4], B = Float32[2 4; 6 8]
+    x = mx.Variable(:x)
+    y = mx.Variable(:y)
+
+    let z = x .* y
+      z = exec(z; :x => A, :y => B)[]
+
+      @test size(z) == size(A)
+      @test copy(z) == A .* B
+    end
+
+    let z = y .* x
+      z = exec(z; :x => A, :y => B)[]
+
+      @test size(z) == size(A)
+      @test copy(z) == B .* A
+    end
+  end
+end  # function test_mul
+
+function test_div()
+  info("SymbolicNode::elementwise div")
+  let x = mx.Variable(:x), A = Float32[1 2; 3 4]
+    let y = exec(x ./ 42; :x => A)[]
+      @test size(y) == size(A)
+      @test copy(y) ≈ A ./ 42
+    end
+
+    let y = exec(42 ./ x; :x => A)[]
+      @test size(y) == size(A)
+      @test copy(y) ≈ 42 ./ A
+    end
+
+    let y = exec(-1 ./ x ./ 42; :x => A)[]
+      @test size(y) == size(A)
+      @test copy(y) ≈ -1 ./ A ./ 42
+    end
+  end
+
+  let A = Float32[1 2; 3 4], B = Float32[2 4; 6 8]
+    x = mx.Variable(:x)
+    y = mx.Variable(:y)
+
+    let z = x ./ y
+      z = exec(z; :x => A, :y => B)[]
+
+      @test size(z) == size(A)
+      @test copy(z) ≈ A ./ B
+    end
+
+    let z = y ./ x
+      z = exec(z; :x => A, :y => B)[]
+
+      @test size(z) == size(A)
+      @test copy(z) ≈ B ./ A
+    end
+  end
+end  # function test_div
+
+function test_power()
+  info("SymbolicNode::elementwise power")
+  let x = mx.Variable(:x), A = Float32[1 2; 3 4]
+    let y = exec(x.^42; :x => A)[]
+      @test size(y) == size(A)
+      @test copy(y) ≈ A.^42
+    end
+
+    let y = exec(42.^x; :x => A)[]
+      @test size(y) == size(A)
+      @test copy(y) ≈ 42.^A
+    end
+  end
+
+  let A = Float32[1 2; 3 4], B = Float32[2 4; 6 8]
+    x = mx.Variable(:x)
+    y = mx.Variable(:y)
+
+    let z = x.^y
+      z = exec(z; :x => A, :y => B)[]
+
+      @test size(z) == size(A)
+      @test copy(z) ≈ A.^B
+    end
+
+    let z = y.^x
+      z = exec(z; :x => A, :y => B)[]
+
+      @test size(z) == size(A)
+      @test copy(z) ≈ B.^A
+    end
+  end
+
+  info("SymbolicNode::power::e.^x::x.^e")
+  let x = mx.Variable(:x), A = [0 0 0; 0 0 0]
+    y = exec(e.^x; :x => A)[]
+    @test copy(y) ≈ ones(A)
+  end
+
+  let x = mx.Variable(:x), A = Float32[1 2; 3 4]
+    let y = e.^x
+      z = exec(y; :x => A)[]
+      @test copy(z) ≈ e.^A
+    end
+
+    let y = x.^e
+      z = exec(y; :x => A)[]
+      @test copy(z) ≈ A.^e
+    end
+  end
+
+  info("SymbolicNode::power::π.^x::x.^π")
+  let x = mx.Variable(:x), A = Float32[1 2; 3 4]
+    let y = π.^x
+      z = exec(y; :x => A)[]
+      @test copy(z) ≈ π.^A
+    end
+
+    let y = x.^π
+      z = exec(y; :x => A)[]
+      @test copy(z) ≈ A.^π
+    end
+  end
+end  # function test_power
+
+function test_get_name()
+  info("SymbolicNode::get_name::with get_internals")
+  name = mx.get_name(mx.get_internals(mlp2()))  # no error
+  @test contains(name, "Ptr")
+end  # function test_get_name
+
+function test_var()
+  info("SymbolicNode::var")
+  x = @mx.var x
+  @test x isa mx.SymbolicNode
+
+  x′ = @mx.var x
+  @test x.handle != x′.handle
+
+  x, y, z = @mx.var x y z
+  @test x isa mx.SymbolicNode
+  @test y isa mx.SymbolicNode
+  @test z isa mx.SymbolicNode
+end  # test_var
+
+
+################################################################################
+# Run tests
+################################################################################
+@testset "SymbolicNode Test" begin
+  test_basic()
+  test_chain()
+  test_internal()
+  test_compose()
+  test_infer_shape()
+  test_infer_shape_error()
+  test_saveload()
+  test_attrs()
+  test_functions()
+  test_reshape()
+  test_dot()
+  test_print()
+  test_misc()
+  test_add()
+  test_minus()
+  test_mul()
+  test_div()
+  test_power()
+  test_get_name()
+  test_var()
+end
+
+end
diff --git a/julia/test/unittest/util.jl b/julia/test/unittest/util.jl
new file mode 100644
index 000000000000..ddd613ca48ea
--- /dev/null
+++ b/julia/test/unittest/util.jl
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module TestUtil
+
+using Base.Test
+
+using MXNet
+
+
+function test_getdocdefine()
+  info("Util::_getdocdefine")
+  @test contains(mx._getdocdefine("sgd_update"), "Defined in")
+end  # function test_getdocdefine
+
+
+function test_firstarg()
+  info("Util::_firstarg")
+  @test mx._firstarg(:(f(x, y))) == :x
+  @test mx._firstarg(:(f(x::mx.NDArray, y))) == :x
+  @test mx._firstarg(:(f(x::mx.NDArray, y::mx.NDArray))) == :x
+  @test mx._firstarg(:(f(x::Int, y::mx.NDArray))) == :x
+  @test mx._firstarg(:(f(x::Int, y::mx.NDArray; other = 42))) == :x
+  @test mx._firstarg(:(f(x::mx.NDArray{T}, y) where {T})) == :x
+  @test mx._firstarg(:(f(x::mx.NDArray{T,N}, y) where {T,N})) == :x
+  @test mx._firstarg(:(f(x::mx.NDArray{T,N} where {T,N}, y))) == :x
+  @test mx._firstarg(:(broadcast_(::typeof(asin), x::mx.NDArray))) == :x
+  @test mx._firstarg(:(broadcast_(::typeof(asin), x::mx.NDArray, y::mx.NDArray))) == :x
+end  # function test_firstarg
+
+
+@testset "Util Test" begin
+  test_firstarg()
+  test_getdocdefine()
+end  # @testset "Util"
+
+end  # module TestUtil
diff --git a/julia/test/unittest/visualize.jl b/julia/test/unittest/visualize.jl
new file mode 100644
index 000000000000..58d111b0fe14
--- /dev/null
+++ b/julia/test/unittest/visualize.jl
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module TestVisualize
+using MXNet
+using Base.Test
+
+using ..Main: mlp2
+
+################################################################################
+# Test Implementations
+################################################################################
+
+function test_basic()
+  info("Visualize::basic")
+
+  mlp = mlp2()
+
+  # Order of elements or default color values can change, but length of the output should be more or less stable
+  @test length(mx.to_graphviz(mlp)) == length(
+"""
+digraph "Network Visualization" {
+node [fontsize=10];
+edge [fontsize=10];
+"fc1"  [label="fc1\\nFullyConnected\\nnum-hidden=1000",style="rounded,filled",fixedsize=true,width=1.3,fillcolor="#fb8072",shape=box,penwidth=2,height=0.8034,color="#941305"];
+"activation0"  [label="activation0\\nActivation\\nact-type=relu",style="rounded,filled",fixedsize=true,width=1.3,fillcolor="#ffffb3",shape=box,penwidth=2,height=0.8034,color="#999900"];
+"fc2"  [label="fc2\\nFullyConnected\\nnum-hidden=10",style="rounded,filled",fixedsize=true,width=1.3,fillcolor="#fb8072",shape=box,penwidth=2,height=0.8034,color="#941305"];
+"activation0" -> "fc1"  [arrowtail=open,color="#737373",dir=back];
+"fc2" -> "activation0"  [arrowtail=open,color="#737373",dir=back];
+}
+""")
+end
+################################################################################
+# Run tests
+################################################################################
+
+@testset "Visualize Test" begin
+  test_basic()
+end
+end
diff --git a/mkldnn.mk b/mkldnn.mk
index 1be0704dcde1..d79bbe7d2a0e 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -47,7 +47,7 @@ $(MKLDNN_LIBFILE):
 
 mkldnn_clean:
 	$(RM) -r 3rdparty/mkldnn/build
-	$(RM) -r 3rdparty/mkldnn/install/*
+	$(RM) -r $(MKLDNNROOT)
 
 ifeq ($(USE_MKLDNN), 1)
 mkldnn: mkldnn_build
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/Changes b/perl-package/AI-MXNet-Gluon-Contrib/Changes
index 81e55aa753ab..f91ea2045edc 100644
--- a/perl-package/AI-MXNet-Gluon-Contrib/Changes
+++ b/perl-package/AI-MXNet-Gluon-Contrib/Changes
@@ -1,5 +1,8 @@
 Revision history for Perl extension AI::MXNet::Gluon::Contrib
 
+1.33    Thu Oct  4 13:25:56 PDT 2018
+        - Fixed kwalitee issues.
+
 1.32    Sun Jul 15 12:12:15 PDT 2018
         - Missing POD fixes.
 
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/META.json b/perl-package/AI-MXNet-Gluon-Contrib/META.json
index ec65bb01348e..910c7d4d9a15 100644
--- a/perl-package/AI-MXNet-Gluon-Contrib/META.json
+++ b/perl-package/AI-MXNet-Gluon-Contrib/META.json
@@ -30,7 +30,7 @@
       },
       "runtime" : {
          "requires" : {
-            "AI::MXNet" : "1.31",
+            "AI::MXNet" : "1.33"
          }
       },
       "test" : {
@@ -38,5 +38,5 @@
       }
    },
    "release_status" : "stable",
-   "version" : "1.32"
+   "version" : "1.33"
 }
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/META.yml b/perl-package/AI-MXNet-Gluon-Contrib/META.yml
index aaa194debae9..d175c2bd1413 100644
--- a/perl-package/AI-MXNet-Gluon-Contrib/META.yml
+++ b/perl-package/AI-MXNet-Gluon-Contrib/META.yml
@@ -18,4 +18,4 @@ no_index:
     - inc
 requires:
   AI::MXNet: '1.31'
-version: '1.32'
+version: '1.33'
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/Makefile.PL b/perl-package/AI-MXNet-Gluon-Contrib/Makefile.PL
index 6c58d6ea8669..a6ff95e8bcc6 100644
--- a/perl-package/AI-MXNet-Gluon-Contrib/Makefile.PL
+++ b/perl-package/AI-MXNet-Gluon-Contrib/Makefile.PL
@@ -39,7 +39,7 @@ my %WriteMakefileArgs = (
     "AI::MXNet" => "1.31",
   },
   "TEST_REQUIRES" => {},
-  "VERSION" => "1.32",
+  "VERSION" => "1.33",
   "test" => {
     "TESTS" => "t/*.t"
   }
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/README b/perl-package/AI-MXNet-Gluon-Contrib/README
index 6c0efcc3c897..f0301d168f75 100644
--- a/perl-package/AI-MXNet-Gluon-Contrib/README
+++ b/perl-package/AI-MXNet-Gluon-Contrib/README
@@ -1,5 +1,5 @@
 This archive contains the distribution AI-MXNet-Gluon-Contrib,
-version 1.32:
+version 1.33:
 
   Perl interface to MXNet Gluon Contib modules, a collection of supplemental Gluon blocks.
 
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib.pm b/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib.pm
index 029bc4b65a68..807dfc87200d 100644
--- a/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib.pm
+++ b/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib.pm
@@ -20,10 +20,20 @@ use strict;
 use warnings;
 use AI::MXNet;
 use AI::MXNet::Gluon::Contrib::NN::BasicLayers;
-our $VERSION = '1.32';
+our $VERSION = '1.33';
 =head1 NAME 
 
     AI::MXNet::Gluon::Contrib - A collection of supplemental Gluon blocks.
 =cut
 
-1;
\ No newline at end of file
+1;
+
+=head1 AUTHOR
+
+    Sergey Kolychev, <sergeykolychev.github@gmail.com>
+
+=head1 COPYRIGHT & LICENSE
+
+    This library is licensed under Apache 2.0 license L<https://www.apache.org/licenses/LICENSE-2.0>
+
+=cut
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/Changes b/perl-package/AI-MXNet-Gluon-ModelZoo/Changes
index 377dff5be8de..61018181c9a5 100644
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/Changes
+++ b/perl-package/AI-MXNet-Gluon-ModelZoo/Changes
@@ -1,5 +1,8 @@
 Revision history for Perl extension AI::MXNet::Gluon::ModelZoo
 
+1.33    Thu Oct  4 13:25:56 PDT 2018
+        - Fixed kwalitee issues.
+
 1.32    Sun Aug  5 14:25:31 PDT 2018
         - Updated vgg16/19 models
 
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/META.json b/perl-package/AI-MXNet-Gluon-ModelZoo/META.json
index 9ea969e9f5fb..2ce7dddba36c 100644
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/META.json
+++ b/perl-package/AI-MXNet-Gluon-ModelZoo/META.json
@@ -31,7 +31,8 @@
       "runtime" : {
          "requires" : {
             "AI::MXNet" : "1.31",
-            "AI::MXNet::Gluon::Contrib" : "1.3"
+            "AI::MXNet::Gluon::Contrib" : "1.3",
+            "IO::Uncompress::Unzip" : "0"
          }
       },
       "test" : {
@@ -39,5 +40,5 @@
       }
    },
    "release_status" : "stable",
-   "version" : "1.32"
+   "version" : "1.33"
 }
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/META.yml b/perl-package/AI-MXNet-Gluon-ModelZoo/META.yml
index a04484a898a9..d6d9652a6dd5 100644
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/META.yml
+++ b/perl-package/AI-MXNet-Gluon-ModelZoo/META.yml
@@ -19,4 +19,5 @@ no_index:
 requires:
   AI::MXNet: '1.31'
   AI::MXNet::Gluon::Contrib: '1.3'
-version: '1.32'
+  IO::Uncompress::Unzip: '0'
+version: '1.33'
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/Makefile.PL b/perl-package/AI-MXNet-Gluon-ModelZoo/Makefile.PL
index d15dfce99b8e..de8b1acc5e2f 100644
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/Makefile.PL
+++ b/perl-package/AI-MXNet-Gluon-ModelZoo/Makefile.PL
@@ -37,10 +37,11 @@ my %WriteMakefileArgs = (
   "NAME" => "AI::MXNet::Gluon::ModelZoo",
   "PREREQ_PM" => {
     "AI::MXNet" => "1.31",
-    "AI::MXNet::Gluon::Contrib" => "1.3"
+    "AI::MXNet::Gluon::Contrib" => "1.3",
+    "IO::Uncompress::Unzip" => "0"
   },
   "TEST_REQUIRES" => {},
-  "VERSION" => "1.32",
+  "VERSION" => "1.33",
   "test" => {
     "TESTS" => "t/*.t"
   }
@@ -49,7 +50,8 @@ my %WriteMakefileArgs = (
 
 my %FallbackPrereqs = (
   "AI::MXNet" => "1.31",
-  "AI::MXNet::Gluon::Contrib" => "1.3"
+  "AI::MXNet::Gluon::Contrib" => "1.3",
+  "IO::Uncompress::Unzip" => "0"
 );
 
 
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/README b/perl-package/AI-MXNet-Gluon-ModelZoo/README
index 6b8e04b971ec..e39ae4b69be0 100644
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/README
+++ b/perl-package/AI-MXNet-Gluon-ModelZoo/README
@@ -1,5 +1,5 @@
 This archive contains the distribution AI-MXNet-Gluon-ModelZoo,
-version 1.32:
+version 1.33:
 
   Perl interface to MXNet Gluon ModelZoo, a collection of pretrained machine learning models for computer vision.
 
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo.pm b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo.pm
index c9e6e7753045..e9cbec02a445 100644
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo.pm
+++ b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo.pm
@@ -26,7 +26,7 @@ use AI::MXNet::Gluon::ModelZoo::Vision;
 use Exporter;
 use base qw(Exporter);
 @AI::MXNet::Gluon::ModelZoo::EXPORT_OK = qw(get_model);
-our $VERSION = '1.32';
+our $VERSION = '1.33';
 
 =head1 NAME
 
@@ -130,3 +130,13 @@ sub get_model
 sub vision { 'AI::MXNet::Gluon::ModelZoo::Vision' }
 
 1;
+
+=head1 AUTHOR
+
+    Sergey Kolychev, <sergeykolychev.github@gmail.com>
+
+=head1 COPYRIGHT & LICENSE
+
+    This library is licensed under Apache 2.0 license L<https://www.apache.org/licenses/LICENSE-2.0>
+
+=cut
diff --git a/perl-package/AI-MXNet/Changes b/perl-package/AI-MXNet/Changes
index 8b9463ee84e8..8bd43f3be205 100644
--- a/perl-package/AI-MXNet/Changes
+++ b/perl-package/AI-MXNet/Changes
@@ -1,5 +1,17 @@
 Revision history for Perl extension AI::MXNet
 
+1.33    Thu Oct  4 13:25:56 PDT 2018
+        - Added randn function.
+        - Internal SELU function on C++ layer.
+        - Predict now accepts ndarray as well.
+        - Gluon: Only warn when the blocks are unregistered.
+        - Gluon: Better sparse support.
+        - Gluon: Improved block summary.
+        - Added validation docs for MXNet installation for Perl.
+        - Flexible perl env for examples.
+        - Gluon: Custom dtypes for the symbol block
+        - Separate eval metric for the epoch level.
+
 1.32    Sun Aug  5 14:25:31 PDT 2018
         - Several new metric classes
         - Expanded documentation
diff --git a/perl-package/AI-MXNet/META.json b/perl-package/AI-MXNet/META.json
index 7d0ab96c0593..bbbea734ccf8 100644
--- a/perl-package/AI-MXNet/META.json
+++ b/perl-package/AI-MXNet/META.json
@@ -30,7 +30,7 @@
       },
       "runtime" : {
          "requires" : {
-            "AI::MXNetCAPI" : "1.32",
+            "AI::MXNetCAPI" : "1.33",
             "AI::NNVMCAPI" : "1.3",
             "Function::Parameters" : "1.0705",
             "Hash::Ordered" : "0.012",
@@ -45,5 +45,5 @@
       }
    },
    "release_status" : "stable",
-   "version" : "1.32"
+   "version" : "1.33"
 }
diff --git a/perl-package/AI-MXNet/META.yml b/perl-package/AI-MXNet/META.yml
index ee5d677a8139..e604b7cd0da8 100644
--- a/perl-package/AI-MXNet/META.yml
+++ b/perl-package/AI-MXNet/META.yml
@@ -17,7 +17,7 @@ no_index:
     - t
     - inc
 requires:
-  AI::MXNetCAPI: '1.32'
+  AI::MXNetCAPI: '1.33'
   AI::NNVMCAPI: '1.3'
   Function::Parameters: '1.0705'
   Hash::Ordered: '0.012'
@@ -25,4 +25,4 @@ requires:
   Mouse: v2.1.0
   PDL: '2.007'
   PDL::CCS: '1.23.4'
-version: '1.32'
+version: '1.33'
diff --git a/perl-package/AI-MXNet/Makefile.PL b/perl-package/AI-MXNet/Makefile.PL
index 59036d905f82..6d70b21344c2 100644
--- a/perl-package/AI-MXNet/Makefile.PL
+++ b/perl-package/AI-MXNet/Makefile.PL
@@ -36,7 +36,7 @@ my %WriteMakefileArgs = (
   "LICENSE" => "apache_2_0",
   "NAME" => "AI::MXNet",
   "PREREQ_PM" => {
-    "AI::MXNetCAPI" => "1.3",
+    "AI::MXNetCAPI" => "1.33",
     "AI::NNVMCAPI" => "1.3",
     "Function::Parameters" => "1.0705",
     "Hash::Ordered" => "0.012",
@@ -46,7 +46,7 @@ my %WriteMakefileArgs = (
     "GraphViz" => "2.14"
   },
   "TEST_REQUIRES" => {},
-  "VERSION" => "1.32",
+  "VERSION" => "1.33",
   "test" => {
     "TESTS" => "t/*.t"
   }
@@ -54,7 +54,7 @@ my %WriteMakefileArgs = (
 
 
 my %FallbackPrereqs = (
-  "AI::MXNetCAPI" => "1.3",
+  "AI::MXNetCAPI" => "1.33",
   "AI::NNVMCAPI" => "1.3",
   "Function::Parameters" => "1.0705",
   "Hash::Ordered" => "0.012",
diff --git a/perl-package/AI-MXNet/README b/perl-package/AI-MXNet/README
index 2f1010a43f9a..f370db3804e9 100644
--- a/perl-package/AI-MXNet/README
+++ b/perl-package/AI-MXNet/README
@@ -1,5 +1,5 @@
 This archive contains the distribution AI-MXNet,
-version 1.32:
+version 1.33:
 
   Perl interface to MXNet machine learning library
 
diff --git a/perl-package/AI-MXNet/examples/calculator.pl b/perl-package/AI-MXNet/examples/calculator.pl
index aadc7cd2641e..0350536c730f 100755
--- a/perl-package/AI-MXNet/examples/calculator.pl
+++ b/perl-package/AI-MXNet/examples/calculator.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
diff --git a/perl-package/AI-MXNet/examples/char_lstm.pl b/perl-package/AI-MXNet/examples/char_lstm.pl
index 1e9c385c95f1..a8bf72599797 100755
--- a/perl-package/AI-MXNet/examples/char_lstm.pl
+++ b/perl-package/AI-MXNet/examples/char_lstm.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
diff --git a/perl-package/AI-MXNet/examples/cudnn_lstm_bucketing.pl b/perl-package/AI-MXNet/examples/cudnn_lstm_bucketing.pl
index 326e57c5a6cb..53200f3095c0 100755
--- a/perl-package/AI-MXNet/examples/cudnn_lstm_bucketing.pl
+++ b/perl-package/AI-MXNet/examples/cudnn_lstm_bucketing.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
diff --git a/perl-package/AI-MXNet/examples/gluon/dcgan.pl b/perl-package/AI-MXNet/examples/gluon/dcgan.pl
index 2bdc56149d7e..dd5294763cb2 100755
--- a/perl-package/AI-MXNet/examples/gluon/dcgan.pl
+++ b/perl-package/AI-MXNet/examples/gluon/dcgan.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
diff --git a/perl-package/AI-MXNet/examples/gluon/mnist.pl b/perl-package/AI-MXNet/examples/gluon/mnist.pl
index 5492e7e98f0e..1fb2d897250f 100755
--- a/perl-package/AI-MXNet/examples/gluon/mnist.pl
+++ b/perl-package/AI-MXNet/examples/gluon/mnist.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
diff --git a/perl-package/AI-MXNet/examples/lstm_bucketing.pl b/perl-package/AI-MXNet/examples/lstm_bucketing.pl
index 3618a62d1fb2..168c7c2be30f 100755
--- a/perl-package/AI-MXNet/examples/lstm_bucketing.pl
+++ b/perl-package/AI-MXNet/examples/lstm_bucketing.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
diff --git a/perl-package/AI-MXNet/examples/mnist.pl b/perl-package/AI-MXNet/examples/mnist.pl
index ca452cd95444..3786b6be98eb 100755
--- a/perl-package/AI-MXNet/examples/mnist.pl
+++ b/perl-package/AI-MXNet/examples/mnist.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
diff --git a/perl-package/AI-MXNet/examples/plot_network.pl b/perl-package/AI-MXNet/examples/plot_network.pl
index fc38ef2baaab..bf39988e7105 100755
--- a/perl-package/AI-MXNet/examples/plot_network.pl
+++ b/perl-package/AI-MXNet/examples/plot_network.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet.pm b/perl-package/AI-MXNet/lib/AI/MXNet.pm
index 651ca92ad69a..6a559a394a9f 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet.pm
@@ -51,7 +51,7 @@ use AI::MXNet::Gluon;
 use AI::MXNet::NDArray::Sparse;
 use AI::MXNet::Symbol::Sparse;
 use AI::MXNet::Engine;
-our $VERSION = '1.32';
+our $VERSION = '1.33';
 
 sub import
 {
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon.pm
index 92c8386c0d14..fde2f6ac5a63 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon.pm
@@ -82,7 +82,7 @@ sub model_zoo { require AI::MXNet::Gluon::ModelZoo; 'AI::MXNet::Gluon::ModelZoo'
     but rather brings the training algorithm and model closer together to provide flexibility in the development process.
 
     Dynamic Graphs: Gluon enables developers to define neural network models that are dynamic,
-    meaning they can be built on the fly, with any structure, and using any of Perl’s native control flow.
+    meaning they can be built on the fly, with any structure, and using any of Perl's native control flow.
 
     High Performance: Gluon provides all of the above benefits without impacting the training speed that the underlying engine provides.
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Block.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Block.pm
index 1b35e7864c12..599c3c3bef6e 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Block.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Block.pm
@@ -93,6 +93,7 @@ method __exit__()
 
 package AI::MXNet::Gluon::Block;
 use AI::MXNet::Gluon::Mouse;
+use Scalar::Util qw(refaddr);
 
 =head2 NAME
 
@@ -288,14 +289,15 @@ method __setattr__($name, $current, $prev=)
 
 method _check_container_with_block()
 {
-    my $_find_block_in_container;
-    $_find_block_in_container = sub { my ($data) = @_;
+    my $_find_unregistered_block_in_container;
+    my %children = map { refaddr($_) => 1 } $self->_children->values;
+    $_find_unregistered_block_in_container = sub { my ($data) = @_;
     # Find whether a nested container structure contains Blocks
         if(ref $data eq 'ARRAY')
         {
             for my $ele (@{ $data })
             {
-                if($_find_block_in_container->($ele))
+                if($_find_unregistered_block_in_container->($ele))
                 {
                     return 1
                 }
@@ -306,7 +308,7 @@ method _check_container_with_block()
         {
             for my $v (values %$data)
             {
-                if($_find_block_in_container->($v))
+                if($_find_unregistered_block_in_container->($v))
                 {
                     return 1;
                 }
@@ -315,7 +317,7 @@ method _check_container_with_block()
         }
         elsif(blessed $data and $data->isa('AI::MXNet::Gluon::Block'))
         {
-            return 1;
+            return not exists $children{ refaddr($data) };
         }
         else
         {
@@ -327,10 +329,10 @@ method _check_container_with_block()
     {
         if((ref $v eq 'HASH' or ref $v eq 'ARRAY') and not $k =~ /^__/)
         {
-            if($_find_block_in_container->($v))
+            if($_find_unregistered_block_in_container->($v))
             {
                 AI::MXNet::Logging->warning(
-                    '"%s" is a container with Blocks. '.
+                    '"%s" is a unregsitered container with Blocks. '.
                     'Note that Blocks inside the list, tuple or dict will not be '.
                     'registered automatically. Make sure to register them using '.
                     'register_child() or switching to '.
@@ -837,10 +839,11 @@ method summary(@inputs)
             $shared_params += $summary->get($layer)->get('shared');
         }
         print (('=')x80, "\n");
-        print "Total params: $total_params\n";
-        print "Trainable params: $trainable_params\n";
-        print "Non-trainable params: ", $total_params - $trainable_params, "\n";
-        print "Shared params: $shared_params\n";
+        print "Parameters in forward computation graph, duplicate included\n";
+        print "   Total params: $total_params\n";
+        print "   Non-trainable params: ", $total_params - $trainable_params, "\n";
+        print "Shared params in forward computation graph: $shared_params\n";
+        print "Unique parameters in model: ", $total_params - $shared_params, "\n";
         print (('-')x80, "\n");
     };
     $_->detach for @hooks;
@@ -1361,19 +1364,25 @@ sub BUILD
         }
     }
 
-    for my $i (@{ $out->list_arguments })
+    my $arg_params = $out->list_arguments;
+    my $aux_params = $out->list_auxiliary_states;
+    my ($arg_types, $aux_types) = _infer_param_types($syms, $out, $arg_params, $aux_params);
+
+    for(enumerate($arg_params))
     {
-        if(not exists $input_names{$i})
+        my ($i, $arg) = @$_;
+        if(not exists $input_names{ $arg })
         {
-            $self->params->get($i, allow_deferred_init => 1);
+            $self->params->get($arg, allow_deferred_init => 1, dtype => $arg_types->[$i]);
         }
     }
 
-    for my $i (@{ $out->list_auxiliary_states })
+    for(enumerate($aux_params))
     {
-        if(not exists $input_names{$i})
+        my ($i, $arg) = @$_;
+        if(not exists $input_names{ $arg })
         {
-            $self->params->get($i, grad_req => 'null', allow_deferred_init => 1);
+            $self->params->get($arg, grad_req => 'null', allow_deferred_init => 1, dtype => $aux_types->[$i]);
         }
     }
 
@@ -1388,6 +1397,71 @@ sub BUILD
     $self->_prefix($prefix);
 }
 
+
+func _infer_param_types($in_params, $out_params, $arg_params, $aux_params, $default_dtype='float32')
+{
+    # Utility function that helps in inferring DType of args and auxs params
+    # from given input param.
+    # Parameters
+    # ----------
+    # in_params: array ref of AI::MXNet::Symbol objects
+    #     List of input symbol variables.
+    # out_params: AI::MXNet::Symbol
+    #     Output symbol variable.
+    # arg_params: array ref of Str
+    #     List of names of argument parametrs.
+    # aux_params: array ref of Str
+    #     List of names of auxiliary parameters.
+    # default_dtype: Dtype, default 'float32'
+    #     Default data type for arg_params and aux_params, if unable to infer the type.
+    #  Returns
+    # -------
+    # arg_types: Array ref of Dtype
+    #     List of arg_params type. Order is same as arg_params.
+    #     Defaults to 'float32', if unable to infer type.
+    # aux_types: Array ref of Dtype
+    #     List of aux_params type. Order is same as aux_params.
+    #     Defaults to 'float32', if unable to infer type.
+
+    my $arg_types;
+    my $aux_types;
+    # Get Input symbol details. This will be used to infer types of
+    # other parameters.
+    my @input_sym_names = map { $_->name } @{ $in_params };
+    # Try to infer input types. If not successful, we will set default dtype.
+    # If successful, we will try to infer other params in the graph.
+    my @input_sym_arg_types;
+    my $can_infer_input_type = 1;
+    for my $in_param(@{ $in_params })
+    {
+        my $input_sym_arg_type = ($in_param->infer_type)[0];
+        if(not $input_sym_arg_type or @$input_sym_arg_type < 1)
+        {
+            $can_infer_input_type = 0;
+            last;
+        }
+        else
+        {
+            push @input_sym_arg_types, $input_sym_arg_type->[0];
+        }
+    }
+    # Try to infer types of other parameters.
+    if($can_infer_input_type)
+    {
+        my %params = map { $_->[0] => $_->[1] } zip(\@input_sym_names, \@input_sym_arg_types);
+        ($arg_types, undef, $aux_types) = $out_params->infer_type(%params);
+        if(not defined $arg_types or @$arg_types != @$arg_params)
+        {
+            $arg_types = [($default_dtype)x@$arg_params];
+        }
+        if(not defined $aux_types or @$aux_types != @$aux_params)
+        {
+            $aux_types = [($default_dtype)x@$aux_params];
+        }
+    }
+    return ($arg_types, $aux_types);
+}
+
 func _common_prefix(@names)
 {
     if(not @names)
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/Activation.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/Activation.pm
index 092893a924aa..1d6342f4955b 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/Activation.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/Activation.pm
@@ -201,16 +201,9 @@ package AI::MXNet::Gluon::NN::SELU;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::HybridBlock';
 
-sub BUILD
-{
-    my $self = shift;
-    $self->scale(1.0507009873554804934193349852946);
-    $self->alpha(1.6732632423543772848170429916717);
-}
-
 method hybrid_forward(GluonClass $F, GluonInput $x)
 {
-    return $self->scale * $F->where($x > 0, $x, $self->alpha * ($F->exp($x) - 1));
+    $F->LeakyReLU($x, act_type=>'selu', name=>'fwd');
 }
 
 __PACKAGE__->register('AI::MXNet::Gluon::NN');
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Parameter.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Parameter.pm
index 475c2a93647e..89cd0cac6229 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Parameter.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Parameter.pm
@@ -421,7 +421,7 @@ method _reduce()
     {
         my $all_row_ids = AI::MXNet::NDArray->arange(stop => $self->shape->[0], dtype=>'int64', ctx=>$ctx);
         $data = AI::MXNet::NDArray->zeros($self->shape, stype=>'row_sparse', ctx=>$ctx);
-        $self->_trainer->_row_sparse_pull($self, $data, $all_row_ids);
+        $self->_trainer->_row_sparse_pull($self, $data, $all_row_ids, 1);
     }
     return $data;
 }
@@ -1047,6 +1047,10 @@ method get(Str $name, %kwargs)
                             next;
                         }
                     }
+                    elsif($k eq 'dtype' and ($v//'') eq ($existing//''))
+                    {
+                        next;
+                    }
                     assert(
                         (not defined $v or Dumper($v) eq Dumper($param->$k)),
                         "Cannot retrieve Parameter $name because desired attribute ".
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Trainer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Trainer.pm
index 1b3b49f3652c..6117777eed8f 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Trainer.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Trainer.pm
@@ -279,16 +279,24 @@ method _init_kvstore()
     $self->_kv_initialized(1);
 }
 
-method _row_sparse_pull($parameter, $out, $row_id)
+# Internal method to invoke pull operations on KVStore. If $full_idx is set to 1,
+# $kv->pull is preferred instead of $kv->row_sparse_pull.
+
+method _row_sparse_pull($parameter, $out, $row_id, $full_idx=0)
 {
     # initialize kv and params if not already
     $self->_init_kvstore() unless $self->_kv_initialized;
     $self->_init_params() if scalar(@{ $self->_params_to_init });
-    $self->kvstore->row_sparse_pull(
-        $self->_param2idx->{ $parameter->name },
-        out => $out,
-        row_ids => $row_id
-    );
+    my $idx = $self->_param2idx->{ $parameter->name };
+    if($full_idx and not $self->kvstore->type =~ /dist/)
+    {
+        assert($row_id->size == $out->shape->[0]);
+        $self->kvstore->pull($idx, out => $out, priority => -$idx, ignore_sparse => 0);
+    }
+    else
+    {
+        $self->kvstore->row_sparse_pull($idx, out => $out, row_ids => $row_id, priority => -$idx);
+    }
 }
 
 =head2 step
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
index fe8dce32e2d8..0359cc3640d4 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
@@ -86,7 +86,7 @@ has '_print_func' => (is => 'rw', isa => 'CodeRef', lazy => 1,
     mx->init->One        Initializes weights to one.
     mx->init->Constant   Initializes the weights to a given value.
     mx->init->Orthogonal Initialize weight as orthogonal matrix.
-    mx->init->Xavier     Returns an initializer performing “Xavier” initialization for weights.
+    mx->init->Xavier     Returns an initializer performing Xavier initialization for weights.
     mx->init->MSRAPrelu  Initialize the weight according to a MSRA paper.
     mx->init->Bilinear   Initialize weight for upsampling layers.
     mx->init->FusedRNN   Initialize parameters for fused rnn layers.
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm b/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm
index bb6631f459a9..15aad76c7b4a 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm
@@ -215,13 +215,14 @@ method push(
 method pull(
     Str|ArrayRef[Str] $key,
     AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]] :$out,
-    Int :$priority=0
+    Int :$priority=0,
+    Bool :$ignore_sparse=1
 )
 {
     my ($keys, $vals) = _key_value($key, $out);
     check_call(
-        AI::MXNetCAPI::KVStorePullEx(
-            $self->handle, scalar(@{ $keys }), $keys, $vals, $priority
+        AI::MXNetCAPI::KVStorePullWithSparseEx(
+            $self->handle, scalar(@{ $keys }), $keys, $vals, $priority, $ignore_sparse
         )
     );
 }
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm
index b6e91aeaf729..0941316960a9 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm
@@ -240,7 +240,7 @@ method get()
 
     The accuracy score is defined as
 
-    accuracy(y, y^) = (1/n) * sum(i=0..n−1) { y^(i)==y(i) }
+    accuracy(y, y^) = (1/n) * sum(i=0..n-1) { y^(i)==y(i) }
 
     Parameters:
     axis (Int, default=1) – The axis that represents classes.
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Module/Base.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Module/Base.pm
index b9d5011008d7..542cf498f495 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Module/Base.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Module/Base.pm
@@ -27,6 +27,7 @@ package AI::MXNet::Module::Base;
 use Mouse;
 use AI::MXNet::Base;
 use Time::HiRes qw(time);
+use Storable qw(dclone);
 
 =head1 NAME
 
@@ -350,7 +351,7 @@ method iter_predict(AI::MXNet::DataIter $eval_data, Maybe[Int] :$num_batch=, Boo
 
     Parameters
     ----------
-    $eval_data  : AI::MXNet::DataIter
+    $eval_data  : AI::MXNet::DataIter|AcceptableInput (PDL|NDArray)
     :$num_batch= : Maybe[Int]
         Default is undef, indicating running all the batches in the data iterator.
     :$merge_batches=1 : Bool
@@ -363,6 +364,8 @@ method iter_predict(AI::MXNet::DataIter $eval_data, Maybe[Int] :$num_batch=, Boo
 
     Returns
     -------
+    If the input is AI::MXNet::NDArray|PDL then the return value is AI::MXNet::NDArray.
+
     When $merge_batches is 1 (by default), the return value will be an array ref
     [$out1, $out2, $out3] where each element is concatenation of the outputs for
     all the mini-batches. If $always_output_list` also is 0 (by default),
@@ -378,13 +381,21 @@ method iter_predict(AI::MXNet::DataIter $eval_data, Maybe[Int] :$num_batch=, Boo
 =cut
 
 method predict(
-    AI::MXNet::DataIter $eval_data,
+    AI::MXNet::DataIter|AcceptableInput $eval_data,
     Maybe[Int] :$num_batch=, Bool :$merge_batches=1, Bool :$reset=1, Bool :$always_output_list=0
 )
 {
     assert($self->binded and $self->params_initialized);
+    if(not blessed $eval_data or not $eval_data->isa('AI::MXNet::DataIter'))
+    {
+        if(not blessed $eval_data or not $eval_data->isa('AI::MXNet::NDArray'))
+        {
+            $eval_data = AI::MXNet::NDArray->array($eval_data);
+        }
+        $self->forward(AI::MXNet::DataBatch->new(data => [$eval_data]));
+        return $self->get_outputs->[0];
+    }
     $eval_data->reset() if $reset;
-
     my @output_list;
     my $nbatch = 0;
     while(my $eval_batch = <$eval_data>)
@@ -533,6 +544,7 @@ method fit(
     }
     $eval_metric = AI::MXNet::Metric->create($eval_metric)
         unless blessed $eval_metric;
+    my $epoch_eval_metric = dclone($eval_metric);
 
     ################################################################################
     # training loop
@@ -541,6 +553,7 @@ method fit(
     {
         my $tic = time;
         $eval_metric->reset;
+        $epoch_eval_metric->reset;
         my $nbatch = 0;
         my $end_of_batch = 0;
         my $next_data_batch = <$train_data>;
@@ -559,10 +572,11 @@ method fit(
             {
                 $end_of_batch = 1;
             }
-            $self->update_metric($eval_metric, $data_batch->label);
+            $self->update_metric($epoch_eval_metric, $data_batch->label);
             $monitor->toc_print if $monitor;
             if(defined $batch_end_callback)
             {
+                $self->update_metric($eval_metric, $data_batch->label);
                 my $batch_end_params = AI::MXNet::BatchEndParam->new(
                     epoch       => $epoch,
                     nbatch      => $nbatch,
@@ -576,7 +590,7 @@ method fit(
             $nbatch++;
         }
         # one epoch of training is finished
-        my $name_value = $eval_metric->get_name_value;
+        my $name_value = $epoch_eval_metric->get_name_value;
         while(my ($name, $val) = each %{ $name_value })
         {
             $self->logger->info('Epoch[%d] Train-%s=%f', $epoch, $name, $val);
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
index 873953192933..3a7b6bab2e2c 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
@@ -27,14 +27,14 @@ package AI::MXNet::NDArray;
     AI::MXNet::NDArray - Imperative tensor operations on CPU/GPU
     In AI::MXNet, NDArray is the core data structure for all mathematical computations.
     An NDArray represents a multidimensional, fixed-size homogenous array.
-    If you’re familiar with the PDL, you might notice some similarities.
+    If you're familiar with the PDL, you might notice some similarities.
     However, NDArray is row-major, unlike the PDL that is column-major.
     Like the PDL, MXNet’s NDArray enables imperative computation.
 
     Some NDArray advandages compared to PDL:
-    MXNet’s NDArray supports fast execution on a wide range of hardware configurations, including CPU, GPU, and multi-GPU machines.
+    MXNet's NDArray supports fast execution on a wide range of hardware configurations, including CPU, GPU, and multi-GPU machines.
     MXNet also scales to distributed systems in the cloud.
-    MXNet’s NDArray executes code lazily, allowing it to automatically parallelize multiple operations across the available hardware.
+    MXNet's NDArray executes code lazily, allowing it to automatically parallelize multiple operations across the available hardware.
 
     An NDArray is a multidimensional array of numbers with the same type.
     We could represent the coordinates of a point in 3D space, e.g. [2, 1, 6] as a 1D array with shape (3).
@@ -43,9 +43,9 @@ package AI::MXNet::NDArray;
 
     [[0, 1, 2]
      [3, 4, 5]]
-    Note that here the use of “dimension” is overloaded. When we say a 2D array, we mean an array with 2 axes, not an array with two components.
+    Note that here the use of 'dimension' is overloaded. When we say a 2D array, we mean an array with 2 axes, not an array with two components.
 
-    Each NDArray supports some important attributes that you’ll often want to query:
+    Each NDArray supports some important attributes that you'll often want to query:
 
     $ndarray->shape: The dimensions of the array.
     It is an array ref of integers indicating the length of the array along each axis.
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Random.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Random.pm
index 8a47a1210e1a..7a99b1dc7ea9 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Random.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Random.pm
@@ -90,6 +90,20 @@ sub AUTOLOAD {
     );
     my @args;
     my @tmp = @_;
+    if($sub eq 'randn')
+    {
+        $sub = 'normal';
+        my @shape;
+        while(defined $tmp[0] and $tmp[0] =~ /^\d+$/)
+        {
+            push @shape, shift(@tmp);
+        }
+        if(@shape)
+        {
+            push @tmp, (shape => \@shape);
+        }
+        %defaults = (%defaults, loc => 0, scale => 1);
+    }
     if(ref $tmp[-1] eq 'HASH')
     {
         my @kwargs = %{ pop(@tmp) };
diff --git a/perl-package/AI-MXNet/t/test_gluon_trainer.t b/perl-package/AI-MXNet/t/test_gluon_trainer.t
index 8b3b52b1ce97..81113af28c20 100644
--- a/perl-package/AI-MXNet/t/test_gluon_trainer.t
+++ b/perl-package/AI-MXNet/t/test_gluon_trainer.t
@@ -24,6 +24,7 @@ use AI::MXNet::Gluon::NN qw(nn);
 use AI::MXNet::TestUtils qw(almost_equal dies_ok);
 use Scalar::Util qw(refaddr);
 use AI::MXNet::Base;
+$ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
 
 sub test_multi_trainer
 {
@@ -127,15 +128,16 @@ sub test_trainer
 
 test_trainer();
 
-sub test_trainer_save_load
+sub test_trainer_sparse_save_load
 {
-    my $x = gluon->Parameter('x', shape=>[10], lr_mult=>1.0);
-    $x->initialize(ctx=>[mx->cpu(0), mx->cpu(1)], init=>'zeros');
+    my $x = gluon->Parameter('x', shape=>[10, 1], lr_mult=>1.0, stype=>'row_sparse');
+    $x->initialize(ctx=>[mx->cpu(0)], init=>'zeros');
     my $trainer = gluon->Trainer([$x], 'sgd', {learning_rate => 0.1});
+    my $all_rows = mx->nd->arange(start => 0, stop => 10, ctx => mx->cpu(0));
     mx->autograd->record(sub {
-        for my $w (@{ $x->list_data })
+        for my $w (@{ $x->list_row_sparse_data($all_rows) })
         {
-            my $y = $w + 1;
+            my $y = $w * 1;
             $y->backward();
         }
     });
@@ -148,7 +150,7 @@ sub test_trainer_save_load
     ok($trainer->kvstore->_updater->optimizer->_get_lr(0) == 0.2);
 }
 
-test_trainer_save_load();
+test_trainer_sparse_save_load();
 
 sub test_trainer_multi_layer_init
 {
diff --git a/perl-package/AI-MXNet/t/test_module.t b/perl-package/AI-MXNet/t/test_module.t
index c29c459bd1bb..3bbd8fdc4ea4 100644
--- a/perl-package/AI-MXNet/t/test_module.t
+++ b/perl-package/AI-MXNet/t/test_module.t
@@ -17,7 +17,7 @@
 
 use strict;
 use warnings;
-use Test::More tests => 426;
+use Test::More tests => 428;
 use AI::MXNet qw(mx);
 use AI::MXNet::Base;
 use AI::MXNet::TestUtils qw(almost_equal enumerate same_array dies_like rand_ndarray);
@@ -789,6 +789,17 @@ sub test_forward_reshape
 
 }
 
+sub test_forward_acceptable_input
+{
+    my $data = mx->sym->Variable('data');
+    my $out = $data * 2;
+    my $mod = mx->mod->Module(symbol => $out);
+    $mod->bind(data_shapes => [['data', [1, 10]]]);
+    $mod->init_params();
+    is_deeply($mod->predict(mx->nd->ones([1, 10]))->shape, [1, 10]);
+    is_deeply($mod->predict(mx->nd->ones([1, 10])->aspdl)->shape, [1, 10]);
+}
+
 test_module_input_grads();
 test_module_dtype();
 test_monitor();
@@ -802,3 +813,4 @@ test_module_set_params();
 test_forward_reshape();
 test_module_initializer();
 test_factorization_machine_module();
+test_forward_acceptable_input();
diff --git a/perl-package/AI-MXNet/t/test_random.t b/perl-package/AI-MXNet/t/test_random.t
index 542f79c5e71d..f049679cbdbd 100644
--- a/perl-package/AI-MXNet/t/test_random.t
+++ b/perl-package/AI-MXNet/t/test_random.t
@@ -17,7 +17,7 @@
 
 use strict;
 use warnings;
-use Test::More tests => 506;
+use Test::More tests => 515;
 use AI::MXNet qw(mx);
 use AI::MXNet::TestUtils qw(same enumerate);
 
@@ -37,6 +37,15 @@ sub check_with_device
                 [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - $params->{scale} }, $tol]
             ]
         },
+        {
+            name => 'randn',
+            ndop => sub { mx->nd->random->randn(@_) },
+            params => { loc => 10.0, scale => 0.5 },
+            checks => [
+                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - $params->{loc} }, $tol],
+                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - $params->{scale} }, $tol]
+            ]
+        },
         {
             name   => 'uniform',
             symbol => sub { mx->sym->random->uniform(@_) },
@@ -126,6 +135,7 @@ sub check_with_device
         }
 
         # check multi-distribution sampling, only supports cpu for now
+        next unless $symbdic->{inputs};
         %params = (shape=>$shape, dtype=>$dtype, ctx=>$device);
         %params = (%params, map { $_->[0] => mx->nd->array($_->[1], ctx=>$device, dtype=>$dtype) } @{ $symbdic->{inputs} });
         mx->random->seed(128);
@@ -149,6 +159,7 @@ sub check_with_device
 
         # check symbolic
         my $symbol = $symbdic->{symbol};
+        next if not $symbol;
         my $X = mx->sym->Variable("X");
         %params = %{ $symbdic->{params} };
         %params = (%params, shape=>$shape, dtype=>$dtype);
diff --git a/perl-package/AI-MXNetCAPI/Changes b/perl-package/AI-MXNetCAPI/Changes
index 938b8e268f1d..08ad085abce9 100644
--- a/perl-package/AI-MXNetCAPI/Changes
+++ b/perl-package/AI-MXNetCAPI/Changes
@@ -1,5 +1,9 @@
 Revision history for Perl extension AI::MXNetCAPI
 
+1.33    Thu Oct  4 13:25:56 PDT 2018
+        - Gluon: Better sparse support for KVStore.
+        - Gpu memory info via mxnet api call.
+
 1.32    Sun Aug  5 14:25:31 PDT 2018
         - Bugfixes.
 
diff --git a/perl-package/AI-MXNetCAPI/META.json b/perl-package/AI-MXNetCAPI/META.json
index 854023559c62..1849e6b3bc18 100644
--- a/perl-package/AI-MXNetCAPI/META.json
+++ b/perl-package/AI-MXNetCAPI/META.json
@@ -37,5 +37,5 @@
       }
    },
    "release_status" : "stable",
-   "version" : "1.32"
+   "version" : "1.33"
 }
diff --git a/perl-package/AI-MXNetCAPI/META.yml b/perl-package/AI-MXNetCAPI/META.yml
index 1db34c501d8c..eb5d9aae8018 100644
--- a/perl-package/AI-MXNetCAPI/META.yml
+++ b/perl-package/AI-MXNetCAPI/META.yml
@@ -19,4 +19,4 @@ no_index:
     - inc
 requires:
   Test::More: '0'
-version: '1.32'
+version: '1.33'
diff --git a/perl-package/AI-MXNetCAPI/README b/perl-package/AI-MXNetCAPI/README
index f5881ff2db07..67b77ccd1614 100644
--- a/perl-package/AI-MXNetCAPI/README
+++ b/perl-package/AI-MXNetCAPI/README
@@ -1,4 +1,4 @@
-AI-MXNetCAPI version 1.32
+AI-MXNetCAPI version 1.33
 =====================
 
 Swig interface to MXNet c api.
diff --git a/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm b/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
index e371219b0ae6..bc7676047d76 100644
--- a/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
+++ b/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
@@ -18,7 +18,7 @@
 package AI::MXNetCAPI;
 use base qw(DynaLoader);
 bootstrap AI::MXNetCAPI;
-our $VERSION = '1.32';
+our $VERSION = '1.33';
 1;
 __END__
 
diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i
index 2540e1bc63bb..38665748a0bf 100644
--- a/perl-package/AI-MXNetCAPI/mxnet.i
+++ b/perl-package/AI-MXNetCAPI/mxnet.i
@@ -342,6 +342,15 @@ int MXEngineSetBulkSize(int bulk_size, int* out);
  */
 int MXGetGPUCount(int* out);
 
+/*!
+ * \brief get the free and total available memory on a GPU
+ * \param dev the GPU number to query
+ * \param free_mem pointer to the integer holding free GPU memory
+ * \param total_mem pointer to the integer holding total GPU memory
+ * \return 0 when success, -1 when failure happens
+ */
+int MXGetGPUMemoryInformation(int dev, int *out, int *out);
+
 
 //-------------------------------------
 // Part 1: NDArray creation and deletion
@@ -1816,6 +1825,24 @@ int MXKVStorePullRowSparseEx(KVStoreHandle handle,
                                        NDArrayHandle* in,
                                        NDArrayHandle* in,
                                        int priority);
+
+/*!
+ * \brief pull a list of (key, value) pairs from the kvstore, where each key is a string
+ * \param handle handle to the kvstore
+ * \param num the number of key-value pairs
+ * \param keys the list of keys
+ * \param vals the list of values
+ * \param priority the priority of the action
+ * \param ignore_sparse whether to ignore sparse arrays in the request
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStorePullWithSparseEx(KVStoreHandle handle,
+                                        mx_uint num,
+                                        const char** in,
+                                        NDArrayHandle* in,
+                                        int priority,
+                                        bool ignore_sparse);
+
 /*!
  * \brief user-defined updater for the kvstore
  * It's this updater's responsibility to delete \a recv and \a local
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index 8df923908fec..3b04016351ad 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -40,7 +40,7 @@
 from ..module import Module
 
 
-def _quantize_params(qsym, params):
+def _quantize_params(qsym, params, th_dict):
     """Given a quantized symbol and a dict of params that have not been quantized,
     generate quantized params. Currently only supports quantizing the arg_params
     with names of `weight` or `bias`, not aux_params. If `qsym` contains symbols
@@ -53,6 +53,7 @@ def _quantize_params(qsym, params):
     qsym : Symbol
         Quantized symbol from FP32 symbol.
     params : dict of str->NDArray
+    th_dict: dict of min/max pairs of layers' output
     """
     inputs_name = qsym.list_arguments()
     quantized_params = {}
@@ -69,11 +70,18 @@ def _quantize_params(qsym, params):
             quantized_params[name+'_max'] = vmax
         elif name in params:
             quantized_params[name] = params[name]
+        elif name.endswith(('_min')):
+            output = name[: - len('_min')] + "_output"
+            if output in th_dict:
+                quantized_params[name] = ndarray.array([th_dict[output][0]])
+        elif name.endswith(('_max')):
+            output = name[: - len('_min')] + "_output"
+            if output in th_dict:
+                quantized_params[name] = ndarray.array([th_dict[output][1]])
     return quantized_params
 
-
 def _quantize_symbol(sym, excluded_symbols=None, offline_params=None,
-                     quantized_dtype='int8'):
+                     quantized_dtype='int8', calib_quantize_op=False):
     """Given a symbol object representing a neural network of data type FP32,
     quantize it into a INT8 network.
 
@@ -81,22 +89,24 @@ def _quantize_symbol(sym, excluded_symbols=None, offline_params=None,
     ----------
     sym : Symbol
         FP32 neural network symbol.
-    excluded_symbols : list of symbols
-        Nodes in the network that users do not want to replace with a symbol of INT8 data type.
+    excluded_sym_names : list of strings
+        A list of strings representing the names of the symbols that users want to excluding
+        from being quantized.
     offline_params : list of strs
         Names of the parameters that users want to quantize offline. It's always recommended to
         quantize parameters offline so that quantizing parameters during the inference can be
         avoided.
     quantized_dtype: str
         The quantized destination type for input data.
+    calib_quantize_op : bool
+        Whether perform offline calibration for quantize op.
     """
     num_excluded_symbols = 0
-    excluded_handles = []
     if excluded_symbols is not None:
         assert isinstance(excluded_symbols, list)
         num_excluded_symbols = len(excluded_symbols)
-        for s in excluded_symbols:
-            excluded_handles.append(s.handle)
+    else:
+        excluded_symbols = []
 
     num_offline = 0
     offline = []
@@ -109,10 +119,11 @@ def _quantize_symbol(sym, excluded_symbols=None, offline_params=None,
     check_call(_LIB.MXQuantizeSymbol(sym.handle,
                                      ctypes.byref(out),
                                      mx_uint(num_excluded_symbols),
-                                     c_array(SymbolHandle, excluded_handles),
+                                     c_str_array(excluded_symbols),
                                      mx_uint(num_offline),
                                      c_array(ctypes.c_char_p, offline),
-                                     c_str(quantized_dtype)))
+                                     c_str(quantized_dtype),
+                                     ctypes.c_bool(calib_quantize_op)))
     return Symbol(out)
 
 
@@ -254,9 +265,6 @@ def _smooth_distribution(p, eps=0.0001):
 # pylint: disable=line-too-long
 def _get_optimal_threshold(arr, num_bins=8001, num_quantized_bins=255):
     """Given a dataset, find the optimal threshold for quantizing it.
-    The reference distribution is `q`, and the candidate distribution is `p`.
-    `q` is a truncated version of the original distribution.
-
     Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
     """
     if isinstance(arr, NDArray):
@@ -307,10 +315,10 @@ def _get_optimal_threshold(arr, num_bins=8001, num_quantized_bins=255):
         right_outlier_count = np.sum(hist[p_bin_idx_stop:])
         p[-1] += right_outlier_count
         # is_nonzeros[k] indicates whether hist[k] is nonzero
-        is_nonzeros = (p != 0).astype(np.int32)
+        is_nonzeros = (sliced_nd_hist != 0).astype(np.int32)
 
         # calculate how many bins should be merged to generate quantized distribution q
-        num_merged_bins = sliced_nd_hist.size // num_quantized_bins
+        num_merged_bins = p.size // num_quantized_bins
         # merge hist into num_quantized_bins bins
         for j in range(num_quantized_bins):
             start = j * num_merged_bins
@@ -318,17 +326,17 @@ def _get_optimal_threshold(arr, num_bins=8001, num_quantized_bins=255):
             quantized_bins[j] = sliced_nd_hist[start:stop].sum()
         quantized_bins[-1] += sliced_nd_hist[num_quantized_bins * num_merged_bins:].sum()
         # expand quantized_bins into p.size bins
-        q = np.zeros(sliced_nd_hist.size, dtype=np.float32)
+        q = np.zeros(p.size, dtype=np.float32)
         for j in range(num_quantized_bins):
             start = j * num_merged_bins
             if j == num_quantized_bins - 1:
-                stop = len(is_nonzeros)
+                stop = -1
             else:
                 stop = start + num_merged_bins
             norm = is_nonzeros[start:stop].sum()
             if norm != 0:
                 q[start:stop] = float(quantized_bins[j]) / float(norm)
-        q[p == 0] = 0
+        q[sliced_nd_hist == 0] = 0
         p = _smooth_distribution(p)
         # There is a chance that q is an invalid probability distribution.
         try:
@@ -336,6 +344,7 @@ def _get_optimal_threshold(arr, num_bins=8001, num_quantized_bins=255):
         except ValueError:
             divergence[i - num_half_quantized_bins] = float("inf")
         divergence[i - num_half_quantized_bins] = stats.entropy(p, q)
+        quantized_bins[:] = 0
 
     min_divergence_idx = np.argmin(divergence)
     min_divergence = divergence[min_divergence_idx]
@@ -363,7 +372,10 @@ def _get_optimal_thresholds(nd_dict, num_bins=8001, num_quantized_bins=255, logg
             _get_optimal_threshold(nd_dict[name], num_bins=num_bins,
                                    num_quantized_bins=num_quantized_bins)
         del nd_dict[name]  # release the memory of ndarray
-        th_dict[name] = (-opt_th, opt_th)
+        if min_val < 0:
+            th_dict[name] = (-opt_th, opt_th)
+        else:
+            th_dict[name] = (0, opt_th)
         if logger is not None:
             logger.info('layer=%s, min_val=%f, max_val=%f, min_divergence=%f, optimal_threshold=%f'
                         % (name, min_val, max_val, min_divergence, opt_th))
@@ -408,12 +420,11 @@ def _load_params(params, logger=logging):
         raise ValueError('Unsupported params provided. Must be either a path to the param file or'
                          ' a pair of dictionaries representing arg_params and aux_params')
 
-
 def quantize_model(sym, arg_params, aux_params,
                    data_names=('data',), label_names=('softmax_label',),
                    ctx=cpu(), excluded_sym_names=None, calib_mode='entropy',
                    calib_data=None, num_calib_examples=None, calib_layer=None,
-                   quantized_dtype='int8', logger=logging):
+                   quantized_dtype='int8', calib_quantize_op=False, logger=logging):
     """User-level API for generating a quantized model from a FP32 model w/ or w/o calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
     inference using the quantized models on Windows for now.
@@ -466,6 +477,8 @@ def quantize_model(sym, arg_params, aux_params,
     quantized_dtype : str
         The quantized destination type for input data. Currently support 'int8'
         and 'uint8', default value is 'int8'.
+    calib_quantize_op: bool
+        Whether calibrate quantize op with its input calibration data. The quantize op's input should be in calib_layer
     logger : Object
         A logging object for printing information during the process of quantization.
 
@@ -481,24 +494,17 @@ def quantize_model(sym, arg_params, aux_params,
         raise ValueError('excluded_sym_names must be a list of strings representing'
                          ' the names of the symbols that will not be quantized,'
                          ' while received type %s' % str(type(excluded_sym_names)))
-    excluded_syms = []
-    if excluded_sym_names is not None:
-        for sym_name in excluded_sym_names:
-            nodes = sym.get_internals()
-            idx = nodes.list_outputs().index(sym_name + '_output')
-            excluded_syms.append(nodes[idx])
-    logger.info('Quantizing symbol')
 
+    logger.info('Quantizing symbol')
     if quantized_dtype not in ('int8', 'uint8'):
         raise ValueError('unknown quantized_dtype %s received,'
                          ' expected `int8` or `uint8`' % quantized_dtype)
-    qsym = _quantize_symbol(sym, excluded_symbols=excluded_syms,
+    qsym = _quantize_symbol(sym, excluded_symbols=excluded_sym_names,
                             offline_params=list(arg_params.keys()),
-                            quantized_dtype=quantized_dtype)
-
-    logger.info('Quantizing parameters')
-    qarg_params = _quantize_params(qsym, arg_params)
+                            quantized_dtype=quantized_dtype,
+                            calib_quantize_op=calib_quantize_op)
 
+    th_dict = {}
     if calib_mode is not None and calib_mode != 'none':
         if not isinstance(ctx, Context):
             raise ValueError('currently only supports single ctx, while received %s' % str(ctx))
@@ -537,4 +543,7 @@ def quantize_model(sym, arg_params, aux_params,
         logger.info('Calibrating quantized symbol')
         qsym = _calibrate_quantized_sym(qsym, th_dict)
 
+    logger.info('Quantizing parameters')
+    qarg_params = _quantize_params(qsym, arg_params, th_dict)
+
     return qsym, qarg_params, aux_params
diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index 2be43981a64c..7e4d34577635 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -468,10 +468,10 @@ def hybrid_forward(self, F, pred, label,
             pred = F.swapaxes(pred, 0, 1)
         if self._batch_axis == 1:
             label = F.swapaxes(label, 0, 1)
-        loss = F.contrib.CTCLoss(pred, label, pred_lengths, label_lengths,
-                                 use_data_lengths=pred_lengths is not None,
-                                 use_label_lengths=label_lengths is not None,
-                                 blank_label='last')
+        loss = F.CTCLoss(pred, label, pred_lengths, label_lengths,
+                         use_data_lengths=pred_lengths is not None,
+                         use_label_lengths=label_lengths is not None,
+                         blank_label='last')
         return _apply_weighting(F, loss, self._weight, sample_weight)
 
 
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index de2ad692adfc..60b3834054db 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -866,7 +866,7 @@ def _sync_copyfrom(self, source_array):
             except:
                 raise TypeError('array must consist of array-like data,' +
                                 'type %s is not supported' % str(type(array)))
-        source_array = np.ascontiguousarray(source_array, dtype=self.dtype)
+        source_array = np.asarray(source_array, dtype=self.dtype, order='C')
         if source_array.shape != self.shape:
             raise ValueError('Shape inconsistent: expected %s vs got %s'%(
                 str(self.shape), str(source_array.shape)))
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 554539b424ad..eaf22f3bec1a 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -2439,6 +2439,23 @@ def squeeze(self, *args, **kwargs):
         """
         return op.squeeze(self, *args, **kwargs)
 
+    def get_backend_symbol(self, backend):
+        """Return symbol for target backend.
+
+        Parameters
+        ----------
+        backend : str
+            The backend names.
+
+        Returns
+        -------
+        out : Symbol
+            The created Symbol for target backend.
+        """
+        out = SymbolHandle()
+        check_call(_LIB.MXGenBackendSubgraph(self.handle, c_str(backend), ctypes.byref(out)))
+        return Symbol(out)
+
     def wait_to_read(self):
         raise NotImplementedForSymbol(self.wait_to_read, None)
 
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 49a1a0f10277..c555b2fdfaf8 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -753,7 +753,7 @@ def as_stype(var, stype, dtype):
         if stype == 'default':
             executor.arg_dict[k][:] = as_stype(v, stype, dtype=dtype)
     for k in location:
-        location[k] = np.ascontiguousarray(location[k])
+        location[k] = np.asarray(location[k], order='C')
     for k, v in location.items():
         if v.dtype.kind != 'f':
             continue
diff --git a/python/setup.py b/python/setup.py
index 915635398224..7385403b488b 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -30,7 +30,7 @@
 else:
     from setuptools import setup
     from setuptools.extension import Extension
-    kwargs = {'install_requires': ['numpy<=1.15.0,>=1.8.2', 'requests<2.19.0,>=2.18.4', 'graphviz<0.9.0,>=0.8.1'], 'zip_safe': False}
+    kwargs = {'install_requires': ['numpy<=1.15.2,>=1.8.2', 'requests<2.19.0,>=2.18.4', 'graphviz<0.9.0,>=0.8.1'], 'zip_safe': False}
 
 with_cython = False
 if '--with-cython' in sys.argv:
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 0ee749419655..ea3a2d68c9f4 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -81,6 +81,14 @@
           </argLine>
         </configuration>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-surefire-plugin</artifactId>
+        <version>2.22.0</version>
+        <configuration>
+          <skipTests>false</skipTests>
+        </configuration>
+      </plugin>
       <plugin>
         <groupId>org.scalastyle</groupId>
         <artifactId>scalastyle-maven-plugin</artifactId>
@@ -104,6 +112,12 @@
       <version>1.3.1-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.11</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>commons-io</groupId>
       <artifactId>commons-io</artifactId>
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Context.scala b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Context.scala
new file mode 100644
index 000000000000..5f0caedcc402
--- /dev/null
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Context.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mxnet.javaapi
+
+import collection.JavaConverters._
+
+class Context(val context: org.apache.mxnet.Context) {
+
+  val deviceTypeid: Int = context.deviceTypeid
+
+  def this(deviceTypeName: String, deviceId: Int = 0)
+  = this(new org.apache.mxnet.Context(deviceTypeName, deviceId))
+
+  def withScope[T](body: => T): T = context.withScope(body)
+  def deviceType: String = context.deviceType
+
+  override def toString: String = context.toString
+  override def equals(other: Any): Boolean = context.equals(other)
+  override def hashCode: Int = context.hashCode
+}
+
+
+object Context {
+  implicit def fromContext(context: org.apache.mxnet.Context): Context = new Context(context)
+  implicit def toContext(jContext: Context): org.apache.mxnet.Context = jContext.context
+
+  val cpu: Context = org.apache.mxnet.Context.cpu()
+  val gpu: Context = org.apache.mxnet.Context.gpu()
+  val devtype2str = org.apache.mxnet.Context.devstr2type.asJava
+  val devstr2type = org.apache.mxnet.Context.devstr2type.asJava
+
+  def defaultCtx: Context = org.apache.mxnet.Context.defaultCtx
+}
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/DType.scala b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/DType.scala
new file mode 100644
index 000000000000..e25cdde7ac73
--- /dev/null
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/DType.scala
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mxnet.javaapi
+
+object DType extends Enumeration {
+  type DType = org.apache.mxnet.DType.DType
+  val Float32 = org.apache.mxnet.DType.Float32
+  val Float64 = org.apache.mxnet.DType.Float64
+  val Float16 = org.apache.mxnet.DType.Float16
+  val UInt8 = org.apache.mxnet.DType.UInt8
+  val Int32 = org.apache.mxnet.DType.Int32
+  val Unknown = org.apache.mxnet.DType.Unknown
+}
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/IO.scala b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/IO.scala
new file mode 100644
index 000000000000..47b1c367c1c2
--- /dev/null
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/IO.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet.javaapi
+
+class DataDesc(val dataDesc: org.apache.mxnet.DataDesc) {
+
+  def this(name: String, shape: Shape, dType: DType.DType, layout: String) =
+    this(new org.apache.mxnet.DataDesc(name, shape, dType, layout))
+
+  override def toString(): String = dataDesc.toString()
+}
+
+object DataDesc{
+  implicit def fromDataDesc(dDesc: org.apache.mxnet.DataDesc): DataDesc = new DataDesc(dDesc)
+
+  implicit def toDataDesc(dataDesc: DataDesc): org.apache.mxnet.DataDesc = dataDesc.dataDesc
+
+  def getBatchAxis(layout: String): Int = org.apache.mxnet.DataDesc.getBatchAxis(Some(layout))
+}
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Shape.scala b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Shape.scala
new file mode 100644
index 000000000000..594e3a60578f
--- /dev/null
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Shape.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet.javaapi
+
+import collection.JavaConverters._
+
+/**
+  * Shape of [[NDArray]] or other data
+  */
+
+class Shape(val shape: org.apache.mxnet.Shape) {
+  def this(dims: java.util.List[java.lang.Integer])
+    = this(new org.apache.mxnet.Shape(dims.asScala.map(Int.unbox)))
+  def this(dims: Array[Int]) = this(new org.apache.mxnet.Shape(dims))
+
+  def apply(dim: Int): Int = shape.apply(dim)
+  def get(dim: Int): Int = apply(dim)
+  def size: Int = shape.size
+  def length: Int = shape.length
+  def drop(dim: Int): Shape = shape.drop(dim)
+  def slice(from: Int, end: Int): Shape = shape.slice(from, end)
+  def product: Int = shape.product
+  def head: Int = shape.head
+
+  def toArray: Array[Int] = shape.toArray
+  def toVector: java.util.List[Int] = shape.toVector.asJava
+
+  override def toString(): String = shape.toString
+  override def equals(o: Any): Boolean = shape.equals(o)
+  override def hashCode(): Int = shape.hashCode()
+}
+
+object Shape {
+  implicit def fromShape(shape: org.apache.mxnet.Shape): Shape = new Shape(shape)
+
+  implicit def toShape(jShape: Shape): org.apache.mxnet.Shape = jShape.shape
+}
diff --git a/scala-package/core/src/test/java/org/apache/mxnet/javaapi/ContextTest.java b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/ContextTest.java
new file mode 100644
index 000000000000..abd4b5edb1e6
--- /dev/null
+++ b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/ContextTest.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet.javaapi;
+
+import org.junit.Test;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class ContextTest {
+	
+	@Test
+	public void testCPU() {
+		Context.cpu();
+	}
+	
+	@Test
+	public void testDefault() {
+		Context.defaultCtx();
+	}
+	
+	@Test
+	public void testConstructor() {
+		new Context("cpu", 0);
+	}
+}
diff --git a/scala-package/core/src/test/java/org/apache/mxnet/javaapi/DTypeTest.java b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/DTypeTest.java
new file mode 100644
index 000000000000..2e356edf5326
--- /dev/null
+++ b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/DTypeTest.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet.javaapi;
+
+import org.junit.Test;
+
+public class DTypeTest {
+	
+	@Test
+	public void Float16Test() {
+		DType.Float16();
+	}
+	
+	@Test
+	public void Float32Test() {
+		DType.Float32();
+	}
+	
+	@Test
+	public void Float64Test() {
+		DType.Float64();
+	}
+	
+	@Test
+	public void UnknownTest() {
+		DType.Unknown();
+	}
+	
+	@Test
+	public void Int32Test() {
+		DType.Int32();
+	}
+	
+	@Test
+	public void UInt8Test() {
+		DType.UInt8();
+	}
+}
diff --git a/scala-package/core/src/test/java/org/apache/mxnet/javaapi/IOTest.java b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/IOTest.java
new file mode 100644
index 000000000000..f53b5c405642
--- /dev/null
+++ b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/IOTest.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet.javaapi;
+
+import org.junit.Test;
+
+public class IOTest {
+	
+	@Test
+	public void testConstructor() {
+		Shape inputShape = new Shape(new int[] {1, 3, 512, 512});
+		new DataDesc("data", inputShape, DType.Float32(), "NCHW");
+	}
+	
+	@Test
+	public void testgetBatchAxis() {
+		DataDesc.getBatchAxis("NCHW");
+	}
+	
+}
diff --git a/scala-package/core/src/test/java/org/apache/mxnet/javaapi/ShapeTest.java b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/ShapeTest.java
new file mode 100644
index 000000000000..8f045b5687ab
--- /dev/null
+++ b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/ShapeTest.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet.javaapi;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import org.junit.Test;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+public class ShapeTest {
+	@Test
+	public void testArrayConstructor()
+	{
+		new Shape(new int[] {3, 4, 5});
+	}
+	
+	@Test
+	public void testListConstructor()
+	{
+		ArrayList<Integer> arrList = new ArrayList<Integer>();
+		arrList.add(3);
+		arrList.add(4);
+		arrList.add(5);
+		new Shape(arrList);
+	}
+	
+	@Test
+	public void testApply()
+	{
+		Shape jS = new Shape(new int[] {3, 4, 5});
+		assertEquals(jS.apply(1), 4);
+	}
+	
+	@Test
+	public void testGet()
+	{
+		Shape jS = new Shape(new int[] {3, 4, 5});
+		assertEquals(jS.get(1), 4);
+	}
+	
+	@Test
+	public void testSize()
+	{
+		Shape jS = new Shape(new int[] {3, 4, 5});
+		assertEquals(jS.size(), 3);
+	}
+	
+	@Test
+	public void testLength()
+	{
+		Shape jS = new Shape(new int[] {3, 4, 5});
+		assertEquals(jS.length(), 3);
+	}
+	
+	@Test
+	public void testDrop()
+	{
+		Shape jS = new Shape(new int[] {3, 4, 5});
+		ArrayList<Integer> l = new ArrayList<Integer>();
+		l.add(4);
+		l.add(5);
+		assertTrue(jS.drop(1).toVector().equals(l));
+	}
+	
+	@Test
+	public void testSlice()
+	{
+		Shape jS = new Shape(new int[] {3, 4, 5});
+		ArrayList<Integer> l = new ArrayList<Integer>();
+		l.add(4);
+		assertTrue(jS.slice(1,2).toVector().equals(l));
+	}
+	
+	@Test
+	public void testProduct()
+	{
+		Shape jS = new Shape(new int[] {3, 4, 5});
+		assertEquals(jS.product(), 60);
+	}
+	
+	@Test
+	public void testHead()
+	{
+		Shape jS = new Shape(new int[] {3, 4, 5});
+		assertEquals(jS.head(), 3);
+	}
+	
+	@Test
+	public void testToArray()
+	{
+		Shape jS = new Shape(new int[] {3, 4, 5});
+		assertTrue(Arrays.equals(jS.toArray(), new int[] {3,4,5}));
+	}
+	
+	@Test
+	public void testToVector()
+	{
+		Shape jS = new Shape(new int[] {3, 4, 5});
+		ArrayList<Integer> l = new ArrayList<Integer>();
+		l.add(3);
+		l.add(4);
+		l.add(5);
+		assertTrue(jS.toVector().equals(l));
+	}
+}
diff --git a/scala-package/examples/scripts/benchmark/run_image_inference_bm.sh b/scala-package/examples/scripts/benchmark/run_image_inference_bm.sh
new file mode 100755
index 000000000000..82aa9f622349
--- /dev/null
+++ b/scala-package/examples/scripts/benchmark/run_image_inference_bm.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+echo $OSTYPE
+
+hw_type=cpu
+if [ "$1" = "gpu" ]
+then
+    hw_type=gpu
+fi
+
+platform=linux-x86_64
+
+if [ "$OSTYPE" == "darwin"* ]
+then
+    platform=osx-x86_64
+fi
+
+MXNET_ROOT=$(cd "$(dirname $0)/../../../.."; pwd)
+CLASS_PATH=$MXNET_ROOT/scala-package/assembly/$platform-$hw_type/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*:$MXNET_ROOT/scala-package/infer/target/*
+
+MODEL_NAME=$2
+
+RUNS=$3
+
+BATCHSIZE=$4
+
+# model dir
+MODEL_PATH_PREFIX=$5
+# input image
+INPUT_IMG=$6
+# which input image dir
+INPUT_DIR=$7
+
+java -Xmx8G -Dmxnet.traceLeakedObjects=true -cp $CLASS_PATH \
+	org.apache.mxnetexamples.benchmark.ScalaInferenceBenchmark \
+	--example $MODEL_NAME \
+	--count $RUNS \
+	--batchSize $BATCHSIZE \
+	--model-path-prefix $MODEL_PATH_PREFIX \
+	--input-image $INPUT_IMG \
+	--input-dir $INPUT_DIR \
+
diff --git a/scala-package/examples/scripts/benchmark/run_text_charrnn_bm.sh b/scala-package/examples/scripts/benchmark/run_text_charrnn_bm.sh
new file mode 100755
index 000000000000..d2c1ee60d418
--- /dev/null
+++ b/scala-package/examples/scripts/benchmark/run_text_charrnn_bm.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+hw_type=cpu
+if [ "$1" = "gpu" ]
+then
+    hw_type=gpu
+fi
+
+platform=linux-x86_64
+
+if [ "$OSTYPE" == "darwin"* ]
+then
+    platform=osx-x86_64
+fi
+
+MXNET_ROOT=$(cd "$(dirname $0)/../../../.."; pwd)
+CLASS_PATH=$MXNET_ROOT/scala-package/assembly/$platform-$hw_type/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*:$MXNET_ROOT/scala-package/infer/target/*
+
+MODEL_NAME=$2
+
+RUNS=$3
+
+# model dir
+MODEL_PATH_PREFIX=$4
+# input image
+DATA_PATH=$5
+
+# feel free to change the starter sentence
+STARTER_SENTENCE="The joke"
+
+java -Xmx8G -Dmxnet.traceLeakedObjects=false -cp $CLASS_PATH \
+	org.apache.mxnetexamples.benchmark.ScalaInferenceBenchmark \
+	--example $MODEL_NAME \
+	--count $RUNS \
+	--model-prefix $MODEL_PATH_PREFIX \
+	--data-path $DATA_PATH \
+	--starter-sentence "$STARTER_SENTENCE"
+
diff --git a/scala-package/examples/scripts/infer/imageclassifier/get_resnet_18_data.sh b/scala-package/examples/scripts/infer/imageclassifier/get_resnet_18_data.sh
new file mode 100755
index 000000000000..4ba9fd5ac4ce
--- /dev/null
+++ b/scala-package/examples/scripts/infer/imageclassifier/get_resnet_18_data.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+MXNET_ROOT=$(cd "$(dirname $0)/../../.."; pwd)
+
+data_path=$MXNET_ROOT/scripts/infer/models/resnet-18/
+
+image_path=$MXNET_ROOT/scripts/infer/images/
+
+if [ ! -d "$data_path" ]; then
+  mkdir -p "$data_path"
+fi
+
+if [ ! -d "$image_path" ]; then
+  mkdir -p "$image_path"
+fi
+
+if [ ! -f "$data_path" ]; then
+  wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/resnet-18-symbol.json -P $data_path
+  wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/resnet-18-0000.params -P $data_path
+  wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/synset.txt -P $data_path
+  wget https://s3.amazonaws.com/model-server/inputs/kitten.jpg -P $image_path
+fi
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/InferBase.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/InferBase.scala
new file mode 100644
index 000000000000..85d5c85329bf
--- /dev/null
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/InferBase.scala
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnetexamples
+
+import org.apache.mxnet._
+
+trait InferBase {
+
+  def loadModel(context: Array[Context]): Any
+  def loadSingleData(): Any
+  def loadBatchFileList(batchSize: Int): List[Any]
+  def loadInputBatch(source: Any): Any
+  def runSingleInference(loadedModel: Any, input: Any): Any
+  def runBatchInference(loadedModel: Any, input: Any): Any
+}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/benchmark/README.md b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/benchmark/README.md
new file mode 100644
index 000000000000..753cb3125410
--- /dev/null
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/benchmark/README.md
@@ -0,0 +1,83 @@
+# Benchmarking Scala Inference APIs 
+
+This folder contains a base class [ScalaInferenceBenchmark](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/benchmark/) and provides a mechanism for benchmarking [MXNet Inference APIs]((https://github.com/apache/incubator-mxnet/tree/master/scala-package/infer)) in Scala.
+The benchmarking scripts provided runs an experiment for single inference calls and batch inference calls. It collects the time taken to perform an inference operation and emits the P99, P50 and Average values for these metrics.  One can easily add/modify any new/existing examples to the ScalaInferenceBenchmark framework in order to get the benchmark numbers for inference calls.
+Currently the ScalaInferenceBenchmark script supports three Scala examples : 
+1. [ImageClassification using ResNet-152](https://github.com/apache/incubator-mxnet/blob/master/scala-package/mxnet-demo/src/main/scala/sample/ImageClassificationExample.scala)
+2. [Object Detection Example](https://github.com/apache/incubator-mxnet/blob/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/objectdetector/SSDClassifierExample.scala)
+3. [Text Generation through RNNs](https://github.com/apache/incubator-mxnet/blob/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TestCharRnn.scala)
+
+This script can be easily placed in an automated environment to run benchmark regressions on the Scala APIs. The script automatically picks up whether you are running it on a CPU machine or on a GPU machine and appropriately uses that.
+
+## Contents
+
+1. [Prerequisites](#prerequisites)
+2. [Scripts](#scripts)
+
+## Prerequisites
+
+1. MXNet
+2. MXNet Scala Package
+3. [IntelliJ IDE (or alternative IDE) project setup](http://mxnet.incubator.apache.org/tutorials/scala/mxnet_scala_on_intellij.html) with the MXNet Scala Package
+4. Model files and datasets for the model one will try to benchmark
+
+## Scripts
+To help you easily run the benchmarks, a starter shell script has been provided for each of three examples mentioned above. The scripts can be found [here](https://github.com/apache/incubator-mxnet/blob/master/scala-package/examples/scripts/benchmark).
+Each of the script takes some parameters as inputs, details of which can be found either in the bash scripts or in the example classes itself. 
+
+* *ImageClassification Example*
+<br> The following shows an example of running ImageClassifier under the benchmark script. The script takes as parameters, the platform type (cpu/gpu), number of iterations for inference calls, the batch size for batch inference calls, the model path, input file, and input directory. 
+For more details to run ImageClassificationExample as a standalone file, refer to the [README](https://github.com/apache/incubator-mxnet/blob/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/README.md) for ImageClassifierExample.
+You may need to run ```chmod u+x run_image_inference_bm.sh``` before running this script.
+    ```bash
+    cd <Path-To-MXNET-Repo>/scala-package/examples/scripts/infer/imageclassifier
+    ./get_resnet_data.sh
+    cd <Path-To-MXNET-Repo>/scala-package/examples/scripts/benchmark
+    ./run_image_inference_bm.sh gpu ImageClassifierExample 100 10 ../infer/models/resnet-152/resnet-152 ../infer/images/kitten.jpg ../infer/images/
+    ./run_image_inference_bm.sh cpu ImageClassifierExample 100 10 ../infer/models/resnet-152/resnet-152 ../infer/images/kitten.jpg ../infer/images/
+    ```
+    Upon running this script, you might see an output like this : 
+    ```
+    [main] INFO org.apache.mxnetexamples.benchmark.CLIParserBase - 
+    single_inference_latency p99 1663, single_inference_p50 729, single_inference_average 755.17
+    ...
+        
+    INFO org.apache.mxnetexamples.benchmark.CLIParserBase - 
+    batch_inference_latency p99 4241, batch_inference_p50 4241, batch_inference_average 4241.00
+    ```
+
+* *Object Detection Example*
+<br> The following shows an example of running SSDClassifier under the benchmark script. The script takes in the number of iterations for inference calls, the batch size for batch inference calls, the model path, input file, and input directory. 
+For more details to run SSDClassifierExample as a standalone file, refer to the [README](https://github.com/apache/incubator-mxnet/blob/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/objectdetector/README.md) for SSDClassifierExample.
+You may need to run ```chmod u+x run_image_inference_bm.sh``` before running this script.
+    ```bash
+    cd <Path-To-MXNET-Repo>/scala-package/examples/scripts/infer/objectdetector
+    ./get_ssd_data.sh
+    cd <Path-To-MXNET-Repo>/scala-package/examples/scripts/benchmark
+    ./run_image_inference_bm.sh cpu ObjectDetectionExample 100 10 ../infer/models/resnet50_ssd/resnet50_ssd_model ../infer/images/dog.jpg ../infer/images/ 
+    ```
+    Upon running this script, you might see an output like this : 
+    ```
+    [main] INFO org.apache.mxnetexamples.benchmark.CLIParserBase - 
+    single_inference_latency p99 1663, single_inference_p50 729, single_inference_average 755.17
+    ...
+    
+    INFO org.apache.mxnetexamples.benchmark.CLIParserBase - 
+    batch_inference_latency p99 4241, batch_inference_p50 4241, batch_inference_average 4241.00
+    ```
+    
+* *Text Generation through RNNs*
+<br>The following shows an example of running TestCharRnn under the benchmark script. The script takes in the number of iterations for inference calls, the model path and the input text file. 
+For more details to run TestCharRnn as a standalone file, refer to the [README](https://github.com/apache/incubator-mxnet/blob/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/README.md) for TextCharRnn.
+You may need to run ```chmod u+x run_text_charrnn_bm.sh``` before running this script.
+    ```bash
+    wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/RNN/obama.zip
+    unzip obama.zip
+    cd <Path-To-MXNET-Repo>/scala-package/examples/scripts/benchmark
+    ./run_text_charrnn_bm.sh cpu CharRnn 100 <path-to-model>/obama <path-to-model>/obama.txt 
+    ```
+    Upon running this script, you might see an output like this : 
+    ```
+    [main] INFO org.apache.mxnetexamples.benchmark.CLIParserBase - 
+    single_inference_latency p99 4097, single_inference_p50 2560, single_inference_average 2673.720000 
+    ```
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmark.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmark.scala
new file mode 100644
index 000000000000..dad2b53e73a3
--- /dev/null
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmark.scala
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnetexamples.benchmark
+
+import org.apache.mxnetexamples.InferBase
+import org.apache.mxnetexamples.infer.imageclassifier.ImageClassifierExample
+import org.apache.mxnet._
+import org.apache.mxnetexamples.infer.objectdetector.SSDClassifierExample
+import org.apache.mxnetexamples.rnn.TestCharRnn
+import org.kohsuke.args4j.{CmdLineParser, Option}
+import org.slf4j.LoggerFactory
+
+import scala.collection.JavaConverters._
+
+object ScalaInferenceBenchmark {
+
+  private val logger = LoggerFactory.getLogger(classOf[CLIParserBase])
+
+  def loadModel(objectToRun: InferBase, context: Array[Context]):
+  Any = {
+    objectToRun.loadModel(context)
+  }
+
+  def loadDataSet(objectToRun: InferBase):
+  Any = {
+    objectToRun.loadSingleData()
+  }
+
+  def loadBatchDataSet(objectToRun: InferBase, batchSize: Int):
+  List[Any] = {
+    objectToRun.loadBatchFileList(batchSize)
+  }
+
+  def runInference(objectToRun: InferBase, loadedModel: Any, dataSet: Any, totalRuns: Int):
+  List[Long] = {
+    var inferenceTimes: List[Long] = List()
+    for (i <- 1 to totalRuns) {
+      NDArrayCollector.auto().withScope {
+        val startTimeSingle = System.currentTimeMillis()
+        objectToRun.runSingleInference(loadedModel, dataSet)
+        val estimatedTimeSingle = System.currentTimeMillis() - startTimeSingle
+        inferenceTimes = estimatedTimeSingle :: inferenceTimes
+        logger.info("Inference time at iteration: %d is : %d \n".format(i, estimatedTimeSingle))
+      }
+    }
+
+    inferenceTimes
+  }
+
+  def runBatchInference(objecToRun: InferBase, loadedModel: Any, dataSetBatches: List[Any]):
+  List[Long] = {
+
+    var inferenceTimes: List[Long] = List()
+    for (batch <- dataSetBatches) {
+      NDArrayCollector.auto().withScope {
+        val loadedBatch = objecToRun.loadInputBatch(batch)
+        val startTimeSingle = System.currentTimeMillis()
+        objecToRun.runBatchInference(loadedModel, loadedBatch)
+        val estimatedTimeSingle = System.currentTimeMillis() - startTimeSingle
+        inferenceTimes = estimatedTimeSingle :: inferenceTimes
+        logger.info("Batch Inference time is : %d \n".format(estimatedTimeSingle))
+      }
+    }
+
+    inferenceTimes
+  }
+
+  def percentile(p: Int, seq: Seq[Long]): Long = {
+    val sorted = seq.sorted
+    val k = math.ceil((seq.length - 1) * (p / 100.0)).toInt
+    sorted(k)
+  }
+
+  def printStatistics(inferenceTimes: List[Long], metricsPrefix: String)  {
+
+    val times: Seq[Long] = inferenceTimes
+    val p50 = percentile(50, times)
+    val p99 = percentile(99, times)
+    val p90 = percentile(90, times)
+    val average = times.sum / (times.length * 1.0)
+
+    logger.info("\n%s_p99 %d, %s_p90 %d, %s_p50 %d, %s_average %1.2f".format(metricsPrefix,
+      p99, metricsPrefix, p90, metricsPrefix, p50, metricsPrefix, average))
+
+  }
+
+  def main(args: Array[String]): Unit = {
+
+    var context = Context.cpu()
+    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
+      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
+      context = Context.gpu()
+    }
+    var baseCLI : CLIParserBase = null
+    try {
+      val exampleName = args(1)
+      val exampleToBenchmark : InferBase = exampleName match {
+        case "ImageClassifierExample" => {
+          val imParser = new org.apache.mxnetexamples.infer.imageclassifier.CLIParser
+          baseCLI = imParser
+          val parsedVals = new CmdLineParser(imParser).parseArgument(args.toList.asJava)
+          new ImageClassifierExample(imParser)
+        }
+        case "ObjectDetectionExample" => {
+          val imParser = new org.apache.mxnetexamples.infer.objectdetector.CLIParser
+          baseCLI = imParser
+          val parsedVals = new CmdLineParser(imParser).parseArgument(args.toList.asJava)
+          new SSDClassifierExample(imParser)
+        }
+        case "CharRnn" => {
+          val imParser = new org.apache.mxnetexamples.rnn.CLIParser
+          baseCLI = imParser
+          val parsedVals = new CmdLineParser(imParser).parseArgument(args.toList.asJava)
+          new TestCharRnn(imParser)
+        }
+        case _ => throw new Exception("Invalid example name to run")
+      }
+
+      logger.info("Running single inference call")
+      // Benchmarking single inference call
+      NDArrayCollector.auto().withScope {
+        val loadedModel = loadModel(exampleToBenchmark, context)
+        val dataSet = loadDataSet(exampleToBenchmark)
+        val inferenceTimes = runInference(exampleToBenchmark, loadedModel, dataSet, baseCLI.count)
+        printStatistics(inferenceTimes, "single_inference")
+      }
+
+      if (baseCLI.batchSize != 0) {
+        logger.info("Running for batch inference call")
+        // Benchmarking batch inference call
+        NDArrayCollector.auto().withScope {
+          val loadedModel = loadModel(exampleToBenchmark, context)
+          val batchDataSet = loadBatchDataSet(exampleToBenchmark, baseCLI.batchSize)
+          val inferenceTimes = runBatchInference(exampleToBenchmark, loadedModel, batchDataSet)
+          printStatistics(inferenceTimes, "batch_inference")
+        }
+      }
+
+    } catch {
+      case ex: Exception => {
+        logger.error(ex.getMessage, ex)
+        new CmdLineParser(baseCLI).printUsage(System.err)
+        sys.exit(1)
+      }
+    }
+  }
+
+}
+
+class CLIParserBase {
+  @Option(name = "--example", usage = "The scala example to benchmark")
+  val exampleName: String = "ImageClassifierExample"
+  @Option(name = "--count", usage = "number of times to run inference")
+  val count: Int = 1000
+  @Option(name = "--batchSize", usage = "BatchSize to run batchinference calls", required = false)
+  val batchSize: Int = 0
+}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala
index f6e4fe0941da..9b6f19a93a98 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala
@@ -20,11 +20,18 @@ package org.apache.mxnetexamples.infer.imageclassifier
 import org.apache.mxnet._
 import org.kohsuke.args4j.{CmdLineParser, Option}
 import org.slf4j.LoggerFactory
-import org.apache.mxnet.infer.ImageClassifier
+import org.apache.mxnet.infer.{Classifier, ImageClassifier}
 
 import scala.collection.JavaConverters._
 import java.io.File
 
+import org.apache.mxnetexamples.benchmark.CLIParserBase
+// scalastyle:off
+import java.awt.image.BufferedImage
+// scalastyle:on
+
+import org.apache.mxnetexamples.InferBase
+
 import scala.collection.mutable.ListBuffer
 
 // scalastyle:off
@@ -108,7 +115,7 @@ object ImageClassifierExample {
   }
 
   def main(args: Array[String]): Unit = {
-    val inst = new ImageClassifierExample
+    val inst = new CLIParser
     val parser: CmdLineParser = new CmdLineParser(inst)
 
     var context = Context.cpu()
@@ -157,11 +164,71 @@ object ImageClassifierExample {
   }
 }
 
-class ImageClassifierExample {
+class CLIParser extends CLIParserBase{
   @Option(name = "--model-path-prefix", usage = "the input model directory")
-  private val modelPathPrefix: String = "/resnet-152/resnet-152"
+  val modelPathPrefix: String = "/resnet-152/resnet-152"
   @Option(name = "--input-image", usage = "the input image")
-  private val inputImagePath: String = "/images/kitten.jpg"
+  val inputImagePath: String = "/images/kitten.jpg"
   @Option(name = "--input-dir", usage = "the input batch of images directory")
-  private val inputImageDir: String = "/images/"
+  val inputImageDir: String = "/images/"
+}
+
+class ImageClassifierExample(CLIParser: CLIParser) extends InferBase{
+
+  override def loadModel(context: Array[Context]): Classifier = {
+    val dType = DType.Float32
+    val inputShape = Shape(1, 3, 224, 224)
+
+    val inputDescriptor = IndexedSeq(DataDesc("data", inputShape, dType, "NCHW"))
+
+    // Create object of ImageClassifier class
+    val imgClassifier: ImageClassifier = new ImageClassifier(CLIParser.modelPathPrefix,
+      inputDescriptor, context)
+    imgClassifier
+  }
+
+  override def loadSingleData(): Any = {
+    val img = ImageClassifier.loadImageFromFile(CLIParser.inputImagePath)
+    img
+  }
+
+  override def loadBatchFileList(batchSize: Int): List[Any] = {
+    val dir = new File(CLIParser.inputImageDir)
+    require(dir.exists && dir.isDirectory,
+      "input image directory: %s not found".format(CLIParser.inputImageDir))
+    val output = ListBuffer[List[String]]()
+    var batch = ListBuffer[String]()
+    for (imgFile: File <- dir.listFiles()){
+      batch += imgFile.getPath
+      if (batch.length == batchSize) {
+        output += batch.toList
+        batch = ListBuffer[String]()
+      }
+    }
+    if (batch.length > 0) {
+      output += batch.toList
+    }
+    output.toList
+  }
+
+  override def loadInputBatch(inputPaths: Any): Any = {
+    val batchFile = inputPaths.asInstanceOf[List[String]]
+    ImageClassifier.loadInputBatch(batchFile)
+  }
+
+  override def runSingleInference(loadedModel: Any, input: Any): Any = {
+    // Running inference on single image
+    val imageModel = loadedModel.asInstanceOf[ImageClassifier]
+    val imgInput = input.asInstanceOf[BufferedImage]
+    val output = imageModel.classifyImage(imgInput, Some(5))
+    output
+  }
+
+  override def runBatchInference(loadedModel: Any, input: Any): Any = {
+    val imageModel = loadedModel.asInstanceOf[ImageClassifier]
+    val imgInput = input.asInstanceOf[Traversable[BufferedImage]]
+    val output = imageModel.classifyImageBatch(imgInput, Some(5))
+    output
+  }
+
 }
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/objectdetector/SSDClassifierExample.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/objectdetector/SSDClassifierExample.scala
index 0edde9e6516b..a8d40c9ef6a8 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/objectdetector/SSDClassifierExample.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/objectdetector/SSDClassifierExample.scala
@@ -16,7 +16,11 @@
  */
 
 package org.apache.mxnetexamples.infer.objectdetector
+// scalastyle:off
+import java.awt.image.BufferedImage
 
+import org.apache.mxnetexamples.benchmark.CLIParserBase
+// scalastyle:on
 import java.io.File
 
 import org.apache.mxnet._
@@ -27,6 +31,8 @@ import org.slf4j.LoggerFactory
 import scala.collection.JavaConverters._
 import java.nio.file.{Files, Paths}
 
+import org.apache.mxnetexamples.InferBase
+
 import scala.collection.mutable.ListBuffer
 
 // scalastyle:off
@@ -37,15 +43,6 @@ import scala.collection.mutable.ListBuffer
   * @see <a href="https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/objectdetector" target="_blank">Instructions to run this example</a>
   */
 // scalastyle:on
-class SSDClassifierExample {
-  @Option(name = "--model-path-prefix", usage = "the input model directory and prefix of the model")
-  private val modelPathPrefix: String = "/model/ssd_resnet50_512"
-  @Option(name = "--input-image", usage = "the input image")
-  private val inputImagePath: String = "/images/dog.jpg"
-  @Option(name = "--input-dir", usage = "the input batch of images directory")
-  private val inputImageDir: String = "/images/"
-}
-
 object SSDClassifierExample {
 
   private val logger = LoggerFactory.getLogger(classOf[SSDClassifierExample])
@@ -111,7 +108,7 @@ object SSDClassifierExample {
   }
 
   def main(args: Array[String]): Unit = {
-    val inst = new SSDClassifierExample
+    val inst = new CLIParser
     val parser : CmdLineParser = new CmdLineParser(inst)
     parser.parseArgument(args.toList.asJava)
     val mdprefixDir = inst.modelPathPrefix
@@ -194,3 +191,63 @@ object SSDClassifierExample {
   }
 
 }
+
+class CLIParser extends CLIParserBase {
+  @Option(name = "--model-path-prefix", usage = "the input model directory and prefix of the model")
+  val modelPathPrefix: String = "/model/ssd_resnet50_512"
+  @Option(name = "--input-image", usage = "the input image")
+  val inputImagePath: String = "/images/dog.jpg"
+  @Option(name = "--input-dir", usage = "the input batch of images directory")
+  val inputImageDir: String = "/images/"
+}
+
+class SSDClassifierExample(CLIParser: CLIParser)
+  extends InferBase {
+  override def loadModel(context: Array[Context]): Any = {
+    val dType = DType.Float32
+    val inputShape = Shape(1, 3, 512, 512)
+    val inputDescriptors = IndexedSeq(DataDesc("data", inputShape, dType, "NCHW"))
+    new ObjectDetector(CLIParser.modelPathPrefix, inputDescriptors, context)
+  }
+  override def loadSingleData(): Any = {
+    val img = ImageClassifier.loadImageFromFile(CLIParser.inputImagePath)
+    img
+  }
+
+  override def runSingleInference(loadedModel: Any, input: Any): Any = {
+    val detector = loadedModel.asInstanceOf[ObjectDetector]
+    val imgInput = input.asInstanceOf[BufferedImage]
+    detector.imageObjectDetect(imgInput)
+  }
+
+  override def loadInputBatch(inputPaths: Any): Any = {
+    val batchFile = inputPaths.asInstanceOf[List[String]]
+    ImageClassifier.loadInputBatch(batchFile)
+  }
+
+  override def loadBatchFileList(batchSize: Int): List[Any] = {
+    val dir = new File(CLIParser.inputImageDir)
+    require(dir.exists && dir.isDirectory,
+      "input image directory: %s not found".format(CLIParser.inputImageDir))
+    val output = ListBuffer[List[String]]()
+    var batch = ListBuffer[String]()
+    for (imgFile: File <- dir.listFiles()){
+      batch += imgFile.getPath
+      if (batch.length == batchSize) {
+        output += batch.toList
+        batch = ListBuffer[String]()
+      }
+    }
+    if (batch.length > 0) {
+      output += batch.toList
+    }
+    output.toList
+  }
+
+  override def runBatchInference(loadedModel: Any, input: Any): Any = {
+    val model = loadedModel.asInstanceOf[ObjectDetector]
+    val imgInput = input.asInstanceOf[Traversable[BufferedImage]]
+    val output = model.imageBatchObjectDetect(imgInput, Some(5))
+    output
+  }
+}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TestCharRnn.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TestCharRnn.scala
index bd064dbd3518..0fbdf7d918d4 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TestCharRnn.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TestCharRnn.scala
@@ -18,8 +18,11 @@
 package org.apache.mxnetexamples.rnn
 
 import org.apache.mxnet._
+import org.apache.mxnetexamples.InferBase
+import org.apache.mxnetexamples.benchmark.CLIParserBase
 import org.kohsuke.args4j.{CmdLineParser, Option}
 import org.slf4j.LoggerFactory
+
 import scala.collection.JavaConverters._
 
 /**
@@ -83,7 +86,7 @@ object TestCharRnn {
   }
 
   def main(args: Array[String]): Unit = {
-    val stcr = new TestCharRnn
+    val stcr = new CLIParser
     val parser: CmdLineParser = new CmdLineParser(stcr)
     try {
       parser.parseArgument(args.toList.asJava)
@@ -99,11 +102,63 @@ object TestCharRnn {
   }
 }
 
-class TestCharRnn {
+class CLIParser extends CLIParserBase {
   @Option(name = "--data-path", usage = "the input train data file")
-  private val dataPath: String = "./data/obama.txt"
+  val dataPath: String = "./data/obama.txt"
   @Option(name = "--model-prefix", usage = "the model prefix")
-  private val modelPrefix: String = "./model/obama"
+  val modelPrefix: String = "./model/obama"
   @Option(name = "--starter-sentence", usage = "the starter sentence")
-  private val starterSentence: String = "The joke"
+  val starterSentence: String = "The joke"
+}
+
+class TestCharRnn(CLIParser: CLIParser) extends InferBase {
+
+  private var vocab : Map[String, Int] = null
+
+  override def loadModel(context: Array[Context]): Any = {
+    val batchSize = 32
+    val buckets = List(129)
+    val numHidden = 512
+    val numEmbed = 256
+    val numLstmLayer = 3
+    val (_, argParams, _) = Model.loadCheckpoint(CLIParser.modelPrefix, 75)
+    this.vocab = Utils.buildVocab(CLIParser.dataPath)
+    val model = new RnnModel.LSTMInferenceModel(numLstmLayer, vocab.size + 1,
+      numHidden = numHidden, numEmbed = numEmbed,
+      numLabel = vocab.size + 1, argParams = argParams, dropout = 0.2f)
+    model
+  }
+
+  override def loadSingleData(): Any = {
+    val revertVocab = Utils.makeRevertVocab(vocab)
+    revertVocab
+  }
+
+  override def runSingleInference(loadedModel: Any, input: Any): Any = {
+    val model = loadedModel.asInstanceOf[RnnModel.LSTMInferenceModel]
+    val revertVocab = input.asInstanceOf[Map[Int, String]]
+    // generate a sequence of 1200 chars
+    val seqLength = 1200
+    val inputNdarray = NDArray.zeros(1)
+    // Feel free to change the starter sentence
+    var output = CLIParser.starterSentence
+    val randomSample = true
+    var newSentence = true
+    val ignoreLength = output.length()
+
+    for (i <- 0 until seqLength) {
+      if (i <= ignoreLength - 1) Utils.makeInput(output(i), vocab, inputNdarray)
+      else Utils.makeInput(output.takeRight(1)(0), vocab, inputNdarray)
+      val prob = model.forward(inputNdarray, newSentence)
+      newSentence = false
+      val nextChar = Utils.makeOutput(prob, revertVocab, randomSample)
+      if (nextChar == "") newSentence = true
+      if (i >= ignoreLength) output = output ++ nextChar
+    }
+    output
+  }
+
+  override def loadBatchFileList(batchSize: Int): List[Any] = null
+  override def loadInputBatch(source: Any): Any = null
+  override def runBatchInference(loadedModel: Any, input: Any): Any = null
 }
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmarkSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmarkSuite.scala
new file mode 100644
index 000000000000..0b7f4693c5fa
--- /dev/null
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmarkSuite.scala
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mxnetexamples.benchmark
+
+import java.io.File
+
+import org.apache.mxnetexamples.Util
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.slf4j.LoggerFactory
+
+import scala.sys.process.Process
+
+class ScalaInferenceBenchmarkSuite  extends FunSuite with BeforeAndAfterAll {
+  private val logger = LoggerFactory.getLogger(classOf[ScalaInferenceBenchmarkSuite])
+  override def beforeAll(): Unit = {
+  }
+
+  test("Testing Benchmark -- Image Classification") {
+    logger.info("Downloading resnet-18 model")
+    val tempDirPath = System.getProperty("java.io.tmpdir")
+    logger.info("tempDirPath: %s".format(tempDirPath))
+    val baseUrl = "https://s3.us-east-2.amazonaws.com/scala-infer-models"
+    Util.downloadUrl(baseUrl + "/resnet-18/resnet-18-symbol.json",
+      tempDirPath + "/resnet18/resnet-18-symbol.json")
+    Util.downloadUrl(baseUrl + "/resnet-18/resnet-18-0000.params",
+      tempDirPath + "/resnet18/resnet-18-0000.params")
+    Util.downloadUrl(baseUrl + "/resnet-18/synset.txt",
+      tempDirPath + "/resnet18/synset.txt")
+    Util.downloadUrl("https://s3.amazonaws.com/model-server/inputs/Pug-Cookie.jpg",
+      tempDirPath + "/inputImages/resnet18/Pug-Cookie.jpg")
+    val modelDirPath = tempDirPath + File.separator + "resnet18/"
+    val inputImagePath = tempDirPath + File.separator +
+      "inputImages/resnet18/Pug-Cookie.jpg"
+    val inputImageDir = tempDirPath + File.separator + "inputImages/resnet18/"
+    val args = Array(
+      "--example", "ImageClassifierExample",
+      "--count", "1",
+      "--batchSize", "10",
+      "--model-path-prefix", s"$modelDirPath/resnet-18",
+      "--input-image", inputImagePath,
+      "--input-dir", inputImageDir
+    )
+    ScalaInferenceBenchmark.main(args)
+  }
+
+  test("Testing Benchmark -- Object Detection") {
+    logger.info("Downloading resnetssd model")
+    val tempDirPath = System.getProperty("java.io.tmpdir")
+    logger.info("tempDirPath: %s".format(tempDirPath))
+    val modelBase = "https://s3.amazonaws.com/model-server/models/resnet50_ssd/"
+    val imageBase = "https://s3.amazonaws.com/model-server/inputs/"
+    Util.downloadUrl(modelBase + "resnet50_ssd_model-symbol.json",
+      tempDirPath + "/resnetssd/resnet50_ssd_model-symbol.json")
+    Util.downloadUrl(modelBase + "resnet50_ssd_model-0000.params",
+      tempDirPath + "/resnetssd/resnet50_ssd_model-0000.params")
+    Util.downloadUrl(modelBase + "synset.txt",
+      tempDirPath + "/resnetssd/synset.txt")
+    Util.downloadUrl(imageBase + "dog-ssd.jpg",
+      tempDirPath + "/inputImages/resnetssd/dog-ssd.jpg")
+    val modelDirPath = tempDirPath + File.separator + "resnetssd/"
+    val inputImagePath = tempDirPath + File.separator +
+      "inputImages/resnetssd/dog-ssd.jpg"
+    val inputImageDir = tempDirPath + File.separator + "inputImages/resnetssd/"
+    val args = Array(
+      "--example", "ObjectDetectionExample",
+      "--count", "1",
+      "--batchSize", "10",
+      "--model-path-prefix", s"$modelDirPath/resnet50_ssd_model",
+      "--input-image", inputImagePath,
+      "--input-dir", inputImageDir
+    )
+    ScalaInferenceBenchmark.main(args)
+  }
+
+  test("Testing Benchmark -- charRNN Model") {
+    logger.info("Downloading LSTM model")
+    val tempDirPath = System.getProperty("java.io.tmpdir")
+    logger.info("tempDirPath: %s".format(tempDirPath))
+    val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/RNN/"
+    Util.downloadUrl(baseUrl + "obama.zip", tempDirPath + "/RNN/obama.zip")
+    Util.downloadUrl(baseUrl + "sherlockholmes.train.txt",
+      tempDirPath + "/RNN/sherlockholmes.train.txt")
+    Util.downloadUrl(baseUrl + "sherlockholmes.valid.txt",
+      tempDirPath + "/RNN/sherlockholmes.valid.txt")
+    // TODO: Need to confirm with Windows
+    Process(s"unzip $tempDirPath/RNN/obama.zip -d $tempDirPath/RNN/") !
+
+    val args = Array(
+      "--example", "CharRnn",
+      "--count", "1",
+      "--data-path", s"$tempDirPath/RNN/obama.txt",
+      "--model-prefix", s"$tempDirPath/RNN/obama",
+      "--starter-sentence", "The joke"
+    )
+    ScalaInferenceBenchmark.main(args)
+  }
+
+}
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
index 34d3bc97a005..d8631df54052 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
@@ -64,7 +64,7 @@ class ImageClassifierExampleSuite extends FunSuite with BeforeAndAfterAll {
     }
 
     val output = ImageClassifierExample.runInferenceOnSingleImage(modelDirPath + "resnet-18",
-        inputImagePath, context)
+     inputImagePath, context)
 
     val outputList = ImageClassifierExample.runInferenceOnBatchOfImage(modelDirPath + "resnet-18",
         inputImageDir, context)
diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index 95325f3a6a2e..17d166eac345 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -1581,26 +1581,29 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
   env->ReleaseIntArrayElements(jargShapeData, argShapeData, 0);
   env->ReleaseIntArrayElements(jargIndPtr, argIndPtr, 0);
 
-  jclass listClass = env->FindClass("scala/collection/mutable/ListBuffer");
-  jmethodID listAppend = env->GetMethodID(listClass,
-    "$plus$eq", "(Ljava/lang/Object;)Lscala/collection/mutable/ListBuffer;");
+  if (ret == 0) {
+    jclass listClass = env->FindClass("scala/collection/mutable/ListBuffer");
+    jmethodID listAppend = env->GetMethodID(listClass,
+      "$plus$eq", "(Ljava/lang/Object;)Lscala/collection/mutable/ListBuffer;");
 
-  if (FillSymbolInferShape(env, listAppend, jinShapeData, inShapeSize, inShapeNdim, inShapeData)) {
-    // TODO(Yizhi): out of memory error thrown, return a specific error code ?
-    return -1;
-  }
-  if (FillSymbolInferShape(
-        env, listAppend, joutShapeData, outShapeSize, outShapeNdim, outShapeData)) {
-    // TODO(Yizhi): out of memory error thrown, return a specific error code ?
-    return -1;
-  }
-  if (FillSymbolInferShape(
-        env, listAppend, jauxShapeData, auxShapeSize, auxShapeNdim, auxShapeData)) {
-    // TODO(Yizhi): out of memory error thrown, return a specific error code ?
-    return -1;
-  }
+    if (FillSymbolInferShape(
+          env, listAppend, jinShapeData, inShapeSize, inShapeNdim, inShapeData)) {
+      // TODO(Yizhi): out of memory error thrown, return a specific error code ?
+      return -1;
+    }
+    if (FillSymbolInferShape(
+          env, listAppend, joutShapeData, outShapeSize, outShapeNdim, outShapeData)) {
+      // TODO(Yizhi): out of memory error thrown, return a specific error code ?
+      return -1;
+    }
+    if (FillSymbolInferShape(
+          env, listAppend, jauxShapeData, auxShapeSize, auxShapeNdim, auxShapeData)) {
+      // TODO(Yizhi): out of memory error thrown, return a specific error code ?
+      return -1;
+    }
 
-  SetIntField(env, jcomplete, complete);
+    SetIntField(env, jcomplete, complete);
+  }
 
   // release allocated memory
   if (jkeys != NULL) {
diff --git a/src/c_api/c_api_function.cc b/src/c_api/c_api_function.cc
index cea8c9553ccf..7091be2e72c5 100644
--- a/src/c_api/c_api_function.cc
+++ b/src/c_api/c_api_function.cc
@@ -55,7 +55,7 @@ std::vector<nnvm::NodeEntry> Gradient(
   g->inputs = out_grads;
 
   std::vector<nnvm::NodeEntry> ret;
-  for (index_t i = 0; i < g->num_outputs(); ++i) {
+  for (uint32_t i = 0; i < g->num_outputs(); ++i) {
     ret.emplace_back(nnvm::NodeEntry{g, i, 0});
   }
 
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 35ecec7e11f6..d4625de80110 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -31,6 +31,7 @@
 #include "./c_api_common.h"
 #include "../operator/operator_common.h"
 #include "../executor/exec_pass.h"
+#include "../operator/subgraph/subgraph_property.h"
 
 namespace mxnet {
 namespace op {
@@ -645,30 +646,29 @@ int MXSymbolGrad(SymbolHandle sym, mx_uint num_wrt, const char** wrt, SymbolHand
 
 int MXQuantizeSymbol(SymbolHandle sym_handle,
                      SymbolHandle *ret_sym_handle,
-                     const mx_uint num_excluded_symbols,
-                     const SymbolHandle *excluded_symbols,
+                     const mx_uint num_excluded_op_names,
+                     const char **excluded_op_names,
                      const mx_uint num_offline,
                      const char **offline_params,
-                     const char *quantized_dtype) {
+                     const char *quantized_dtype,
+                     const bool calib_quantize) {
   nnvm::Symbol *s = new nnvm::Symbol();
   API_BEGIN();
   nnvm::Symbol *sym = static_cast<nnvm::Symbol*>(sym_handle);
   nnvm::Graph g = Symbol2Graph(*sym);
-  std::unordered_set<nnvm::NodePtr> excluded_nodes;
-  for (size_t i = 0; i < num_excluded_symbols; ++i) {
-    nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(excluded_symbols[i]);
-    for (const auto& e : sym->outputs) {
-      excluded_nodes.emplace(e.node);
-    }
+  std::unordered_set<std::string> excluded_node_names;
+  for (size_t i = 0; i < num_excluded_op_names; ++i) {
+    excluded_node_names.emplace(excluded_op_names[i]);
   }
-  g.attrs["excluded_nodes"] = std::make_shared<nnvm::any>(std::move(excluded_nodes));
   std::unordered_set<std::string> offline;
   for (size_t i = 0; i < num_offline; ++i) {
     offline.emplace(offline_params[i]);
   }
   std::string quantized_type(quantized_dtype);
+  g.attrs["excluded_nodes"] = std::make_shared<nnvm::any>(std::move(excluded_node_names));
   g.attrs["offline_params"] = std::make_shared<nnvm::any>(std::move(offline));
   g.attrs["quantized_dtype"] = std::make_shared<nnvm::any>(std::move(quantized_type));
+  g.attrs["calib_quantize"] = std::make_shared<nnvm::any>(calib_quantize);
   g = ApplyPass(std::move(g), "QuantizeGraph");
   s->outputs = g.outputs;
   *ret_sym_handle = s;
@@ -696,3 +696,21 @@ int MXSetCalibTableToQuantizedSymbol(SymbolHandle qsym_handle,
   *ret_qsym_handle = s;
   API_END_HANDLE_ERROR(delete s);
 }
+
+int MXGenBackendSubgraph(SymbolHandle sym_handle, const char *backend,
+                         SymbolHandle *ret_sym_handle) {
+  nnvm::Symbol *s = new nnvm::Symbol();
+  API_BEGIN();
+  nnvm::Symbol *sym = static_cast<nnvm::Symbol *>(sym_handle);
+  *s = sym->Copy();
+  nnvm::Graph g = Symbol2Graph(*s);
+  mxnet::op::SubgraphPropertyPtr property =
+      mxnet::op::SubgraphPropertyRegistry::Get()->CreateSubgraphProperty(
+          backend);
+  g.attrs["subgraph_property"] =
+      std::make_shared<nnvm::any>(std::move(property));
+  g = ApplyPass(std::move(g), "PartitionGraph");
+  s->outputs = g.outputs;
+  *ret_sym_handle = s;
+  API_END_HANDLE_ERROR(delete s);
+}
diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
index b4b10c2c75bc..0ada350b1ede 100644
--- a/src/common/cuda_utils.h
+++ b/src/common/cuda_utils.h
@@ -374,6 +374,22 @@ inline bool GetEnvAllowTensorCore() {
   return allow_tensor_core;
 }
 
+// The policy if the user hasn't set the environment variable
+// CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION
+#define MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION_DEFAULT false
+
+/*!
+ * \brief Returns global policy for TensorCore implicit type casting
+ */
+inline bool GetEnvAllowTensorCoreConversion() {
+  // Use of optional<bool> here permits: "0", "1", "true" and "false" to all be
+  // legal.
+  bool default_value = MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION_DEFAULT;
+  return dmlc::GetEnv("MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION",
+                      dmlc::optional<bool>(default_value))
+      .value();
+}
+
 #if CUDA_VERSION >= 9000
 // Sets the cuBLAS math mode that determines the 'allow TensorCore' policy.  Returns previous.
 inline cublasMath_t SetCublasMathMode(cublasHandle_t blas_handle, cublasMath_t new_math_type) {
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 922917f79475..136917a60d95 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1308,7 +1308,7 @@ void GraphExecutor::ExecuteMonCallback(size_t nid) {
     }
   }
   CHECK_EQ(opnode.exec->out_array.size(), output_names.size());
-  for (index_t i = 0; i < opnode.exec->out_array.size(); ++i) {
+  for (size_t i = 0; i < opnode.exec->out_array.size(); ++i) {
     NDArray *cpy = new NDArray(opnode.exec->out_array[i]);
     std::string name = inode.source->attrs.name + "_" + output_names[i];
     this->monitor_callback_(name.c_str(), reinterpret_cast<void*>(cpy));
@@ -1532,14 +1532,14 @@ static nnvm::Symbol PartitionGraph(const nnvm::Symbol& src,
 // This is for bind flow.
 static nnvm::Symbol PartitionGraph(const nnvm::Symbol& src,
                                    const std::string& prop_name,
-                                   const std::vector<NDArray> &in_args,
+                                   std::vector<NDArray> *in_args,
                                    const std::vector<NDArray> &aux_states,
                                    const Context& default_ctx,
                                    const std::map<std::string, Context>& ctx_map) {
   const std::vector<std::string> input_names = src.ListInputNames(Symbol::kAll);
   const std::vector<std::string> arg_names = src.ListInputNames(nnvm::Symbol::kReadOnlyArgs);
   const std::vector<std::string> aux_names = src.ListInputNames(nnvm::Symbol::kAuxiliaryStates);
-  CHECK_EQ(arg_names.size(), in_args.size());
+  CHECK_EQ(arg_names.size(), in_args->size());
   CHECK_EQ(aux_names.size(), aux_states.size());
   nnvm::ShapeVector arg_shapes;  // all input shapes
   arg_shapes.reserve(input_names.size());
@@ -1547,7 +1547,7 @@ static nnvm::Symbol PartitionGraph(const nnvm::Symbol& src,
   arg_dtypes.reserve(input_names.size());
   StorageTypeVector arg_stypes;  // all input stypes
   arg_stypes.reserve(input_names.size());
-  std::vector<Context> in_arg_ctxes(in_args.size());
+  std::vector<Context> in_arg_ctxes(in_args->size());
   std::vector<Context> aux_state_ctxes(aux_states.size());
 
   size_t i1 = 0, i2 = 0;
@@ -1561,15 +1561,32 @@ static nnvm::Symbol PartitionGraph(const nnvm::Symbol& src,
     } else {
       CHECK(i1 < arg_names.size());
       CHECK_EQ(arg_names[i1], input_names[i]);
-      arg_shapes.push_back(in_args[i1].shape());
-      arg_dtypes.push_back(in_args[i1].dtype());
-      arg_stypes.push_back(in_args[i1].storage_type());
-      in_arg_ctxes[i1] = in_args[i1].ctx();
+      arg_shapes.push_back(in_args->at(i1).shape());
+      arg_dtypes.push_back(in_args->at(i1).dtype());
+      arg_stypes.push_back(in_args->at(i1).storage_type());
+      in_arg_ctxes[i1] = in_args->at(i1).ctx();
       ++i1;
     }
   }
-  return PartitionGraph(src, prop_name, arg_shapes, arg_dtypes, arg_stypes,
-                        default_ctx, ctx_map, in_arg_ctxes, aux_state_ctxes);
+
+  // setup in_args_map
+  std::unordered_map<std::string, NDArray> in_args_map;
+  for (size_t i = 0; i < in_args->size(); ++i) {
+    in_args_map[arg_names[i]] = in_args->at(i);
+  }
+  auto result = PartitionGraph(src, prop_name, arg_shapes, arg_dtypes, arg_stypes, default_ctx,
+                               ctx_map, in_arg_ctxes, aux_state_ctxes);
+  // Reorder in_args into new_in_args according to partitioned symbol input sequence
+  std::vector<NDArray> new_in_args(in_args->size());
+  // get new symbol in_arg names
+  std::vector<std::string> new_arg_names = result.ListInputNames(nnvm::Symbol::kReadOnlyArgs);
+  CHECK_EQ(arg_names.size(), new_arg_names.size());
+  in_args->clear();
+  for (auto arg_name : new_arg_names) {
+    CHECK(in_args_map.count(arg_name));
+    in_args->push_back(in_args_map[arg_name]);
+  }
+  return result;
 }
 }  // namespace exec
 
@@ -1613,12 +1630,13 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
                          const std::vector<NDArray> &aux_states,
                          Executor* shared_exec) {
   auto exec = new exec::GraphExecutor();
+  std::vector<NDArray> tmp_in_args = in_args;
   if (!exec->subgraph_property().empty()) {
-    symbol = exec::PartitionGraph(symbol, exec->subgraph_property(), in_args, aux_states,
+    symbol = exec::PartitionGraph(symbol, exec->subgraph_property(), &tmp_in_args, aux_states,
                                   default_ctx, group2ctx);
   }
   exec->Init(symbol, default_ctx, group2ctx,
-             in_args, arg_grad_store, grad_req_type, aux_states,
+             tmp_in_args, arg_grad_store, grad_req_type, aux_states,
              reinterpret_cast<Executor*>(shared_exec));
   return exec;
 }
diff --git a/src/io/image_iter_common.h b/src/io/image_iter_common.h
index 8580ff8f9f9c..a2324a4b5c5b 100644
--- a/src/io/image_iter_common.h
+++ b/src/io/image_iter_common.h
@@ -42,7 +42,7 @@ class ImageLabelMap {
    * \param label_width predefined label_width
    */
   explicit ImageLabelMap(const char *path_imglist,
-                         mshadow::index_t label_width,
+                         index_t label_width,
                          bool silent) {
     this->label_width = label_width;
     image_index_.clear();
@@ -58,7 +58,7 @@ class ImageLabelMap {
       // skip space
       while (isspace(*p) && p != end) ++p;
       image_index_.push_back(static_cast<size_t>(atol(p)));
-      for (size_t i = 0; i < label_width; ++i) {
+      for (index_t i = 0; i < label_width; ++i) {
         // skip till space
         while (!isspace(*p) && p != end) ++p;
         // skip space
@@ -171,7 +171,7 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
 // Batch parameters
 struct BatchParam : public dmlc::Parameter<BatchParam> {
   /*! \brief label width */
-  index_t batch_size;
+  uint32_t batch_size;
   /*! \brief use round roubin to handle overflow batch */
   bool round_batch;
   // declare parameters
diff --git a/src/io/iter_image_recordio_2.cc b/src/io/iter_image_recordio_2.cc
index 668eb0059f82..f41483955494 100644
--- a/src/io/iter_image_recordio_2.cc
+++ b/src/io/iter_image_recordio_2.cc
@@ -75,7 +75,7 @@ class ImageRecordIOParser2 {
   cv::Mat TJimdecode(cv::Mat buf, int color);
 #endif
 #endif
-  inline unsigned ParseChunk(DType* data_dptr, real_t* label_dptr, const unsigned current_size,
+  inline size_t ParseChunk(DType* data_dptr, real_t* label_dptr, const size_t current_size,
     dmlc::InputSplit::Blob * chunk);
   inline void CreateMeanImg(void);
 
@@ -104,10 +104,10 @@ class ImageRecordIOParser2 {
   /*! \brief temp space */
   mshadow::TensorContainer<cpu, 3> img_;
   /*! \brief internal instance order */
-  std::vector<std::pair<unsigned, unsigned> > inst_order_;
-  unsigned inst_index_;
+  std::vector<std::pair<size_t, size_t> > inst_order_;
+  size_t inst_index_;
   /*! \brief internal counter tracking number of already parsed entries */
-  unsigned n_parsed_;
+  size_t n_parsed_;
   /*! \brief overflow marker */
   bool overflow;
   /*! \brief unit size */
@@ -200,7 +200,7 @@ inline void ImageRecordIOParser2<DType>::Init(
                       "larger chunk size";
       }
       // 1.1 ratio is for a bit more shuffle parts to avoid boundary issue
-      unsigned num_shuffle_parts =
+      size_t num_shuffle_parts =
           std::ceil(source_->GetTotalSize() * 1.1 /
                     (param_.num_parts * (param_.shuffle_chunk_size << 20UL)));
 
@@ -262,7 +262,7 @@ inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
   }
   CHECK(source_ != nullptr);
   dmlc::InputSplit::Blob chunk;
-  unsigned current_size = 0;
+  size_t current_size = 0;
   out->index.resize(batch_param_.batch_size);
 
   // InitBatch
@@ -295,7 +295,7 @@ inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
 
   while (current_size < batch_param_.batch_size) {
     // int n_to_copy;
-    unsigned n_to_out = 0;
+    size_t n_to_out = 0;
     if (n_parsed_ == 0) {
       if (source_->NextBatch(&chunk, batch_param_.batch_size)) {
         inst_order_.clear();
@@ -328,15 +328,16 @@ inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
         n_to_out = 0;
       }
     } else {
-      int n_to_copy = std::min(n_parsed_, batch_param_.batch_size - current_size);
+      size_t n_to_copy = std::min(n_parsed_,
+                                  static_cast<size_t>(batch_param_.batch_size) - current_size);
       n_parsed_ -= n_to_copy;
       // Copy
       #pragma omp parallel for num_threads(param_.preprocess_threads)
-      for (int i = 0; i < n_to_copy; ++i) {
+      for (int i = 0; i < static_cast<int>(n_to_copy); ++i) {
         omp_exc_.Run([&] {
-        std::pair<unsigned, unsigned> place = inst_order_[inst_index_ + i];
+        std::pair<size_t, size_t> place = inst_order_[inst_index_ + i];
         const DataInst& batch = temp_[place.first][place.second];
-        for (unsigned j = 0; j < batch.data.size(); ++j) {
+        for (size_t j = 0; j < batch.data.size(); ++j) {
           CHECK_EQ(unit_size_[j], batch.data[j].Size());
           MSHADOW_TYPE_SWITCH(out->data[j].data().type_flag_, dtype, {
           mshadow::Copy(
@@ -482,18 +483,18 @@ cv::Mat ImageRecordIOParser2<DType>::TJimdecode(cv::Mat image, int color) {
 
 // Returns the number of images that are put into output
 template<typename DType>
-inline unsigned ImageRecordIOParser2<DType>::ParseChunk(DType* data_dptr, real_t* label_dptr,
-  const unsigned current_size, dmlc::InputSplit::Blob * chunk) {
+inline size_t ImageRecordIOParser2<DType>::ParseChunk(DType* data_dptr, real_t* label_dptr,
+  const size_t current_size, dmlc::InputSplit::Blob * chunk) {
   temp_.resize(param_.preprocess_threads);
 #if MXNET_USE_OPENCV
   // save opencv out
   dmlc::RecordIOChunkReader reader(*chunk, 0, 1);
-  unsigned gl_idx = current_size;
+  size_t gl_idx = current_size;
   #pragma omp parallel num_threads(param_.preprocess_threads)
   {
     omp_exc_.Run([&] {
     CHECK(omp_get_num_threads() == param_.preprocess_threads);
-    unsigned int tid = omp_get_thread_num();
+    int tid = omp_get_thread_num();
     // dmlc::RecordIOChunkReader reader(*chunk, tid, param_.preprocess_threads);
     ImageRecordIO rec;
     dmlc::InputSplit::Blob blob;
@@ -502,7 +503,7 @@ inline unsigned ImageRecordIOParser2<DType>::ParseChunk(DType* data_dptr, real_t
     out_tmp.Clear();
     while (true) {
       bool reader_has_data;
-      unsigned idx;
+      size_t idx;
       #pragma omp critical
       {
         reader_has_data = reader.NextRecord(&blob);
@@ -567,7 +568,7 @@ inline unsigned ImageRecordIOParser2<DType>::ParseChunk(DType* data_dptr, real_t
         data = mshadow::Tensor<cpu, 3, DType>(data_dptr + idx*unit_size_[0],
           mshadow::Shape3(n_channels, res.rows, res.cols));
       } else {
-        out_tmp.Push(static_cast<unsigned>(rec.image_index()),
+        out_tmp.Push(static_cast<size_t>(rec.image_index()),
                  mshadow::Shape3(n_channels, res.rows, res.cols),
                  mshadow::Shape1(param_.label_width));
         data = out_tmp.data().Back();
@@ -612,7 +613,7 @@ inline unsigned ImageRecordIOParser2<DType>::ParseChunk(DType* data_dptr, real_t
   });
   }
   omp_exc_.Rethrow();
-  return (std::min(batch_param_.batch_size, gl_idx) - current_size);
+  return (std::min(static_cast<size_t>(batch_param_.batch_size), gl_idx) - current_size);
 #else
   LOG(FATAL) << "Opencv is needed for image decoding and augmenting.";
   return 0;
@@ -633,8 +634,8 @@ inline void ImageRecordIOParser2<DType>::CreateMeanImg(void) {
       inst_order_.clear();
       // Parse chunk w/o putting anything in out
       ParseChunk(nullptr, nullptr, batch_param_.batch_size, &chunk);
-      for (unsigned i = 0; i < inst_order_.size(); ++i) {
-        std::pair<unsigned, unsigned> place = inst_order_[i];
+      for (size_t i = 0; i < inst_order_.size(); ++i) {
+        std::pair<size_t, size_t> place = inst_order_[i];
         mshadow::Tensor<cpu, 3> outimg =
           temp_[place.first][place.second].data[0].template get<cpu, 3, real_t>();
         if (imcnt == 0) {
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 47e0c5bbe75a..5d8e39dea225 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -168,6 +168,18 @@ nnvm::Symbol NDArray::get_autograd_symbol() const {
 
 #if MXNET_USE_MKLDNN == 1
 
+NDArray::NDArray(const mkldnn::memory *mkldnn_mem, bool static_data)
+    : storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) {
+  auto mem_pd = mkldnn_mem->get_primitive_desc();
+  auto mem_desc = mem_pd.desc();
+  shape_ = TShape(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims);
+  dtype_ = get_mxnet_type(mem_desc.data.data_type);
+  auto data = TBlob(mkldnn_mem->get_data_handle(), shape_, cpu::kDevMask, dtype_);
+  ptr_ = std::make_shared<Chunk>(data, 0);
+  ptr_->mkl_mem_ = std::make_shared<MKLDNNMemory>(mem_pd, ptr_->shandle.dptr);
+  ptr_->static_data = static_data;
+}
+
 NDArray NDArray::MKLDNNDataReshape(const TShape &shape) const {
   CHECK(!is_none()) << "NDArray is not initialized";
   CHECK_GE(shape_.Size(), shape.Size())
@@ -477,24 +489,6 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) {
   mkl_mem_.reset(new MKLDNNMemory(pd, shandle.dptr));
 }
 
-/*
- * Here we want to get MKLDNN memory whose primitive desc is exactly the same as
- * the given one. operator== can't guarantee that. == can return true even if
- * the formats are different. I need to double check its format.
- */
-static inline mkldnn::memory *GetMKLDNNExact(
-    const mkldnn::memory *mem, mkldnn::memory::primitive_desc desc) {
-  mkldnn::memory::primitive_desc src_desc = mem->get_primitive_desc();
-  if (desc == src_desc && desc.desc().data.format == src_desc.desc().data.format) {
-    return const_cast<mkldnn::memory *>(mem);
-  } else {
-    std::shared_ptr<mkldnn::memory> ret(new mkldnn::memory(
-            desc, mem->get_data_handle()));
-    MKLDNNStream::Get()->RegisterMem(ret);
-    return ret.get();
-  }
-}
-
 const mkldnn::memory *NDArray::GetMKLDNNData(
     const mkldnn::memory::primitive_desc &desc) const {
   if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
@@ -722,6 +716,21 @@ mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &
   MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
   return ptr_->mkl_mem_->GetRaw();
 }
+
+void NDArray::UpdateMKLDNNMemDesc() {
+  const mkldnn::memory *mem = GetMKLDNNData();
+  auto mem_desc = mem->get_primitive_desc().desc();
+  auto this_dtype = get_mkldnn_type(dtype());
+  if (this_dtype != mem_desc.data.data_type) {
+    mkldnn::memory::desc data_md(
+        mkldnn::memory::dims(mem_desc.data.dims,
+                             mem_desc.data.dims + mem_desc.data.ndims),
+        this_dtype, static_cast<mkldnn::memory::format>(mem_desc.data.format));
+    mkldnn::memory::primitive_desc pd(data_md, CpuEngine::Get()->get_engine());
+    ptr_->mkl_mem_.reset(new MKLDNNMemory(pd, ptr_->shandle.dptr));
+    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
+  }
+}
 #endif
 
 void NDArray::SetTBlob() const {
@@ -1592,8 +1601,12 @@ void NDArray::Save(dmlc::Stream *strm) const {
     save_data = nd_cpu.data();
   } else {
     this->WaitToRead();
-    save_data = this->data();
     nd_cpu = *this;
+#if MXNET_USE_MKLDNN == 1
+    if (nd_cpu.IsMKLDNNData())
+      nd_cpu = nd_cpu.Reorder2Default();
+#endif
+    save_data = nd_cpu.data();
   }
 
   // save type flag
@@ -2105,10 +2118,10 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
   if (mean.is_none()) {
     MSHADOW_TYPE_SWITCH(buff.dtype(), DType, {
       mshadow::Tensor<cpu, 4, DType> tensor = buff.data().get<cpu, 4, DType>();
-      for (index_t i = 0; i < y1-y0; i++) {
+      for (size_t i = 0; i < y1-y0; i++) {
         uchar* im_data = res.ptr<uchar>(y0+i) + res.channels()*x0;
-        for (index_t j = 0; j < x1-x0; j++) {
-          for (index_t k = 0; k < n_channels; k++) {
+        for (size_t j = 0; j < x1-x0; j++) {
+          for (size_t k = 0; k < n_channels; k++) {
             tensor[0][k][i][j] = DType(im_data[k]);  // NOLINT(*)
           }
           im_data += res.channels();
@@ -2125,10 +2138,10 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
     MSHADOW_TYPE_SWITCH(buff.dtype(), DType, {
       mshadow::Tensor<cpu, 4, DType> tensor = buff.data().get<cpu, 4, DType>();
       mshadow::Tensor<cpu, 3, DType> tmean = mean.data().get<cpu, 3, DType>();
-      for (index_t i = 0; i < y1-y0; i++) {
+      for (size_t i = 0; i < y1-y0; i++) {
         uchar* im_data = res.ptr<uchar>(y0+i) + res.channels()*x0;
-        for (index_t j = 0; j < x1-x0; j++) {
-          for (index_t k = 0; k < n_channels; k++) {
+        for (size_t j = 0; j < x1-x0; j++) {
+          for (size_t k = 0; k < n_channels; k++) {
             tensor[0][k][i][j] = DType(im_data[k]) - tmean[k][i][j];  // NOLINT(*)
           }
           im_data += res.channels();
diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc
index 022302aca403..43295d6e1014 100644
--- a/src/ndarray/ndarray_function.cc
+++ b/src/ndarray/ndarray_function.cc
@@ -92,7 +92,7 @@ void ElementwiseSumRspImpl(mshadow::Stream<cpu>* s,
               auto out_value_cur_row = out_values[irow];
               const auto offset = row_idx_ptr - nd_indices_start;
               auto nd_value_cur_row = nd_values[offset];
-              for (size_t j = 0; j < nd_value_cur_row.shape_[0]; ++j) {
+              for (index_t j = 0; j < nd_value_cur_row.shape_[0]; ++j) {
                 out_value_cur_row[j] += nd_value_cur_row[j];
               }
               ++irow;
diff --git a/src/operator/batch_norm_v1-inl.h b/src/operator/batch_norm_v1-inl.h
index 1e048452275c..f4116e30186e 100644
--- a/src/operator/batch_norm_v1-inl.h
+++ b/src/operator/batch_norm_v1-inl.h
@@ -286,14 +286,14 @@ class BatchNormV1Prop : public OperatorProperty {
     // For other input types, these parameters have the same type as input
     // NOTE: This requirement is from cuDNN (v. 4 and 5)
     int dtype_param = (dtype == kFloat16) ? kFloat32 : dtype;
-    for (index_t i = 1; i < in_type->size(); ++i) {
+    for (size_t i = 1; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype_param;
       } else {
         UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]);
       }
     }
-    for (index_t i = 0; i < aux_type->size(); ++i) {
+    for (size_t i = 0; i < aux_type->size(); ++i) {
       if ((*aux_type)[i] != -1) {
         UNIFORM_TYPE_CHECK((*aux_type)[i], dtype_param, ListArguments()[i]);
       }
diff --git a/src/operator/bilinear_sampler.cu b/src/operator/bilinear_sampler.cu
index 2e6be3e1ef3e..03734a61316b 100644
--- a/src/operator/bilinear_sampler.cu
+++ b/src/operator/bilinear_sampler.cu
@@ -51,8 +51,8 @@ __global__ void BilinearSamplerForwardKernel(const int i_c, const int i_h,
     int h = (index / o_w) % o_h;
     int c = (index / o_w / o_h) % o_c;
     int n = index / o_w / o_h / o_c;
-    index_t out_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
-    index_t grid_index = n * o_h * o_w * 2 + h * o_w + w;
+    int out_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
+    int grid_index = n * o_h * o_w * 2 + h * o_w + w;
     DType y_real = (*(grid + grid_index + o_h * o_w) + 1) * (i_h - 1) / 2;
     DType x_real = (*(grid + grid_index) + 1) * (i_w - 1) / 2;
     int top_left_y = static_cast<int>(floor(y_real));
@@ -96,7 +96,7 @@ __global__ void BilinearSamplerBackwardKernel(const int i_c, const int i_h,
     int n = index / o_w / o_h;
     DType top_left_y_gw = 0.0;
     DType top_left_x_gw = 0.0;
-    index_t grid_src_index = n * o_h * o_w * 2 + h * o_w + w;
+    int grid_src_index = n * o_h * o_w * 2 + h * o_w + w;
     DType y_real = (*(grid_src + grid_src_index + o_h * o_w) + 1) * (i_h - 1) / 2;
     DType x_real = (*(grid_src + grid_src_index) + 1) * (i_w - 1) / 2;
 
@@ -104,8 +104,8 @@ __global__ void BilinearSamplerBackwardKernel(const int i_c, const int i_h,
     int top_left_x = static_cast<int>(floor(x_real));
     DType top_left_y_w = 1.0 - (y_real - top_left_y);
     DType top_left_x_w = 1.0 - (x_real - top_left_x);
-    for (index_t c = 0; c < o_c; ++c) {
-      index_t grad_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
+    for (int c = 0; c < o_c; ++c) {
+      int grad_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
       int data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w + top_left_x;
       // calc 4 vertex value in input data
       DType top_left_v = 0;
diff --git a/src/operator/channel_op_common.h b/src/operator/channel_op_common.h
index 00cd8ae084bb..1afc13ad2594 100644
--- a/src/operator/channel_op_common.h
+++ b/src/operator/channel_op_common.h
@@ -44,7 +44,7 @@ inline void concatenate_helper(const std::vector<mshadow::Tensor<xpu, dim, DType
     mshadow::Tensor<xpu, dim, DType> out = *output;
     size_t size = input.size();
     index_t begin = 0;
-    for (index_t i = 0; i < size; ++i) {
+    for (size_t i = 0; i < size; ++i) {
       index_t end = begin + input[i].size(cdim);
       Assign(slice<cdim>(out, begin, end), req, input[i]);
       begin = end;
@@ -79,7 +79,7 @@ void split_helper(const mshadow::Tensor<xpu, dim, DType> &input,
     std::vector<mshadow::Tensor<xpu, dim, DType> > out = *output;
     size_t size = out.size();
     index_t begin = 0;
-    for (index_t i = 0; i < size; ++i) {
+    for (size_t i = 0; i < size; ++i) {
       index_t end = begin + out[i].size(cdim);
       Assign(out[i], req[i], slice<cdim>(input, begin, end));
       begin = end;
diff --git a/src/operator/contrib/count_sketch-inl.h b/src/operator/contrib/count_sketch-inl.h
index 76d1a7efb876..dd3bf54ab6a6 100644
--- a/src/operator/contrib/count_sketch-inl.h
+++ b/src/operator/contrib/count_sketch-inl.h
@@ -185,7 +185,7 @@ class CountSketchProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/contrib/ctc_loss-inl.h b/src/operator/contrib/ctc_loss-inl.h
deleted file mode 100644
index c8a8b2637401..000000000000
--- a/src/operator/contrib/ctc_loss-inl.h
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2016 by Contributors
- * \file ctc_loss-inl.h
- * \brief
- * \author Sebastian Bodenstien
-*/
-
-#ifndef MXNET_OPERATOR_CONTRIB_CTC_LOSS_INL_H_
-#define MXNET_OPERATOR_CONTRIB_CTC_LOSS_INL_H_
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include <ctime>
-#include <cstring>
-#include <iostream>
-#include "../operator_common.h"
-#include "../sequence_op_common.h"
-#include "../mshadow_op.h"
-#include "../nn/sequence_mask-inl.h"
-
-#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
-#define CUDNN_LABEL_LENGTH_LIMIT 256
-#include "../nn/softmax-inl.h"
-#endif  // CUDNN
-
-namespace mxnet {
-namespace op {
-
-namespace ctc_loss {
-enum CTCLossOpInputs { kData, kLabel };
-enum CTCLossOpOutputs { kOut, kGrad };
-enum CTCLossOpForwardResource { kTempSpace };
-}
-
-template <typename T>
-inline void get_workspace_size(std::vector<int> *label_lengths,
-                               std::vector<int> *data_lengths,
-                               int alphabet_size, int minibatch, bool gpu,
-                               size_t *size_bytes) {
-  // This is the max of all S and T for all examples in the minibatch.
-  int maxL = *std::max_element(label_lengths->data(),
-                               label_lengths->data() + minibatch);
-  int maxT = *std::max_element(data_lengths->data(),
-                               data_lengths->data() + minibatch);
-
-  const int S = 2 * maxL + 1;
-
-  *size_bytes = 0;
-
-  if (gpu) {
-    // GPU storage
-    // nll_forward, nll_backward
-    *size_bytes += 2 * sizeof(T) * minibatch;
-
-    // repeats
-    *size_bytes += sizeof(int) * minibatch;
-
-    // label offsets
-    *size_bytes += sizeof(int) * minibatch;
-
-    // utt_length
-    *size_bytes += sizeof(int) * minibatch;
-
-    // label lengths
-    *size_bytes += sizeof(int) * minibatch;
-
-    // labels without blanks - overallocate for now
-    *size_bytes += sizeof(int) * maxL * minibatch;
-
-    // labels with blanks
-    *size_bytes += sizeof(int) * S * minibatch;
-
-    // alphas
-    *size_bytes += sizeof(T) * S * maxT * minibatch;
-
-    // denoms
-    *size_bytes += sizeof(T) * maxT * minibatch;
-
-    // probs (since we will pass in activations)
-    *size_bytes += sizeof(T) * alphabet_size * maxT * minibatch;
-
-  } else {
-    // cpu can eventually replace all minibatch with
-    // max number of concurrent threads if memory is
-    // really tight
-
-    // per minibatch memory
-    size_t per_minibatch_bytes = 0;
-
-    // output
-    per_minibatch_bytes += sizeof(T) * alphabet_size;
-
-    // alphas
-    per_minibatch_bytes += sizeof(T) * S * maxT;
-
-    // betas
-    per_minibatch_bytes += sizeof(T) * S;
-
-    // labels w/blanks, e_inc, s_inc
-    per_minibatch_bytes += 3 * sizeof(int) * S;
-
-    *size_bytes = per_minibatch_bytes * minibatch;
-
-    // probs
-    *size_bytes += sizeof(T) * alphabet_size * maxT * minibatch;
-  }
-}
-
-// Takes a tensor of labels, and interprets 0-elements at the end of the vector
-// as padding. The tensor is packed into an std::vector without padding
-// characters. The label sequence lengths are also inferred from the padding chars.
-// When cudnn is enabled, the return value signifies whether the cudnn length limit is exceeded.
-template <typename DType, typename xpu>
-inline bool LabelTensorToPackedVector(mshadow::Tensor<xpu, 2, DType> labels,
-                                      int padding_mask,
-                                      std::vector<int> *packed_labels,
-                                      std::vector<int> *label_lengths) {
-  int batch = labels.size(0);
-  int max_num_labels = labels.size(1);
-  bool exceed_limit = false;
-
-  std::vector<int> cpu_labels(max_num_labels*batch);
-  mshadow::Tensor<xpu, 1, DType> flat_labels = labels.FlatTo1D();
-  IndexTensorToVector(flat_labels, &cpu_labels);
-
-  for (int b = 0; b < batch; ++b) {
-    auto start = cpu_labels.data()+b*max_num_labels;
-    auto res = std::find(start, start+max_num_labels, padding_mask);
-    int len = std::distance(start, res);
-#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
-    exceed_limit = exceed_limit || len > CUDNN_LABEL_LENGTH_LIMIT;
-#endif
-    std::copy(start, start + len,
-              std::back_inserter(*packed_labels));
-    label_lengths->at(b) = len;
-  }
-  return exceed_limit;
-}
-
-// Takes a tensor of labels, and a vector which specifies the actual length of each label
-// The tensor is packed into an std::vector without padding characters.
-// The label length vector is copied into an std::vector.
-// When cudnn is enabled, the return value signifies whether the cudnn length limit is exceeded.
-template <typename DType, typename xpu>
-inline bool PackLabelByLength(mshadow::Tensor<xpu, 2, DType> labels,
-                              mshadow::Tensor<xpu, 1, DType> in_label_lengths,
-                              std::vector<int> *packed_labels,
-                              std::vector<int> *label_lengths) {
-  int batch = labels.size(0);
-  int max_num_labels = labels.size(1);
-  bool exceed_limit = false;
-
-  IndexTensorToVector(in_label_lengths, label_lengths);
-
-  std::vector<int> cpu_labels(max_num_labels*batch);
-  mshadow::Tensor<xpu, 1, DType> flat_labels = labels.FlatTo1D();
-  IndexTensorToVector(flat_labels, &cpu_labels);
-
-  for (int b = 0; b < batch; ++b) {
-    auto start = cpu_labels.data()+b*max_num_labels;
-    int len = label_lengths->at(b);
-#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
-    exceed_limit = exceed_limit || len > CUDNN_LABEL_LENGTH_LIMIT;
-#endif
-    std::copy(start, start + len,
-              std::back_inserter(*packed_labels));
-  }
-  return exceed_limit;
-}
-
-struct CTCLossParam : public dmlc::Parameter<CTCLossParam> {
-  bool use_data_lengths;
-  bool use_label_lengths;
-  int blank_label;
-  DMLC_DECLARE_PARAMETER(CTCLossParam) {
-    DMLC_DECLARE_FIELD(use_data_lengths).set_default(false)
-      .describe("Whether the data lenghts are decided by `data_lengths`. "
-                "If false, the lengths are equal to the max sequence length.");
-    DMLC_DECLARE_FIELD(use_label_lengths).set_default(false)
-      .describe("Whether the label lenghts are decided by "
-                "`label_lengths`, or derived from `padding_mask`. "
-                "If false, the lengths are derived from the "
-                "first occurrence of the value of `padding_mask`. "
-                "The value of `padding_mask` is ``0`` when first CTC label is reserved for blank, "
-                "and ``-1`` when last label is reserved for blank. See `blank_label`.");
-    DMLC_DECLARE_FIELD(blank_label)
-      .add_enum("first", 0)
-      .add_enum("last", 1)
-      .set_default(0)
-      .describe("Set the label that is reserved for blank label."
-                "If \"first\", 0-th label is reserved, and "
-                "label values for tokens in the vocabulary are "
-                "between ``1`` and ``alphabet_size-1``, and the padding mask is ``-1``. "
-                "If \"last\", last label value ``alphabet_size-1`` "
-                "is reserved for blank label instead, "
-                "and label values for tokens in the vocabulary are "
-                "between ``0`` and ``alphabet_size-2``, and the padding mask is ``0``.");
-  }
-};
-
-template <typename xpu>
-class CTCLossOp : public Operator {
- public:
-  explicit CTCLossOp(CTCLossParam p) {
-    this->param_ = p;
-    exceed_cudnn_limit = false;
-#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
-    CUDNN_CALL(cudnnCreateCTCLossDescriptor(&ctc_desc_));
-    CUDNN_CALL(cudnnSetCTCLossDescriptor(ctc_desc_, CUDNN_DATA_FLOAT));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&prob_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&grad_desc_));
-#endif
-  }
-
-  ~CTCLossOp() {
-#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
-    CUDNN_CALL(cudnnDestroyCTCLossDescriptor(ctc_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(prob_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(grad_desc_));
-#endif
-  }
-
-  virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2U+param_.use_data_lengths+param_.use_label_lengths);
-    CHECK_EQ(out_data.size(), 2U);
-    exceed_cudnn_limit = false;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-
-    MSHADOW_TYPE_SWITCH(in_data[ctc_loss::kLabel].type_flag_, DType, {
-      Tensor<xpu, 3, real_t> data =
-        in_data[ctc_loss::kData].get<xpu, 3, real_t>(s);
-      Tensor<xpu, 2, DType> labels =
-        in_data[ctc_loss::kLabel].get<xpu, 2, DType>(s);
-
-      Tensor<xpu, 1, real_t> costs =
-        out_data[ctc_loss::kOut].get<xpu, 1, real_t>(s);
-      Tensor<xpu, 3, real_t> grad =
-        out_data[ctc_loss::kGrad].get<xpu, 3, real_t>(s);
-
-      int max_seq_len = data.size(0);
-      int batch_size = data.size(1);
-      int alphabet_size = data.size(2);
-
-      // data_lengths
-      std::vector<int> data_lengths(batch_size, max_seq_len);
-      if (param_.use_data_lengths) {
-        int kInputLength = 2;
-        IndexTensorToVector(in_data[kInputLength].get<xpu, 1, real_t>(s), &data_lengths);
-      }
-
-      // label_lengths
-      std::vector<int> packed_labels;
-      std::vector<int> label_lengths(batch_size);
-
-      if (param_.use_label_lengths) {
-        int kLabelLength = 2 + param_.use_data_lengths;
-        exceed_cudnn_limit =
-          PackLabelByLength(labels, in_data[kLabelLength].get<xpu, 1, DType>(s),
-                           &packed_labels, &label_lengths);
-      } else {
-        exceed_cudnn_limit = LabelTensorToPackedVector(labels, param_.blank_label == 0 ? 0 : -1,
-                                                      &packed_labels, &label_lengths);
-      }
-
-      // CUDNN is disabled due to lack of support for input lengths
-      /* #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 */
-      /*     if (!exceed_cudnn_limit) { */
-      /*       cudnn_forward(ctx, s, data, costs, grad, */
-      /*                     &data_lengths, &label_lengths, &packed_labels, */
-      /*                     max_seq_len, batch_size, alphabet_size, */
-      /*                     req[ctc_loss::kGrad] != mxnet::kNullOp); */
-      /*     } else { */
-      /*       baidu_forward(ctx, s, data, costs, grad, */
-      /*                     &data_lengths, &label_lengths, &packed_labels, */
-      /*                     batch_size, alphabet_size, req[ctc_loss::kGrad] != mxnet::kNullOp);*/
-      /*     } */
-      /* #else */
-
-      baidu_forward(ctx, s, data, costs, grad,
-                    &data_lengths, &label_lengths, &packed_labels,
-                    batch_size, alphabet_size, req[ctc_loss::kGrad] != mxnet::kNullOp);
-
-      if (param_.use_data_lengths) {
-        // baidu warp CTC implementation sometimes includes undefined gradients
-        // for data outside of length mask. Setting to 0 to make it consistent
-        // with CPU implementation.
-        int kInputLength = 2;
-        mxnet_op::SequenceMask(grad, in_data[kInputLength].get<xpu, 1, real_t>(s),
-                              static_cast<real_t>(0));
-      }
-    });
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-
-    Tensor<xpu, 3, real_t> data_grad =
-        in_grad[ctc_loss::kData].get<xpu, 3, real_t>(s);
-    Tensor<xpu, 1, real_t> output_grad =
-        out_grad[ctc_loss::kOut].get<xpu, 1, real_t>(s);
-
-    Tensor<xpu, 3, real_t> data_grad_computed =
-        out_data[ctc_loss::kGrad].get<xpu, 3, real_t>(s);
-
-    Assign(data_grad, req[ctc_loss::kData],
-           mshadow::expr::broadcast<1>(output_grad, data_grad.shape_) * data_grad_computed);
-  }
-
- private:
-  CTCLossParam param_;
-  bool exceed_cudnn_limit;
-
-#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
-  cudnnDataType_t dtype_;
-  cudnnCTCLossDescriptor_t ctc_desc_;
-  cudnnTensorDescriptor_t prob_desc_, grad_desc_;
-
-  inline virtual void cudnn_forward(const OpContext &ctx,
-                                    mshadow::Stream<xpu>* s,
-                                    mshadow::Tensor<xpu, 3, real_t> data,
-                                    mshadow::Tensor<xpu, 1, real_t> costs,
-                                    mshadow::Tensor<xpu, 3, real_t> grad,
-                                    std::vector<int>* data_lengths,
-                                    std::vector<int>* label_lengths,
-                                    std::vector<int>* packed_labels,
-                                    int max_seq_len,
-                                    int batch_size,
-                                    int alphabet_size,
-                                    bool req_grad) {
-    using namespace mshadow;
-
-    // call cudnn to calculate ctc loss
-    dtype_ = CUDNN_DATA_FLOAT;
-    int dims[3], strides[3];
-    size_t workspace_bytes;
-    int workspace_size;
-    dims[0] = max_seq_len;
-    dims[1] = batch_size;
-    dims[2] = alphabet_size;
-    strides[0] = batch_size*alphabet_size;
-    strides[1] = alphabet_size;
-    strides[2] = 1;
-    cudnnCTCLossAlgo_t ctc_algo = CUDNN_CTC_LOSS_ALGO_DETERMINISTIC;
-    CUDNN_CALL(cudnnSetTensorNdDescriptor(prob_desc_,
-                                          dtype_,
-                                          3,
-                                          dims,
-                                          strides));
-    CUDNN_CALL(cudnnSetTensorNdDescriptor(grad_desc_,
-                                          dtype_,
-                                          3,
-                                          dims,
-                                          strides));
-    CUDNN_CALL(cudnnGetCTCLossWorkspaceSize(s->dnn_handle_,
-                                            prob_desc_,
-                                            req_grad?grad_desc_:NULL,
-                                            packed_labels->data(),
-                                            label_lengths->data(),
-                                            data_lengths->data(),
-                                            ctc_algo,
-                                            ctc_desc_,
-                                            &workspace_bytes));
-    workspace_size = (workspace_bytes + sizeof(real_t) - 1)/sizeof(real_t);
-
-    Tensor<xpu, 1, real_t> temp_space =
-      ctx.requested[ctc_loss::kTempSpace].get_space_typed<xpu, 1, real_t>(
-          mshadow::Shape1(workspace_size+data.shape_.FlatTo1D()[0]), s);
-
-    Tensor<gpu, 1, real_t> work_space(temp_space.dptr_,
-                                      mshadow::Shape1(workspace_size), s);
-    Tensor<xpu, 3, real_t> prob(temp_space.dptr_+workspace_size,
-                                data.shape_, s);
-
-    // since the input is activation before softmax and cudnn ctc takes softmax
-    // apply softmax to inputs first.
-    mxnet_op::Softmax<mxnet_op::softmax_fwd, false>(
-      s, data.dptr_, prob.dptr_, data.shape_, 2, 1.0);
-
-    CUDNN_CALL(cudnnCTCLoss(s->dnn_handle_,
-                            prob_desc_,
-                            prob.dptr_,
-                            packed_labels->data(),
-                            label_lengths->data(),
-                            data_lengths->data(),
-                            costs.dptr_,
-                            req_grad?grad_desc_:NULL,
-                            req_grad?grad.dptr_:NULL,
-                            ctc_algo,
-                            ctc_desc_,
-                            work_space.dptr_,
-                            workspace_bytes));
-
-    if (req_grad) {
-      mxnet_op::SoftmaxGrad<mshadow_op::mul, mxnet_op::softmax_bwd, kWriteTo, false>(
-        s, prob.dptr_, grad.dptr_, grad.dptr_, data.shape_, 2, 1.0);
-      Assign(grad, mxnet::kWriteInplace, grad * alphabet_size);
-    }
-  }
-#endif  // __CUDACC__ && CUDNN
-
-  inline void baidu_forward(const OpContext &ctx,
-                            mshadow::Stream<xpu>* s,
-                            mshadow::Tensor<xpu, 3, real_t> data,
-                            mshadow::Tensor<xpu, 1, real_t> costs,
-                            mshadow::Tensor<xpu, 3, real_t> grad,
-                            std::vector<int>* data_lengths,
-                            std::vector<int>* label_lengths,
-                            std::vector<int>* packed_labels,
-                            int batch_size,
-                            int alphabet_size,
-                            bool req_grad) {
-    using namespace mshadow;
-    // allocate temporary workspace
-    size_t size_bytes;
-    bool gpu = data.kDevCPU ? false : true;
-    get_workspace_size<real_t>(label_lengths, data_lengths, alphabet_size,
-                               batch_size, gpu, &size_bytes);
-
-    // round-up so there are enough elems in memory
-    int num_tmp_elems = (size_bytes + sizeof(real_t) - 1) / sizeof(real_t);
-    Tensor<xpu, 1, real_t> workspace =
-        ctx.requested[ctc_loss::kTempSpace].get_space_typed<xpu, 1, real_t>(
-            Shape1(num_tmp_elems), s);
-
-    compute_ctc_cost(data, costs.dptr_, grad.dptr_, packed_labels->data(),
-                     label_lengths->data(), data_lengths->data(),
-                     workspace.dptr_, req_grad,
-                     param_.blank_label == 0 ? 0 : (alphabet_size-1));
-  }
-};  // class CTCLossOp
-
-template <typename xpu>
-Operator *CreateOp(CTCLossParam param, int dtype);
-
-#if DMLC_USE_CXX11
-class CTCLossProp : public OperatorProperty {
- public:
-  int NumVisibleOutputs() const override { return 1; }
-
-  int NumOutputs() const override { return 2; }
-
-  std::vector<std::string> ListArguments() const override {
-    if (param_.use_data_lengths && param_.use_label_lengths) {
-      return {"data", "label", "data_lengths", "label_lengths"};
-    } else if (param_.use_data_lengths) {
-      return {"data", "label", "data_lengths"};
-    } else if (param_.use_label_lengths) {
-      return {"data", "label", "label_lengths"};
-    } else {
-      return {"data", "label"};
-    }
-  }
-
-  std::vector<std::string> ListOutputs() const override {
-    return {"output", "grad"};
-  }
-
-  void Init(const std::vector<std::pair<std::string, std::string>> &kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape, std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    index_t expected_inputs = 2+param_.use_data_lengths+param_.use_label_lengths;
-    CHECK_EQ(in_shape->size(), expected_inputs)
-        << "Expect " << expected_inputs << " inputs to the symbol.";
-
-    const TShape &dshape = (*in_shape)[ctc_loss::kData];
-    const TShape &lshape = (*in_shape)[ctc_loss::kLabel];
-    CHECK_EQ(dshape.ndim(), 3U) << "The data array must be of rank 3.";
-    CHECK_EQ(lshape.ndim(), 2U) << "The labels array must be of rank 2.";
-    CHECK_EQ(dshape[1], lshape[0])
-        << "The batch size for the labels and data arrays must be the same.";
-    if (param_.use_data_lengths) {
-      int kInputLength = 2;
-      const TShape &dlshape = (*in_shape)[kInputLength];
-      CHECK_EQ(dlshape.ndim(), 1U) << "Data length array must be a vector.";
-      CHECK_EQ(dlshape[0], dshape[1])
-          << "The batch size for the data and data lengths must be the same.";
-    }
-    if (param_.use_label_lengths) {
-      int kLabelLength = 2+param_.use_data_lengths;
-      const TShape &llshape = (*in_shape)[kLabelLength];
-      CHECK_EQ(llshape.ndim(), 1U) << "Label length array must be a vector.";
-      CHECK_EQ(llshape[0], lshape[0])
-          << "The batch size for the labels and label lengths must be the same.";
-    }
-
-    CHECK_GE(dshape[0], lshape[1]) << "The max number of labels cannot exceed "
-                                      "the maximum sequence length of the "
-                                      "data.";
-
-    TShape oshape(1);
-    oshape[0] = dshape[1];  // batch size
-    out_shape->clear();
-    out_shape->push_back(oshape);  // forward output
-    out_shape->push_back(dshape);  // grad output
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-    std::vector<int> *out_type,
-    std::vector<int> *aux_type) const override {
-    CHECK_LE(in_type->size(), this->ListArguments().size());
-    int dtype = (*in_type)[ctc_loss::kData];
-    CHECK_NE(dtype, -1) << "Input data must have specified type";
-
-    out_type->clear();
-    out_type->push_back(dtype);  // forward output
-    out_type->push_back(dtype);  // grad output
-    return true;
-  }
-
-  OperatorProperty *Copy() const override {
-    auto ptr = new CTCLossProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override { return "_contrib_CTCLoss"; }
-
-  std::vector<ResourceRequest> ForwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-      const std::vector<int> &out_grad, const std::vector<int> &in_data,
-      const std::vector<int> &out_data) const override {
-    return {out_grad[ctc_loss::kOut], out_data[ctc_loss::kGrad]};
-  }
-
-  Operator *CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return NULL;
-  }
-
-  Operator *CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
-
- private:
-  CTCLossParam param_;
-};      // class CTCLossProp
-#endif  // DMLC_USE_CXX11
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_CONTRIB_CTC_LOSS_INL_H_
diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h
index 480f675bdbff..7328eb38308f 100644
--- a/src/operator/contrib/deformable_convolution-inl.h
+++ b/src/operator/contrib/deformable_convolution-inl.h
@@ -129,7 +129,7 @@ class DeformableConvolutionOp : public Operator {
     // calculate the shape of col_buffer
     TShape col_buffer_shape(num_spatial_axes_ + 1);
     col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
-    for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+    for (size_t i = 1; i < col_buffer_shape.ndim(); ++i) {
       col_buffer_shape[i] = out_data[0].shape_[i + 1];
     }
     // create a column buffer using workspace and col_buffer_shape
@@ -453,7 +453,7 @@ class DeformableConvolutionProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/contrib/fft-inl.h b/src/operator/contrib/fft-inl.h
index be7b64aeb0c6..c5c8574f19e7 100644
--- a/src/operator/contrib/fft-inl.h
+++ b/src/operator/contrib/fft-inl.h
@@ -258,7 +258,7 @@ class FFTProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/contrib/ifft-inl.h b/src/operator/contrib/ifft-inl.h
index e48d653d9274..da560c3c5178 100644
--- a/src/operator/contrib/ifft-inl.h
+++ b/src/operator/contrib/ifft-inl.h
@@ -250,7 +250,7 @@ class IFFTProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i=0; i < in_type->size(); ++i) {
+    for (size_t i=0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/contrib/sync_batch_norm-inl.h b/src/operator/contrib/sync_batch_norm-inl.h
index 1f548dbc7e5e..78f1c09dfe03 100644
--- a/src/operator/contrib/sync_batch_norm-inl.h
+++ b/src/operator/contrib/sync_batch_norm-inl.h
@@ -500,14 +500,14 @@ class SyncBatchNormProp : public OperatorProperty {
     // For other input types, these parameters have the same type as input
     // NOTE: This requirement is from cuDNN (v. 4 and 5)
     int dtype_param = (dtype == kFloat16) ? kFloat32 : dtype;
-    for (index_t i = 1; i < in_type->size(); ++i) {
+    for (size_t i = 1; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype_param;
       } else {
         UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]);
       }
     }
-    for (index_t i = 0; i < aux_type->size(); ++i) {
+    for (size_t i = 0; i < aux_type->size(); ++i) {
       if ((*aux_type)[i] != -1) {
         UNIFORM_TYPE_CHECK((*aux_type)[i], dtype_param, ListArguments()[i]);
       }
diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
index d8310e6f1fc0..758ce12d8006 100644
--- a/src/operator/convolution_v1-inl.h
+++ b/src/operator/convolution_v1-inl.h
@@ -335,12 +335,10 @@ class ConvolutionV1Op : public Operator {
                                      oshape[2] * oshape[3]);
     // param_.workspace is in elements of sizeof(DType)
     // if param_.workspace is set to zero the nstep_ equals ishape[0] (batch)
-    nstep_ = std::max(
-        std::min(
-            static_cast<index_t>(
-                param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
-            ishape[0]),
-        1U);
+    nstep_ = std::max<index_t>(
+        std::min(static_cast<index_t>(param_.workspace) /
+          (shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]),
+      1);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);
@@ -502,7 +500,7 @@ class ConvolutionV1Prop : public OperatorProperty {
     CHECK_GE(in_type->size(), 1);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index b33a717d15b2..077428f54747 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -496,6 +496,11 @@ class CuDNNRNNOp : public Operator{
         if (cudnn_tensor_core_ && rnn_algo == CUDNN_RNN_ALGO_STANDARD) {
           math_type = CUDNN_TENSOR_OP_MATH;
         }
+      #if CUDNN_VERSION >= 7200
+            if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() &&
+                (DataType<DType>::kFlag != kFloat16))
+              math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
+      #endif
         CUDNN_CALL(cudnnSetRNNMatrixMathType(rnn_desc_, math_type));
       #endif
       // Get temp space sizes
diff --git a/src/operator/custom/custom.cc b/src/operator/custom/custom.cc
index 2061324e9e27..1bcbbbcf3e87 100644
--- a/src/operator/custom/custom.cc
+++ b/src/operator/custom/custom.cc
@@ -238,14 +238,14 @@ std::vector<nnvm::NodeEntry> Gradient(
   }
 
   std::vector<nnvm::NodeEntry> ret;
-  for (index_t i = 0; i < params.num_args; ++i) {
-    ret.emplace_back(nnvm::NodeEntry{g, i, 0});
+  for (size_t i = 0; i < params.num_args; ++i) {
+    ret.emplace_back(nnvm::NodeEntry{g, static_cast<uint32_t>(i), 0});
   }
   if (params.num_auxs) {
     nnvm::NodePtr ng = nnvm::Node::Create();
     ng->attrs.op = nnvm::Op::Get("_NoGradient");
     ng->attrs.name = "NoGradient";
-    for (index_t i = 0; i < params.num_auxs; ++i) {
+    for (size_t i = 0; i < params.num_auxs; ++i) {
       ret.emplace_back(nnvm::NodeEntry{ng, 0, 0});
     }
   }
diff --git a/src/operator/custom/native_op-inl.h b/src/operator/custom/native_op-inl.h
index 3c222071a4d6..05ad124ad345 100644
--- a/src/operator/custom/native_op-inl.h
+++ b/src/operator/custom/native_op-inl.h
@@ -77,7 +77,7 @@ class NativeOp : public Operator {
     s->Wait();
     param_.pinfo->forward(ptrs.size(), ptrs.data(), ndims.data(), shapes.data(),
         tags.data(), param_.pinfo->p_forward);
-    for (index_t i = 0; i < out_data.size(); ++i) {
+    for (size_t i = 0; i < out_data.size(); ++i) {
       CHECK_NE(req[i], kAddTo) << "NativeOp doesn't support AddTo for output";
       if (req[i] != kNullOp) {
         std::stringstream ss;
@@ -111,7 +111,7 @@ class NativeOp : public Operator {
     s->Wait();
     param_.pinfo->backward(ptrs.size(), ptrs.data(), ndims.data(), shapes.data(),
         tags.data(), param_.pinfo->p_backward);
-    for (index_t i = 0; i < in_grad.size(); ++i) {
+    for (size_t i = 0; i < in_grad.size(); ++i) {
       CHECK_NE(req[i], kAddTo) << "NativeOp doesn't support AddTo for output";
       if (req[i] != kNullOp) {
         std::stringstream ss;
diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index 16aa0c388cd1..cf44da699156 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -197,8 +197,8 @@ struct ElemwiseGradUseOut {
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
                                           const std::vector<nnvm::NodeEntry>& ograds) const {
     std::vector<nnvm::NodeEntry> heads;
-    index_t n_out = n->num_outputs();
-    for (index_t i = 0; i < n_out; ++i) {
+    uint32_t n_out = n->num_outputs();
+    for (uint32_t i = 0; i < n_out; ++i) {
       heads.emplace_back(nnvm::NodeEntry{n, i, 0});
     }
     return MakeNonlossGradNode(op_name, n, ograds, heads, n->attrs.dict);
@@ -214,8 +214,8 @@ struct ElemwiseGradUseInOut {
     for (auto& h : n->inputs) {
       heads.push_back(h);
     }
-    index_t n_out = n->num_outputs();
-    for (index_t i = 0; i < n_out; ++i) {
+    uint32_t n_out = n->num_outputs();
+    for (uint32_t i = 0; i < n_out; ++i) {
       heads.emplace_back(nnvm::NodeEntry{n, i, 0});
     }
     return MakeGradNode(op_name, n, heads, n->attrs.dict);
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index f28f5d7a436d..be542ba5b6be 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -363,7 +363,7 @@ static bool BatchNormType(const nnvm::NodeAttrs& attrs,
       dtype_param = mshadow::DataType<AccRealX>::kFlag; });
   std::vector<std::string> args{"data", "gamma", "beta", "mean", "var"};
   CHECK_LE(in_type->size(), args.size());
-  for (index_t i = 1; i < in_type->size(); ++i) {
+  for (size_t i = 1; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype_param;
     } else {
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index d5abe629123b..53b0c1380ed3 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -279,7 +279,7 @@ static bool ConvolutionType(const nnvm::NodeAttrs& attrs,
   CHECK_GE(in_type->size(), 1U);
   int dtype = (*in_type)[0];
   CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
+  for (size_t i = 0; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype;
     } else {
diff --git a/src/operator/nn/ctc_loss-inl.h b/src/operator/nn/ctc_loss-inl.h
new file mode 100644
index 000000000000..754cf8471b5d
--- /dev/null
+++ b/src/operator/nn/ctc_loss-inl.h
@@ -0,0 +1,397 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file ctc_loss-inl.h
+ * \brief CTC Loss operator
+*/
+
+#ifndef MXNET_OPERATOR_NN_CTC_LOSS_INL_H_
+#define MXNET_OPERATOR_NN_CTC_LOSS_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <algorithm>
+#include <string>
+#include "../mshadow_op.h"
+#include "./sequence_mask-inl.h"
+#include "../sequence_op_common.h"
+#include "../operator_common.h"
+#include "../elemwise_op_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace ctc_loss {
+enum CTCLossOpInputs { kData, kLabel };
+enum CTCLossOpOutputs { kOut, kGrad };
+}
+
+template <typename T>
+inline void get_workspace_size(const std::vector<int> *label_lengths,
+                               const std::vector<int> *data_lengths,
+                               int alphabet_size, int minibatch, bool isGPU,
+                               size_t *size_bytes) {
+  // This is the max of all S and T for all examples in the minibatch.
+  int maxL = *std::max_element(label_lengths->data(),
+                               label_lengths->data() + minibatch);
+  int maxT = *std::max_element(data_lengths->data(),
+                               data_lengths->data() + minibatch);
+
+  const int S = 2 * maxL + 1;
+
+  *size_bytes = 0;
+
+  if (isGPU) {
+    // GPU storage
+    // nll_forward, nll_backward
+    *size_bytes += 2 * sizeof(T) * minibatch;
+
+    // repeats
+    *size_bytes += sizeof(int) * minibatch;
+
+    // label offsets
+    *size_bytes += sizeof(int) * minibatch;
+
+    // utt_length
+    *size_bytes += sizeof(int) * minibatch;
+
+    // label lengths
+    *size_bytes += sizeof(int) * minibatch;
+
+    // labels without blanks - overallocate for now
+    *size_bytes += sizeof(int) * maxL * minibatch;
+
+    // labels with blanks
+    *size_bytes += sizeof(int) * S * minibatch;
+
+    // alphas
+    *size_bytes += sizeof(T) * S * maxT * minibatch;
+
+    // denoms
+    *size_bytes += sizeof(T) * maxT * minibatch;
+
+    // probs (since we will pass in activations)
+    *size_bytes += sizeof(T) * alphabet_size * maxT * minibatch;
+
+  } else {
+    // cpu can eventually replace all minibatch with
+    // max number of concurrent threads if memory is
+    // really tight
+
+    // per minibatch memory
+    size_t per_minibatch_bytes = 0;
+
+    // output
+    per_minibatch_bytes += sizeof(T) * alphabet_size;
+
+    // alphas
+    per_minibatch_bytes += sizeof(T) * S * maxT;
+
+    // betas
+    per_minibatch_bytes += sizeof(T) * S;
+
+    // labels w/blanks, e_inc, s_inc
+    per_minibatch_bytes += 3 * sizeof(int) * S;
+
+    *size_bytes = per_minibatch_bytes * minibatch;
+
+    // probs
+    *size_bytes += sizeof(T) * alphabet_size * maxT * minibatch;
+  }
+}
+
+// Takes a tensor of labels, and interprets 0-elements at the end of the vector
+// as padding. The tensor is packed into an std::vector without padding
+// characters. The label sequence lengths are also inferred from the padding chars.
+template <typename DType, typename xpu>
+inline void LabelTensorToPackedVector(mshadow::Tensor<xpu, 2, DType> labels,
+                                      int padding_mask,
+                                      std::vector<int> *packed_labels,
+                                      std::vector<int> *label_lengths) {
+  int batch = labels.size(0);
+  int max_num_labels = labels.size(1);
+
+  std::vector<int> cpu_labels(max_num_labels * batch);
+  mshadow::Tensor<xpu, 1, DType> flat_labels = labels.FlatTo1D();
+  IndexTensorToVector(flat_labels, &cpu_labels);
+
+  for (int b = 0; b < batch; ++b) {
+    auto start = cpu_labels.data() + b * max_num_labels;
+    auto res = std::find(start, start + max_num_labels, padding_mask);
+    int len = std::distance(start, res);
+    std::copy(start, start + len,
+              std::back_inserter(*packed_labels));
+    label_lengths->at(b) = len;
+  }
+}
+
+// Takes a tensor of labels, and a vector which specifies the actual length of each label
+// The tensor is packed into an std::vector without padding characters.
+// The label length vector is copied into an std::vector.
+template <typename DType, typename xpu>
+inline void PackLabelByLength(mshadow::Tensor<xpu, 2, DType> labels,
+                              mshadow::Tensor<xpu, 1, DType> in_label_lengths,
+                              std::vector<int> *packed_labels,
+                              std::vector<int> *label_lengths) {
+  int batch = labels.size(0);
+  int max_num_labels = labels.size(1);
+
+  IndexTensorToVector(in_label_lengths, label_lengths);
+
+  std::vector<int> cpu_labels(max_num_labels * batch);
+  mshadow::Tensor<xpu, 1, DType> flat_labels = labels.FlatTo1D();
+  IndexTensorToVector(flat_labels, &cpu_labels);
+
+  for (int b = 0; b < batch; ++b) {
+    auto start = cpu_labels.data() + b * max_num_labels;
+    int len = label_lengths->at(b);
+    std::copy(start, start + len,
+              std::back_inserter(*packed_labels));
+  }
+}
+
+struct CTCLossOpParam : public dmlc::Parameter<CTCLossOpParam> {
+  bool use_data_lengths;
+  bool use_label_lengths;
+  int blank_label;
+  DMLC_DECLARE_PARAMETER(CTCLossOpParam) {
+    DMLC_DECLARE_FIELD(use_data_lengths).set_default(false)
+      .describe("Whether the data lenghts are decided by `data_lengths`. "
+                "If false, the lengths are equal to the max sequence length.");
+    DMLC_DECLARE_FIELD(use_label_lengths).set_default(false)
+      .describe("Whether the label lenghts are decided by "
+                "`label_lengths`, or derived from `padding_mask`. "
+                "If false, the lengths are derived from the "
+                "first occurrence of the value of `padding_mask`. "
+                "The value of `padding_mask` is ``0`` when first CTC label is reserved for blank, "
+                "and ``-1`` when last label is reserved for blank. See `blank_label`.");
+    DMLC_DECLARE_FIELD(blank_label)
+      .add_enum("first", 0)
+      .add_enum("last", 1)
+      .set_default(0)
+      .describe("Set the label that is reserved for blank label."
+                "If \"first\", 0-th label is reserved, and "
+                "label values for tokens in the vocabulary are "
+                "between ``1`` and ``alphabet_size-1``, and the padding mask is ``-1``. "
+                "If \"last\", last label value ``alphabet_size-1`` "
+                "is reserved for blank label instead, "
+                "and label values for tokens in the vocabulary are "
+                "between ``0`` and ``alphabet_size-2``, and the padding mask is ``0``.");
+  }
+};
+
+// By default, the inputs must include data array and label array
+// if use_data_lengths parameter is set, user should also supply
+// data_lengths array; if use_label_lengths parameter is set, user
+// should also specify label_lengths array.
+inline uint32_t CTCLossOpNumInputs(const NodeAttrs& attrs) {
+  const CTCLossOpParam& param = nnvm::get<CTCLossOpParam>(attrs.parsed);
+  return 2U + param.use_data_lengths + param.use_label_lengths;
+}
+
+inline bool CTCLossOpShape(const nnvm::NodeAttrs &attrs,
+                           std::vector<TShape>* in_attrs,
+                           std::vector<TShape>* out_attrs) {
+    const CTCLossOpParam& param = nnvm::get<CTCLossOpParam>(attrs.parsed);
+    CHECK_EQ(in_attrs->size(), CTCLossOpNumInputs(attrs));
+    CHECK_EQ(out_attrs->size(), 2U);
+
+    const TShape &dshape = (*in_attrs)[ctc_loss::kData];
+    const TShape &lshape = (*in_attrs)[ctc_loss::kLabel];
+    CHECK_EQ(dshape.ndim(), 3U) << "The number of dimensions of data array must be 3.";
+    CHECK_EQ(lshape.ndim(), 2U) << "The number of dimensions of labels array must be 2.";
+    CHECK_EQ(dshape[1], lshape[0])
+        << "The batch size for the labels and data arrays must be the same.";
+
+    if (param.use_data_lengths) {
+      int kInputLength = 2;
+      const TShape &dlshape = (*in_attrs)[kInputLength];
+      CHECK_EQ(dlshape.ndim(), 1U) << "Data length array must be a vector.";
+      CHECK_EQ(dlshape[0], dshape[1])
+          << "The batch size for the data and data lengths must be the same.";
+    }
+    if (param.use_label_lengths) {
+      int kLabelLength = 2 + param.use_data_lengths;
+      const TShape &llshape = (*in_attrs)[kLabelLength];
+      CHECK_EQ(llshape.ndim(), 1U) << "Label length array must be a vector.";
+      CHECK_EQ(llshape[0], lshape[0])
+          << "The batch size for the labels and label lengths must be the same.";
+    }
+    CHECK_GE(dshape[0], lshape[1]) << "The max number of labels cannot exceed "
+                                      "the maximum sequence length of the "
+                                      "data.";
+
+    TShape oshape(1);
+    oshape[0] = dshape[1];  // batch size
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);  // forward output
+    SHAPE_ASSIGN_CHECK(*out_attrs, 1, dshape);  // grad output
+    return true;
+}
+
+inline bool CTCLossOpType(const nnvm::NodeAttrs& attrs,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+    CHECK_GE(in_attrs->size(), 2U);
+    CHECK_EQ(out_attrs->size(), 2U);
+    int dtype = (*in_attrs)[ctc_loss::kData];
+    CHECK_NE(dtype, -1) << "Input data must have specified type";
+
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));  // forward output
+    TYPE_ASSIGN_CHECK(*out_attrs, 1, in_attrs->at(0));  // grad output
+    return true;
+}
+
+inline bool CTCLossOpStorageType(const nnvm::NodeAttrs& attrs,
+                                 const int dev_mask,
+                                 DispatchMode* dispatch_mode,
+                                 std::vector<int>* in_attrs,
+                                 std::vector<int>* out_attrs) {
+  CHECK_GE(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 2U);
+  const int in_stype = in_attrs->at(0);
+  bool dispatched = false;
+  if (!dispatched && in_stype == kDefaultStorage) {
+    // dns -> dns
+    dispatched = storage_type_assign(out_attrs, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFCompute);
+  }
+  if (!dispatched) {
+    dispatched = dispatch_fallback(out_attrs, dispatch_mode);
+  }
+  return dispatched;
+}
+
+
+inline std::vector<std::string> CTCLossOpListInputNames(const NodeAttrs& attrs) {
+  const CTCLossOpParam& param = nnvm::get<CTCLossOpParam>(attrs.parsed);
+  if (param.use_data_lengths && param.use_label_lengths) {
+    return {"data", "label", "data_lengths", "label_lengths"};
+  } else if (param.use_data_lengths) {
+    return {"data", "label", "data_lengths"};
+  } else if (param.use_label_lengths) {
+    return {"data", "label", "label_lengths"};
+  } else {
+    return {"data", "label"};
+  }
+}
+
+template<typename xpu>
+void CTCLossOpForward(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<TBlob>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+
+  const CTCLossOpParam& param = nnvm::get<CTCLossOpParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), CTCLossOpNumInputs(attrs));
+  CHECK_EQ(outputs.size(), 2U);
+  CHECK_EQ(req.size(), 2U);
+
+  const TBlob& in_data = inputs[ctc_loss::kData];
+  const TBlob& in_label = inputs[ctc_loss::kLabel];
+  const TBlob& out_data = outputs[ctc_loss::kOut];
+  const TBlob& out_grad = outputs[ctc_loss::kGrad];
+
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH(inputs[ctc_loss::kLabel].type_flag_, DType, {
+    Tensor<xpu, 3, real_t> data = in_data.get<xpu, 3, real_t>(s);
+    Tensor<xpu, 2, DType> labels = in_label.get<xpu, 2, DType>(s);
+    Tensor<xpu, 1, real_t> costs = out_data.get<xpu, 1, real_t>(s);
+    Tensor<xpu, 3, real_t> grad = out_grad.get<xpu, 3, real_t>(s);
+
+    int max_seq_len = data.size(0);
+    int batch_size = data.size(1);
+    int alphabet_size = data.size(2);
+
+    // data_lengths
+    std::vector<int> data_lengths(batch_size, max_seq_len);
+    if (param.use_data_lengths) {
+      int kInputLength = 2;
+      IndexTensorToVector(inputs[kInputLength].get<xpu, 1, real_t>(s), &data_lengths);
+    }
+
+    // label_lengths
+    std::vector<int> packed_labels;
+    std::vector<int> label_lengths(batch_size);
+
+    if (param.use_label_lengths) {
+      int kLabelLength = 2 + param.use_data_lengths;
+      PackLabelByLength(labels, inputs[kLabelLength].get<xpu, 1, DType>(s),
+                        &packed_labels, &label_lengths);
+    } else {
+      LabelTensorToPackedVector(labels, param.blank_label == 0 ? 0 : -1,
+                                &packed_labels, &label_lengths);
+    }
+
+    size_t size_bytes;
+    get_workspace_size<real_t>(&label_lengths, &data_lengths, alphabet_size,
+                               batch_size, data.kDevCPU ? false : true, &size_bytes);
+
+    // round-up so there are enough elems in memory
+    int num_tmp_elems = (size_bytes + sizeof(real_t) - 1) / sizeof(real_t);
+    Tensor<xpu, 1, real_t> workspace =
+      ctx.requested[0].get_space_typed<xpu, 1, real_t>(Shape1(num_tmp_elems), s);
+
+    compute_ctc_cost(data, costs.dptr_, grad.dptr_, packed_labels.data(),
+                     label_lengths.data(), data_lengths.data(),
+                     workspace.dptr_, req[ctc_loss::kGrad] != mxnet::kNullOp,
+                     param.blank_label == 0 ? 0 : (alphabet_size - 1));
+
+    if (param.use_data_lengths) {
+      // baidu warp CTC implementation sometimes includes undefined gradients
+      // for data outside of length mask. Setting to 0 to make it consistent
+      // with CPU implementation.
+      int kInputLength = 2;
+      mxnet_op::SequenceMask(grad, inputs[kInputLength].get<xpu, 1, real_t>(s),
+                             static_cast<real_t>(0));
+    }
+  });
+}
+
+template<typename xpu>
+void CTCLossOpBackward(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx,
+                       const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TBlob& in_grad = outputs[0];
+  const TBlob& out_grad = inputs[0];
+  const TBlob& grad_computed = inputs[3];  // grad computed in the forward step
+
+  Tensor<xpu, 3, real_t> igrad_data = in_grad.get<xpu, 3, real_t>(s);
+  Tensor<xpu, 1, real_t> ograd_data = out_grad.get<xpu, 1, real_t>(s);
+  Tensor<xpu, 3, real_t> computed_grad_data = grad_computed.get<xpu, 3, real_t>(s);
+
+  Assign(igrad_data, req[0],
+         mshadow::expr::broadcast<1>(ograd_data, computed_grad_data.shape_) * computed_grad_data);
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NN_CTC_LOSS_INL_H_
+
diff --git a/src/operator/contrib/ctc_loss.cc b/src/operator/nn/ctc_loss.cc
similarity index 64%
rename from src/operator/contrib/ctc_loss.cc
rename to src/operator/nn/ctc_loss.cc
index 32e8e629f090..c381677b3ce0 100644
--- a/src/operator/contrib/ctc_loss.cc
+++ b/src/operator/nn/ctc_loss.cc
@@ -18,26 +18,22 @@
  */
 
 /*!
- * Copyright (c) 2015 by Contributors
  * \file ctc_loss.cc
- * \brief
- * \author Sebastian Bodenstein
-*/
-
+ * \brief CPU Implementation of CTC Loss op
+ */
 #include "./ctc_loss-inl.h"
-#include "./ctc_include/detail/cpu_ctc.h"
+#include "../../../3rdparty/ctc_include/detail/cpu_ctc.h"
 
 namespace mshadow {
-
 template <typename DType>
 ctcStatus_t compute_ctc_cost(const Tensor<cpu, 3, DType> activations,
                              DType *costs, DType *grads, int *labels,
                              int *label_lengths, int *data_lengths,
-                             void *workspace, int train, int blank_label) {
+                             void *workspace, bool isTraining, int blank_label) {
   int minibatch = static_cast<int>(activations.size(1));
   int alphabet_size = static_cast<int>(activations.size(2));
   mxnet_warpctc::CpuCTC<DType> ctc(alphabet_size, minibatch, workspace, blank_label);
-  if (train) {
+  if (isTraining) {
     return ctc.cost_and_grad(activations.dptr_, grads, costs, labels,
                              label_lengths, data_lengths);
   } else {
@@ -45,32 +41,18 @@ ctcStatus_t compute_ctc_cost(const Tensor<cpu, 3, DType> activations,
                              data_lengths);
   }
 }
-
 }  // namespace mshadow
 
 namespace mxnet {
 namespace op {
-template <>
-Operator *CreateOp<cpu>(CTCLossParam param, int dtype) {
-  return new CTCLossOp<cpu>(param);
-}
-
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *CTCLossProp::CreateOperatorEx(Context ctx,
-                                        std::vector<TShape> *in_shape,
-                                        std::vector<int> *in_type) const {
-  std::vector<TShape> out_shape, aux_shape;
-  std::vector<int> out_type, aux_type;
-  CHECK(InferType(in_type, &out_type, &aux_type));
-  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
-}
-
-DMLC_REGISTER_PARAMETER(CTCLossParam);
 
-MXNET_REGISTER_OP_PROPERTY(_contrib_CTCLoss, CTCLossProp)
-    .describe(R"code(Connectionist Temporal Classification Loss.
+DMLC_REGISTER_PARAMETER(CTCLossOpParam);
 
+NNVM_REGISTER_OP(CTCLoss)
+.add_alias("ctc_loss")
+.add_alias("_contrib_CTCLoss")
+.add_alias("_contrib_ctc_loss")
+.describe(R"code(Connectionist Temporal Classification Loss.
 The shapes of the inputs and outputs:
 
 - **data**: `(sequence_length, batch_size, alphabet_size)`
@@ -113,18 +95,41 @@ Sequence Data with Recurrent Neural Networks*, A. Graves *et al*. for more
 information on the definition and the algorithm.
 
 )code" ADD_FILELINE)
-    .add_argument("data", "NDArray-or-Symbol", "Input data to the ctc_loss op.")
-    .add_argument("label", "NDArray-or-Symbol",
-                  "Ground-truth labels for the loss.")
-    .add_argument("data_lengths", "NDArray-or-Symbol",
-                  "Lengths of data for each of the samples. Only required "
-                  "when use_data_lengths is true.")
-    .add_argument("label_lengths", "NDArray-or-Symbol",
-                  "Lengths of labels for each of the samples. Only required "
-                  "when use_label_lengths is true.")
-    .add_arguments(CTCLossParam::__FIELDS__());
-
-NNVM_REGISTER_OP(_contrib_CTCLoss).add_alias("_contrib_ctc_loss");
+.set_attr_parser(ParamParser<CTCLossOpParam>)
+.set_num_inputs(CTCLossOpNumInputs)
+.set_num_outputs(2)
+.set_attr<nnvm::FListInputNames>("FListInputNames", CTCLossOpListInputNames)
+.set_attr<nnvm::FListOutputNames>("FListOutputNAmes",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"out", "grad"};
+  })
+.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
+  [](const NodeAttrs& attrs) {
+    return 1;
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", CTCLossOpShape)
+.set_attr<nnvm::FInferType>("FInferType", CTCLossOpType)
+.set_attr<FInferStorageType>("FInferStorageType", CTCLossOpStorageType)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs)
+  { return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; })
+.set_attr<FCompute>("FCompute<cpu>", CTCLossOpForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_ctc_loss"})
+.add_argument("data", "NDArray-or-Symbol", "Input ndarray")
+.add_argument("label", "NDArray-or-Symbol", "Ground-truth labels for the loss.")
+.add_argument("data_lengths", "NDArray-or-Symbol",
+              "Lengths of data for each of the samples. Only required "
+              "when use_data_lengths is true.")
+.add_argument("label_lengths", "NDArray-or-Symbol",
+              "Lengths of labels for each of the samples. Only required "
+              "when use_label_lengths is true.")
+.add_arguments(CTCLossOpParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_ctc_loss)
+.set_attr_parser(ParamParser<CTCLossOpParam>)
+.set_num_inputs(1)
+.set_num_outputs(CTCLossOpNumInputs)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", CTCLossOpBackward<cpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/ctc_loss.cu b/src/operator/nn/ctc_loss.cu
similarity index 81%
rename from src/operator/contrib/ctc_loss.cu
rename to src/operator/nn/ctc_loss.cu
index 3f5f12ca4394..a4491bf6986e 100644
--- a/src/operator/contrib/ctc_loss.cu
+++ b/src/operator/nn/ctc_loss.cu
@@ -18,14 +18,13 @@
  */
 
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2018 by Contributors
  * \file ctc_loss.cu
- * \brief
- * \author Sebastian Bodenstein
-*/
-#include <algorithm>
+ * \brief GPU Implementation of ctc_loss op
+ */
+
 #include "./ctc_loss-inl.h"
-#include "./ctc_include/detail/gpu_ctc.h"
+#include "../../../3rdparty/ctc_include/detail/gpu_ctc.h"
 
 namespace mshadow {
 
@@ -45,17 +44,19 @@ ctcStatus_t compute_ctc_cost(const Tensor<gpu, 3, DType> activations,
     return ctc.score_forward(activations.dptr_, costs, labels,
                              label_lengths, input_lengths);
 }
-
 }  // namespace mshadow
 
-////////////////////////////////////////////////////////////////////////////////
-
 namespace mxnet {
 namespace op {
-template <>
-Operator *CreateOp<gpu>(CTCLossParam param, int dtype) {
-  return new CTCLossOp<gpu>(param);
-}
+
+NNVM_REGISTER_OP(CTCLoss)
+.add_alias("ctc_loss")
+.add_alias("_contrib_ctc_loss")
+.add_alias("_contrib_CTCLoss")
+.set_attr<FCompute>("FCompute<gpu>", CTCLossOpForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_ctc_loss)
+.set_attr<FCompute>("FCompute<gpu>", CTCLossOpBackward<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
index acdd64976651..53bd76c9c3e8 100644
--- a/src/operator/nn/cudnn/cudnn_convolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -543,6 +543,11 @@ class CuDNNConvolutionOp {
     #if CUDNN_MAJOR >= 7
       cudnnMathType_t math_type = cudnn_tensor_core_ ? CUDNN_TENSOR_OP_MATH
                                                     : CUDNN_DEFAULT_MATH;
+      #if CUDNN_VERSION >= 7200
+            if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() &&
+                (DataType<DType>::kFlag != kFloat16))
+              math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
+      #endif
       CUDNN_CALL(cudnnSetConvolutionMathType(forward_conv_desc_, math_type));
       CUDNN_CALL(cudnnSetConvolutionMathType(back_conv_desc_, math_type));
       CUDNN_CALL(cudnnSetConvolutionMathType(back_conv_desc_w_, math_type));
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index c7ccfb2fb4e8..d89a489c0183 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -131,7 +131,7 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
     if (bCal) {
       size_t input_ndim = input.ndim();
 
-      for (index_t i = 0; i < ndim; i++) {
+      for (size_t i = 0; i < ndim; i++) {
         // input.ndim() can be larger than ndim, in case that the complete input
         // shape was passed and not only the ndim last ones
         o_pad[i] = stride[i] * (input[(input_ndim - ndim) + i] - 1) + DilatedKernelSize(i);
@@ -141,7 +141,7 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
         o_pad[i] = (o_pad[i] + 1) / 2;
       }
     } else {
-      for (index_t i = 0; i < ndim; i++) {
+      for (size_t i = 0; i < ndim; i++) {
         o_pad[i] = pad[i];
         o_adj[i] = adj[i];
       }
@@ -459,12 +459,10 @@ class DeconvolutionOp {
                                      oshape[1] / param_.num_group,
                                      oshape[2] * oshape[3]);
     // See convolution for workspace calculations. nstep_ will be the effective batch size
-    nstep_ = std::max(
-        std::min(
-            static_cast<index_t>(
-                param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
-            ishape[0]),
-        1U);
+    nstep_ = std::max<index_t>(
+        std::min(static_cast<index_t>(param_.workspace) /
+          (shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]),
+      1);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
                                              shape_colunit_[1] * nstep_);
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index 1ab391d92b04..039c732c831d 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -248,7 +248,7 @@ static bool DeconvolutionType(const nnvm::NodeAttrs& attrs,
   CHECK_GE(in_type->size(), 1U);
   int dtype = (*in_type)[0];
   CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
+  for (size_t i = 0; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype;
     } else {
diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc
index 99f0dc46f1bd..020cb479acc6 100644
--- a/src/operator/nn/lrn.cc
+++ b/src/operator/nn/lrn.cc
@@ -57,7 +57,7 @@ bool LRNType(const nnvm::NodeAttrs& attrs,
   CHECK_GE(in_type->size(), 1U);
   int dtype = (*in_type)[0];
   CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
+  for (size_t i = 0; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype;
     } else {
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index c4a4e52480b6..8a2f4a3e5011 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -132,6 +132,11 @@ static inline bool SupportMKLDNN(int dtype, const TShape &shape) {
   return dtype == mshadow::kFloat32 && (ndim == 1 || ndim == 2 || ndim == 4);
 }
 
+static inline bool SupportMKLDNNQuantize(int dtype) {
+  return dtype == mshadow::kFloat32 || dtype == mshadow::kInt8 ||
+         dtype == mshadow::kUint8;
+}
+
 static inline bool SupportMKLDNN(const NDArray &input) {
   return SupportMKLDNN(input.dtype(), input.shape())
       && SupportStorageMKLDNN(input.storage_type());
@@ -186,6 +191,23 @@ static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) {
   }
 }
 
+static inline int get_mxnet_type(mkldnn_data_type_t dtype) {
+  auto mkldnn_dtype = static_cast<mkldnn::memory::data_type>(dtype);
+  switch (mkldnn_dtype) {
+    case mkldnn::memory::data_type::f32:
+      return mshadow::kFloat32;
+    case mkldnn::memory::data_type::s32:
+      return mshadow::kInt32;
+    case mkldnn::memory::data_type::s8:
+      return mshadow::kInt8;
+    case mkldnn::memory::data_type::u8:
+      return mshadow::kUint8;
+    default:
+      LOG(FATAL) << "unknown MKLDNN type";
+      return mshadow::kFloat32;
+  }
+}
+
 inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int ndim) {
   mkldnn::memory::dims dims(ndim);
   for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i];
@@ -327,6 +349,24 @@ enum OutDataOp {
 typedef std::pair<OutDataOp, mkldnn::memory *> mkldnn_output_t;
 void MKLDNNCopy(const mkldnn::memory &mem, const mkldnn::memory* this_mem);
 
+/*
+ * Here we want to get MKLDNN memory whose primitive desc is exactly the same as
+ * the given one. operator== can't guarantee that. == can return true even if
+ * the formats are different. I need to double check its format.
+ */
+static inline mkldnn::memory *GetMKLDNNExact(
+    const mkldnn::memory *mem, mkldnn::memory::primitive_desc desc) {
+  mkldnn::memory::primitive_desc src_desc = mem->get_primitive_desc();
+  if (desc == src_desc && desc.desc().data.format == src_desc.desc().data.format) {
+    return const_cast<mkldnn::memory *>(mem);
+  } else {
+    std::shared_ptr<mkldnn::memory> ret(new mkldnn::memory(
+            desc, mem->get_data_handle()));
+    MKLDNNStream::Get()->RegisterMem(ret);
+    return ret.get();
+  }
+}
+
 /*
  * These two functions try to create MKLDNN memory in an NDArray based on `req'.
  * The difference is that the first function can create MKLDNN memory with
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index 029f23bd8f5e..a60d6555c74d 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -332,6 +332,7 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
   } else if (desc.data.ndims == 5) {
     switch (desc.data.format) {
       case mkldnn_goihw:
+      case mkldnn_hwigo:
       case mkldnn_gOIhw8i8o:
       case mkldnn_gOIhw16i16o:
       case mkldnn_gOIhw4i16o4i:
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
index 23f2fe694633..971c66ad9dd2 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
@@ -27,6 +27,7 @@
 
 #if MXNET_USE_MKLDNN == 1
 
+#include <vector>
 #include <utility>
 #include "../convolution-inl.h"
 #include "./mkldnn_ops-inl.h"
@@ -35,19 +36,68 @@
 namespace mxnet {
 namespace op {
 
-mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
-    const ConvolutionParam& param, const bool is_train, const NDArray &data,
-    const NDArray &weights, const NDArray *bias, const NDArray &output);
+struct MKLDNNConvParam : public dmlc::Parameter<MKLDNNConvParam> {
+  bool with_bn;
+  bool with_relu;
+  bool with_sum;
+  bool with_postsum_relu;
+  bool quantized;
+  bool weight_channelwise_scale;
+
+  dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
+  dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
+
+  DMLC_DECLARE_PARAMETER(MKLDNNConvParam) {
+    DMLC_DECLARE_FIELD(with_bn).set_default(false)
+    .describe("Add post batchnorm.");
+    DMLC_DECLARE_FIELD(with_relu).set_default(false)
+    .describe("Add post relu");
+    DMLC_DECLARE_FIELD(with_sum).set_default(false)
+    .describe("Add post sum");
+    DMLC_DECLARE_FIELD(with_postsum_relu).set_default(false)
+    .describe("Add post relu after sum");
+    DMLC_DECLARE_FIELD(quantized).set_default(false)
+    .describe("enable quantization");
+    DMLC_DECLARE_FIELD(weight_channelwise_scale).set_default(true)
+    .describe("Quantize weight with channel wise scales.");
+    DMLC_DECLARE_FIELD(min_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The minimum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to by "
+              "quantized convolution op to calculate primitive scale");
+    DMLC_DECLARE_FIELD(max_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The maximum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to by "
+              "quantized convolution op to calculate primitive scale");
+  }
+};
+
+struct MKLDNNConvFullParam {
+  ConvolutionParam conv_param;
+  MKLDNNConvParam mkldnn_param;
+  float sum_scale;
+  std::vector<float> requantize_scales;
+};
+
+static inline bool IsOutputUInt8(const MKLDNNConvParam &mkldnn_param) {
+  return ((!mkldnn_param.with_sum) && mkldnn_param.with_relu) ||
+         mkldnn_param.with_postsum_relu;
+}
+
+mkldnn::convolution_forward::primitive_desc
+GetConvFwdImpl(const MKLDNNConvFullParam &param, const bool is_train,
+               const NDArray &data, const NDArray &weights, const NDArray *bias,
+               const NDArray &output);
 
 class MKLDNNConvForward {
  public:
   mkldnn::convolution_forward::primitive_desc fwd_pd;
 
-  MKLDNNConvForward(const ConvolutionParam& param, const bool is_train,
+  MKLDNNConvForward(const MKLDNNConvFullParam &param, const bool is_train,
                     const NDArray &data, const NDArray &weights,
-                    const NDArray *bias, const NDArray &output): fwd_pd(
-                        GetConvFwdImpl(param, is_train, data, weights, bias, output)) {
-  }
+                    const NDArray *bias, const NDArray &output)
+      : fwd_pd(GetConvFwdImpl(param, is_train, data, weights, bias, output)) {}
 
   void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight,
                  const mkldnn::memory *bias, const mkldnn::memory &output);
@@ -66,9 +116,17 @@ class MKLDNNConvForward {
 
 typedef ParamOpSign<ConvolutionParam> MKLDNNConvSignature;
 
-MKLDNNConvForward &GetConvFwd(const nnvm::NodeAttrs& attrs,
-    const bool is_train, const NDArray &data, const NDArray &weights,
-    const NDArray *bias, const NDArray &output);
+MKLDNNConvForward &GetConvFwd(const ConvolutionParam &param,
+                              const bool is_train, const NDArray &data,
+                              const NDArray &weights, const NDArray *bias,
+                              const NDArray &output);
+
+void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam &param,
+                                         const OpContext &ctx,
+                                         MKLDNNConvForward *fwd,
+                                         const std::vector<NDArray> &in_data,
+                                         const std::vector<OpReqType> &req,
+                                         const std::vector<NDArray> &out_data);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
index 2e19d3219abb..6a70ae40ac8f 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -34,55 +34,83 @@
 namespace mxnet {
 namespace op {
 
+DMLC_REGISTER_PARAMETER(MKLDNNConvParam);
+
 bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input) {
   if (params.kernel.ndim() != 2)
     return false;
-  return input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 4;
+  return SupportMKLDNNQuantize(input.dtype()) && input.shape().ndim() == 4;
 }
 
 mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
-    const ConvolutionParam& param, const bool is_train, const NDArray &data,
-    const NDArray &weights, const NDArray *bias, const NDArray &output) {
+    const MKLDNNConvFullParam &param, const bool is_train,
+    const NDArray &data, const NDArray &weights, const NDArray *bias,
+    const NDArray &output) {
   auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
   auto data_md = GetMemDesc(data);
-  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto weight_md = GetWeightDesc(weights, param.conv_param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
-  CHECK_GE(param.stride.ndim(), 2U);
-  CHECK_GE(param.pad.ndim(), 2U);
-  CHECK_GE(param.dilate.ndim(), 2U);
+  CHECK_GE(param.conv_param.stride.ndim(), 2U);
+  CHECK_GE(param.conv_param.pad.ndim(), 2U);
+  CHECK_GE(param.conv_param.dilate.ndim(), 2U);
   mkldnn::memory::dims strides{0, 0};
-  strides[0] = param.stride[0];
-  strides[1] = param.stride[1];
+  strides[0] = param.conv_param.stride[0];
+  strides[1] = param.conv_param.stride[1];
   mkldnn::memory::dims padding{0, 0};
-  padding[0] = param.pad[0];
-  padding[1] = param.pad[1];
-  if (param.dilate.ndim() == 0 && bias == nullptr) {
+  padding[0] = param.conv_param.pad[0];
+  padding[1] = param.conv_param.pad[1];
+  mkldnn::primitive_attr attr;
+  mkldnn::post_ops ops;
+  if (param.mkldnn_param.with_relu) {
+    float scale = 1.0f;            // for fp32, scale is 1.
+    float alpha = 0.0f;            // negative slope for mkldnn_eltwise_relu.
+    float beta = 1.0f;             // ignored for mkldnn_eltwise_relu.
+    ops.append_eltwise(scale, eltwise_relu, alpha, beta);
+  }
+  if (param.mkldnn_param.with_sum) {
+    ops.append_sum(param.sum_scale);
+  }
+  if (param.mkldnn_param.with_postsum_relu) {
+    float scale = 1.0f;            // for fp32, scale is 1.
+    float alpha = 0.0f;            // negative slope for mkldnn_eltwise_relu.
+    float beta = 1.0f;             // ignored for mkldnn_eltwise_relu.
+    ops.append_eltwise(scale, eltwise_relu, alpha, beta);
+  }
+  attr.set_post_ops(ops);
+
+  if (param.mkldnn_param.quantized && param.requantize_scales.size()) {
+    int mask = param.mkldnn_param.weight_channelwise_scale ? 2 : 0;
+    attr.set_output_scales(mask, param.requantize_scales);
+    attr.set_int_output_round_mode(round_nearest);
+  }
+
+  if (param.conv_param.dilate.ndim() == 0 && bias == nullptr) {
     mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
         data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
-    return mkldnn::convolution_forward::primitive_desc(desc, engine);
-  } else if (param.dilate.ndim() == 0) {
+    return mkldnn::convolution_forward::primitive_desc(desc, attr, engine);
+  } else if (param.conv_param.dilate.ndim() == 0) {
     auto bias_md = GetMemDesc(*bias);
     mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
         data_md, weight_md, bias_md, out_md, strides, padding, padding,
         mkldnn::padding_kind::zero);
-    return mkldnn::convolution_forward::primitive_desc(desc, engine);
+    return mkldnn::convolution_forward::primitive_desc(desc, attr, engine);
   } else {
     mkldnn::memory::dims dilates{0, 0};
-    dilates[0] = param.dilate[0] - 1;
-    dilates[1] = param.dilate[1] - 1;
+    dilates[0] = param.conv_param.dilate[0] - 1;
+    dilates[1] = param.conv_param.dilate[1] - 1;
     if (bias == nullptr) {
       mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
           data_md, weight_md, out_md, strides, dilates, padding, padding,
           mkldnn::padding_kind::zero);
-      return mkldnn::convolution_forward::primitive_desc(desc, engine);
+      return mkldnn::convolution_forward::primitive_desc(desc, attr, engine);
     } else {
       auto bias_md = GetMemDesc(*bias);
       mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
                                              data_md, weight_md, bias_md, out_md, strides,
                                              dilates, padding, padding,
                                              mkldnn::padding_kind::zero);
-      return mkldnn::convolution_forward::primitive_desc(desc, engine);
+      return mkldnn::convolution_forward::primitive_desc(desc, attr, engine);
     }
   }
 }
@@ -207,15 +235,15 @@ void MKLDNNConvForward::SetNewMem(const mkldnn::memory &data,
   }
 }
 
-MKLDNNConvForward &GetConvFwd(const nnvm::NodeAttrs& attrs, const bool is_train,
-                              const NDArray &data, const NDArray &weights,
-                              const NDArray *bias, const NDArray &output) {
+MKLDNNConvForward &GetConvFwd(const ConvolutionParam &param,
+                              const bool is_train, const NDArray &data,
+                              const NDArray &weights, const NDArray *bias,
+                              const NDArray &output) {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNConvSignature, MKLDNNConvForward, OpHash> fwds;
 #else
   static MX_THREAD_LOCAL std::unordered_map<MKLDNNConvSignature, MKLDNNConvForward, OpHash> fwds;
 #endif
-  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
   MKLDNNConvSignature key(param);
   key.AddSign(is_train);
   // Here we can sign the conv op with NDArray because conv primitive will
@@ -229,7 +257,10 @@ MKLDNNConvForward &GetConvFwd(const nnvm::NodeAttrs& attrs, const bool is_train,
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
-    MKLDNNConvForward fwd(param, is_train, data, weights, bias, output);
+    MKLDNNConvFullParam full_param;
+    full_param.conv_param = param;
+    full_param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
+    MKLDNNConvForward fwd(full_param, is_train, data, weights, bias, output);
     auto ins_ret = fwds.insert(
         std::pair<MKLDNNConvSignature, MKLDNNConvForward>(key, fwd));
     CHECK(ins_ret.second);
@@ -238,17 +269,17 @@ MKLDNNConvForward &GetConvFwd(const nnvm::NodeAttrs& attrs, const bool is_train,
   return it->second;
 }
 
-void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                               const std::vector<NDArray> &in_data,
-                               const std::vector<OpReqType> &req,
-                               const std::vector<NDArray> &out_data) {
+void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam &param,
+                                         const OpContext &ctx,
+                                         MKLDNNConvForward *fwd,
+                                         const std::vector<NDArray> &in_data,
+                                         const std::vector<OpReqType> &req,
+                                         const std::vector<NDArray> &out_data) {
   TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
-  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
   NDArray weight = in_data[conv::kWeight];
-  MKLDNNConvForward &fwd = GetConvFwd(attrs, ctx.is_train, in_data[conv::kData], weight,
-      param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]);
-
-  auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_primitive_desc());
+  bool no_bias = param.conv_param.no_bias && !param.mkldnn_param.with_bn;
+  auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(
+      fwd->fwd_pd.src_primitive_desc());
   const mkldnn::memory *weight_mem;
   if (ctx.is_train) {
     // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
@@ -257,32 +288,58 @@ void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx
       // This asks the engine to change the layout of the weight array after
       // it's used.
       weight.Reorder2DefaultAsync();
-    weight_mem = GetWeights(weight, fwd.fwd_pd.weights_primitive_desc(), param.num_group);
+    weight_mem = GetWeights(weight, fwd->fwd_pd.weights_primitive_desc(),
+                            param.conv_param.num_group);
   } else {
     // For inference, we want to reorder the weight array so we don't need to
     // reorder data every time.
     if (weight.IsDefaultData()) {
-      weight_mem = GetWeights(weight, fwd.fwd_pd.weights_primitive_desc(), param.num_group);
+      weight_mem = GetWeights(weight, fwd->fwd_pd.weights_primitive_desc(),
+                              param.conv_param.num_group);
       // We also need to modify the layout on the original weight array. The
       // data conversion happens after the weight array is used.
-      weight.MKLDNNDataReorderAsync(fwd.fwd_pd.weights_primitive_desc());
+      weight.MKLDNNDataReorderAsync(fwd->fwd_pd.weights_primitive_desc());
     } else {
       weight_mem = weight.GetMKLDNNData();
-      CHECK(weight_mem->get_primitive_desc() == fwd.fwd_pd.weights_primitive_desc());
+      CHECK(weight_mem->get_primitive_desc() == fwd->fwd_pd.weights_primitive_desc());
     }
   }
-  auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd.fwd_pd.dst_primitive_desc(),
-                                 req[conv::kOut]);
+  mkldnn_output_t out_mem;
+  if (param.mkldnn_param.with_sum) {
+    out_mem = mkldnn_output_t(
+        OutDataOp::Noop,
+        const_cast<mkldnn::memory *>(out_data[conv::kOut].GetMKLDNNData()));
+  } else {
+    out_mem = CreateMKLDNNMem(out_data[conv::kOut],
+                              fwd->fwd_pd.dst_primitive_desc(), req[conv::kOut]);
+  }
+
   const mkldnn::memory *bias_mem = nullptr;
-  if (!param.no_bias)
-    bias_mem = in_data[conv::kBias].GetMKLDNNDataReorder(fwd.fwd_pd.bias_primitive_desc());
-  fwd.SetNewMem(*data_mem, *weight_mem, bias_mem, *out_mem.second);
-  MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd());
+  if (!no_bias) {
+    bias_mem = in_data[conv::kBias].GetMKLDNNData();
+  }
+  fwd->SetNewMem(*data_mem, *weight_mem, bias_mem, *out_mem.second);
+  MKLDNNStream::Get()->RegisterPrim(fwd->GetFwd());
 
   CommitOutput(out_data[conv::kOut], out_mem);
   MKLDNNStream::Get()->Submit();
 }
 
+void MKLDNNConvolutionForward(const nnvm::NodeAttrs &attrs,
+                              const OpContext &ctx,
+                              const std::vector<NDArray> &in_data,
+                              const std::vector<OpReqType> &req,
+                              const std::vector<NDArray> &out_data) {
+  MKLDNNConvFullParam param;
+  param.conv_param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
+  auto &fwd = GetConvFwd(
+      param.conv_param, ctx.is_train, in_data[conv::kData], in_data[conv::kWeight],
+      param.conv_param.no_bias ? nullptr : &in_data[conv::kBias],
+      out_data[conv::kOut]);
+  MKLDNNConvolutionForwardFullFeature(param, ctx, &fwd, in_data, req, out_data);
+}
+
 class MKLDNNConvBackward {
   std::shared_ptr<mkldnn::convolution_backward_data> bwd_data;
   std::shared_ptr<mkldnn::convolution_backward_weights> bwd_weight;
@@ -440,10 +497,14 @@ void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ct
                                const std::vector<NDArray>& outputs) {
   TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
   const std::vector<NDArray> &in_grad = outputs;
-  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwdImpl(param, ctx.is_train,
-      inputs[conv::kData + 1], inputs[conv::kWeight + 1],
-      param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut]);
+  MKLDNNConvFullParam full_param;
+  full_param.conv_param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  full_param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
+  mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwdImpl(
+      full_param, ctx.is_train, inputs[conv::kData + 1], inputs[conv::kWeight + 1],
+      full_param.conv_param.no_bias ? nullptr : &inputs[conv::kBias + 1],
+      inputs[conv::kOut]);
+  const ConvolutionParam &param = full_param.conv_param;
 
   CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace";
   MKLDNNConvBackward &convBwd = GetConvBwd(attrs, inputs[conv::kData + 1],
diff --git a/src/operator/nn/upsampling-inl.h b/src/operator/nn/upsampling-inl.h
index 4b9159edd174..feb44c894a7a 100644
--- a/src/operator/nn/upsampling-inl.h
+++ b/src/operator/nn/upsampling-inl.h
@@ -48,8 +48,8 @@ enum UpSamplingMultiInputMode {kConcat, kSum};
 }  // namespace up_enum
 
 struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
-  index_t scale;
-  index_t num_filter;
+  int scale;
+  int num_filter;
   int sample_type;
   int num_args;
   int multi_input_mode;
diff --git a/src/operator/nn/upsampling.cc b/src/operator/nn/upsampling.cc
index 5aa111e26f75..b6b3d873df7d 100644
--- a/src/operator/nn/upsampling.cc
+++ b/src/operator/nn/upsampling.cc
@@ -92,7 +92,7 @@ static bool UpSamplingType(const nnvm::NodeAttrs& attrs,
   CHECK_GE(in_type->size(), 1U);
   int dtype = (*in_type)[0];
   CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
+  for (size_t i = 0; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype;
     } else {
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index 6a4c3d027075..d7c141724777 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -395,7 +395,7 @@ inline std::vector<nnvm::NodeEntry> MakeGradNode(
   auto p = MakeNode(op_name, n->attrs.name + "_backward",
                     &inputs, &dict, &n);
   std::vector<nnvm::NodeEntry> ret;
-  for (index_t i = 0; i < p->num_outputs(); ++i) {
+  for (uint32_t i = 0; i < p->num_outputs(); ++i) {
     ret.emplace_back(nnvm::NodeEntry{p, i, 0});
   }
   return ret;
@@ -406,7 +406,7 @@ inline std::vector<nnvm::NodeEntry> MakeZeroGradNodes(
     const nnvm::NodePtr& n,
     const std::vector<nnvm::NodeEntry>& ograds) {
   std::vector<nnvm::NodeEntry> ret;
-  for (index_t i = 0; i < n->num_inputs(); ++i) {
+  for (uint32_t i = 0; i < n->num_inputs(); ++i) {
     std::ostringstream os;
     if (1 == n->num_inputs()) {
       os << n->attrs.name << "_backward";
@@ -445,7 +445,7 @@ inline std::vector<nnvm::NodeEntry> MakeNonlossGradNode(
   p->inputs.insert(p->inputs.end(), ograds.begin(), ograds.end());
   p->inputs.insert(p->inputs.end(), inputs.begin(), inputs.end());
   std::vector<nnvm::NodeEntry> ret;
-  for (index_t i = 0; i < p->num_outputs(); ++i) {
+  for (uint32_t i = 0; i < p->num_outputs(); ++i) {
     ret.emplace_back(nnvm::NodeEntry{p, i, 0});
   }
   return ret;
diff --git a/src/operator/pad.cu b/src/operator/pad.cu
index 372683a2be81..1aab12a3a79f 100644
--- a/src/operator/pad.cu
+++ b/src/operator/pad.cu
@@ -56,9 +56,9 @@ __global__ void image_2d_pad_edge_kernel(Tensor<gpu, 4, DType> dst,
   int oStartY = max(0, padT);
 
   int inputPointX =
-      min(max(padL, outputPointX), src.size(3) + padL - 1) - oStartX + iStartX;
+      min(max(padL, outputPointX), static_cast<int>(src.size(3)) + padL - 1) - oStartX + iStartX;
   int inputPointY =
-      min(max(padT, outputPointY), src.size(2) + padT - 1) - oStartY + iStartY;
+      min(max(padT, outputPointY), static_cast<int>(src.size(2)) + padT - 1) - oStartY + iStartY;
 
   DType valueToCopy = src[batch][plane][inputPointY][inputPointX];
   dst[batch][plane][outputPointY][outputPointX] = valueToCopy;
@@ -98,9 +98,9 @@ __global__ void image_2d_pad_edge_grad_kernel(
   int iStartY = max(0, -padT);
   int oStartX = max(0, padL);
   int oStartY = max(0, padT);
-  int inputPointX = min(max(padL, outputPointX), grad_in.size(3) + padL - 1) -
+  int inputPointX = min(max(padL, outputPointX), static_cast<int>(grad_in.size(3)) + padL - 1) -
                     oStartX + iStartX;
-  int inputPointY = min(max(padT, outputPointY), grad_in.size(2) + padT - 1) -
+  int inputPointY = min(max(padT, outputPointY), static_cast<int>(grad_in.size(2)) + padT - 1) -
                     oStartY + iStartY;
   DType valueToCopy = grad_out[batch][plane][outputPointY][outputPointX];
   atomicAdd(&grad_in[batch][plane][inputPointY][inputPointX], valueToCopy);
@@ -346,11 +346,11 @@ __global__ void image_3d_pad_edge_kernel(Tensor<gpu, 5, DType> dst,
   int oStartZ = max(0, padF);
 
   int inputPointX =
-      min(max(padL, outputPointX), src.size(4) + padL - 1) - oStartX + iStartX;
+      min(max(padL, outputPointX), static_cast<int>(src.size(4)) + padL - 1) - oStartX + iStartX;
   int inputPointY =
-      min(max(padT, outputPointY), src.size(3) + padT - 1) - oStartY + iStartY;
+      min(max(padT, outputPointY), static_cast<int>(src.size(3)) + padT - 1) - oStartY + iStartY;
   int inputPointZ =
-      min(max(padF, outputPointZ), src.size(2) + padF - 1) - oStartZ + iStartZ;
+      min(max(padF, outputPointZ), static_cast<int>(src.size(2)) + padF - 1) - oStartZ + iStartZ;
 
   DType valueToCopy = src[batch][plane][inputPointZ][inputPointY][inputPointX];
   dst[batch][plane][outputPointZ][outputPointY][outputPointX] = valueToCopy;
@@ -395,11 +395,11 @@ __global__ void image_3d_pad_edge_grad_kernel(
   int oStartY = max(0, padT);
   int oStartZ = max(0, padF);
 
-  int inputPointX = min(max(padL, outputPointX), grad_in.size(4) + padL - 1) -
+  int inputPointX = min(max(padL, outputPointX), static_cast<int>(grad_in.size(4)) + padL - 1) -
                     oStartX + iStartX;
-  int inputPointY = min(max(padT, outputPointY), grad_in.size(3) + padT - 1) -
+  int inputPointY = min(max(padT, outputPointY), static_cast<int>(grad_in.size(3)) + padT - 1) -
                     oStartY + iStartY;
-  int inputPointZ = min(max(padF, outputPointZ), grad_in.size(2) + padF - 1) -
+  int inputPointZ = min(max(padF, outputPointZ), static_cast<int>(grad_in.size(2)) + padF - 1) -
                     oStartZ + iStartZ;
   DType valueToCopy =
       grad_out[batch][plane][outputPointZ][outputPointY][outputPointX];
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h
index f7709319d6a2..7a00f621d452 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h
@@ -75,6 +75,11 @@ static void MKLDNNQuantizeComputeKer(const std::vector<NDArray>& inputs,
   auto i_mpd = i_mem->get_primitive_desc();
   auto i_desc = i_mpd.desc();
   mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
+  if (i_fmt == mkldnn::memory::format::nchw ||
+      i_fmt == mkldnn::memory::format::nChw8c ||
+      i_fmt == mkldnn_nChw16c) {
+    i_fmt = mkldnn::memory::format::nhwc;
+  }
   size_t i_ndim = in_buffer.shape().ndim();
   mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
   for (size_t i = 0; i < i_ndim; i++) {
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
index fa6a32a47392..b8c47c3af11b 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
@@ -41,12 +41,12 @@ static void MKLDNNQuantizedConvForward(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_data[0].dtype(), mshadow::kUint8)
     << "mkldnn_quantized_conv op only supports uint8 as input type";
   TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
-  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
   NDArray weight = in_data[conv::kWeight];
-  MKLDNNConvForward &fwd = GetConvFwd(attrs, ctx.is_train,
-      in_data[conv::kData], weight,
-      param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]);
-
+  ConvolutionParam param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  auto &fwd = GetConvFwd(
+      param, ctx.is_train, in_data[conv::kData], in_data[conv::kWeight],
+      param.no_bias ? nullptr : &in_data[conv::kBias],
+      out_data[conv::kOut]);
   auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_primitive_desc());
   const mkldnn::memory *weight_mem;
   // For inference, we want to reorder the weight array so we don't need to
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
index 10834868d2b5..2fa790dc88ef 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -89,17 +89,42 @@ std::vector<NodeEntry> OfflineParams(std::vector<NodeEntry>&& outputs,
   return outputs;
 }
 
-inline bool NeedQuantize(NodePtr node, const std::unordered_set<NodePtr> excluded_nodes) {
+inline bool NeedQuantize(NodePtr node, const std::unordered_set<std::string>& excluded_nodes) {
   static auto& quantized_op_map = Op::GetAttr<mxnet::FQuantizedOp>("FQuantizedOp");
-  return quantized_op_map.count(node->op()) && !excluded_nodes.count(node);
+  static auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
+  const auto& op = node->op();
+  if (op && quantized_op_map.count(op)) {
+    bool need = true;
+    if (excluded_nodes.count(node->attrs.name)) {
+      need = false;
+    } else if (!node->attrs.subgraphs.empty()) {
+      ExecType exec_type = fexec_type.count(op) ? fexec_type[op](node->attrs) : ExecType::kSync;
+      if (exec_type != ExecType::kSubgraphExec) {
+        // This is a fused subgraph node, try to match inner node.
+        CHECK_EQ(node->attrs.subgraphs.size(), 1);
+        auto subgraph_sym = node->attrs.subgraphs[0];
+        DFSVisit(subgraph_sym->outputs, [&](const nnvm::NodePtr& n) {
+          if (n->is_variable()) return;
+          if (excluded_nodes.count(n->attrs.name)) {
+            need = false;
+          }
+        });
+      }
+    }
+    return need;
+  }
+  return false;
 }
 
 Graph QuantizeGraph(Graph &&src) {
   static auto& quantized_op_map = Op::GetAttr<mxnet::FQuantizedOp>("FQuantizedOp");
   static auto& need_requantize_map = Op::GetAttr<mxnet::FNeedRequantize>("FNeedRequantize");
+  static auto& avoid_quantize_input_map =
+      Op::GetAttr<mxnet::FAvoidQuantizeInput>("FAvoidQuantizeInput");
   auto offline_params = src.GetAttr<std::unordered_set<std::string>>("offline_params");
-  auto excluded_nodes = src.GetAttr<std::unordered_set<NodePtr>>("excluded_nodes");
+  auto excluded_nodes = src.GetAttr<std::unordered_set<std::string>>("excluded_nodes");
   auto quantized_dtype = src.GetAttr<std::string>("quantized_dtype");
+  auto calib_quantize = src.GetAttr<bool>("calib_quantize");
 
   // mirror_map stores the mapping from the currently visited graph to the newly created quantized
   // graph. Key is the currently visited graph's node pointer, and value is a copied node of the key
@@ -116,7 +141,8 @@ Graph QuantizeGraph(Graph &&src) {
       new_node = fquantized_op(node->attrs);
 
       // add data into quantized op input
-      for (const auto& e : node->inputs) {
+      for (size_t i = 0; i < node->inputs.size(); ++i) {
+        const auto& e = node->inputs[i];
         NodePtr mirror_node = mirror_map.at(e.node.get());
         NodeEntry mirror_entry = NodeEntry{
           mirror_node, e.index, e.version};
@@ -125,23 +151,34 @@ Graph QuantizeGraph(Graph &&src) {
         // taking mirror_entry as input to generate a quantized NDArray. Save the mapping between
         // e's source node and the newly created quantize op so that the quantize op can be
         // reused next time when the same entry is visited again.
-        if (!NeedQuantize(e.node, excluded_nodes) &&
-            (mirror_node->op() == nullptr ||
-             mirror_node->op()->name != "_contrib_quantize")) {
+        if (avoid_quantize_input_map.count(node->op()) &&
+            avoid_quantize_input_map[node->op()](node->attrs, i)) {
+          new_node->inputs.emplace_back(mirror_entry);
+        } else if (!NeedQuantize(e.node, excluded_nodes) &&
+                   (mirror_node->op() == nullptr ||
+                    mirror_node->op()->name != "_contrib_quantize")) {
           NodePtr quantize_node = InsertNode("_contrib_quantize",
             e.node->attrs.name + "_quantize", new_node, mirror_entry);
           quantize_node->attrs.dict["out_type"] = quantized_dtype;
           quantize_node->op()->attr_parser(&(quantize_node->attrs));
+          if (calib_quantize) {
+            NodePtr min_var = CreateNode("nullptr", e.node->attrs.name + "_min");
+            quantize_node->inputs.emplace_back(NodeEntry{min_var, 0, 0});
+            NodePtr max_var = CreateNode("nullptr", e.node->attrs.name + "_max");
+            quantize_node->inputs.emplace_back(NodeEntry{max_var, 0, 0});
+          } else {
+            NodePtr min_node = InsertNode("min",
+                e.node->attrs.name + "_min", quantize_node, mirror_entry);
+            min_node->op()->attr_parser(&(min_node->attrs));
 
-          NodePtr min_node = InsertNode("min",
-              e.node->attrs.name + "_min", quantize_node, mirror_entry);
-          min_node->op()->attr_parser(&(min_node->attrs));
-
-          NodePtr max_node = InsertNode("max",
-              e.node->attrs.name + "_max", quantize_node, mirror_entry);
-          max_node->op()->attr_parser(&(max_node->attrs));
-
+            NodePtr max_node = InsertNode("max",
+                e.node->attrs.name + "_max", quantize_node, mirror_entry);
+            max_node->op()->attr_parser(&(max_node->attrs));
+          }
           mirror_map[e.node.get()] = std::move(quantize_node);
+        } else if (mirror_node->op() != nullptr
+                   && mirror_node->op()->name == "_contrib_dequantize") {
+          new_node->inputs.emplace_back(NodeEntry{mirror_node->inputs[0].node, e.index, e.version});
         } else {
           // If the entry e's node needs quantization, or mirror_entry is from a quantize op,
           // simply add mirror_entry to the input of the new_node.
@@ -152,24 +189,35 @@ Graph QuantizeGraph(Graph &&src) {
 
       // add min and max into quantized op input assume order of quantized op inputs is:
       // data1, data2, ..., min1, max1, min2, max2, ...
-      for (const auto& e : node->inputs) {
+      for (size_t i = 0; i < node->inputs.size(); ++i) {
+        const auto& e = node->inputs[i];
         NodePtr mirror_node = mirror_map.at(e.node.get());
+        if (mirror_node->op() != nullptr
+            && mirror_node->op()->name == "_contrib_dequantize") {
+          mirror_node = mirror_node->inputs[0].node;
+        }
         NodeEntry mirror_entry = NodeEntry{
           mirror_node, e.index, e.version};
         // for quantize node
         uint32_t min_index = 1;
         uint32_t max_index = 2;
+        if (avoid_quantize_input_map.count(node->op()) &&
+            avoid_quantize_input_map[node->op()](node->attrs, i)) {
+          // skip non-quantized input
+          continue;
+        }
         if (quantized_op_map.count(e.node->op())) {
           // here we calculate the output number (exclude min/max, in order to
           // calculate min/max index from mirror node) based on assumption that
           // there is only 1min and 1max output from mirror node (which is
           // currently true)
-          size_t  num_outputs = mirror_node->num_outputs() - 2;
+          size_t num_outputs = mirror_node->num_outputs() - 2;
           min_index = num_outputs + 2 * e.index;
           max_index = num_outputs + 2 * e.index + 1;
         } else {
-          CHECK(mirror_node->op()->name == "_contrib_quantize")
-            << "The input is not quantize or quantized_op";
+          CHECK(mirror_node->op() != nullptr &&
+                mirror_node->op()->name == "_contrib_quantize")
+              << "The input is not quantize or quantized_op";
         }
         new_node->inputs.emplace_back(NodeEntry{mirror_node, min_index, 0});
         new_node->inputs.emplace_back(NodeEntry{mirror_node, max_index, 0});
@@ -178,8 +226,8 @@ Graph QuantizeGraph(Graph &&src) {
       // If the new_node op registered attr FNeedRequantize, insert requantize node after it.
       // Here it's assumed that the quantized_op node only produces three outputs:
       // out_data, min_range, and max_range.
-      if (need_requantize_map.count(new_node->op()) > 0
-          && need_requantize_map[new_node->op()](new_node->attrs)) {
+      if (need_requantize_map.count(new_node->op()) > 0 &&
+          need_requantize_map[new_node->op()](new_node->attrs)) {
         NodePtr requantize_node = Node::Create();
         requantize_node->attrs.op = Op::Get("_contrib_requantize");
         requantize_node->attrs.name = "requantize_" + node->attrs.name;
@@ -187,7 +235,8 @@ Graph QuantizeGraph(Graph &&src) {
           requantize_node->op()->attr_parser(&(requantize_node->attrs));
         }
         for (size_t i = 0; i < 3; ++i) {
-          requantize_node->inputs.emplace_back(NodeEntry{new_node, static_cast<uint32_t>(i), 0});
+          requantize_node->inputs.emplace_back(
+              NodeEntry{new_node, static_cast<uint32_t>(i), 0});
         }
         new_node = requantize_node;
       }
@@ -199,33 +248,45 @@ Graph QuantizeGraph(Graph &&src) {
       // the new_node.
       *new_node = *node;
       new_node->inputs.clear();
-      for (const auto& e : node->inputs) {
-        NodePtr mirror_node = mirror_map.at(e.node.get());
-        NodeEntry mirror_entry = NodeEntry{
-          mirror_node, e.index, e.version};
-        // if input node is quantized operator, add dequantize node
-        if (NeedQuantize(e.node, excluded_nodes)) {
-          // here we calculate the output number (exclude min/max, in order to
-          // calculate min/max index from mirror node) based on assumption that
-          // there is only 1min and 1max output from mirror node (which is
-          // currently true)
-          size_t num_outputs = mirror_node->num_outputs() - 2;
-          uint32_t min_index = num_outputs + 2 * e.index;
-          uint32_t max_index = num_outputs + 2 * e.index + 1;
-          NodePtr dequantize_node = CreateNode("_contrib_dequantize",
-            e.node->attrs.name + "_dequantize");
-          dequantize_node->inputs.emplace_back(mirror_entry);
-          dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, min_index, 0});
-          dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, max_index, 0});
-          dequantize_node->op()->attr_parser(&(dequantize_node->attrs));
+      if (node->is_variable() && node->attrs.name == "data") {
+        // Insert identity for data to collect calib for it.
+        NodePtr identity_node =
+            CreateNode("identity", new_node->attrs.name + "_id");
+        identity_node->inputs.emplace_back(NodeEntry{new_node, 0, 0});
+        new_node = identity_node;
+      } else {
+        for (const auto& e : node->inputs) {
+          NodePtr mirror_node = mirror_map.at(e.node.get());
+          NodeEntry mirror_entry = NodeEntry{
+            mirror_node, e.index, e.version};
+          // if input node is quantized operator, add dequantize node
+          if (NeedQuantize(e.node, excluded_nodes) &&
+              (mirror_node->op() == nullptr ||
+              mirror_node->op()->name != "_contrib_dequantize")) {
+            // here we calculate the output number (exclude min/max, in order to
+            // calculate min/max index from mirror node) based on assumption that
+            // there is only 1min and 1max output from mirror node (which is
+            // currently true)
+            size_t num_outputs = mirror_node->num_outputs() - 2;
+            uint32_t min_index = num_outputs + 2 * e.index;
+            uint32_t max_index = num_outputs + 2 * e.index + 1;
+            NodePtr dequantize_node = CreateNode("_contrib_dequantize",
+              e.node->attrs.name + "_dequantize");
+            dequantize_node->inputs.emplace_back(mirror_entry);
+            dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, min_index, 0});
+            dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, max_index, 0});
+            dequantize_node->op()->attr_parser(&(dequantize_node->attrs));
 
-          new_node->inputs.emplace_back(NodeEntry{dequantize_node, 0, 0});
-          mirror_map[e.node.get()] = std::move(dequantize_node);
-        } else if (mirror_node->op() != nullptr
-                   && mirror_node->op()->name == "_contrib_quantize") {
-          new_node->inputs.emplace_back(NodeEntry{mirror_node->inputs[0].node, e.index, e.version});
-        } else {
-          new_node->inputs.emplace_back(NodeEntry{mirror_node, e.index, e.version});
+            new_node->inputs.emplace_back(NodeEntry{dequantize_node, 0, 0});
+            mirror_map[e.node.get()] = std::move(dequantize_node);
+          } else if (mirror_node->op() != nullptr
+                    && mirror_node->op()->name == "_contrib_quantize") {
+            new_node->inputs.emplace_back(
+                NodeEntry{mirror_node->inputs[0].node, e.index, e.version});
+          } else {
+            new_node->inputs.emplace_back(
+                NodeEntry{mirror_node, e.index, e.version});
+          }
         }
       }
     }
diff --git a/src/operator/random/shuffle_op.cu b/src/operator/random/shuffle_op.cu
index 5bf8320c0788..51588494a63c 100644
--- a/src/operator/random/shuffle_op.cu
+++ b/src/operator/random/shuffle_op.cu
@@ -60,7 +60,7 @@ void ShuffleForwardGPU(const nnvm::NodeAttrs& attrs,
   const index_t stride = size / first_axis_len;
   Stream<gpu> *s = ctx.get_stream<gpu>();
   MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    using KeyType = index_t;
+    using KeyType = uint32_t;
     Tensor<gpu, 1, DType> in = inputs[0].get_with_shape<gpu, 1, DType>(Shape1(size), s);
     Tensor<gpu, 1, DType> out = outputs[0].get_with_shape<gpu, 1, DType>(Shape1(size), s);
     Random<gpu, KeyType> *prnd = ctx.requested[0].get_random<gpu, KeyType>(s);
@@ -82,7 +82,8 @@ void ShuffleForwardGPU(const nnvm::NodeAttrs& attrs,
       Tensor<gpu, 1, index_t> indices(reinterpret_cast<index_t*>(tmp_space_ptr),
                                       Shape1(first_axis_len), s);
       tmp_space_ptr += sizeof(index_t) * first_axis_len;
-      Kernel<range_fwd, gpu>::Launch(s, first_axis_len, 1, 0U, 1U, kWriteTo, indices.dptr_);
+      Kernel<range_fwd, gpu>::Launch(s, static_cast<int>(first_axis_len),
+                                     1, index_t(0), index_t(1), kWriteTo, indices.dptr_);
       Tensor<gpu, 1, KeyType> keys(reinterpret_cast<KeyType*>(tmp_space_ptr),
                                    Shape1(first_axis_len), s);
       tmp_space_ptr += sizeof(KeyType) * first_axis_len;
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 3901a805b9de..11ca066c7833 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -689,7 +689,7 @@ class RNNProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/sequence_last-inl.h b/src/operator/sequence_last-inl.h
index 58562862a4e0..1a59473cfc3a 100644
--- a/src/operator/sequence_last-inl.h
+++ b/src/operator/sequence_last-inl.h
@@ -278,7 +278,7 @@ class SequenceLastProp : public OperatorProperty {
     CHECK_GE(in_type->size(), param_.use_sequence_length ? 2U : 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/sequence_mask-inl.h b/src/operator/sequence_mask-inl.h
index a34cea04965e..c93ffb5f17b6 100644
--- a/src/operator/sequence_mask-inl.h
+++ b/src/operator/sequence_mask-inl.h
@@ -267,7 +267,7 @@ class SequenceMaskProp : public OperatorProperty {
     CHECK_GE(in_type->size(), param_.use_sequence_length ? 2U : 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/sequence_reverse-inl.h b/src/operator/sequence_reverse-inl.h
index 943ca6e933c9..5c48729e18ff 100644
--- a/src/operator/sequence_reverse-inl.h
+++ b/src/operator/sequence_reverse-inl.h
@@ -246,7 +246,7 @@ class SequenceReverseProp : public OperatorProperty {
     CHECK_GE(in_type->size(), param_.use_sequence_length ? 2U : 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 9a4db2c9694a..fec321b97e4c 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -333,7 +333,7 @@ class SoftmaxOutputProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/subgraph/default_subgraph_property.cc b/src/operator/subgraph/default_subgraph_property.cc
index 0152344f4d43..5a2c52e61729 100644
--- a/src/operator/subgraph/default_subgraph_property.cc
+++ b/src/operator/subgraph/default_subgraph_property.cc
@@ -17,8 +17,6 @@
  * under the License.
  */
 
-#include <vector>
-#include <string>
 #include "./common.h"
 #include "./subgraph_property.h"
 #include "../../imperative/cached_op.h"
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv-inl.h b/src/operator/subgraph/mkldnn/mkldnn_conv-inl.h
new file mode 100644
index 000000000000..8675446f5a14
--- /dev/null
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv-inl.h
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_INL_H_
+#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_INL_H_
+#if MXNET_USE_MKLDNN == 1
+
+#include <utility>
+#include <vector>
+#include <string>
+#include "../../nn/convolution-inl.h"
+#include "../../nn/batch_norm-inl.h"
+#include "../../nn/mkldnn/mkldnn_convolution-inl.h"
+
+namespace mxnet {
+namespace op {
+
+struct MKLDNNConvFusionParam {
+  MKLDNNConvFullParam full_conv_param;
+  std::shared_ptr<BatchNormParam> bn_param;
+};
+
+static const size_t uint8_range = 255;
+static const size_t int8_range = 127;
+
+enum MKLDNNConvOpOutputs { kOut, kMin, kMax };
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_INL_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv.cc b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
new file mode 100644
index 000000000000..a1083d09b7b5
--- /dev/null
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
@@ -0,0 +1,690 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <utility>
+#include <vector>
+#include <string>
+#include "../common.h"
+#include "../../nn/mkldnn/mkldnn_base-inl.h"
+#include "../../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../../quantization/quantization_utils.h"
+#include "mkldnn_conv-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template <typename DType>
+static void UpdateConvWeightBias(NDArray *weight, NDArray *bias, bool no_bias,
+                                 const NDArray &gamma, const NDArray &beta,
+                                 const NDArray &mean, const NDArray &variance,
+                                 const BatchNormParam *param) {
+  // TODO(Zhennan): Handle the case weight is not in dims 4.
+  NDArray update_weight = NDArray(weight->storage_type(), weight->shape(),
+                                  weight->ctx(), true, weight->dtype());
+  NDArray update_bias = NDArray(beta.storage_type(), beta.shape(), beta.ctx(),
+                                true, beta.dtype());
+  const DType *weight_ptr = weight->data().dptr<DType>();
+  const DType *bias_ptr = no_bias ? nullptr : bias->data().dptr<DType>();
+  const DType *gamma_ptr = gamma.Reorder2Default().data().dptr<DType>();
+  const DType *beta_ptr = beta.Reorder2Default().data().dptr<DType>();
+  const DType *mean_ptr = mean.Reorder2Default().data().dptr<DType>();
+  const DType *var_ptr = variance.Reorder2Default().data().dptr<DType>();
+  DType *update_weight_ptr = update_weight.data().dptr<DType>();
+  DType *update_bias_ptr = update_bias.data().dptr<DType>();
+  size_t channel = gamma.shape()[0];
+  size_t offset = weight->shape()[1] * weight->shape()[2] * weight->shape()[3];
+#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+  for (int c = 0; c < static_cast<int>(channel); ++c) {
+    const DType *p1 = weight_ptr + c * offset;
+    DType *p2 = update_weight_ptr + c * offset;
+    DType alpha = (param->fix_gamma ? static_cast<DType>(1.0f) : gamma_ptr[c]) /
+                  sqrt(var_ptr[c] + param->eps);
+
+    if (bias_ptr)
+      update_bias_ptr[c] = beta_ptr[c] + alpha * (bias_ptr[c] - mean_ptr[c]);
+    else
+      update_bias_ptr[c] = beta_ptr[c] - alpha * mean_ptr[c];
+
+    for (size_t k = 0; k < offset; ++k) {
+      p2[k] = p1[k] * alpha;
+    }
+  }
+  *weight = update_weight;
+  *bias = update_bias;
+}
+
+static inline size_t GetInSumIndex(const MKLDNNConvFusionParam &param) {
+  return 2 + (param.full_conv_param.conv_param.no_bias ? 0 : 1) +
+         (param.full_conv_param.mkldnn_param.with_bn ? 4 : 0);
+}
+
+template <typename DType>
+static void QuantizeConvWeightBias(NDArray *weight, NDArray *bias,
+                                   bool has_bias, float data_scale,
+                                   bool weight_channelwise_scale,
+                                   std::vector<float> *weight_scales) {
+  using red::limits::MaxValue;
+  using red::limits::MinValue;
+  const DType *weight_ptr = weight->data().dptr<DType>();
+  NDArray quantized_weight = NDArray(weight->storage_type(), weight->shape(),
+                                     weight->ctx(), true, mshadow::kInt8);
+  int8_t *quan_weight_ptr = quantized_weight.data().dptr<int8_t>();
+  size_t channel = weight->shape()[0];
+
+  // TODO(Zhennan): Handle the case weight is not in dims 4.
+  size_t offset = weight->shape()[1] * weight->shape()[2] * weight->shape()[3];
+  std::vector<DType> weight_c_min(channel, MaxValue<DType>());
+  std::vector<DType> weight_c_max(channel, MinValue<DType>());
+#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+  for (int c = 0; c < static_cast<int>(channel); ++c) {
+    const DType *p1 = weight_ptr + c * offset;
+    for (size_t k = 0; k < offset; ++k) {
+      if (weight_c_min[c] > p1[k])
+        weight_c_min[c] = p1[k];
+      if (weight_c_max[c] < p1[k])
+        weight_c_max[c] = p1[k];
+    }
+  }
+
+  if (weight_channelwise_scale) {
+    weight_scales->resize(channel);
+#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+    for (int c = 0; c < static_cast<int>(channel); ++c) {
+      DType weight_range = MaxAbs(weight_c_min[c], weight_c_max[c]);
+      weight_scales->at(c) = int8_range / weight_range;
+      const DType *fp_ptr = weight_ptr + c * offset;
+      int8_t *quan_ptr = quan_weight_ptr + c * offset;
+      for (size_t k = 0; k < offset; ++k) {
+        quan_ptr[k] = std::round(weight_scales->at(c) * fp_ptr[k]);
+      }
+    }
+  } else {
+    DType total_min = weight_c_min[0];
+    DType total_max = weight_c_max[0];
+    for (size_t c = 0; c < channel; ++c) {
+      if (total_min > weight_c_min[c]) total_min = weight_c_min[c];
+      if (total_max < weight_c_max[c]) total_max = weight_c_max[c];
+    }
+    weight_scales->resize(1);
+    DType weight_range = MaxAbs(total_min, total_max);
+    weight_scales->at(0) = int8_range / weight_range;
+#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+    for (int c = 0; c < static_cast<int>(channel); ++c) {
+      const DType *fp_ptr = weight_ptr + c * offset;
+      int8_t *quan_ptr = quan_weight_ptr + c * offset;
+      for (size_t k = 0; k < offset; ++k) {
+        quan_ptr[k] = std::round(weight_scales->at(0) * fp_ptr[k]);
+      }
+    }
+  }
+
+  *weight = quantized_weight;
+  if (has_bias) {
+    const DType *bias_ptr = bias->data().dptr<DType>();
+    NDArray quantized_bias = NDArray(bias->storage_type(), bias->shape(),
+                                     bias->ctx(), true, mshadow::kInt32);
+    int32_t *quan_bias_ptr = quantized_bias.data().dptr<int32_t>();
+    for (size_t c = 0; c < channel; ++c) {
+      auto weight_scale =
+          weight_channelwise_scale ? weight_scales->at(c) : weight_scales->at(0);
+      float bias_scale = weight_scale * data_scale;
+      quan_bias_ptr[c] = std::round(bias_scale * bias_ptr[c]);
+    }
+    *bias = quantized_bias;
+  }
+}
+
+static void ConvFusionFallBackCompute() {
+  LOG(FATAL) << "Don't know how to do ConvFusionFallBackCompute!";
+}
+
+static void ConvolutionFusionComputeExCPU(const MKLDNNConvFullParam &full_param,
+                                          const OpContext &ctx,
+                                          MKLDNNConvForward *fwd,
+                                          const std::vector<NDArray> &inputs,
+                                          const std::vector<OpReqType> &req,
+                                          const std::vector<NDArray> &outputs) {
+  if (SupportMKLDNNConv(full_param.conv_param, inputs[0])) {
+    MKLDNNConvolutionForwardFullFeature(full_param, ctx, fwd, inputs, req, outputs);
+    return;
+  }
+  ConvFusionFallBackCompute();
+}
+
+class SgMKLDNNConvOperator {
+ public:
+  explicit SgMKLDNNConvOperator(const nnvm::NodeAttrs &attrs)
+      : initalized_(false),
+        subgraph_sym_(*attrs.subgraphs[0]),
+        param_(nnvm::get<MKLDNNConvFusionParam>(attrs.parsed)),
+        inplace_(false) {}
+
+  void Forward(const OpContext &ctx,
+               const std::vector<NDArray> &inputs,
+               const std::vector<OpReqType> &req,
+               const std::vector<NDArray> &outputs);
+
+  void Backward(const OpContext &ctx, const std::vector<NDArray> &inputs,
+                const std::vector<OpReqType> &req,
+                const std::vector<NDArray> &outputs) {
+    LOG(FATAL) << "Not implemented: subgraph mkldnn Conv only supports "
+                  "inference computation.";
+  }
+
+ private:
+  bool initalized_;
+  nnvm::Symbol subgraph_sym_;
+  MKLDNNConvFusionParam param_;
+  std::shared_ptr<MKLDNNConvForward> fwd_;
+  NDArray cached_weight_;
+  NDArray cached_bias_;
+  float cached_data_min_;
+  float cached_data_max_;
+  float cached_sum_min_;
+  float cached_sum_max_;
+  size_t weight_ver_;
+  size_t bias_ver_;
+  std::vector<float> weight_scales_;
+  bool inplace_;
+};
+
+void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
+                                   const std::vector<NDArray> &inputs,
+                                   const std::vector<OpReqType> &req,
+                                   const std::vector<NDArray> &outputs) {
+  auto &full_conv_param = param_.full_conv_param;
+  auto &mkldnn_param = param_.full_conv_param.mkldnn_param;
+  auto &conv_param = param_.full_conv_param.conv_param;
+  auto bn_param = param_.bn_param.get();
+  size_t input_size =
+      2 + (conv_param.no_bias ? 0 : 1) + (mkldnn_param.with_bn ? 4 : 0) +
+      (mkldnn_param.with_sum ? 1 : 0) +
+      (mkldnn_param.quantized ? 2 + (full_conv_param.mkldnn_param.with_sum ? 2 : 0) : 0);
+  CHECK_EQ(inputs.size(), input_size);
+  size_t idx = 0;
+
+  auto in_data = idx++;
+  auto in_weight = idx++;
+  auto in_bias = conv_param.no_bias ? 0 : (idx++);
+  auto in_gamma = mkldnn_param.with_bn ? (idx++) : 0;
+  auto in_beta = mkldnn_param.with_bn ? (idx++) : 0;
+  auto in_mean = mkldnn_param.with_bn ? (idx++) : 0;
+  auto in_var = mkldnn_param.with_bn ? (idx++) : 0;
+  auto in_sum = mkldnn_param.with_sum ? (idx++) : 0;
+  float data_min =
+      mkldnn_param.quantized ? inputs[idx++].data().dptr<float>()[0] : 0.0;
+  float data_max =
+      mkldnn_param.quantized ? inputs[idx++].data().dptr<float>()[0] : 0.0;
+  float sum_min = (mkldnn_param.with_sum && mkldnn_param.quantized)
+                      ? inputs[idx++].data().dptr<float>()[0]
+                      : 0.0;
+  float sum_max = (mkldnn_param.with_sum && mkldnn_param.quantized)
+                      ? inputs[idx++].data().dptr<float>()[0]
+                      : 0.0;
+  float *out_min_ptr =
+      mkldnn_param.quantized ? outputs[kMin].data().dptr<float>() : nullptr;
+  float *out_max_ptr =
+      mkldnn_param.quantized ? outputs[kMax].data().dptr<float>() : nullptr;
+  CHECK_EQ(input_size, idx);
+  bool has_bias = mkldnn_param.with_bn || !conv_param.no_bias;
+  NDArray data = inputs[in_data];
+  NDArray output = mkldnn_param.with_sum ? inputs[in_sum] : outputs[kOut];
+
+  // Copy inputs[in_sum] into outputs[kOut] in case inplace optimization failed.
+  if (mkldnn_param.with_sum) {
+    if (!initalized_) {
+      auto in_mkl_mem = inputs[in_sum].GetMKLDNNData();
+      auto out_mkl_mem = outputs[kOut].GetMKLDNNData();
+      // TODO(zhennan): Currently, mkldnn fallback mechanism will break inplace option,
+      // which make check (req[kOut] == kWriteInplace) useless.
+      if (in_mkl_mem->get_data_handle() == out_mkl_mem->get_data_handle()) {
+        inplace_ = true;
+      }
+    }
+    if (!inplace_) {
+      auto in_mkl_mem = inputs[in_sum].GetMKLDNNData();
+      const_cast<NDArray &>(outputs[kOut]).CopyFrom(*in_mkl_mem);
+      output = NDArray(outputs[kOut].GetMKLDNNData());
+    }
+  }
+
+  // Check input change
+  // TODO(zhennan): Only update cached_* changed.
+  if (initalized_) {
+    if (mkldnn_param.with_bn) {
+      if (weight_ver_ != inputs[in_weight].version() ||
+          ((!conv_param.no_bias) && bias_ver_ != inputs[in_bias].version())) {
+        initalized_ = false;
+      }
+    }
+    if (initalized_ && mkldnn_param.quantized) {
+      if (cached_data_min_ != data_min || cached_data_max_ != data_max ||
+          cached_sum_min_ != sum_min || cached_sum_max_ != sum_max ||
+          weight_ver_ != inputs[in_weight].version() ||
+          ((!conv_param.no_bias) && bias_ver_ != inputs[in_bias].version())) {
+        initalized_ = false;
+      }
+    }
+  }
+  bool post_requantize = false;
+  if (mkldnn_param.quantized) {
+    if (mkldnn_param.min_calib_range.has_value() &&
+        mkldnn_param.max_calib_range.has_value()) {
+      post_requantize = true;
+      mkldnn_param.weight_channelwise_scale = true;
+      *out_min_ptr = mkldnn_param.min_calib_range.value();
+      *out_max_ptr = mkldnn_param.max_calib_range.value();
+    } else {
+      mkldnn_param.weight_channelwise_scale = false;
+    }
+  }
+
+  if (!initalized_) {
+    cached_data_min_ = data_min;
+    cached_data_max_ = data_max;
+    cached_sum_min_ = sum_min;
+    cached_sum_max_ = sum_max;
+    full_conv_param.sum_scale = 1.0;
+    cached_weight_ = inputs[in_weight].Reorder2Default();
+    weight_ver_ = inputs[in_weight].version();
+    if (!conv_param.no_bias) {
+      cached_bias_ = inputs[in_bias].Reorder2Default();
+      bias_ver_ = inputs[in_bias].version();
+    } else {
+      cached_bias_ = NDArray();
+    }
+
+    // Update weight and bias after bn fusion.
+    if (mkldnn_param.with_bn) {
+      CHECK_EQ(inputs[in_weight].dtype(), inputs[in_gamma].dtype());
+      CHECK_EQ(inputs[in_weight].dtype(), inputs[in_beta].dtype());
+      CHECK_EQ(inputs[in_weight].dtype(), inputs[in_var].dtype());
+      MSHADOW_REAL_TYPE_SWITCH(inputs[in_weight].dtype(), DType, {
+        UpdateConvWeightBias<DType>(&cached_weight_, &cached_bias_,
+                                    conv_param.no_bias, inputs[in_gamma],
+                                    inputs[in_beta], inputs[in_mean],
+                                    inputs[in_var], bn_param);
+      });
+    }
+    // Quantize weight and bias.
+    if (mkldnn_param.quantized) {
+      CHECK(data.dtype() == mshadow::kInt8 || data.dtype() == mshadow::kUint8);
+      auto data_range = (data.dtype() == mshadow::kInt8) ? int8_range : uint8_range;
+      float data_scale = data_range / MaxAbs(cached_data_min_, cached_data_max_);
+      MSHADOW_REAL_TYPE_SWITCH(cached_weight_.dtype(), DType, {
+        QuantizeConvWeightBias<DType>(&cached_weight_, &cached_bias_,
+                                      has_bias, data_scale,
+                                      mkldnn_param.weight_channelwise_scale,
+                                      &weight_scales_);
+      });
+      // Collect scale.
+      size_t channel = cached_weight_.shape()[0];
+      float sum_in_scale = 1.0;
+      float out_range;
+      float quantized_out_range;
+      float output_scale;
+      if (cached_data_min_ < 0.0) {
+        // TODO(zhennan): Support int8 input when mkldnn supports.
+        LOG(FATAL) << "Can't handle negetive value for QuantizeData";
+      }
+      if (mkldnn_param.with_sum) {
+        auto quantized_sum_range = cached_sum_min_ < 0 ? int8_range : uint8_range;
+        sum_in_scale = quantized_sum_range / MaxAbs(cached_sum_min_, cached_sum_max_);
+      }
+      if (post_requantize) {
+        quantized_out_range =
+            IsOutputUInt8(mkldnn_param) ? uint8_range : int8_range;
+        out_range = MaxAbs(*out_min_ptr, *out_max_ptr);
+        output_scale = quantized_out_range / out_range;
+        full_conv_param.requantize_scales.resize(channel);
+        for (size_t c = 0; c < channel; c++) {
+          auto weight_scale = mkldnn_param.weight_channelwise_scale
+                                  ? weight_scales_[c]
+                                  : weight_scales_[0];
+          full_conv_param.requantize_scales[c] =
+              output_scale / data_scale / weight_scale;
+        }
+      } else {
+        output_scale = data_scale * weight_scales_[0];
+        full_conv_param.requantize_scales.resize(0);
+      }
+      if (mkldnn_param.with_sum)
+        full_conv_param.sum_scale = output_scale / sum_in_scale;
+    }
+    fwd_.reset(new MKLDNNConvForward(
+        full_conv_param, ctx.is_train, data, cached_weight_,
+        has_bias ? &cached_bias_ : nullptr, output));
+  }
+  initalized_ = true;
+  std::vector<NDArray> new_inputs;
+  std::vector<OpReqType> new_req;
+  if (has_bias) {
+    new_inputs = {data, cached_weight_, cached_bias_};
+    new_req = {req[in_data], req[in_weight], req[in_bias]};
+  } else {
+    new_inputs = {data, cached_weight_};
+    new_req = {req[in_data], req[in_weight]};
+  }
+  ConvolutionFusionComputeExCPU(full_conv_param, ctx, fwd_.get(), new_inputs,
+                                new_req, {output});
+
+  if (mkldnn_param.with_sum) {
+    auto out = const_cast<NDArray &>(outputs[kOut]);
+    out.UpdateMKLDNNMemDesc();
+  }
+}
+
+static void SgMKLDNNConvOpForward(const OpStatePtr &state_ptr,
+                                  const OpContext &ctx,
+                                  const std::vector<NDArray> &inputs,
+                                  const std::vector<OpReqType> &req,
+                                  const std::vector<NDArray> &outputs) {
+  SgMKLDNNConvOperator &op = state_ptr.get_state<SgMKLDNNConvOperator>();
+  op.Forward(ctx, inputs, req, outputs);
+}
+
+static uint32_t SgMKLDNNConvNumInputs(const NodeAttrs &attrs) {
+  auto const &param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
+  auto num_input = DefaultSubgraphOpNumInputs(attrs);
+  if (param.full_conv_param.mkldnn_param.quantized)
+    return num_input + 2 + param.full_conv_param.mkldnn_param.with_sum ? 2 : 0;
+  else
+    return num_input;
+}
+
+static void SgMKLDNNConvParamParser(nnvm::NodeAttrs *attrs) {
+  MKLDNNConvFusionParam param_;
+  try {
+    param_.full_conv_param.mkldnn_param.Init(attrs->dict);
+  } catch (const dmlc::ParamError &e) {
+    std::ostringstream os;
+    os << e.what();
+    os << ", in operator " << attrs->op->name << "("
+       << "name=\"" << attrs->name << "\"";
+    for (const auto &k : attrs->dict) {
+      os << ", " << k.first << "=\"" << k.second << "\"";
+    }
+    os << ")";
+    throw dmlc::ParamError(os.str());
+  }
+  auto subgraph_sym = attrs->subgraphs[0];
+  DFSVisit(subgraph_sym->outputs, [&](const nnvm::NodePtr &node) {
+    if (node->is_variable()) return;
+    auto &node_name = node->op()->name;
+    if (node_name == "BatchNorm") {
+      CHECK_EQ(param_.full_conv_param.mkldnn_param.with_bn, true);
+      CHECK(param_.bn_param.get() == nullptr);
+      param_.bn_param = std::make_shared<BatchNormParam>(
+          nnvm::get<BatchNormParam>(node->attrs.parsed));
+    } else if (node_name == "Convolution") {
+      param_.full_conv_param.conv_param =
+          nnvm::get<ConvolutionParam>(node->attrs.parsed);
+    }
+  });
+  attrs->parsed = std::move(param_);
+}
+
+static std::vector<std::string> SgMKLDNNConvListInputNames(
+    const NodeAttrs &attrs) {
+  auto const &param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
+  std::vector<std::string> input_names = DefaultSubgraphOpListInputs(attrs);
+  if (param.full_conv_param.mkldnn_param.quantized) {
+    input_names.emplace_back("data_min");
+    input_names.emplace_back("data_max");
+    if (param.full_conv_param.mkldnn_param.with_sum) {
+      input_names.emplace_back("sum_min");
+      input_names.emplace_back("sum_max");
+    }
+  }
+  return input_names;
+}
+
+static std::vector<std::string> SgMKLDNNConvListOutputNames(
+    const NodeAttrs &attrs) {
+  auto const &param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
+  if (param.full_conv_param.mkldnn_param.quantized)
+    return std::vector<std::string>{"output", "output_min", "output_max"};
+  else
+    return std::vector<std::string>{"output"};
+}
+
+static OpStatePtr CreateSgMKLDNNConvState(const nnvm::NodeAttrs &attrs,
+                                          Context ctx,
+                                          const std::vector<TShape> &in_shapes,
+                                          const std::vector<int> &in_types) {
+  return OpStatePtr::Create<SgMKLDNNConvOperator>(attrs);
+}
+
+template <typename DType>
+static void FilterMinMaxIndice(const MKLDNNConvParam &mkldnn_param,
+                               std::vector<DType> *in_shapes,
+                               std::vector<DType> *out_shapes,
+                               std::vector<DType> *base_in_shapes,
+                               std::vector<DType> *base_out_shapes,
+                               std::unordered_set<size_t> *minmax_indice) {
+  base_out_shapes->push_back(out_shapes->at(0));
+  size_t last = in_shapes->size() - 1;
+  if (mkldnn_param.with_sum) {
+    minmax_indice->insert(last);
+    minmax_indice->insert(last - 1);
+    minmax_indice->insert(last - 2);
+    minmax_indice->insert(last - 3);
+    *base_in_shapes =
+        std::vector<DType>(in_shapes->begin(), in_shapes->end() - 4);
+  } else {
+    minmax_indice->insert(last);
+    minmax_indice->insert(last - 1);
+    *base_in_shapes =
+        std::vector<DType>(in_shapes->begin(), in_shapes->end() - 2);
+  }
+}
+
+static bool SgMKLDNNConvInferShape(const nnvm::NodeAttrs &attrs,
+                                   std::vector<TShape> *in_shapes,
+                                   std::vector<TShape> *out_shapes) {
+  auto const &param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
+  if (param.full_conv_param.mkldnn_param.quantized) {
+    std::unordered_set<size_t> minmax_indice;
+    std::vector<TShape> base_in_shapes;
+    std::vector<TShape> base_out_shapes;
+
+    FilterMinMaxIndice<TShape>(param.full_conv_param.mkldnn_param, in_shapes,
+                               out_shapes, &base_in_shapes, &base_out_shapes,
+                               &minmax_indice);
+    bool result =
+        DefaultSubgraphOpShape(attrs, &base_in_shapes, &base_out_shapes);
+    size_t base_idx = 0;
+    for (size_t i = 0; i < in_shapes->size(); ++i) {
+      if (minmax_indice.count(i)) {
+        SHAPE_ASSIGN_CHECK(*in_shapes, i, Shape1(1));
+      } else {
+        in_shapes->at(i) = base_in_shapes[base_idx++];
+      }
+    }
+    out_shapes->at(0) = base_out_shapes[0];
+    SHAPE_ASSIGN_CHECK(*out_shapes, 1, Shape1(1));
+    SHAPE_ASSIGN_CHECK(*out_shapes, 2, Shape1(1));
+    return result;
+  } else {
+    return DefaultSubgraphOpShape(attrs, in_shapes, out_shapes);
+  }
+}
+
+static bool SgMKLDNNConvInferType(const nnvm::NodeAttrs &attrs,
+                                  std::vector<int> *in_types,
+                                  std::vector<int> *out_types) {
+  auto const &param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
+  if (param.full_conv_param.mkldnn_param.quantized) {
+    std::unordered_set<size_t> minmax_indice;
+    std::vector<int> base_in_types;
+    std::vector<int> base_out_types;
+    FilterMinMaxIndice<int>(param.full_conv_param.mkldnn_param, in_types,
+                            out_types, &base_in_types, &base_out_types,
+                            &minmax_indice);
+    // Override data type to fp32 for default infer type as bn doesn't support
+    // uint8.
+    int orig_data = base_in_types[0];
+    base_in_types[0] = mshadow::kFloat32;
+    int orig_sum = base_in_types[0];
+    if (param.full_conv_param.mkldnn_param.with_sum) {
+      auto sum_index = GetInSumIndex(param);
+      orig_sum = base_in_types[sum_index];
+      base_in_types[sum_index] = mshadow::kFloat32;
+    }
+    bool result = DefaultSubgraphOpType(attrs, &base_in_types, &base_out_types);
+    base_in_types[0] = orig_data;
+    if (param.full_conv_param.mkldnn_param.with_sum) {
+      auto sum_index = GetInSumIndex(param);
+      base_in_types[sum_index] = orig_sum;
+    }
+    size_t base_idx = 0;
+    for (size_t i = 0; i < in_types->size(); ++i) {
+      if (minmax_indice.count(i)) {
+        TYPE_ASSIGN_CHECK(*in_types, i, mshadow::kFloat32);
+      } else {
+        in_types->at(i) = base_in_types[base_idx++];
+      }
+    }
+    if (param.full_conv_param.mkldnn_param.min_calib_range.has_value() &&
+        param.full_conv_param.mkldnn_param.max_calib_range.has_value()) {
+      if (IsOutputUInt8(param.full_conv_param.mkldnn_param)) {
+        TYPE_ASSIGN_CHECK(*out_types, 0, mshadow::kUint8);
+      } else {
+        TYPE_ASSIGN_CHECK(*out_types, 0, mshadow::kInt8);
+      }
+    } else {
+      TYPE_ASSIGN_CHECK(*out_types, 0, mshadow::kInt32);
+    }
+
+    TYPE_ASSIGN_CHECK(*out_types, 1, mshadow::kFloat32);
+    TYPE_ASSIGN_CHECK(*out_types, 2, mshadow::kFloat32);
+    return result;
+  } else {
+    return DefaultSubgraphOpType(attrs, in_types, out_types);
+  }
+}
+
+static bool SgMKLDNNConvOpStorageType(const nnvm::NodeAttrs &attrs,
+                                      const int dev_mask,
+                                      DispatchMode *dispatch_mode,
+                                      std::vector<int> *in_stypes,
+                                      std::vector<int> *out_stypes) {
+  auto const &param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
+  if (param.full_conv_param.mkldnn_param.quantized) {
+    std::unordered_set<size_t> minmax_indice;
+    std::vector<int> base_in_stypes;
+    std::vector<int> base_out_stypes;
+    FilterMinMaxIndice<int>(param.full_conv_param.mkldnn_param, in_stypes,
+                            out_stypes, &base_in_stypes, &base_out_stypes,
+                            &minmax_indice);
+    bool result = DefaultSubgraphOpStorageType(
+        attrs, dev_mask, dispatch_mode, &base_in_stypes, &base_out_stypes);
+    size_t base_idx = 0;
+    for (size_t i = 0; i < in_stypes->size(); ++i) {
+      if (minmax_indice.count(i)) {
+        type_assign(&in_stypes->at(i), mxnet::kDefaultStorage);
+      } else {
+        in_stypes->at(i) = base_in_stypes[base_idx++];
+      }
+    }
+    out_stypes->at(0) = base_out_stypes[0];
+    type_assign(&out_stypes->at(1), mxnet::kDefaultStorage);
+    type_assign(&out_stypes->at(2), mxnet::kDefaultStorage);
+    return result;
+  } else {
+    return DefaultSubgraphOpStorageType(attrs, dev_mask, dispatch_mode,
+                                        in_stypes, out_stypes);
+  }
+}
+
+std::vector<std::pair<int, int>> SgMKLDNNConvInplaceOption(
+    const NodeAttrs &attrs) {
+  auto const &param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
+  if (param.full_conv_param.mkldnn_param.with_sum) {
+    return std::vector<std::pair<int, int>>{{GetInSumIndex(param), 0}};
+  } else {
+    return std::vector<std::pair<int, int>>();
+  }
+}
+
+nnvm::NodePtr SgMKLDNNConvQuantizedOp(const NodeAttrs& attrs) {
+  nnvm::NodePtr node = nnvm::Node::Create();
+  node->attrs.op = Op::Get("_sg_mkldnn_conv");
+  node->attrs.name = "quantized_" + attrs.name;
+  node->attrs.dict = attrs.dict;
+  node->attrs.dict["quantized"] = "true";
+  node->attrs.subgraphs.reserve(attrs.subgraphs.size());
+  for (auto sub : attrs.subgraphs) {
+    node->attrs.subgraphs.push_back(sub);
+  }
+  node->op()->attr_parser(&(node->attrs));
+  return node;
+}
+
+bool SgMKLDNNAvoidQuantizeInput(const NodeAttrs &attrs, size_t index) {
+  auto const &param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
+  std::unordered_set<size_t> avoid_indice;
+  size_t idx = 0;
+  idx++;                         // data
+  avoid_indice.insert(idx++);    // weight
+  if (!param.full_conv_param.conv_param.no_bias) {
+    avoid_indice.insert(idx++);  // bias
+  }
+  if (param.full_conv_param.mkldnn_param.with_bn) {
+    avoid_indice.insert(idx++);  // gamma
+    avoid_indice.insert(idx++);  // beta
+    avoid_indice.insert(idx++);  // mean
+    avoid_indice.insert(idx++);  // var
+  }
+  return avoid_indice.count(index);
+}
+
+NNVM_REGISTER_OP(_sg_mkldnn_conv)
+.describe(R"code(_sg_mkldnn_conv)code" ADD_FILELINE)
+.set_num_inputs(SgMKLDNNConvNumInputs)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  auto const &param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
+  return param.full_conv_param.mkldnn_param.quantized ? 3 : 1;
+})
+.set_attr_parser(SgMKLDNNConvParamParser)
+.set_attr<nnvm::FListInputNames>("FListInputNames", SgMKLDNNConvListInputNames)
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", SgMKLDNNConvListOutputNames)
+.set_attr<FCreateOpState>("FCreateOpState", CreateSgMKLDNNConvState)
+.set_attr<nnvm::FInferShape>("FInferShape", SgMKLDNNConvInferShape)
+.set_attr<nnvm::FInferType>("FInferType", SgMKLDNNConvInferType)
+.set_attr<FInferStorageType>("FInferStorageType", SgMKLDNNConvOpStorageType)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgMKLDNNConvOpForward)
+.set_attr<bool>("TIsMKLDNN", true)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+                                DefaultSubgraphOpMutableInputs)
+.set_attr<std::string>("key_var_num_args", "num_args")
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", SgMKLDNNConvInplaceOption)
+.set_attr<FQuantizedOp>("FQuantizedOp", SgMKLDNNConvQuantizedOp)
+.set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
+.set_attr<FAvoidQuantizeInput>("FAvoidQuantizeInput", SgMKLDNNAvoidQuantizeInput);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // if MXNET_USE_MKLDNN == 1
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv_post_quantize_property.cc b/src/operator/subgraph/mkldnn/mkldnn_conv_post_quantize_property.cc
new file mode 100644
index 000000000000..fc68287b039d
--- /dev/null
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv_post_quantize_property.cc
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#if MXNET_USE_MKLDNN == 1
+
+#include "../common.h"
+#include "../subgraph_property.h"
+#include "../../nn/mkldnn/mkldnn_convolution-inl.h"
+#include "mkldnn_conv-inl.h"
+#include "../../quantization/requantize-inl.h"
+
+namespace mxnet {
+namespace op {
+
+class SgMKLDNNConvPostQuantizeSelector : public SubgraphSelector {
+ public:
+  /*! \brief pattern match status */
+  enum SelectStatus {
+    kFail = 0,
+    kStart,
+    kSuccess,
+  };
+
+ private:
+  bool disable_all;
+  SelectStatus status;
+  std::vector<const nnvm::Node *> matched_list;
+
+ public:
+  explicit SgMKLDNNConvPostQuantizeSelector(int dis_all)
+      : disable_all(dis_all) {}
+
+  bool Select(const nnvm::Node &n) override {
+    if ((!disable_all) && n.op() && n.op()->name == "_sg_mkldnn_conv") {
+      auto const &param = nnvm::get<MKLDNNConvFusionParam>(n.attrs.parsed);
+      if (param.full_conv_param.mkldnn_param.quantized) {
+        status = kStart;
+        matched_list.clear();
+        matched_list.push_back(&n);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool SelectInput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    return false;
+  }
+
+  bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    if (status == kFail || status == kSuccess || new_node.is_variable())
+      return false;
+    // If n isn't the last matched node, then we encoutered a internal
+    // branch, we should pop out the node behind n and stop fusion.
+    if (matched_list.back() != &n) {
+      status = kFail;
+      return false;
+    }
+    if (new_node.op()->name == "_contrib_requantize") {
+      auto const &param = nnvm::get<RequantizeParam>(new_node.attrs.parsed);
+      if (param.min_calib_range.has_value() &&
+          param.max_calib_range.has_value()) {
+        matched_list.push_back(&new_node);
+        status = kSuccess;
+        return true;
+      } else {
+        status = kFail;
+      }
+    }
+    return false;
+  }
+
+  std::vector<nnvm::Node *> Filter(
+      const std::vector<nnvm::Node *> &candidates) override {
+    if (status != kSuccess) {
+      return std::vector<nnvm::Node *>(0);
+    } else {
+      return candidates;
+    }
+  }
+};
+
+class SgMKLDNNConvPostQuantizeProperty : public SubgraphProperty {
+ public:
+  SgMKLDNNConvPostQuantizeProperty() {
+    disable_all = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_OPT", 0);
+    if (disable_all) {
+      LOG(INFO) << "MKLDNN Convolution post-quantization optimization pass is disabled.";
+    } else {
+      LOG(INFO) << "Start to execute MKLDNN Convolution post-quantization optimization pass.";
+    }
+  }
+  static SubgraphPropertyPtr Create() {
+    return std::make_shared<SgMKLDNNConvPostQuantizeProperty>();
+  }
+  nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
+                                   const int subgraph_id = 0) const override {
+    nnvm::NodePtr conv_node = nullptr;
+    nnvm::NodePtr requantize_node = nullptr;
+    DFSVisit(sym.outputs, [&](const nnvm::NodePtr &node) {
+      if (node->is_variable()) return;
+      auto &op_name = node->op()->name;
+      if (op_name == "_sg_mkldnn_conv") {
+        conv_node = node;
+      } else if (op_name == "_contrib_requantize") {
+        requantize_node = node;
+      }
+    });
+    CHECK_NOTNULL(conv_node);
+    CHECK_NOTNULL(requantize_node);
+    auto const &requantize_param =
+        nnvm::get<RequantizeParam>(requantize_node->attrs.parsed);
+    CHECK(requantize_param.min_calib_range.has_value());
+    CHECK(requantize_param.max_calib_range.has_value());
+    conv_node->attrs.dict["min_calib_range"] =
+        std::to_string(requantize_param.min_calib_range.value());
+    conv_node->attrs.dict["max_calib_range"] =
+        std::to_string(requantize_param.max_calib_range.value());
+    conv_node->op()->attr_parser(&(conv_node->attrs));
+    return conv_node;
+  }
+
+  SubgraphSelectorPtr CreateSubgraphSelector() const override {
+    auto selector =
+        std::make_shared<SgMKLDNNConvPostQuantizeSelector>(disable_all);
+    return selector;
+  }
+
+  void ConnectSubgraphOutputs(
+      const nnvm::NodePtr n,
+      std::vector<nnvm::NodeEntry *> *output_entries) const override {
+    for (size_t i = 0; i < output_entries->size(); ++i) {
+      auto entry_ptr = output_entries->at(i);
+      *entry_ptr = nnvm::NodeEntry{n, entry_ptr->index, 0};
+    }
+  }
+
+ private:
+  int disable_all;
+};
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_POST_QUANTIZE, SgMKLDNNConvPostQuantizeProperty);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // if MXNET_USE_MKLDNN == 1
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv_property.cc b/src/operator/subgraph/mkldnn/mkldnn_conv_property.cc
new file mode 100644
index 000000000000..e5220f24d34d
--- /dev/null
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv_property.cc
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#if MXNET_USE_MKLDNN == 1
+
+#include "../common.h"
+#include "../subgraph_property.h"
+#include "../../nn/activation-inl.h"
+
+namespace mxnet {
+namespace op {
+class SgMKLDNNConvSelector : public SubgraphSelector {
+ public:
+  /*! \brief pattern match status */
+  enum SelectStatus {
+    kFail = 0,
+    kStart,
+    kBN,
+    kSum,
+    kSuccess,
+  };
+
+ private:
+  bool disable_all;
+  bool disable_conv_bn;
+  bool disable_conv_relu;
+  bool disable_conv_sum;
+  SelectStatus status;
+  std::vector<const nnvm::Node *> matched_list;
+
+ public:
+  SgMKLDNNConvSelector(int dis_all, int dis_conv_bn, int dis_conv_relu, int dis_conv_sum)
+      : disable_all(dis_all),
+        disable_conv_bn(dis_conv_bn),
+        disable_conv_relu(dis_conv_relu),
+        disable_conv_sum(dis_conv_sum) {}
+
+  bool Select(const nnvm::Node &n) override {
+    if (n.op() && n.op()->name == "Convolution") {
+      status = disable_all ? kSuccess : kStart;
+      matched_list.clear();
+      matched_list.push_back(&n);
+      return true;
+    }
+    return false;
+  }
+
+  bool SelectInput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    return false;
+  }
+
+  bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    if (status == kFail || status == kSuccess || new_node.is_variable())
+      return false;
+    // If n isn't the last matched node, then we encoutered a internal
+    // branch, we should pop out the node behind n and stop fusion.
+    if (matched_list.back() != &n) {
+      while (matched_list.back() != &n) {
+        matched_list.pop_back();
+      }
+      status = kSuccess;
+      return false;
+    }
+    // Use status machine to do selection. The status change is
+    // kStart -> kBN -> kSum -> kSuccess
+    switch (status) {
+      case kStart:
+        if ((!disable_conv_bn) && new_node.op()->name == "BatchNorm") {
+          matched_list.push_back(&new_node);
+          status = kBN;
+          return true;
+        }
+      case kBN:
+        if ((!disable_conv_sum) && new_node.op()->name == "elemwise_add") {
+          matched_list.push_back(&new_node);
+          status = kSum;
+          return true;
+        }
+      case kSum:
+      default:
+        if ((!disable_conv_relu) && new_node.op()->name == "Activation") {
+          const ActivationParam &param =
+              nnvm::get<ActivationParam>(new_node.attrs.parsed);
+          if (param.act_type == activation::kReLU) {
+            matched_list.push_back(&new_node);
+            // If we find conv+relu, then we can't match bn anymore.
+            if (status == kStart) status = kBN;
+            return true;
+          } else {
+            status = kSuccess;
+            return false;
+          }
+        }
+        status = kSuccess;
+        return false;
+    }
+  }
+
+  std::vector<nnvm::Node *> Filter(
+      const std::vector<nnvm::Node *> &candidates) override {
+    if (status == kFail) {
+      return std::vector<nnvm::Node *>(0);
+    } else {
+      return candidates;
+    }
+  }
+};
+
+class SgMKLDNNConvProperty : public SubgraphProperty {
+ public:
+  SgMKLDNNConvProperty() {
+    disable_all = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_OPT", 0);
+    disable_conv_bn = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_BN", 0);
+    disable_conv_relu = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_RELU", 0);
+    disable_conv_sum = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_SUM", 0);
+
+    disable_all =
+        disable_all && disable_conv_bn && disable_conv_relu && disable_conv_sum;
+    if (disable_all) {
+      LOG(INFO) << "MKLDNN Convolution optimization pass is disabled.";
+    } else {
+      LOG(INFO) << "Start to execute MKLDNN Convolution optimization pass.";
+    }
+  }
+  static SubgraphPropertyPtr Create() {
+    return std::make_shared<SgMKLDNNConvProperty>();
+  }
+  nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
+                                   const int subgraph_id = 0) const override {
+    nnvm::NodePtr n = nnvm::Node::Create();
+    // This op has single output, remove duplicated.
+    auto last_node = sym.outputs[0].node;
+    nnvm::Symbol new_sym;
+    new_sym.outputs.emplace_back(nnvm::NodeEntry{last_node, 0, 0});
+    std::ostringstream node_name;
+    node_name << "sg_mkldnn_";
+    bool _with_sum = false;
+    DFSVisit(new_sym.outputs, [&](const nnvm::NodePtr &node) {
+      if (node->is_variable()) return;
+      auto &sub_name = node->op()->name;
+      if (sub_name == "Convolution") {
+        node_name << "conv_";
+      } else if (sub_name == "BatchNorm") {
+        node_name << "bn_";
+        n->attrs.dict["with_bn"] = "true";
+      } else if (sub_name == "elemwise_add") {
+        node_name << "add_";
+        n->attrs.dict["with_sum"] = "true";
+        _with_sum = true;
+
+      } else if (sub_name == "Activation") {
+        node_name << "relu_";
+        if (!_with_sum) {
+          n->attrs.dict["with_relu"] = "true";
+        } else {
+          n->attrs.dict["with_postsum_relu"] = "true";
+        }
+      }
+    });
+    node_name << std::to_string(subgraph_id);
+    n->attrs.name = node_name.str();
+    n->attrs.op = Op::Get("_sg_mkldnn_conv");
+    CHECK(n->attrs.op);
+    n->attrs.subgraphs.emplace_back(std::make_shared<nnvm::Symbol>(new_sym));
+    n->op()->attr_parser(&(n->attrs));
+    return n;
+  }
+
+  SubgraphSelectorPtr CreateSubgraphSelector() const override {
+    auto selector = std::make_shared<SgMKLDNNConvSelector>(
+        disable_all, disable_conv_bn, disable_conv_relu, disable_conv_sum);
+    return selector;
+  }
+
+  void ConnectSubgraphOutputs(
+      const nnvm::NodePtr n,
+      std::vector<nnvm::NodeEntry *> *output_entries) const override {
+    // Connect all extern output entries to output[0]
+    for (size_t i = 0; i < output_entries->size(); ++i) {
+      *output_entries->at(i) = nnvm::NodeEntry{n, 0, 0};
+    }
+  }
+
+  void ConnectSubgraphInputs(
+      const nnvm::NodePtr n, std::vector<nnvm::NodeEntry *> *input_entries,
+      std::vector<nnvm::NodeEntry> *orig_input_entries) const override {
+    auto sym = n->attrs.subgraphs[0];
+    std::unordered_set<const nnvm::Node *> node_sets;
+    DFSVisit(sym->outputs, [&](const nnvm::NodePtr &node) {
+      if (node->is_variable()) return;
+      node_sets.insert(node.get());
+      if (node->op()->name == "elemwise_add") {
+        // Make sure n is the left operand of sum, if not,
+        // switch sum operands sequence to ensure that
+        // the extra sum operand stays in the last of inputs.
+        if (node_sets.count(node->inputs[1].node.get())) {
+          auto tmp = node->inputs[1];
+          node->inputs[1] = node->inputs[0];
+          node->inputs[0] = tmp;
+          std::rotate(input_entries->begin(), input_entries->begin() + 1,
+                      input_entries->end());
+          std::rotate(orig_input_entries->begin(),
+                      orig_input_entries->begin() + 1,
+                      orig_input_entries->end());
+        }
+      }
+    });
+    n->inputs = *orig_input_entries;
+  }
+
+ private:
+  int disable_all;
+  int disable_conv_bn;
+  int disable_conv_relu;
+  int disable_conv_sum;
+};
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNConvProperty);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // if MXNET_USE_MKLDNN == 1
diff --git a/src/operator/subgraph/partition_graph.cc b/src/operator/subgraph/partition_graph.cc
index 315f7eec00c6..da9a9f375fa5 100644
--- a/src/operator/subgraph/partition_graph.cc
+++ b/src/operator/subgraph/partition_graph.cc
@@ -653,10 +653,9 @@ void CreateSubgraphNode(Graph* g,
   nnvm::NodePtr n = subg_prop->CreateSubgraphNode(sym, subgraph_id);
 
   // Connect the external nodes to the subgraph node.
-  for (size_t i = 0; i < output_entries.size(); ++i) {
-    *output_entries[i] = nnvm::NodeEntry{n, static_cast<uint32_t>(i), 0};
-  }
-  n->inputs = orig_input_entries;
+  subg_prop->ConnectSubgraphOutputs(n, &output_entries);
+  subg_prop->ConnectSubgraphInputs(n, &input_entries, &orig_input_entries);
+
   const auto& indexed_graph = g->indexed_graph();
   for (size_t i = 0; i < n->inputs.size(); ++i) {
     auto& e = n->inputs[i];
diff --git a/src/operator/subgraph/subgraph_property.h b/src/operator/subgraph/subgraph_property.h
index cfbc1f837337..e9fdd6619275 100644
--- a/src/operator/subgraph/subgraph_property.h
+++ b/src/operator/subgraph/subgraph_property.h
@@ -62,16 +62,22 @@ class SubgraphSelector {
    * \brief Determines if to select input_node when traverse to the cur_node.
    * \param cur_node the node for determining whether its input_node should be selected
    * \param input_node the input node of the cur_node
+   * \return true if input_node is selected
    */
   virtual bool SelectInput(const nnvm::Node &cur_node, const nnvm::Node &input_node) = 0;
   /*!
    * \brief Determines if to select output_node when traverse to the cur_node.
    * \param cur_node the node for determining whether its output_node should be selected
    * \param output_node the output node of the cur_node
+   * \return true if output_node is selected
    */
   virtual bool SelectOutput(const nnvm::Node &cur_node, const nnvm::Node &output_node) = 0;
-  // Post processes pre-selected subgraph nodes. Return a list of nodes that
-  // users want to keep in subgraph(s).
+  /*!
+   * \brief Post processes pre-selected subgraph nodes. Return a list of nodes that
+   *        users want to keep in subgraph(s).
+   * \param candidates re-selected subgraph nodes to filt
+   * \return a list of nodes to keep
+   */
   virtual std::vector<nnvm::Node*> Filter(const std::vector<nnvm::Node*>& candidates) {
     return candidates;
   }
@@ -81,30 +87,65 @@ using SubgraphSelectorPtr = std::shared_ptr<SubgraphSelector>;
 
 /*!
  * \brief This provides a set of properties for partitioning a graph into subgraphs,
- * reconstructing a new graph from the subgraphs and creating a subgraph
- * operator to execute the subgraph.
+ *        reconstructing a new graph from the subgraphs and creating a subgraph
+ *        operator to execute the subgraph.
  */
 class SubgraphProperty {
  public:
-  // the criteria of selecting the subgraph nodes.
+  /*!
+   * \brief The criteria of selecting the subgraph nodes.
+   */
   virtual SubgraphSelectorPtr CreateSubgraphSelector() const = 0;
-  // create an nnvm node for a given subgraph. Here users can customize how to
-  // execute the operators in the subgraph.
-  virtual nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &s,
+  /*!
+   * \brief Create an nnvm node for a given subgraph. Here users can customize how to
+   *        execute the operators in the subgraph.
+   * \param sym the symbol to create subgraph node
+   * \param subgraph_id subgraph id
+   */
+  virtual nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
                                            const int subgraph_id = 0) const = 0;
-  // set an attr with name in the attr map
+  /*!
+   * \brief Connect subgraph internal output with external output entries.
+   *        By default, each output entry will connect to an unique internal output.
+   * \param subgraph_node the subgraph node to connect output
+   * \param output_entries external output entries depending on this subgraph node
+   */
+  virtual void ConnectSubgraphOutputs(const nnvm::NodePtr subgraph_node,
+                                      std::vector<nnvm::NodeEntry*>* output_entries) const {
+    for (size_t i = 0; i < output_entries->size(); ++i) {
+      *output_entries->at(i) = nnvm::NodeEntry{subgraph_node, static_cast<uint32_t>(i), 0};
+    }
+  }
+  /*!
+   * \brief Connect subgraph internal input with external input entries.
+   * By default, each input entry will connect in top sorted order.
+   * \param subgraph_node the subgraph node to connect input
+   * \param input_entries input entries inside subgraph
+   * \param orig_input_entries input entries outside subgraph
+   */
+  virtual void ConnectSubgraphInputs(const nnvm::NodePtr subgraph_node,
+                                     std::vector<nnvm::NodeEntry*>* input_entries,
+                                     std::vector<nnvm::NodeEntry>* orig_input_entries) const {
+    subgraph_node->inputs = *orig_input_entries;
+  }
+  /*!
+   * \brief Set an attr with name in the attr map.
+   */
   template<typename T>
   SubgraphProperty& SetAttr(const std::string& name, const T& value) {
     attrs_[name] = std::make_shared<dmlc::any>(value);
     return *this;
   }
-  // get the attr with the name
+  /*!
+   * \brief Get the attr with the name.
+   */
   template<typename T>
   const T& GetAttr(const std::string& name) const {
     auto it = attrs_.find(name);
     CHECK(it != attrs_.end()) << "Cannot find attribute " << name << " in SubgraphProperty";
     return nnvm::get<T>(*it->second);
   }
+
  protected:
   std::unordered_map<std::string, std::shared_ptr<nnvm::any>> attrs_;
 };
diff --git a/src/operator/svm_output-inl.h b/src/operator/svm_output-inl.h
index 9ae0ced7a74a..011b9ad10284 100644
--- a/src/operator/svm_output-inl.h
+++ b/src/operator/svm_output-inl.h
@@ -159,7 +159,7 @@ class SVMOutputProp : public OperatorProperty {
     CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
+    for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
diff --git a/src/operator/tensor/indexing_op-inl.cuh b/src/operator/tensor/indexing_op-inl.cuh
index b2f514e20cd9..5c5236363a53 100644
--- a/src/operator/tensor/indexing_op-inl.cuh
+++ b/src/operator/tensor/indexing_op-inl.cuh
@@ -213,10 +213,10 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(dst.stream_);
   const int num_unique_est = min(num_rows, src.size(0));
   const int max_nthread = 128;
-  const int num_y = max(src.size(0)/num_unique_est, 1);
+  const int num_y = max(static_cast<int>(src.size(0))/num_unique_est, 1);
   const int block_dim_x = kWarpSize;
   const int block_dim_y = min(num_y, max_nthread/block_dim_x);
-  const int SZ = min((src.size(1) + block_dim_x - 1) / block_dim_x, 4);
+  const int SZ = min((static_cast<int>(src.size(1)) + block_dim_x - 1) / block_dim_x, 4);
   const int grid_dim_x = (src.size(1) + block_dim_x * SZ - 1) / (block_dim_x * SZ);
   const int grid_dim_y = min(num_unique_est, mshadow::cuda::kBaseGridNum);
   dim3 dimBlock(block_dim_x, block_dim_y);
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 78e1fa1d9c6a..bfee083ca1a6 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -86,7 +86,7 @@ inline TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
   }
   auto dshape_len = dshape_vec.size();
   auto params_len = param_shape_vec.size();
-  for (index_t i = 0; i < params_len; ++i) {
+  for (size_t i = 0; i < params_len; ++i) {
     IType proposed_dim = param_shape_vec[i];
     if (proposed_dim == 0) {
       // keep same
@@ -2085,7 +2085,7 @@ void StackOpForward(const nnvm::NodeAttrs& attrs,
     Shape<3> oshape = Shape3(leading, mid, trailing);
     out = outputs[0].get_with_shape<xpu, 3, DType>(oshape, s);
 
-    for (index_t i = 0; i < inputs.size(); ++i) {
+    for (size_t i = 0; i < inputs.size(); ++i) {
       Shape<3> dshape = Shape3(leading, 1, trailing);
       data[i] = inputs[i].get_with_shape<xpu, 3, DType>(dshape, s);
     }
@@ -2119,7 +2119,7 @@ void StackOpBackward(const nnvm::NodeAttrs& attrs,
     Shape<3> oshape = Shape3(leading, mid, trailing);
     grad = inputs[0].get_with_shape<xpu, 3, DType>(oshape, s);
 
-    for (index_t i = 0; i < outputs.size(); ++i) {
+    for (size_t i = 0; i < outputs.size(); ++i) {
       Shape<3> dshape = Shape3(leading, 1, trailing);
       grad_in[i] = outputs[i].get_with_shape<xpu, 3, DType>(dshape, s);
     }
diff --git a/src/operator/tensor/ordering_op-inl.h b/src/operator/tensor/ordering_op-inl.h
index c1a5b89db094..18bd7608e4c1 100644
--- a/src/operator/tensor/ordering_op-inl.h
+++ b/src/operator/tensor/ordering_op-inl.h
@@ -380,14 +380,17 @@ void TopKImpl(const RunContext &ctx,
   Tensor<xpu, 3, DType> dat = src.FlatTo3D<xpu, DType>(axis, axis, s);
   size_t temp_size = 0;
   // Temp space needed by the gpu-based full sorts.
-  temp_size = std::max(temp_size, mxnet::op::SortByKeyWorkspaceSize<int, int, xpu>(src.Size()));
-  temp_size = std::max(temp_size, mxnet::op::SortByKeyWorkspaceSize<int, DType, xpu>(src.Size()));
-  temp_size = std::max(temp_size, mxnet::op::SortByKeyWorkspaceSize<DType, int, xpu>(src.Size()));
+  temp_size = std::max(temp_size,
+    mxnet::op::SortByKeyWorkspaceSize<int, int, xpu>(src.Size()));
+  temp_size = std::max(temp_size,
+    mxnet::op::SortByKeyWorkspaceSize<int, DType, xpu>(src.Size()));
+  temp_size = std::max(temp_size,
+    mxnet::op::SortByKeyWorkspaceSize<DType, int, xpu>(src.Size()));
   // Additional temp space for gpu full sorts for batch ids.
   temp_size += sizeof(int) * src.Size();
   // Temp space for cpu sorts.
-  temp_size = std::max(temp_size, sizeof(DType) * src.Size());
-  size_t workspace_size = temp_size + sizeof(DType) * src.Size() + sizeof(int) * src.Size();
+  temp_size = std::max(temp_size, sizeof(DType) * static_cast<size_t>(src.Size()));
+  index_t workspace_size = temp_size + sizeof(DType) * src.Size() + sizeof(int) * src.Size();
   if (param.ret_typ == topk_enum::kReturnMask) {
     workspace_size += sizeof(int) * batch_size * k + sizeof(DType) * batch_size * k;
   }
diff --git a/src/operator/tensor/ordering_op.cc b/src/operator/tensor/ordering_op.cc
index 1e2832d3763e..189ea19fa6f8 100644
--- a/src/operator/tensor/ordering_op.cc
+++ b/src/operator/tensor/ordering_op.cc
@@ -74,8 +74,8 @@ Examples::
     const TopKParam& param = nnvm::get<TopKParam>(n->attrs.parsed);
     if (param.ret_typ == topk_enum::kReturnValue || param.ret_typ == topk_enum::kReturnBoth) {
       std::vector<nnvm::NodeEntry> inputs;
-      index_t n_out = n->num_outputs();
-      for (index_t i = 0; i < n_out; ++i) {
+      uint32_t n_out = n->num_outputs();
+      for (uint32_t i = 0; i < n_out; ++i) {
         inputs.emplace_back(nnvm::NodeEntry{ n, i, 0 });
       }
       return MakeNonlossGradNode("_backward_topk", n, {ograds[0]}, inputs, n->attrs.dict);
@@ -136,8 +136,8 @@ Examples::
   [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
     const SortParam& param = nnvm::get<SortParam>(n->attrs.parsed);
     std::vector<nnvm::NodeEntry> inputs;
-    index_t n_out = n->num_outputs();
-    for (index_t i = 0; i < n_out; ++i) {
+    uint32_t n_out = n->num_outputs();
+    for (uint32_t i = 0; i < n_out; ++i) {
       inputs.emplace_back(nnvm::NodeEntry{ n, i, 0 });
     }
     return MakeNonlossGradNode("_backward_topk", n, {ograds[0]}, inputs,
diff --git a/tests/nightly/apache_rat_license_check/rat-excludes b/tests/nightly/apache_rat_license_check/rat-excludes
index 301a342f4664..0c305f498b34 100755
--- a/tests/nightly/apache_rat_license_check/rat-excludes
+++ b/tests/nightly/apache_rat_license_check/rat-excludes
@@ -57,3 +57,4 @@ apache-rat-tasks/*
 moderngpu/*
 deformable_im2col.cuh
 deformable_im2col.h
+REQUIRE
diff --git a/tests/nightly/straight_dope/test_notebooks_single_gpu.py b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
index 5eeb52f516e1..fc3a4f240ddc 100644
--- a/tests/nightly/straight_dope/test_notebooks_single_gpu.py
+++ b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
@@ -35,13 +35,11 @@
     'chapter02_supervised-learning/environment',
     'chapter03_deep-neural-networks/kaggle-gluon-kfold',
     'chapter04_convolutional-neural-networks/deep-cnns-alexnet',  # > 10 mins.
-    'chapter05_recurrent-neural-networks/rnns-gluon', # > 10 mins.
     'chapter06_optimization/gd-sgd-scratch',  # Overflow warning is intended.
     'chapter06_optimization/gd-sgd-gluon',  # Overflow warning is intended.
     'chapter07_distributed-learning/multiple-gpus-scratch',
     'chapter07_distributed-learning/multiple-gpus-gluon',
     'chapter07_distributed-learning/training-with-multiple-machines',
-    'chapter08_computer-vision/visual-question-answer', # > 10 mins.
     'chapter11_recommender-systems/intro-recommender-systems',  # Early draft, non-working.
     'chapter12_time-series/intro-forecasting-gluon',
     'chapter12_time-series/intro-forecasting-2-gluon',
@@ -178,6 +176,9 @@ def test_lstm_scratch(self):
     def test_gru_scratch(self):
         assert _test_notebook('chapter05_recurrent-neural-networks/gru-scratch')
 
+    def test_rnn_gluon(self):
+        assert _test_notebook('chapter05_recurrent-neural-networks/rnns-gluon')
+ 
     # Chapter 6
 
     def test_optimization_intro(self):
@@ -227,6 +228,10 @@ def test_object_detection(self):
     def test_fine_tuning(self):
         assert _test_notebook('chapter08_computer-vision/fine-tuning')
 
+    def test_visual_qa(self):
+        assert _test_notebook('chapter08_computer-vision/visual-question-answer')
+
+
     # Chapter 9
 
     def test_tree_lstm(self):
diff --git a/tests/nightly/test_all.sh b/tests/nightly/test_all.sh
index 04d895fecf21..73f0f588fe90 100755
--- a/tests/nightly/test_all.sh
+++ b/tests/nightly/test_all.sh
@@ -122,4 +122,7 @@ juLog -name=BuildWithoutCUDNN -error=Error build
 # python: multi gpus lenet + mnist
 juLog -name=Python.Multi.Lenet.Mnist -error=Error python multi_lenet.py
 
+# python: large tensor
+juLog -name=Python.LargeTensor -error=Fail python test_large_array.py
+
 exit $errors
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
new file mode 100644
index 000000000000..121acc174b51
--- /dev/null
+++ b/tests/nightly/test_large_array.py
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+import mxnet as mx
+from mxnet import gluon, nd
+
+
+class TestLargeArray(unittest.TestCase):
+    def test_ndarray2numpy(self):
+        m = gluon.nn.Embedding(14000, 128)
+        m.initialize()
+        ind = nd.zeros((700000, 128))
+        x = m(ind)
+        x.shape
+        test = x.asnumpy()
+        assert (x.shape == test.shape)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/python/gpu/test_kvstore_gpu.py b/tests/python/gpu/test_kvstore_gpu.py
index 5e9b120f14ee..4232a590a5df 100644
--- a/tests/python/gpu/test_kvstore_gpu.py
+++ b/tests/python/gpu/test_kvstore_gpu.py
@@ -58,6 +58,7 @@ def init_kv_with_str(stype='default', kv_type='local'):
 # Test seed 89411477 (module seed 1829754103) resulted in a py3-gpu CI runner core dump.
 # Not reproducible, so this test is back on random seeds.
 @with_seed()
+@unittest.skipIf(mx.context.num_gpus() < 2, "test_rsp_push_pull needs more than 1 GPU")
 def test_rsp_push_pull():
     def check_rsp_push_pull(kv_type, sparse_pull, is_push_cpu=True):
         kv = init_kv_with_str('row_sparse', kv_type)
@@ -65,7 +66,8 @@ def check_rsp_push_pull(kv_type, sparse_pull, is_push_cpu=True):
         push_ctxs = [mx.cpu(i) if is_push_cpu else mx.gpu(i) for i in range(2)]
         kv.push('e', [mx.nd.ones(shape, ctx=context).tostype('row_sparse') for context in push_ctxs])
 
-        def check_rsp_pull(kv, count, ctxs, sparse_pull, is_same_rowid=False, use_slice=False):
+        def check_rsp_pull(kv, ctxs, sparse_pull, is_same_rowid=False, use_slice=False):
+            count = len(ctxs)
             num_rows = shape[0]
             row_ids = []
             all_row_ids = np.arange(num_rows)
@@ -100,14 +102,14 @@ def check_rsp_pull(kv, count, ctxs, sparse_pull, is_same_rowid=False, use_slice=
                     expected_val[:] = 2
                     assert_almost_equal(retained, expected_val)
 
-        check_rsp_pull(kv, 1, [mx.gpu(0)], sparse_pull)
-        check_rsp_pull(kv, 1, [mx.cpu(0)], sparse_pull)
-        check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], sparse_pull)
-        check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], sparse_pull, is_same_rowid=True)
-        check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)], sparse_pull)
-        check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)], sparse_pull, is_same_rowid=True)
-        check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], sparse_pull, use_slice=True)
-        check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)], sparse_pull, use_slice=True)
+        check_rsp_pull(kv, [mx.gpu(0)], sparse_pull)
+        check_rsp_pull(kv, [mx.cpu(0)], sparse_pull)
+        check_rsp_pull(kv, [mx.gpu(i//2) for i in range(4)], sparse_pull)
+        check_rsp_pull(kv, [mx.gpu(i//2) for i in range(4)], sparse_pull, is_same_rowid=True)
+        check_rsp_pull(kv, [mx.cpu(i) for i in range(4)], sparse_pull)
+        check_rsp_pull(kv, [mx.cpu(i) for i in range(4)], sparse_pull, is_same_rowid=True)
+        check_rsp_pull(kv, [mx.gpu(i//2) for i in range(4)], sparse_pull, use_slice=True)
+        check_rsp_pull(kv, [mx.cpu(i) for i in range(4)], sparse_pull, use_slice=True)
 
     envs = ["","1"]
     key  = "MXNET_KVSTORE_USETREE"
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index 53e4051fc072..e5490096f60f 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -298,6 +298,7 @@ def check_activation_training(stype):
         check_activation_training(stype)
 
 
+@with_seed()
 def test_convolution():
     def check_convolution_training(stype):
         for shape in [(3, 3, 10), (3, 3, 10, 10)]:
@@ -322,6 +323,8 @@ def check_convolution_training(stype):
         check_convolution_training(stype)
 
 
+@with_seed()
+@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12579")
 def test_Deconvolution():
     def check_Deconvolution_training(stype):
         for shape in [(3, 3, 10), (3, 3, 10, 10)]:
diff --git a/tests/python/mkl/test_subgraph.py b/tests/python/mkl/test_subgraph.py
new file mode 100644
index 000000000000..5b708216e2ac
--- /dev/null
+++ b/tests/python/mkl/test_subgraph.py
@@ -0,0 +1,487 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import os
+import mxnet as mx
+import numpy as np
+import unittest
+import ctypes
+from mxnet.io import NDArrayIter
+from mxnet.module import Module
+from mxnet.symbol import Symbol
+from importlib import import_module
+from numpy.testing import assert_allclose
+from mxnet.base import SymbolHandle, check_call, _LIB, mx_uint, c_str
+from mxnet.test_utils import DummyIter
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, '../unittest/'))
+from common import with_seed
+from mxnet.test_utils import assert_almost_equal
+
+DATA_SHAPE=[(4, 4, 10, 10), (32, 3, 24, 24), (64, 8, 64, 64)]
+
+def check_qsym_calibrated(qsym):
+  assert ''.join(qsym.attr_dict().keys()).find('quantized_sg_mkldnn_conv') != -1
+  for k, v in qsym.attr_dict().items():
+    if k.find('quantized_sg_mkldnn_conv') != -1:
+      assert 'min_calib_range' in v
+      assert 'max_calib_range' in v
+    if k.find('_quantize') != -1:
+      assert v['out_type'] == 'uint8'
+
+def check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape, label_shape):
+  mod = mx.mod.Module(symbol=qsym, context=mx.current_context())
+  mod.bind(for_training=False,
+           data_shapes=[('data', data_shape)],
+           label_shapes=[('softmax_label', label_shape)])
+  mod.set_params(qarg_params, qaux_params)
+  mod.forward(batch, is_train=False)
+  for output in mod.get_outputs():
+    output.wait_to_read()
+  return mod.get_outputs()
+
+def check_quantize(sym, data_shape):
+  fc = mx.sym.FullyConnected(data=sym, num_hidden=10, flatten=True, name='fc')
+  sym = mx.sym.SoftmaxOutput(data=fc, name='softmax')
+  sym_sg = sym.get_backend_symbol("MKLDNN")
+  label_shape = (data_shape[0], 10)
+  mod = Module(symbol=sym)
+  mod.bind(for_training=False,
+           data_shapes=[('data', data_shape)],
+           label_shapes=[('softmax_label', label_shape)])
+  mod.init_params(mx.init.Normal(0.5))
+  arg_params, aux_params = mod.get_params()
+
+  data = [mx.random.uniform(-1, 1, shape=shape, ctx=mx.current_context()) for _, shape in mod.data_shapes]
+  batch = mx.io.DataBatch(data, [])
+
+  mod.forward(batch, is_train=False)
+  for output in mod.get_outputs():
+      output.wait_to_read()
+  ref_out = mod.get_outputs()
+
+  excluded_sym_names = []
+  if mx.current_context() == mx.cpu():
+    excluded_sym_names += ['fc']
+
+  calib_data = mx.nd.random.uniform(shape=data_shape)
+  calib_data = NDArrayIter(data=calib_data)
+  calib_data = DummyIter(calib_data)
+  calib_layer = lambda name: name.endswith('_output')
+  qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym_sg,
+                                                                   arg_params=arg_params,
+                                                                   aux_params=aux_params,
+                                                                   ctx=mx.current_context(),
+                                                                   excluded_sym_names=excluded_sym_names,
+                                                                   quantized_dtype='uint8',
+                                                                   calib_mode='naive',
+                                                                   calib_data=calib_data,
+                                                                   calib_layer=calib_layer,
+                                                                   calib_quantize_op=True,
+                                                                   num_calib_examples=5)
+  qsym = qsym.get_backend_symbol("MKLDNN_POST_QUANTIZE")
+  check_qsym_calibrated(qsym)
+  quantized_out = check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape, label_shape)
+  for i in range(len(ref_out)):
+    assert_almost_equal(ref_out[i].asnumpy(), quantized_out[i].asnumpy(), atol = 1)
+
+
+@with_seed()
+def check_fusion(sym, data_shape, attrs_op):
+  sym_sg = sym.get_backend_symbol("MKLDNN")
+  assert ''.join(sym_sg.get_internals().list_outputs()).find('sg_mkldnn_conv') != -1
+  for k, v in sym_sg.attr_dict().items():
+    if k.find('sg_mkldnn_conv') != -1:
+      for attr_op in attrs_op:
+        assert v[attr_op] == 'true'
+
+  arg_shapes, _, aux_shapes = sym.infer_shape()
+  arg_array = [mx.nd.random.uniform(-1, 1, shape=shape) for shape in arg_shapes]
+  aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes]
+  exe = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
+  exe.forward()
+  os.environ['MXNET_SUBGRAPH_BACKEND'] = 'MKLDNN'
+  exe_sg = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
+  exe_sg.forward()
+  del os.environ['MXNET_SUBGRAPH_BACKEND']
+  for i in range(len(exe.outputs)):
+    assert_almost_equal(exe.outputs[i].asnumpy(), exe_sg.outputs[i].asnumpy(), rtol=1e-3, atol=1e-3)
+
+  # fp32 to uint8
+  check_quantize(sym, data_shape)
+
+def check_neg_fusion(syms, attrs_name=None, excluded_attrs=None, date_shape=(4,4,10,10)):
+  for sym, attrs, excluded_attr in zip(syms, attrs_name, excluded_attrs):
+    sym_sg = sym.get_backend_symbol("MKLDNN")
+    exe_sg = sym_sg.simple_bind(mx.cpu(), data=date_shape, grad_req='null')
+
+    attrs_dict = sym_sg.attr_dict()
+    for k, v in attrs_dict.items():
+      if k.find('sg_mkldnn_conv') != -1:
+        for attr in attrs:
+          assert v[attr] == 'true'
+        for exc_attr in excluded_attr:
+          assert exc_attr not in v.keys()
+
+def head_symbol(data_shape):
+  data = mx.symbol.Variable('data', shape=data_shape, dtype='float32')
+  weight = mx.symbol.Variable('weight', dtype='float32')
+  bn = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=0.9, name='bn')
+  return bn, weight
+
+# single conv fuision case
+def single_conv(no_bias, data_shape):
+  conv_attr = ['']
+  data, weight = head_symbol(data_shape)
+  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
+                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
+  return conv, conv_attr
+
+# conv + bn fusion case
+def conv_bn(no_bias, data_shape):
+  conv_bn_attr = ['with_bn']
+  data, weight = head_symbol(data_shape)
+  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
+                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
+  bn1 = mx.symbol.BatchNorm(data=conv, name="bn1")
+  return bn1, conv_bn_attr
+
+# conv + relu fusion case
+def conv_relu(no_bias, data_shape):
+  conv_relu_attr = ['with_relu']
+  data, weight = head_symbol(data_shape)
+  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
+                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
+  relu = mx.symbol.Activation(data=conv, name='relu', act_type="relu")
+  return relu, conv_relu_attr
+
+# conv + add fusion case
+def conv_add(no_bias, data_shape):
+  conv_add_attr = ['with_sum']
+  data, weight = head_symbol(data_shape)
+  conv1 = mx.symbol.Convolution(data=data, weight=weight, name='conv1', num_filter=64,
+                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
+  conv2 = mx.symbol.Convolution(data=data, name='conv2', num_filter=64,
+                               kernel=(3, 3), stride=(1, 1))
+  pool = mx.sym.Pooling(data=conv2, kernel=(1, 1), pool_type='avg', name='pool')
+  sum = conv1 + pool
+  return sum, conv_add_attr
+
+# conv + add fusion case 2
+def conv_add2(no_bias, data_shape):
+  conv_add_attr = ['with_sum']
+  data, weight = head_symbol(data_shape)
+  conv1 = mx.symbol.Convolution(data=data, weight=weight, name='conv1', num_filter=64,
+                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
+  conv2 = mx.symbol.Convolution(data=data, name='conv2', num_filter=64,
+                               kernel=(3, 3), stride=(1, 1))
+  pool = mx.sym.Pooling(data=conv2, kernel=(1, 1), pool_type='avg', name='pool')
+  sum = pool + conv1
+  return sum, conv_add_attr
+
+# conv + bn + relu fusion case
+def conv_bn_relu(no_bias, data_shape):
+  conv_bn_relu_attr = ['with_bn', 'with_relu']
+  data, weight = head_symbol(data_shape)
+  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
+                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
+  bn1 = mx.symbol.BatchNorm(data=conv, name="bn1")
+  relu = mx.symbol.Activation(data=bn1, name='relu', act_type="relu")
+  return relu, conv_bn_relu_attr
+
+# conv + bn + add + relu fusion case
+def conv_bn_sum_relu(no_bias, data_shape):
+  conv_bn_add_relu_attr = ['with_sum', 'with_postsum_relu', 'with_bn']
+  data, weight = head_symbol(data_shape)
+  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
+                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
+  bn1 = mx.symbol.BatchNorm(data=conv, name="bn1")
+  conv1 = mx.symbol.Convolution(data=data, weight=weight, name='conv1', num_filter=64,
+                                kernel=(3, 3), stride=(1, 1))
+  sum1 = bn1 + conv1
+  relu = mx.symbol.Activation(data=sum1, name='relu', act_type="relu")
+  return relu, conv_bn_add_relu_attr
+
+def tail_neg_symbol(sym1, sym2):
+  fc1 = mx.sym.FullyConnected(data=sym1, num_hidden=10, flatten=True, name='fc1')
+  fc2 = mx.sym.FullyConnected(data=sym2, num_hidden=10, flatten=True, name='fc2')
+  concat = mx.sym.Concat(*[fc1, fc2], name="concat")
+  sym = mx.sym.SoftmaxOutput(data=concat, name='softmax')
+  return sym
+
+# conv + bn can't be fusion case
+# eg.1
+# conv --------- > bn
+#  |
+#  |
+#  -------------> [custom op]
+def neg_conv_bn(data_shape):
+  syms = []
+  attrs = []
+  excluded_attrs = []
+  data, weight = head_symbol(data_shape)
+
+  # eg.1 ([custom op] = pool)
+  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64, kernel=(3, 3), stride=(1, 1))
+  bn1 = mx.symbol.BatchNorm(data=conv, name="bn1")
+  pool = mx.sym.Pooling(data=conv, kernel=(4, 4), pool_type='avg', name='pool')
+  sym = tail_neg_symbol(bn1, pool)
+
+  syms.append(sym)
+  attrs.append([])
+  excluded_attrs.append([])
+  return syms, attrs, excluded_attrs
+
+# conv + relu can't be fusion case
+# eg.1
+# conv -----------> relu
+#  |
+#  |
+#  ---------------> [custom op]
+def neg_conv_relu(data_shape):
+  syms = []
+  attrs = []
+  excluded_attrs = []
+  data, weight = head_symbol(data_shape)
+
+  # eg.1 ([custom op] = pool)
+  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64, kernel=(3, 3), stride=(1, 1))
+  relu = mx.symbol.Activation(data=conv, name='relu', act_type="relu")
+  pool = mx.sym.Pooling(data=conv, kernel=(4, 4), pool_type='avg', name='pool')
+  sym = tail_neg_symbol(relu, pool)
+
+  syms.append(sym)
+  attrs.append([])
+  excluded_attrs.append([])
+  return syms, attrs, excluded_attrs
+
+# conv + add can't be fusion case
+# eg.1
+#  ---------------> [custom op]
+#  |
+#  |
+# conv -----------> add
+#                   |
+#                   |
+# added ------------>
+def neg_conv_add(data_shape):
+  syms = []
+  attrs = []
+  excluded_attrs = []
+  val = mx.symbol.Variable('addval')
+  data, weight = head_symbol(data_shape)
+
+  # eg.1 ([custom op] = pool, [added op] = val)
+  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64, kernel=(3, 3), stride=(1, 1))
+  sum1 = conv + val
+  pool = mx.sym.Pooling(data=conv, kernel=(4, 4), pool_type='avg', name='pool')
+  sym = tail_neg_symbol(sum1, pool)
+
+  syms.append(sym)
+  attrs.append([])
+  excluded_attrs.append('with_sum')
+  return syms, attrs, excluded_attrs
+
+# conv + bn + relu can't be fusion case
+# eg.1
+#   --------------> [custom op]
+#   |
+# conv -----------> bn -----------> relu
+#
+# eg.2
+#                   --------------> [custom op]
+#                   |
+# conv -----------> bn -----------> relu
+def neg_conv_bn_relu(data_shape):
+  syms = []
+  attrs = []
+  excluded_attrs = []
+  data, weight = head_symbol(data_shape)
+
+  # eg.1 ([custom op] = pool11)
+  conv11 = mx.symbol.Convolution(data=data, weight=weight, name='conv11', num_filter=64, kernel=(3, 3), stride=(1, 1))
+  bn11 = mx.symbol.BatchNorm(data=conv11, name="bn11")
+  relu11 = mx.symbol.Activation(data=bn11, name='relu11', act_type="relu")
+  pool11 = mx.sym.Pooling(data=conv11, kernel=(4, 4), pool_type='avg', name='pool11')
+  sym1 = tail_neg_symbol(relu11, pool11)
+
+  syms.append(sym1)
+  attrs.append([])
+  excluded_attrs.append([])
+
+  # eg.2 ([custom op] = pool)
+  conv21 = mx.symbol.Convolution(data=data, weight=weight, name='conv21', num_filter=64, kernel=(3, 3), stride=(1, 1))
+  bn21 = mx.symbol.BatchNorm(data=conv21, name="bn21")
+  relu21 = mx.symbol.Activation(data=bn21, name='relu21', act_type="relu")
+  pool21 = mx.sym.Pooling(data=bn21, kernel=(4, 4), pool_type='avg', name='pool21')
+  sym2 = tail_neg_symbol(relu21, pool21)
+
+  syms.append(sym2)
+  attrs.append(['with_bn'])
+  excluded_attrs.append(['with_relu'])
+  return syms, attrs, excluded_attrs
+
+# conv + bn + add + relu can't be fusion case
+# eg.1
+#   --------------> [custom op]
+#   |
+# conv -----------> bn -----------> add -----------> relu
+#
+# eg.2
+#                    -------------> [custom op]
+#                    |
+# conv -----------> bn -----------> add -----------> relu
+#
+# eg.3
+#                                    --------------> [custom op]
+#                                    |
+# conv -----------> bn -----------> add -----------> relu
+def neg_conv_bn_add_relu(data_shape):
+  syms = []
+  attrs = []
+  excluded_attrs = []
+  addVal = mx.symbol.Variable('addval')
+  data, weight = head_symbol(data_shape)
+
+  # eg.1
+  conv11 = mx.symbol.Convolution(data=data, weight=weight, name='conv11', num_filter=64, kernel=(3, 3), stride=(1, 1))
+  bn11 = mx.symbol.BatchNorm(data=conv11, name="bn11")
+  sum11 = bn11 + addVal
+  relu11 = mx.symbol.Activation(data=sum11, name='relu11', act_type="relu")
+  pool11 = mx.sym.Pooling(data=conv11, kernel=(4, 4), pool_type='avg', name='pool11')
+  sym1 = tail_neg_symbol(relu11, pool11)
+
+  syms.append(sym1)
+  attrs.append([])
+  excluded_attrs.append(['with_sum', 'with_postsum_relu', 'with_bn'])
+
+  # eg.2
+  conv21 = mx.symbol.Convolution(data=data, weight=weight, name='conv21', num_filter=64, kernel=(3, 3), stride=(1, 1))
+  bn21 = mx.symbol.BatchNorm(data=conv21, name="bn21")
+  sum21 = bn21 + addVal
+  relu21 = mx.symbol.Activation(data=sum21, name='relu21', act_type="relu")
+  pool21 = mx.sym.Pooling(data=bn21, kernel=(4, 4), pool_type='avg', name='pool21')
+  sym2 = tail_neg_symbol(relu21, pool21)
+
+  syms.append(sym2)
+  attrs.append(['with_bn'])
+  excluded_attrs.append(['with_sum', 'with_postsum_relu'])
+
+  # eg.3
+  conv31 = mx.symbol.Convolution(data=data, weight=weight, name='conv31', num_filter=64, kernel=(3, 3), stride=(1, 1))
+  bn31 = mx.symbol.BatchNorm(data=conv31, name="bn31")
+  sum31 = bn31 + addVal
+  relu31 = mx.symbol.Activation(data=sum31, name='relu31', act_type="relu")
+  pool31 = mx.sym.Pooling(data=sum31, kernel=(4, 4), pool_type='avg', name='pool31')
+  sym3 = tail_neg_symbol(relu31, pool31)
+
+  syms.append(sym3)
+  attrs.append(['with_bn', 'with_sum'])
+  excluded_attrs.append(['with_postsum_relu'])
+  return syms, attrs, excluded_attrs
+
+@with_seed()
+def test_pos_single_conv():
+  for data_shape in DATA_SHAPE:
+    net, attrs = single_conv(False, data_shape)
+    check_fusion(net, data_shape, attrs)
+    net, attrs = single_conv(True, data_shape)
+    check_fusion(net, data_shape, attrs)
+
+@with_seed()
+def test_pos_conv_relu():
+  for data_shape in DATA_SHAPE:
+    net, attrs = conv_relu(False, data_shape)
+    check_fusion(net, data_shape, attrs)
+    net, attrs = conv_relu(True, data_shape)
+    check_fusion(net, data_shape, attrs)
+
+@with_seed()
+def test_pos_conv_bn():
+  for data_shape in DATA_SHAPE:
+    net, attrs = conv_bn(False, data_shape)
+    check_fusion(net, data_shape, attrs)
+    net, attrs = conv_bn(True, data_shape)
+    check_fusion(net, data_shape, attrs)
+
+@with_seed()
+def test_pos_conv_add():
+  for data_shape in DATA_SHAPE:
+    net, attrs = conv_add(False, data_shape)
+    check_fusion(net, data_shape, attrs)
+    net, attrs = conv_add(True, data_shape)
+    check_fusion(net, data_shape, attrs)
+
+@with_seed()
+def test_pos_conv_add2():
+  for data_shape in DATA_SHAPE:
+    net, attrs = conv_add2(False, data_shape)
+    check_fusion(net, data_shape, attrs)
+    net, attrs = conv_add2(True, data_shape)
+    check_fusion(net, data_shape, attrs)
+
+@with_seed()
+def test_pos_conv_bn_relu():
+  for data_shape in DATA_SHAPE:
+    net, attrs = conv_bn_relu(False, data_shape)
+    check_fusion(net, data_shape, attrs)
+    net, attrs = conv_bn_relu(True, data_shape)
+    check_fusion(net, data_shape, attrs)
+
+@with_seed()
+def test_pos_conv_bn_sum_relu():
+  for data_shape in DATA_SHAPE:
+    net, attrs = conv_bn_sum_relu(False, data_shape)
+    check_fusion(net, data_shape, attrs)
+    net, attrs = conv_bn_sum_relu(True, data_shape)
+    check_fusion(net, data_shape, attrs)
+
+@with_seed()
+def test_neg_conv_bn():
+  for data_shape in DATA_SHAPE:
+    syms, attrs, excluded_attrs = neg_conv_bn(data_shape)
+    check_neg_fusion(syms, attrs, excluded_attrs, data_shape)
+
+@with_seed()
+def test_neg_conv_relu():
+  for data_shape in DATA_SHAPE:
+    syms, attrs, excluded_attrs = neg_conv_relu(data_shape)
+    check_neg_fusion(syms, attrs, excluded_attrs, data_shape)
+
+@with_seed()
+def test_neg_conv_add():
+  for data_shape in DATA_SHAPE:
+    syms, attrs, excluded_attrs = neg_conv_add(data_shape)
+    check_neg_fusion(syms, attrs, excluded_attrs, data_shape)
+
+@with_seed()
+def test_neg_conv_bn_relu():
+  for data_shape in DATA_SHAPE:
+    syms, attrs, excluded_attrs = neg_conv_bn_relu(data_shape)
+    check_neg_fusion(syms, attrs, excluded_attrs, data_shape)
+
+@with_seed()
+def test_neg_conv_bn_add_relu():
+  for data_shape in DATA_SHAPE:
+    syms, attrs, excluded_attrs = neg_conv_bn_add_relu(data_shape)
+    check_neg_fusion(syms, attrs, excluded_attrs, data_shape)
+
+
+if __name__ == "__main__":
+  import nose
+  nose.runmodule()
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 369a923c1879..5ae2c6c398e9 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -374,7 +374,7 @@ def test_quantize_params():
     for name in offline_params:
         params[name] = mx.nd.uniform(shape=(2, 2))
     qsym = mx.contrib.quant._quantize_symbol(sym, offline_params=offline_params)
-    qparams = mx.contrib.quant._quantize_params(qsym, params)
+    qparams = mx.contrib.quant._quantize_params(qsym, params, th_dict = {})
     param_names = params.keys()
     qparam_names = qparams.keys()
     for name in qparam_names:
@@ -406,7 +406,7 @@ def get_fp32_residual():
     fc = mx.sym.FullyConnected(pool, num_hidden=10, flatten=True, name='fc')
     sym = mx.sym.SoftmaxOutput(fc, grad_scale=1, ignore_label=-1, multi_output=False,
                                out_grad=False, preserve_shape=False, use_ignore=False, name='softmax')
-    return sym 
+    return sym
 
 @with_seed()
 def test_quantize_model():
@@ -418,7 +418,7 @@ def check_params(params, qparams, qsym=None):
                     assert k in qparams
                     assert same(v.asnumpy(), qparams[k].asnumpy())
             else:
-                qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params)
+                qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params, th_dict = {})
                 assert len(qparams) == len(qparams_ground_truth)
                 for k, v in qparams_ground_truth.items():
                     assert k in qparams
@@ -494,7 +494,7 @@ def check_params(params, qparams, qsym=None):
                     assert k in qparams
                     assert same(v.asnumpy(), qparams[k].asnumpy())
             else:
-                qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params)
+                qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params, th_dict = {})
                 assert len(qparams) == len(qparams_ground_truth)
                 for k, v in qparams_ground_truth.items():
                     assert k in qparams
@@ -525,7 +525,7 @@ def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape):
             mod.forward(batch, is_train=False)
             for output in mod.get_outputs():
                 output.wait_to_read()
-             
+
 
         sym = get_fp32_residual()
         mod = Module(symbol=sym)
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 99183d72287c..a6932d252dd2 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -420,7 +420,6 @@ def check_layer_forward(layer, dshape):
     mx.test_utils.assert_almost_equal(np_out, out.asnumpy(), rtol=1e-5, atol=1e-6)
     mx.test_utils.assert_almost_equal(np_dx, x.grad.asnumpy(), rtol=1e-5, atol=1e-6)
 
-@unittest.skip("Flaky test: https://github.com/apache/incubator-mxnet/issues/11506")
 @with_seed()
 def test_conv():
     layers1d = [
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 38d9429003b1..00cf30ae62d8 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -118,6 +118,14 @@ def test_ndarray_setitem():
     x_np[:, -3:-1, -2:-1] = 1
     assert same(x.asnumpy(), x_np)
 
+    # numpy assignment for empty axis
+    for trivial_shape in [(), (1,), (1, 1), (1, 1, 1)]:
+        x = mx.nd.zeros(trivial_shape)
+        x[:] = np.ones(trivial_shape)
+        x_np = np.ones(trivial_shape, dtype=x.dtype)
+        assert x.shape == trivial_shape
+        assert same(x.asnumpy(), x_np)
+
 
 @with_seed()
 def test_ndarray_elementwise():
@@ -217,6 +225,13 @@ def test_ndarray_onehot():
         assert same(npy, arr.asnumpy())
 
 
+def test_init_from_scalar():
+    npy = np.ones([])
+    arr = mx.nd.array(npy)
+    assert arr.shape == ()
+    assert same(npy, arr.asnumpy())
+
+
 @with_seed()
 def test_ndarray_copy():
     c = mx.nd.array(np.random.uniform(-10, 10, (10, 10)))
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index b17562c1d946..5332517fa680 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -4491,11 +4491,34 @@ def test_pick_helper(index_type=np.int32):
 def check_ctc_loss(acts, labels, loss_truth):
     in_var = mx.sym.Variable('input')
     labels_var = mx.sym.Variable('labels')
-    ctc = mx.sym.contrib.ctc_loss(in_var, labels_var)
+    ctc = mx.sym.ctc_loss(in_var, labels_var)
     acts_nd = mx.nd.array(acts, ctx=default_context())
     labels_nd = mx.nd.array(labels, ctx=default_context())
     exe = ctc.bind(ctx=default_context(), args=[acts_nd, labels_nd])
+    # test forward with grad calc
+    exe.forward(is_train=True)
+    outTest = exe.outputs[0]
     # test forward without grad calc
+    exe.forward(is_train=False)
+    outTrain = exe.outputs[0]
+    # make sure losses calculated with both modes are the same
+    assert_almost_equal(outTest.asnumpy(), outTrain.asnumpy())
+
+    # test against ground truth, if available
+    if loss_truth is not None:
+        assert_almost_equal(outTest.asnumpy(), loss_truth)
+    # test grad
+    check_numeric_gradient(ctc, [acts, labels], grad_nodes=['input'], rtol=0.05, atol=1e-3)
+
+# check contrib operator for backward compatibility
+def check_contrib_ctc_loss(acts, labels, loss_truth):
+    in_var = mx.sym.Variable('input')
+    labels_var = mx.sym.Variable('labels')
+    ctc = mx.sym.contrib.ctc_loss(in_var, labels_var)
+    acts_nd = mx.nd.array(acts, ctx=default_context())
+    labels_nd = mx.nd.array(labels, ctx=default_context())
+    exe = ctc.bind(ctx=default_context(), args=[acts_nd, labels_nd])
+    # test forward with grad calc
     exe.forward(is_train=True)
     outTest = exe.outputs[0]
     # test forward without grad calc
@@ -4503,6 +4526,7 @@ def check_ctc_loss(acts, labels, loss_truth):
     outTrain = exe.outputs[0]
     # make sure losses calculated with both modes are the same
     assert_almost_equal(outTest.asnumpy(), outTrain.asnumpy())
+
     # test against ground truth, if available
     if loss_truth is not None:
         assert_almost_equal(outTest.asnumpy(), loss_truth)
@@ -4520,6 +4544,8 @@ def test_ctc_loss():
     labels = np.array([[2, 3, 0], [2, 3, 0]])
     true_loss = np.array([4.04789, 4.04789], dtype=np.float32) # from Torch
     check_ctc_loss(acts, labels, true_loss)
+    check_contrib_ctc_loss(acts, labels, true_loss)
+
     # Test 2:
     acts2 = np.array([
         [[-5, -4, -3, -2, -1], [1.2, 3.4, 1.2, -0.1, -2.34]],
@@ -4528,11 +4554,13 @@ def test_ctc_loss():
     labels2 = np.array([[2, 3, 1], [2, 0, 0]], dtype=np.float32)
     true_loss = np.array([7.3557, 5.4091], dtype=np.float32) # from Torch
     check_ctc_loss(acts2, labels2, true_loss)
+    check_contrib_ctc_loss(acts2, labels2, true_loss)
 
     # Test 3: check use integer type as label
     labels3 = np.array([[2, 3, 1], [2, 0, 0]], dtype=np.int32)
     true_loss = np.array([7.3557, 5.4091], dtype=np.float32) # from Torch
     check_ctc_loss(acts2, labels3, true_loss)
+    check_contrib_ctc_loss(acts2, labels3, true_loss)
 
 @with_seed()
 def test_ctc_loss_with_large_classes():
@@ -4550,7 +4578,7 @@ def test_ctc_loss_with_large_classes():
         [1000, 2000, 3000, 4000, 0, 5000, 0, 0]], dtype=np.int32)
     nd_data = mx.nd.array(data)
     nd_label = mx.nd.array(label)
-    loss = mx.nd.contrib.ctc_loss(data=nd_data, label=nd_label)
+    loss = mx.nd.ctc_loss(data=nd_data, label=nd_label)
     expected_loss = np.array([688.02826, 145.34462])
     assert_almost_equal(loss.asnumpy(), expected_loss)
 
@@ -4619,6 +4647,85 @@ def check_ctc_loss_grad(blank_label): # from tf
         label_lens = np.array([5, 4], dtype=np.int32)
         loss_truth = np.array([-loss_log_prob_0, -loss_log_prob_1], np.float32)
 
+        with default_context():
+            data = mx.nd.array(inputs)
+            label = mx.nd.array(labels)
+            data.attach_grad()
+            with mx.autograd.record():
+                l = mx.ndarray.CTCLoss(data, label,
+                                       use_data_lengths=True,
+                                       use_label_lengths=True,
+                                       data_lengths=mx.nd.array(seq_lens),
+                                       label_lengths=mx.nd.array(label_lens),
+                                       blank_label=blank_label)
+                l.backward()
+            assert_almost_equal(l.asnumpy(), loss_truth, atol=1e-5, rtol=1e-5)
+            assert_almost_equal(data.grad.asnumpy(), grad_truth, atol=1e-5, rtol=1e-5)
+
+    # check contrib operator for backward compatibility
+    def check_contrib_ctc_loss_grad(blank_label): # from tf
+        vocab_size = 5
+        max_label_len = 5
+        padding_mask = -1+ (blank_label=='first')
+
+        targets_0 = [0, 1, 2, 1, 0]
+        loss_log_prob_0 = -3.34211
+        input_prob_matrix_0 = np.asarray(
+            [[0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553],
+             [0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436],
+             [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688],
+             [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533],
+             [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]],
+            dtype=np.float32)
+        gradient_log_prob_0 = np.asarray(
+            [[-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553],
+             [0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436],
+             [0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688],
+             [0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533],
+             [-0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]],
+            dtype=np.float32)
+
+        targets_1 = [0, 1, 1, 0]
+        loss_log_prob_1 = -5.42262
+        input_prob_matrix_1 = np.asarray(
+            [[0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508],
+             [0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549],
+             [0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456],
+             [0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345],
+             [0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]],
+            dtype=np.float32)
+        gradient_log_prob_1 = np.asarray(
+            [[-0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508],
+             [0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549],
+             [0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544],
+             [0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345],
+             [-0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]],
+            dtype=np.float32)
+
+        inputs = [
+            np.vstack(
+                [input_prob_matrix_0[t, :], input_prob_matrix_1[t, :]])
+            for t in range(5)
+        ] + 2 * [np.nan * np.ones((2, vocab_size+1), np.float32)]
+        inputs = np.log(np.asarray(inputs, dtype=np.float32))
+
+        grad_truth = np.array([
+            np.vstack(
+                [gradient_log_prob_0[t, :], gradient_log_prob_1[t, :]])
+            for t in range(5)
+        ] + 2 * [np.zeros((2, vocab_size+1), np.float32)])
+
+        if blank_label == 'first':
+            inputs = np.roll(inputs, 1, axis=2)
+            grad_truth = np.roll(grad_truth, 1, axis=2)
+
+        labels = (np.asarray([x + [padding_mask]*(max_label_len-len(x))
+                             for x in [targets_0, targets_1]])+(blank_label == 'first'))
+
+        seq_lens = np.array([5, 5], dtype=np.int32)
+        label_lens = np.array([5, 4], dtype=np.int32)
+        loss_truth = np.array([-loss_log_prob_0, -loss_log_prob_1], np.float32)
+
         with default_context():
             data = mx.nd.array(inputs)
             label = mx.nd.array(labels)
@@ -4634,8 +4741,11 @@ def check_ctc_loss_grad(blank_label): # from tf
             assert_almost_equal(l.asnumpy(), loss_truth, atol=1e-5, rtol=1e-5)
             assert_almost_equal(data.grad.asnumpy(), grad_truth, atol=1e-5, rtol=1e-5)
 
+
     check_ctc_loss_grad('first')
     check_ctc_loss_grad('last')
+    check_contrib_ctc_loss_grad('first')
+    check_contrib_ctc_loss_grad('last')
 
 
 @with_seed()
diff --git a/tests/utils/notebook_test/__init__.py b/tests/utils/notebook_test/__init__.py
index 25e96ab0fc55..a32c5269b812 100644
--- a/tests/utils/notebook_test/__init__.py
+++ b/tests/utils/notebook_test/__init__.py
@@ -32,7 +32,7 @@
 
 IPYTHON_VERSION = 4  # Pin to ipython version 4.
 TIME_OUT = 10*60  # Maximum 10 mins/test. Reaching timeout causes test failure.
-RETRIES = 8
+ATTEMPTS = 8
 KERNEL_ERROR_MSG = 'Kernel died before replying to kernel_info'
 
 
@@ -80,23 +80,28 @@ def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='
         else:
             eprocessor = ExecutePreprocessor(timeout=TIME_OUT)
 
+        success = False
         # There is a low (< 1%) chance that starting a notebook executor will fail due to the kernel
         # taking to long to start, or a port collision, etc.
-        for i in range(RETRIES):
+        for i in range(ATTEMPTS):
             try:
                 nb, _ = eprocessor.preprocess(notebook, {'metadata': {'path': working_dir}})
+                success = True
             except RuntimeError as rte:
                 # We check if the exception has to do with the Jupyter kernel failing to start. If
-                # not, we rethrow to prevent the notebook from erring RETRIES times. It is not ideal
-                # to inspect the exception message, but necessary for retry logic, as Jupyter client
-                # throws the generic RuntimeError that can be confused with other Runtime errors.
+                # not, we rethrow to prevent the notebook from erring ATTEMPTS times. It is not
+                # ideal to inspect the exception message, but necessary for retry logic, as Jupyter
+                # client throws the generic RuntimeError that can be confused with other Runtime
+                # errors.
                 if str(rte) != KERNEL_ERROR_MSG:
                     raise rte
 
-                logging.info("Error starting preprocessor: {}. Attempt {}/{}".format(str(rte), i+1, RETRIES))
+                logging.info("Error starting preprocessor: {}. Attempt {}/{}".format(str(rte), i+1, ATTEMPTS))
                 time.sleep(1)
                 continue
             break
+        if not success:
+            errors.append("Error: Notebook failed to run after {} attempts.".format(ATTEMPTS))
     except Exception as err:
         err_msg = str(err)
         errors.append(err_msg)
diff --git a/tools/license_header.py b/tools/license_header.py
index 7aef33b71213..f6726891f521 100755
--- a/tools/license_header.py
+++ b/tools/license_header.py
@@ -76,13 +76,15 @@
                'example/rcnn/rcnn/cython/nms_kernel.cu',
                'prepare_mkl.sh',
                'example/image-classification/predict-cpp/image-classification-predict.cc',
-               'src/operator/contrib/ctc_include/']
+               'src/operator/contrib/ctc_include/',
+               'julia/REQUIRE']
 
 # language extensions and the according commment mark
 _LANGS = {'.cc':'*', '.h':'*', '.cu':'*', '.cuh':'*', '.py':'#',
           '.pm':'#', '.scala':'*', '.cc':'*', '.sh':'#', '.cmake':'#',
           '.java':'*', '.sh':'#', '.cpp':'*', '.hpp':'*', '.c':'*',
-          '.bat':'rem', '.pl':'#', '.m':'%', '.R':'#', '.mk':'#', '.cfg':'#', '.t':'#', '.ps1': '#'}
+          '.bat':'rem', '.pl':'#', '.m':'%', '.R':'#', '.mk':'#', '.cfg':'#',
+          '.t':'#', '.ps1':'#', '.jl':'#'}
 
 # Previous license header, which will be removed
 _OLD_LICENSE = re.compile('.*Copyright.*by Contributors')