diff --git a/.gitignore b/.gitignore
index c7530ab69c6a..7eb8e7d6e777 100644
--- a/.gitignore
+++ b/.gitignore
@@ -167,10 +167,6 @@ python/.eggs
 tests/Makefile
 tests/mxnet_unit_tests
 
-# generated wrappers for ccache
-cc
-cxx
-
 # Code coverage related
 .coverage
 *.gcov
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3b8bbd2e0272..f09a9c29e9d8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,7 +20,7 @@ mxnet_option(USE_F16C             "Build with x86 F16C instruction support" ON)
 mxnet_option(USE_LAPACK           "Build with lapack support" ON)
 mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
 mxnet_option(USE_MKLML_MKL        "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE))
-mxnet_option(USE_MKLDNN           "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE))
+mxnet_option(USE_MKLDNN           "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE)
 mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON IF NOT MSVC)
 mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support (if found)" ON)
 mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)
diff --git a/CODEOWNERS b/CODEOWNERS
index 5a88e89dfb02..ce648ef2e087 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -13,14 +13,14 @@
 
 # Language bindings
 /R-package/                @thirdwing
-/scala-package/            @yzhliu @nswamy
+/scala-package/            @yzhliu @nswamy @pllarroy
 /perl-package/             @sergeykolychev
-/python/                   @szha
+/python/                   @szha @pllarroy
 /contrib/clojure-package/  @gigasquid
 
 # C++ base
 /src/kvstore/     @rahul003 @anirudh2290
-/include/         @anirudh2290
+/include/         @anirudh2290 @pllarroy
 /src/c_api/       @anirudh2290
 /src/common/      @anirudh2290
 /src/engine/      @anirudh2290
@@ -33,13 +33,17 @@
 /src/profiler/    @anirudh2290
 /src/storage/     @anirudh2290
 /tests/cpp/       @anirudh2290
-/cpp-package/ @nswamy
+/cpp-package/     @nswamy @pllarroy
+/src/             @pllarroy
+/plugin/          @pllarroy
 
 # CMake
-CMakeLists.txt    @szha @rahul003
-/cmake/           @szha @rahul003
+CMakeLists.txt    @szha @rahul003 @pllarroy
+/cmake/           @szha @rahul003 @pllarroy
 
 # MXNet CI
+dev_menu.py         @pllarroy
+/ci/                @pllarroy
 /tests/ci_build/    @marcoabreu
 Jenkinsfile         @marcoabreu
 .travis.yml         @marcoabreu
@@ -50,16 +54,16 @@ Makefile          @szha
 prepare_mkl.sh    @szha
 
 # Docs
-/docs/            @szha
+/docs/            @szha @pllarroy
 
 # Submodules
 .gitmodules       @szha
 
 # Examples
-/example/         @szha
+/example/         @szha @pllarroy
 
 # Tools
-/tools/           @szha
+/tools/           @szha @pllarroy
 
 # Github templates
 /.github/         @szha
diff --git a/MKLDNN_README.md b/MKLDNN_README.md
index 2618d23388e7..6b25fee85195 100644
--- a/MKLDNN_README.md
+++ b/MKLDNN_README.md
@@ -1,9 +1,9 @@
 # Build/Install MXNet with MKL-DNN
 
-A better training and inference perforamce are expected to achieved on Intel-Architecture CPUs with MXNET built with [Intel MKL-DNN](https://github.com/intel/mkl-dnn) on multiple operating system, including Linux, Windows and MacOS.
-In the following sections, you will find building instructions for MXNET with Intel MKL-DNN on Linux, MacOS and Windows.
+A better training and inference performance is expected to be achieved on Intel-Architecture CPUs with MXNet built with [Intel MKL-DNN](https://github.com/intel/mkl-dnn) on multiple operating system, including Linux, Windows and MacOS.
+In the following sections, you will find build instructions for MXNet with Intel MKL-DNN on Linux, MacOS and Windows.
 
-The detailed performance data collected on Intel Xeon CPU with MXNET built with Intel MKL-DNN can be found at [here](https://mxnet.incubator.apache.org/faq/perf.html#intel-cpu).
+The detailed performance data collected on Intel Xeon CPU with MXNet built with Intel MKL-DNN can be found [here](https://mxnet.incubator.apache.org/faq/perf.html#intel-cpu).
 
 
 <h2 id="0">Contents</h2>
@@ -83,7 +83,7 @@ LIBRARY_PATH=$(brew --prefix llvm)/lib/ make -j $(sysctl -n hw.ncpu) CC=$(brew -
 
 <h2 id="3">Windows</h2>
 
-On Windows, you can use [Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) and [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) to compile MXNET with Intel MKL-DNN.
+On Windows, you can use [Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) and [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) to compile MXNet with Intel MKL-DNN.
 [Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is recommended.
 
 **Visual Studio 2015**
@@ -123,7 +123,7 @@ cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -D
 These commands produce a library called ```libmxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
 Also ```libmkldnn.dll``` with be in the ```./build/3rdparty/mkldnn/src/Release/```
 
-6. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading mxnet.
+6. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
 
 **Visual Studio 2017**
 
@@ -177,7 +177,7 @@ cmake -G "Visual Studio 15 2017 Win64" .. -T host=x64 -DUSE_CUDA=0 -DUSE_CUDNN=0
 msbuild mxnet.sln /p:Configuration=Release;Platform=x64 /maxcpucount
 ```
 
-9. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading mxnet.
+9. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
 
 <h2 id="4">Verify MXNet with python</h2>
 
diff --git a/Makefile b/Makefile
index f15968bfe526..08539df7ea12 100644
--- a/Makefile
+++ b/Makefile
@@ -60,6 +60,10 @@ endif
 # use customized config file
 include $(config)
 
+ifndef $(USE_MKLDNN)
+    USE_MKLDNN = 1
+endif
+
 ifeq ($(USE_MKL2017), 1)
 $(warning "USE_MKL2017 is deprecated. We will switch to USE_MKLDNN.")
 	USE_MKLDNN=1
@@ -463,7 +467,7 @@ build/src/%.o: src/%.cc | mkldnn
 
 build/src/%_gpu.o: src/%.cu | mkldnn
 	@mkdir -p $(@D)
-	$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS)" -M -MT build/src/$*_gpu.o $< >build/src/$*_gpu.d
+	$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS)" --generate-dependencies -MT build/src/$*_gpu.o $< >build/src/$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS)" $<
 
 # A nvcc bug cause it to generate "generic/xxx.h" dependencies from torch headers.
@@ -479,7 +483,7 @@ build/plugin/%.o: plugin/%.cc
 
 %_gpu.o: %.cu
 	@mkdir -p $(@D)
-	$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS) -Isrc/operator" -M -MT $*_gpu.o $< >$*_gpu.d
+	$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS) -Isrc/operator" --generate-dependencies -MT $*_gpu.o $< >$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS) -Isrc/operator" $<
 
 %.o: %.cc $(CORE_INC)
@@ -686,7 +690,7 @@ rclean:
 
 ifneq ($(EXTRA_OPERATORS),)
 clean: rclean cyclean $(EXTRA_PACKAGES_CLEAN)
-	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ 
+	$(RM) -r build lib bin deps *~ */*~ */*/*~ */*/*/*~ 
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
diff --git a/ci/docker/Dockerfile.build.android_armv7 b/ci/docker/Dockerfile.build.android_armv7
index c601fc5e5ff7..a2e98cd2efe1 100644
--- a/ci/docker/Dockerfile.build.android_armv7
+++ b/ci/docker/Dockerfile.build.android_armv7
@@ -75,6 +75,11 @@ ENV OpenBLAS_DIR=${CROSS_ROOT}
 
 WORKDIR /work
 
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
 
diff --git a/ci/docker/Dockerfile.build.android_armv8 b/ci/docker/Dockerfile.build.android_armv8
index 60376b8efda2..f7de86763457 100644
--- a/ci/docker/Dockerfile.build.android_armv8
+++ b/ci/docker/Dockerfile.build.android_armv8
@@ -74,6 +74,12 @@ ENV CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang++
 COPY install/android_arm64_openblas.sh /work/
 RUN /work/android_arm64_openblas.sh
 ENV CPLUS_INCLUDE_PATH /work/deps/OpenBLAS
-WORKDIR /work/build
+
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
+
+WORKDIR /work/build
\ No newline at end of file
diff --git a/ci/docker/Dockerfile.build.armv6 b/ci/docker/Dockerfile.build.armv6
index 6f16d8c77a0a..60e223b7a60f 100644
--- a/ci/docker/Dockerfile.build.armv6
+++ b/ci/docker/Dockerfile.build.armv6
@@ -38,5 +38,10 @@ ENV OpenBLAS_DIR=${CROSS_ROOT}
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.armv7 b/ci/docker/Dockerfile.build.armv7
index 5f0223448f12..0b557d5839e9 100644
--- a/ci/docker/Dockerfile.build.armv7
+++ b/ci/docker/Dockerfile.build.armv7
@@ -38,5 +38,10 @@ ENV OpenBLAS_DIR=${CROSS_ROOT}
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.armv8 b/ci/docker/Dockerfile.build.armv8
index 27bd425ae9b7..ef9c95865590 100644
--- a/ci/docker/Dockerfile.build.armv8
+++ b/ci/docker/Dockerfile.build.armv8
@@ -42,5 +42,10 @@ ENV OpenBLAS_DIR=${CROSS_ROOT}
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
 COPY runtime_functions.sh /work/
 WORKDIR /work/build
diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index d128ebc7e2a7..07097887f87d 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -77,10 +77,16 @@ RUN JETPACK_DOWNLOAD_PREFIX=https://developer.download.nvidia.com/devzone/devcen
     dpkg -i --force-architecture  $ARM_NVINFER_INSTALLER_PACKAGE && \
     dpkg -i --force-architecture  $ARM_NVINFER_DEV_INSTALLER_PACKAGE && \
     apt update -y || true && apt install -y cuda-libraries-dev-9-0 libcudnn7-dev libnvinfer-dev
+RUN ln -s /usr/include/aarch64-linux-gnu/cudnn_v7.h /usr/include/aarch64-linux-gnu/cudnn.h
 ENV PATH $PATH:/usr/local/cuda/bin
 ENV NVCCFLAGS "-m64"
 ENV CUDA_ARCH "-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62"
 ENV NVCC /usr/local/cuda/bin/nvcc
 
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
diff --git a/ci/docker/install/centos7_adduser.sh b/ci/docker/install/centos7_adduser.sh
index ba72c9b92281..f9d2402c9554 100755
--- a/ci/docker/install/centos7_adduser.sh
+++ b/ci/docker/install/centos7_adduser.sh
@@ -34,4 +34,9 @@ then
     mkdir /work/mxnet
     mkdir /work/build
     chown -R jenkins_slave /work/
+
+    # Later on, we have to override the links because underlying build systems ignore our compiler settings. Thus,
+    # we have to give the process the proper permission to these files. This is hacky, but unfortunately 
+    # there's no better way to do this without patching all our submodules.
+    chown -R jenkins_slave /usr/local/bin
 fi
diff --git a/ci/docker/install/ubuntu_adduser.sh b/ci/docker/install/ubuntu_adduser.sh
index 515a80f63b07..a7668bac2ab6 100755
--- a/ci/docker/install/ubuntu_adduser.sh
+++ b/ci/docker/install/ubuntu_adduser.sh
@@ -40,4 +40,9 @@ then
     mkdir /work/mxnet
     mkdir /work/build
     chown -R jenkins_slave /work/
+
+    # Later on, we have to override the links because underlying build systems ignore our compiler settings. Thus,
+    # we have to give the process the proper permission to these files. This is hacky, but unfortunately 
+    # there's no better way to do this without patching all our submodules.
+    chown -R jenkins_slave /usr/local/bin
 fi
diff --git a/ci/docker/install/ubuntu_publish.sh b/ci/docker/install/ubuntu_publish.sh
new file mode 100644
index 000000000000..bc3513dd13e5
--- /dev/null
+++ b/ci/docker/install/ubuntu_publish.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Build on Ubuntu 14.04 LTS for LINUX CPU/GPU
+apt-get update
+apt-get install -y software-properties-common
+add-apt-repository ppa:ubuntu-toolchain-r/test -y
+add-apt-repository ppa:openjdk-r/ppa -y # Java lib
+apt-get update
+apt-get install -y git \
+    cmake3 \
+    libcurl4-openssl-dev \
+    unzip \
+    gcc-4.8 \
+    g++-4.8 \
+    gfortran \
+    gfortran-4.8 \
+    binutils \
+    nasm \
+    libtool \
+    curl \
+    pandoc \
+    python3-pip \
+    automake \
+    pkg-config \
+    openjdk-8-jdk
+curl -o apache-maven-3.3.9-bin.tar.gz http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz  
+tar xzf apache-maven-3.3.9-bin.tar.gz   
+mkdir /usr/local/maven  
+mv apache-maven-3.3.9/ /usr/local/maven/    
+update-alternatives --install /usr/bin/mvn mvn /usr/local/maven/apache-maven-3.3.9/bin/mvn 1
+update-ca-certificates -f
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 6dd5bb6f239d..53cd9ba2d4c6 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -39,32 +39,59 @@ clean_repo() {
 build_ccache_wrappers() {
     set -ex
 
-    rm -f cc
-    rm -f cxx
-
-    touch cc
-    touch cxx
-
     if [ -z ${CC+x} ]; then
         echo "No \$CC set, defaulting to gcc";
         export CC=gcc
     fi
-
-    if [ -z ${CXX+x} ]; then
+     if [ -z ${CXX+x} ]; then
        echo "No \$CXX set, defaulting to g++";
        export CXX=g++
     fi
 
-    # this function is nessesary for cuda enabled make based builds, since nvcc needs just an executable for -ccbin
-
-    echo -e "#!/bin/sh\n/usr/local/bin/ccache ${CC} \"\$@\"\n" >> cc
-    echo -e "#!/bin/sh\n/usr/local/bin/ccache ${CXX} \"\$@\"\n" >> cxx
-
-    chmod +x cc
-    chmod +x cxx
-
-    export CC=`pwd`/cc
-    export CXX=`pwd`/cxx
+    # Recommended by CCache: https://ccache.samba.org/manual.html#_run_modes
+    # Add to the beginning of path to ensure this redirection is picked up instead
+    # of the original ones. Especially CUDA/NVCC appends itself to the beginning of the
+    # path and thus this redirect is ignored. This change fixes this problem
+    # This hacky approach with symbolic links is required because underlying build
+    # systems of our submodules ignore our CMake settings. If they use Makefile,
+    # we can't influence them at all in general and NVCC also prefers to hardcode their
+    # compiler instead of respecting the settings. Thus, we take this brutal approach
+    # and just redirect everything of this installer has been called.
+    # In future, we could do these links during image build time of the container.
+    # But in the beginning, we'll make this opt-in. In future, loads of processes like
+    # the scala make step or numpy compilation and other pip package generations
+    # could be heavily sped up by using ccache as well.
+    mkdir /tmp/ccache-redirects
+    export PATH=/tmp/ccache-redirects:$PATH
+    ln -s ccache /tmp/ccache-redirects/gcc
+    ln -s ccache /tmp/ccache-redirects/gcc-8
+    ln -s ccache /tmp/ccache-redirects/g++
+    ln -s ccache /tmp/ccache-redirects/g++-8
+    ln -s ccache /tmp/ccache-redirects/nvcc
+    ln -s ccache /tmp/ccache-redirects/clang++-3.9
+    ln -s ccache /tmp/ccache-redirects/clang-3.9
+    ln -s ccache /tmp/ccache-redirects/clang++-5.0
+    ln -s ccache /tmp/ccache-redirects/clang-5.0
+    ln -s ccache /tmp/ccache-redirects/clang++-6.0
+    ln -s ccache /tmp/ccache-redirects/clang-6.0
+    ln -s ccache /usr/local/bin/gcc
+    ln -s ccache /usr/local/bin/gcc-8
+    ln -s ccache /usr/local/bin/g++
+    ln -s ccache /usr/local/bin/g++-8
+    ln -s ccache /usr/local/bin/nvcc
+    ln -s ccache /usr/local/bin/clang++-3.9
+    ln -s ccache /usr/local/bin/clang-3.9
+    ln -s ccache /usr/local/bin/clang++-5.0
+    ln -s ccache /usr/local/bin/clang-5.0
+    ln -s ccache /usr/local/bin/clang++-6.0
+    ln -s ccache /usr/local/bin/clang-6.0
+
+    export NVCC=ccache
+
+    # Uncomment if you would like to debug CCache hit rates.
+    # You can monitor using tail -f ccache-log
+    # export CCACHE_LOGFILE=/work/mxnet/ccache-log
+    # export CCACHE_DEBUG=1
 }
 
 build_wheel() {
@@ -106,6 +133,8 @@ build_jetson() {
     set -ex
     pushd .
 
+    #build_ccache_wrappers
+
     cp make/crosscompile.jetson.mk ./config.mk
     make -j$(nproc)
 
@@ -129,6 +158,7 @@ build_armv6() {
 
     # We do not need OpenMP, since most armv6 systems have only 1 core
 
+    build_ccache_wrappers
     cmake \
         -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
@@ -159,6 +189,7 @@ build_armv7() {
     # file tries to add -llapack. Lapack functionality though, requires -lgfortran
     # to be linked additionally.
 
+    build_ccache_wrappers
     cmake \
         -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
         -DCMAKE_CROSSCOMPILING=ON \
@@ -181,6 +212,7 @@ build_armv7() {
 }
 
 build_armv8() {
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -205,6 +237,7 @@ build_armv8() {
 build_android_armv7() {
     set -ex
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DANDROID=ON\
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
@@ -225,6 +258,7 @@ build_android_armv7() {
 build_android_armv8() {
     set -ex
     cd /work/build
+    build_ccache_wrappers
     cmake\
         -DANDROID=ON \
         -DUSE_CUDA=OFF\
@@ -244,7 +278,7 @@ build_centos7_cpu() {
     cd /work/mxnet
     export CC="ccache gcc"
     export CXX="ccache g++"
-
+    build_ccache_wrappers
     make \
         DEV=1 \
         USE_LAPACK=1 \
@@ -253,10 +287,17 @@ build_centos7_cpu() {
         USE_BLAS=openblas \
         USE_DIST_KVSTORE=1 \
         -j$(nproc)
+    cp lib/libmkldnn.so.0 lib/libmkldnn.so.0.tmp
+    mv lib/libmkldnn.so.0.tmp lib/libmkldnn.so.0
+    cp lib/libmklml_intel.so lib/libmklml_intel.so.tmp
+    mv lib/libmklml_intel.so.tmp lib/libmklml_intel.so
+    cp lib/libiomp5.so lib/libiomp5.so.tmp
+    mv lib/libiomp5.so.tmp lib/libiomp5.so
 }
 
 build_amzn_linux_cpu() {
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -279,13 +320,12 @@ build_centos7_mkldnn() {
     cd /work/mxnet
     export CC="ccache gcc"
     export CXX="ccache g++"
-
+    build_ccache_wrappers
     make \
         DEV=1 \
         ENABLE_TESTCOVERAGE=1 \
         USE_LAPACK=1 \
         USE_LAPACK_PATH=/usr/lib64/liblapack.so \
-        USE_MKLDNN=1 \
         USE_BLAS=openblas \
         -j$(nproc)
 }
@@ -294,7 +334,7 @@ build_centos7_gpu() {
     set -ex
     cd /work/mxnet
     # unfortunately this build has problems in 3rdparty dependencies with ccache and make
-    # build_ccache_wrappers
+    build_ccache_wrappers
     make \
         DEV=1                                     \
         ENABLE_TESTCOVERAGE=1                     \
@@ -307,6 +347,12 @@ build_centos7_gpu() {
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         -j$(nproc)
+    cp lib/libmkldnn.so.0 lib/libmkldnn.so.0.tmp
+    mv lib/libmkldnn.so.0.tmp lib/libmkldnn.so.0
+    cp lib/libmklml_intel.so lib/libmklml_intel.so.tmp
+    mv lib/libmklml_intel.so.tmp lib/libmklml_intel.so
+    cp lib/libiomp5.so lib/libiomp5.so.tmp
+    mv lib/libiomp5.so.tmp lib/libiomp5.so
 }
 
 build_ubuntu_cpu() {
@@ -315,8 +361,9 @@ build_ubuntu_cpu() {
 
 build_ubuntu_cpu_openblas() {
     set -ex
-    export CC="ccache gcc"
-    export CXX="ccache g++"
+    export CC="gcc"
+    export CXX="g++"
+    build_ccache_wrappers
     make \
         DEV=1                         \
         ENABLE_TESTCOVERAGE=1         \
@@ -344,6 +391,7 @@ build_ubuntu_cpu_cmake_debug() {
     set -ex
     pushd .
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -365,11 +413,12 @@ build_ubuntu_cpu_cmake_asan() {
 
     pushd .
     cd /work/build
+    export CXX=g++-8
+    export CC=gcc-8
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-        -DCMAKE_CXX_COMPILER=/usr/bin/g++-8 \
-        -DCMAKE_C_COMPILER=/usr/bin/gcc-8 \
         -DUSE_CUDA=OFF \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_OPENMP=OFF \
@@ -391,10 +440,10 @@ build_ubuntu_cpu_cmake_asan() {
 
 build_ubuntu_cpu_clang39() {
     set -ex
-     export CXX=clang++-3.9
+    export CXX=clang++-3.9
     export CC=clang-3.9
-     build_ccache_wrappers
-     make \
+    build_ccache_wrappers
+    make \
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
@@ -429,6 +478,7 @@ build_ubuntu_cpu_clang_tidy() {
 
     pushd .
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -458,7 +508,6 @@ build_ubuntu_cpu_clang39_mkldnn() {
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
-        USE_MKLDNN=1                  \
         USE_OPENMP=0                  \
         -j$(nproc)
 }
@@ -475,7 +524,6 @@ build_ubuntu_cpu_clang60_mkldnn() {
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
-        USE_MKLDNN=1                  \
         USE_OPENMP=1                  \
         -j$(nproc)
 }
@@ -490,7 +538,6 @@ build_ubuntu_cpu_mkldnn() {
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
-        USE_MKLDNN=1                  \
         -j$(nproc)
 }
 
@@ -526,6 +573,8 @@ build_ubuntu_gpu_tensorrt() {
     mkdir -p build
     cd build
     cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
         -DCMAKE_CXX_FLAGS=-I/usr/include/python${PYVER}\
         -DBUILD_SHARED_LIBS=ON ..\
         -G Ninja
@@ -540,7 +589,10 @@ build_ubuntu_gpu_tensorrt() {
     cd 3rdparty/onnx-tensorrt/
     mkdir -p build
     cd build
-    cmake ..
+    cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        ..
     make -j$(nproc)
     export LIBRARY_PATH=`pwd`:$LIBRARY_PATH
     popd
@@ -578,7 +630,6 @@ build_ubuntu_gpu_mkldnn() {
         ENABLE_TESTCOVERAGE=1                     \
         USE_CPP_PACKAGE=1                         \
         USE_BLAS=openblas                         \
-        USE_MKLDNN=1                              \
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
@@ -595,7 +646,6 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
         DEV=1                                     \
         ENABLE_TESTCOVERAGE=1                     \
         USE_BLAS=openblas                         \
-        USE_MKLDNN=1                              \
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=0                               \
@@ -618,11 +668,16 @@ build_ubuntu_gpu_cuda91_cudnn7() {
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         -j$(nproc)
+    cp lib/libmkldnn.so.0 lib/libmkldnn.so.0.tmp
+    mv lib/libmkldnn.so.0.tmp lib/libmkldnn.so.0
+    cp lib/libiomp5.so lib/libiomp5.so.tmp
+    mv lib/libiomp5.so.tmp lib/libiomp5.so
 }
 
 build_ubuntu_amalgamation() {
     set -ex
     # Amalgamation can not be run with -j nproc
+    build_ccache_wrappers
     make -C amalgamation/ clean
     make -C amalgamation/     \
         USE_BLAS=openblas     \
@@ -632,6 +687,7 @@ build_ubuntu_amalgamation() {
 build_ubuntu_amalgamation_min() {
     set -ex
     # Amalgamation can not be run with -j nproc
+    build_ccache_wrappers
     make -C amalgamation/ clean
     make -C amalgamation/     \
         USE_BLAS=openblas     \
@@ -642,14 +698,15 @@ build_ubuntu_amalgamation_min() {
 build_ubuntu_gpu_cmake_mkldnn() {
     set -ex
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache    \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache      \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache   \
         -DENABLE_TESTCOVERAGE=ON                \
         -DUSE_CUDA=1                            \
         -DUSE_CUDNN=1                           \
         -DUSE_MKLML_MKL=1                       \
-        -DUSE_MKLDNN=1                          \
         -DCMAKE_BUILD_TYPE=Release              \
         -DCUDA_ARCH_NAME=Manual                 \
         -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
@@ -665,9 +722,11 @@ build_ubuntu_gpu_cmake_mkldnn() {
 build_ubuntu_gpu_cmake() {
     set -ex
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache    \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache      \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache   \
         -DENABLE_TESTCOVERAGE=ON                \
         -DUSE_CUDA=1                            \
         -DUSE_CUDNN=1                           \
@@ -681,6 +740,9 @@ build_ubuntu_gpu_cmake() {
         /work/mxnet
 
     ninja -v
+    # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
+    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
+    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
 }
 
 build_ubuntu_blc() {
@@ -792,6 +854,9 @@ unittest_ubuntu_cpu_scala() {
 
 unittest_centos7_cpu_scala() {
     set -ex
+    mkdir -p /work/mxnet/3rdparty/mkldnn/build/install/lib/
+    cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so
+    cp lib/libmklml_intel.so /work/mxnet/3rdparty/mkldnn/build/install/lib/libmklml_intel.so
     cd /work/mxnet
     make scalapkg USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
     make scalaunittest USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
@@ -861,6 +926,7 @@ unittest_ubuntu_cpu_julia06() {
 
     # FIXME
     export LD_PRELOAD='/usr/lib/x86_64-linux-gnu/libjemalloc.so'
+    # export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
 
     # use the prebuilt binary from $MXNET_HOME/lib
     julia -e 'Pkg.build("MXNet")'
@@ -944,6 +1010,9 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
 
 integrationtest_ubuntu_gpu_scala() {
     set -ex
+    mkdir -p /work/mxnet/3rdparty/mkldnn/build/install/lib/
+    cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so
+    cp lib/libmklml_intel.so /work/mxnet/3rdparty/mkldnn/build/install/lib/libmklml_intel.so
     make scalapkg USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 USE_DIST_KVSTORE=1 SCALA_ON_GPU=1 ENABLE_TESTCOVERAGE=1
     make scalaintegrationtest USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 SCALA_TEST_ON_GPU=1 USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
 }
@@ -1138,7 +1207,7 @@ nightly_straight_dope_python3_multi_gpu_tests() {
 nightly_tutorial_test_ubuntu_python3_gpu() {
     set -ex
     cd /work/mxnet/docs
-    export BUILD_VER=tutorial 
+    export BUILD_VER=tutorial
     export MXNET_DOCS_BUILD_MXNET=0
     make html
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
@@ -1168,7 +1237,7 @@ deploy_docs() {
     set -ex
     pushd .
 
-    make docs
+    make docs SPHINXOPTS=-W
 
     popd
 }
@@ -1187,6 +1256,7 @@ deploy_jl_docs() {
 
     # FIXME
     export LD_PRELOAD='/usr/lib/x86_64-linux-gnu/libjemalloc.so'
+    # export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
 
     # use the prebuilt binary from $MXNET_HOME/lib
     julia -e 'Pkg.build("MXNet")'
@@ -1224,5 +1294,3 @@ EOF
     declare -F | cut -d' ' -f3
     echo
 fi
-
-
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 74bde1eee211..94cc81a444a6 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -23,22 +23,22 @@
 utils = load('ci/Jenkinsfile_utils.groovy')
 
 // mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
 
 // Python wheels
 mx_pip = 'build/*.whl'
 
 // for scala build, need to pass extra libs when run with dist_kvstore
-mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
+mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
-mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
+mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
 mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
-mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
-mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*'
-mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*'
+mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
+mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
+mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
 
 // Python unittest for CPU
 // Python 2
diff --git a/ci/jenkins/Jenkinsfile_edge b/ci/jenkins/Jenkinsfile_edge
index 275a0c96de94..c101ba102386 100644
--- a/ci/jenkins/Jenkinsfile_edge
+++ b/ci/jenkins/Jenkinsfile_edge
@@ -34,7 +34,7 @@ utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-//    custom_steps.compile_armv8_jetson_gpu(),
+    custom_steps.compile_armv8_jetson_gpu(),
     custom_steps.compile_armv7_cpu(),
     custom_steps.compile_armv6_cpu(),
     custom_steps.compile_armv8_cpu(),
diff --git a/cpp-package/example/README.md b/cpp-package/example/README.md
index c7223e94c920..c2329330b6be 100644
--- a/cpp-package/example/README.md
+++ b/cpp-package/example/README.md
@@ -2,7 +2,8 @@
 
 ## Building C++ examples
 
-The examples are built while building the MXNet library and cpp-package from source . However, they can be built manually as follows
+The examples in this folder demonstrate the **training** workflow. The **inference workflow** related examples can be found in [inference](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference>) folder.
+The examples in this folder are built while building the MXNet library and cpp-package from source . However, they can be built manually as follows
 
 From cpp-package/examples directory
 
@@ -18,7 +19,7 @@ The examples that are built to be run on GPU may not work on the non-GPU machine
 The makefile will also download the necessary data files and store in a data folder. (The download will take couple of minutes, but will be done only once on a fresh installation.)
 
 
-## Examples
+## Examples demonstrating training workflow
 
 This directory contains following examples. In order to run the examples, ensure that the path to the MXNet shared library is added to the OS specific environment variable viz. **LD\_LIBRARY\_PATH** for Linux, Mac and Ubuntu OS and **PATH** for Windows OS. For example `export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/home/ubuntu/incubator-mxnet/lib` on ubuntu using gpu.
 
diff --git a/cpp-package/example/inference/Makefile b/cpp-package/example/inference/Makefile
new file mode 100644
index 000000000000..5efe6cfb68e5
--- /dev/null
+++ b/cpp-package/example/inference/Makefile
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+CPPEX_SRC = $(wildcard *.cpp)
+CPPEX_EXE = $(patsubst %.cpp, %, $(CPPEX_SRC))
+OPENCV_CFLAGS=`pkg-config --cflags opencv`
+OPENCV_LDFLAGS=`pkg-config --libs opencv`
+
+CXX=g++
+
+
+CFLAGS=$(COMMFLAGS) -I../../../3rdparty/tvm/nnvm/include -I../../../3rdparty/dmlc-core/include -I ../../include -I ../../../include -Wall -O3 -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas
+CPPEX_EXTRA_LDFLAGS := -L../../../lib -lmxnet $(OPENCV_LDFLAGS)
+
+all: $(CPPEX_EXE)
+
+debug: CPPEX_CFLAGS += -DDEBUG -g
+debug: all
+
+
+$(CPPEX_EXE):% : %.cpp
+	$(CXX) -std=c++0x $(CFLAGS)  $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(CPPEX_EXTRA_LDFLAGS)
+
+clean:
+	rm -f $(CPPEX_EXE)
diff --git a/cpp-package/example/inference/README.md b/cpp-package/example/inference/README.md
new file mode 100644
index 000000000000..79831b40b6bd
--- /dev/null
+++ b/cpp-package/example/inference/README.md
@@ -0,0 +1,41 @@
+# MXNet C++ Package Inference Workflow Examples
+
+## Building C++ Inference examples
+
+The examples in this folder demonstrate the **inference** workflow.
+To build examples use following commands:
+
+-  Release: **make all**
+-  Debug: **make debug all**
+
+
+## Examples demonstrating inference workflow
+
+This directory contains following examples. In order to run the examples, ensure that the path to the MXNet shared library is added to the OS specific environment variable viz. **LD\_LIBRARY\_PATH** for Linux, Mac and Ubuntu OS and **PATH** for Windows OS.
+
+### [inception_inference.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/inception_inference.cpp>)
+
+This example demonstrates image classification workflow with pre-trained models using MXNet C++ API. The command line parameters the example can accept are as shown below:
+
+```
+./inception_inference --help
+Usage:
+inception_inference --symbol <model symbol file in json format>
+                    --params <model params file>
+					--image <path to the image used for prediction
+					--synset file containing labels for prediction
+					[--input_shape <dimensions of input image e.g "3 224 224"]
+					[--mean file containing mean image for normalizing the input image
+					[--gpu] Specify this option if workflow needs to be run in gpu context
+```
+The model json and param file and synset files are required to run this example.  The sample command line is as follows:
+
+```
+
+./inception_inference --symbol "./model/Inception-BN-symbol.json" --params "./model/Inception-BN-0126.params" --synset "./model/synset.txt" --mean "./model/mean_224.nd" --image "./model/dog.jpg"
+```
+Alternatively, The script [unit_test_inception_inference.sh](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/unit_test_inception_inference.sh>) downloads the pre-trained **Inception** model and a test image. The users can invoke this script as follows:
+
+```
+./unit_test_inception_inference.sh
+```
diff --git a/cpp-package/example/inference/inception_inference.cpp b/cpp-package/example/inference/inception_inference.cpp
new file mode 100644
index 000000000000..7005e745b2f4
--- /dev/null
+++ b/cpp-package/example/inference/inception_inference.cpp
@@ -0,0 +1,446 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * This example demonstrates image classification workflow with pre-trained models using MXNet C++ API.
+ * The example performs following tasks.
+ * 1. Load the pre-trained model.
+ * 2. Load the parameters of pre-trained model.
+ * 3. Load the image to be classified  in to NDArray.
+ * 4. Normalize the image using the mean of images that were used for training.
+ * 5. Run the forward pass and predict the input image.
+ */
+
+#include <sys/stat.h>
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+#include <opencv2/opencv.hpp>
+
+using namespace mxnet::cpp;
+
+static mx_float DEFAULT_MEAN_R = 123.675;
+static mx_float DEFAULT_MEAN_G = 116.28;
+static mx_float DEFAULT_MEAN_B = 103.53;
+/*
+ * class Predictor
+ *
+ * This class encapsulates the functionality to load the model, process input image and run the forward pass.
+ */
+
+class Predictor {
+ public:
+    Predictor() {}
+    Predictor(const std::string& model_json_file,
+              const std::string& model_params_file,
+              const Shape& input_shape,
+              bool gpu_context_type = false,
+              const std::string& synset_file = "",
+              const std::string& mean_image_file = "");
+    void PredictImage(const std::string& image_file);
+    ~Predictor();
+
+ private:
+    void LoadModel(const std::string& model_json_file);
+    void LoadParameters(const std::string& model_parameters_file);
+    void LoadSynset(const std::string& synset_file);
+    NDArray LoadInputImage(const std::string& image_file);
+    void LoadMeanImageData();
+    void LoadDefaultMeanImageData();
+    void NormalizeInput(const std::string& mean_image_file);
+    inline bool FileExists(const std::string& name) {
+        struct stat buffer;
+        return (stat(name.c_str(), &buffer) == 0);
+    }
+    NDArray mean_img;
+    std::map<std::string, NDArray> args_map;
+    std::map<std::string, NDArray> aux_map;
+    std::vector<std::string> output_labels;
+    Symbol net;
+    Executor *executor;
+    Shape input_shape;
+    NDArray mean_image_data;
+    NDArray std_dev_image_data;
+    Context global_ctx = Context::cpu();
+    std::string mean_image_file;
+};
+
+
+/*
+ * The constructor takes following parameters as input:
+ * 1. model_json_file:  The model in json formatted file.
+ * 2. model_params_file: File containing model parameters
+ * 3. synset_file: File containing the list of image labels
+ * 4. input_shape: Shape of input data to the model. Since this class will be running one inference at a time,
+ *                 the input shape is required to be in format Shape(1, number_of_channels, height, width)
+ * The input image will be resized to (height x width) size before running the inference.
+ * The constructor will:
+ *  1. Load the model and parameter files.
+ *  2. Load the synset file.
+ *  3. Invoke the SimpleBind to bind the input argument to the model and create an executor.
+ *
+ *  The SimpleBind is expected to be invoked only once.
+ */
+Predictor::Predictor(const std::string& model_json_file,
+                     const std::string& model_params_file,
+                     const Shape& input_shape,
+                     bool gpu_context_type,
+                     const std::string& synset_file,
+                     const std::string& mean_image_file):
+                     input_shape(input_shape),
+                     mean_image_file(mean_image_file) {
+  if (gpu_context_type) {
+    global_ctx = Context::gpu();
+  }
+  // Load the model
+  LoadModel(model_json_file);
+
+  // Load the model parameters.
+  LoadParameters(model_params_file);
+
+  /*
+   * The data will be used to output the exact label that matches highest output of the model.
+   */
+  LoadSynset(synset_file);
+
+  /*
+   * Load the mean image data if specified.
+   */
+  if (!mean_image_file.empty()) {
+    LoadMeanImageData();
+  } else {
+    LG << "Mean image file for normalizing the input is not provide."
+       << " We will use the default mean values for R,G and B channels.";
+    LoadDefaultMeanImageData();
+  }
+
+  // Create an executor after binding the model to input parameters.
+  args_map["data"] = NDArray(input_shape, global_ctx, false);
+  executor = net.SimpleBind(global_ctx, args_map, std::map<std::string, NDArray>(),
+                              std::map<std::string, OpReqType>(), aux_map);
+}
+
+/*
+ * The following function loads the model from json file.
+ */
+void Predictor::LoadModel(const std::string& model_json_file) {
+  if (!FileExists(model_json_file)) {
+    LG << "Model file " << model_json_file << " does not exist";
+    throw std::runtime_error("Model file does not exist");
+  }
+  LG << "Loading the model from " << model_json_file << std::endl;
+  net = Symbol::Load(model_json_file);
+}
+
+
+/*
+ * The following function loads the model parameters.
+ */
+void Predictor::LoadParameters(const std::string& model_parameters_file) {
+  if (!FileExists(model_parameters_file)) {
+    LG << "Parameter file " << model_parameters_file << " does not exist";
+    throw std::runtime_error("Model parameters does not exist");
+  }
+  LG << "Loading the model parameters from " << model_parameters_file << std::endl;
+  std::map<std::string, NDArray> parameters;
+  NDArray::Load(model_parameters_file, 0, &parameters);
+  for (const auto &k : parameters) {
+    if (k.first.substr(0, 4) == "aux:") {
+      auto name = k.first.substr(4, k.first.size() - 4);
+      aux_map[name] = k.second.Copy(global_ctx);
+    }
+    if (k.first.substr(0, 4) == "arg:") {
+      auto name = k.first.substr(4, k.first.size() - 4);
+      args_map[name] = k.second.Copy(global_ctx);
+    }
+  }
+  /*WaitAll is need when we copy data between GPU and the main memory*/
+  NDArray::WaitAll();
+}
+
+
+/*
+ * The following function loads the synset file.
+ * This information will be used later to report the label of input image.
+ */
+void Predictor::LoadSynset(const std::string& synset_file) {
+  if (!FileExists(synset_file)) {
+    LG << "Synset file " << synset_file << " does not exist";
+    throw std::runtime_error("Synset file does not exist");
+  }
+  LG << "Loading the synset file.";
+  std::ifstream fi(synset_file.c_str());
+  if (!fi.is_open()) {
+    std::cerr << "Error opening synset file " << synset_file << std::endl;
+    throw std::runtime_error("Error in opening the synset file.");
+  }
+  std::string synset, lemma;
+  while (fi >> synset) {
+    getline(fi, lemma);
+    output_labels.push_back(lemma);
+  }
+  fi.close();
+}
+
+
+/*
+ * The following function loads the mean data from mean image file.
+ * This data will be used for normalizing the image before running the forward
+ * pass.
+ * The output data has the same shape as that of the input image data.
+ */
+void Predictor::LoadMeanImageData() {
+  LG << "Load the mean image data that will be used to normalize "
+     << "the image before running forward pass.";
+  mean_image_data = NDArray(input_shape, global_ctx, false);
+  mean_image_data.SyncCopyFromCPU(
+        NDArray::LoadToMap(mean_image_file)["mean_img"].GetData(),
+        input_shape.Size());
+  NDArray::WaitAll();
+}
+
+
+/*
+ * The following function loads the default mean values for
+ * R, G and B channels into NDArray that has the same shape as that of
+ * input image.
+ */
+void Predictor::LoadDefaultMeanImageData() {
+  LG << "Loading the default mean image data";
+  std::vector<float> array;
+  /*resize pictures to (224, 224) according to the pretrained model*/
+  int height = input_shape[2];
+  int width = input_shape[3];
+  int channels = input_shape[1];
+  std::vector<mx_float> default_means;
+  default_means.push_back(DEFAULT_MEAN_R);
+  default_means.push_back(DEFAULT_MEAN_G);
+  default_means.push_back(DEFAULT_MEAN_B);
+  for (int c = 0; c < channels; ++c) {
+    for (int i = 0; i < height; ++i) {
+      for (int j = 0; j < width; ++j) {
+        array.push_back(default_means[c]);
+      }
+    }
+  }
+  mean_image_data = NDArray(input_shape, global_ctx, false);
+  mean_image_data.SyncCopyFromCPU(array.data(), input_shape.Size());
+  NDArray::WaitAll();
+}
+
+
+/*
+ * The following function loads the input image into NDArray.
+ */
+NDArray Predictor::LoadInputImage(const std::string& image_file) {
+  if (!FileExists(image_file)) {
+    LG << "Image file " << image_file << " does not exist";
+    throw std::runtime_error("Image file does not exist");
+  }
+  LG << "Loading the image " << image_file << std::endl;
+  std::vector<float> array;
+  cv::Mat mat = cv::imread(image_file);
+  /*resize pictures to (224, 224) according to the pretrained model*/
+  int height = input_shape[2];
+  int width = input_shape[3];
+  int channels = input_shape[1];
+  cv::resize(mat, mat, cv::Size(height, width));
+  for (int c = 0; c < channels; ++c) {
+    for (int i = 0; i < height; ++i) {
+      for (int j = 0; j < width; ++j) {
+        array.push_back(static_cast<float>(mat.data[(i * height + j) * 3 + c]));
+      }
+    }
+  }
+  NDArray image_data = NDArray(input_shape, global_ctx, false);
+  image_data.SyncCopyFromCPU(array.data(), input_shape.Size());
+  NDArray::WaitAll();
+  return image_data;
+}
+
+
+/*
+ * The following function runs the forward pass on the model.
+ * The executor is created in the constructor.
+ *
+ */
+void Predictor::PredictImage(const std::string& image_file) {
+  // Load the input image
+  NDArray image_data = LoadInputImage(image_file);
+
+  // Normalize the image
+  image_data.Slice(0, 1) -= mean_image_data;
+
+  LG << "Running the forward pass on model to predict the image";
+  /*
+   * The executor->arg_arrays represent the arguments to the model.
+   *
+   * Copying the image_data that contains the NDArray of input image
+   * to the arg map of the executor. The input is stored with the key "data" in the map.
+   *
+   */
+  image_data.CopyTo(&(executor->arg_dict()["data"]));
+  NDArray::WaitAll();
+
+  // Run the forward pass.
+  executor->Forward(false);
+
+  // The output is available in executor->outputs.
+  auto array = executor->outputs[0].Copy(global_ctx);
+  NDArray::WaitAll();
+
+  /*
+   * Find out the maximum accuracy and the index associated with that accuracy.
+   * This is done by using the argmax operator on NDArray.
+   */
+  auto predicted = array.ArgmaxChannel();
+  NDArray::WaitAll();
+
+  int best_idx = predicted.At(0, 0);
+  float best_accuracy = array.At(0, best_idx);
+
+  if (output_labels.empty()) {
+    LG << "The model predicts the highest accuracy of " << best_accuracy << " at index "
+       << best_idx;
+  } else {
+    LG << "The model predicts the input image to be a [" << output_labels[best_idx]
+       << " ] with Accuracy = " << best_accuracy << std::endl;
+  }
+}
+
+
+Predictor::~Predictor() {
+  if (executor) {
+    delete executor;
+  }
+  MXNotifyShutdown();
+}
+
+
+/*
+ * Convert the input string of number of hidden units into the vector of integers.
+ */
+std::vector<index_t> getShapeDimensions(const std::string& hidden_units_string) {
+    std::vector<index_t> dimensions;
+    char *p_next;
+    int num_unit = strtol(hidden_units_string.c_str(), &p_next, 10);
+    dimensions.push_back(num_unit);
+    while (*p_next) {
+        num_unit = strtol(p_next, &p_next, 10);
+        dimensions.push_back(num_unit);
+    }
+    return dimensions;
+}
+
+void printUsage() {
+    std::cout << "Usage:" << std::endl;
+    std::cout << "inception_inference --symbol <model symbol file in json format>  " << std::endl
+              << "--params <model params file> " << std::endl
+              << "--image <path to the image used for prediction> " << std::endl
+              << "--synset <file containing labels for prediction> " << std::endl
+              << "[--input_shape <dimensions of input image e.g \"3 224 224\">] " << std::endl
+              << "[--mean <file containing mean image for normalizing the input image>] "
+              << std::endl
+              << "[--gpu  <Specify this option if workflow needs to be run in gpu context>]"
+              << std::endl;
+}
+
+int main(int argc, char** argv) {
+  std::string model_file_json;
+  std::string model_file_params;
+  std::string synset_file = "";
+  std::string mean_image = "";
+  std::string input_image = "";
+  bool gpu_context_type = false;
+
+  std::string input_shape = "3 224 224";
+    int index = 1;
+    while (index < argc) {
+        if (strcmp("--symbol", argv[index]) == 0) {
+            index++;
+            model_file_json = (index < argc ? argv[index]:"");
+        } else if (strcmp("--params", argv[index]) == 0) {
+            index++;
+            model_file_params = (index < argc ? argv[index]:"");
+        } else if (strcmp("--synset", argv[index]) == 0) {
+            index++;
+            synset_file = (index < argc ? argv[index]:"");
+        } else if (strcmp("--mean", argv[index]) == 0) {
+            index++;
+            mean_image = (index < argc ? argv[index]:"");
+        } else if (strcmp("--image", argv[index]) == 0) {
+            index++;
+            input_image = (index < argc ? argv[index]:"");
+        } else if (strcmp("--input_shape", argv[index]) == 0) {
+            index++;
+            input_shape = (index < argc ? argv[index]:input_shape);
+        } else if (strcmp("--gpu", argv[index]) == 0) {
+            gpu_context_type = true;
+        } else if (strcmp("--help", argv[index]) == 0) {
+            printUsage();
+            return 0;
+        }
+        index++;
+    }
+
+  if (model_file_json.empty() || model_file_params.empty() || synset_file.empty()) {
+    LG << "ERROR: Model details such as symbol, param and/or synset files are not specified";
+    printUsage();
+    return 1;
+  }
+
+  if (input_image.empty()) {
+    LG << "ERROR: Path to the input image is not specified.";
+    printUsage();
+    return 1;
+  }
+
+  std::vector<index_t> input_dimensions = getShapeDimensions(input_shape);
+
+  /*
+   * Since we are running inference for 1 image, add 1 to the input_dimensions so that
+   * the shape of input data for the model will be
+   * {no. of images, channels, height, width}
+   */
+  input_dimensions.insert(input_dimensions.begin(), 1);
+
+  Shape input_data_shape(input_dimensions);
+
+  try {
+    // Initialize the predictor object
+    Predictor predict(model_file_json, model_file_params, input_data_shape, gpu_context_type,
+                      synset_file, mean_image);
+
+    // Run the forward pass to predict the image.
+    predict.PredictImage(input_image);
+  } catch (std::runtime_error &error) {
+    LG << "Execution failed with ERROR: " << error.what();
+  } catch (...) {
+    /*
+     * If underlying MXNet code has thrown an exception the error message is
+     * accessible through MXGetLastError() function.
+     */
+    LG << "Execution failed with following MXNet error";
+    LG << MXGetLastError();
+  }
+  return 0;
+}
diff --git a/cpp-package/example/inference/unit_test_inception_inference.sh b/cpp-package/example/inference/unit_test_inception_inference.sh
new file mode 100755
index 000000000000..4f40b496bbd3
--- /dev/null
+++ b/cpp-package/example/inference/unit_test_inception_inference.sh
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Downloading the data and model
+mkdir -p model
+wget -nc http://data.dmlc.ml/mxnet/models/imagenet/inception-bn.tar.gz
+wget -nc -O model/dog.jpg https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/dog.jpg?raw=true
+wget -nc -O model/mean_224.nd https://github.com/dmlc/web-data/raw/master/mxnet/example/feature_extract/mean_224.nd
+tar -xvzf inception-bn.tar.gz -C model
+
+# Building
+make all
+
+
+# Running the example with dog image.
+if [ "$(uname)" == "Darwin" ]; then
+    DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:../../../lib ./inception_inference --symbol "./model/Inception-BN-symbol.json" --params "./model/Inception-BN-0126.params" --synset "./model/synset.txt" --mean "./model/mean_224.nd" --image "./model/dog.jpg" 2&> inception_inference.log
+else
+    LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../lib ./inception_inference --symbol "./model/Inception-BN-symbol.json" --params "./model/Inception-BN-0126.params" --synset "./model/synset.txt" --mean "./model/mean_224.nd" --image "./model/dog.jpg" 2&> inception_inference.log
+fi
+result=`grep -c "pug-dog" inception_inference.log`
+if [ $result == 1 ];
+then
+    echo "PASS: inception_inference correctly identified the image."
+    exit 0
+else
+    echo "FAIL: inception_inference FAILED to identify the image."
+    exit 1
+fi
diff --git a/docs/README.md b/docs/README.md
index c21836edd821..80463cc68d54 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -17,9 +17,13 @@ git clone --recursive https://github.com/apache/incubator-mxnet.git mxnet
 cd mxnet/docs/build_version_doc
 ./setup_docs_ubuntu.sh
 cd ../../
-make docs USE_OPENMP=1
+make docs USE_OPENMP=1 SPHINXOPTS=-W
 ```
 
+OpenMP speeds things up and will work on Ubuntu if you used the `setup_docs_ubuntu.sh` script.
+The `-W` Sphinx option enforces "warnings as errors". This will help you debug your builds and get them through CI.
+**CI will not let a PR through if it breaks the website.** Refer to the [MXNet Developer wiki's documentation guide](https://cwiki.apache.org/confluence/display/MXNET/Documentation+Guide) for troubleshooting tips.
+
 For more information on each API's documentation dependencies, how to serve the docs, or how to build the full website with each legacy MXNet version, refer to the following links:
 
 * [Dependencies](https://github.com/apache/incubator-mxnet/tree/master/docs/build_version_doc#dependencies) - required before you build the docs
diff --git a/docs/build_version_doc/build_all_version.sh b/docs/build_version_doc/build_all_version.sh
index 5f857996f19d..6b8c3cbd864e 100755
--- a/docs/build_version_doc/build_all_version.sh
+++ b/docs/build_version_doc/build_all_version.sh
@@ -43,6 +43,9 @@
 set -e
 set -x
 
+# Set OPTS to any Sphinx build options, like -W for "warnings as errors"
+OPTS=
+
 # $1 is the list of branches/tags to build
 if [ -z "$1" ]
   then
@@ -117,6 +120,8 @@ function checkout () {
   git checkout "$repo_folder" || git branch $repo_folder "upstream/$repo_folder" && git checkout "$repo_folder" || exit 1
   if [ $tag == 'master' ]; then
     git pull
+    # master gets warnings as errors for Sphinx builds
+    OPTS="-W"
   fi
   git submodule update --init --recursive
   cd ..
@@ -160,7 +165,7 @@ for key in ${!build_arr[@]}; do
 
     echo "Building $tag..."
     cd $tag/docs
-    make html USE_OPENMP=1 BUILD_VER=$tag || exit 1
+    make html USE_OPENMP=1 BUILD_VER=$tag SPHINXOPTS=$OPTS || exit 1
     # Navigate back to build_version_doc folder
     cd ../../../
     # Use the display tag name for the folder name
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index 0f0c82f4b599..98057d0d76d6 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -37,6 +37,12 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 * MXNET_CPU_NNPACK_NTHREADS
   - Values: Int ```(default=4)```
   - The number of threads used for NNPACK. NNPACK package aims to provide high-performance implementations of some layers for multi-core CPUs. Checkout [NNPACK](http://mxnet.io/faq/nnpack.html) to know more about it.
+* MXNET_MP_WORKER_NTHREADS
+  - Values: Int ```(default=1)```
+  - The number of scheduling threads on CPU given to multiprocess workers. Enlarge this number allows more operators to run in parallel in individual workers but please consider reducing the overall `num_workers` to avoid thread contention (not available on Windows).
+* MXNET_MP_OPENCV_NUM_THREADS
+  - Values: Int ```(default=0)```
+  - The number of OpenCV execution threads given to multiprocess workers. OpenCV multithreading is disabled if `MXNET_MP_OPENCV_NUM_THREADS` < 1 (default). Enlarge this number may boost the performance of individual workers when executing underlying OpenCV functions but please consider reducing the overall `num_workers` to avoid thread contention (not available on Windows).
 
 ## Memory Options
 
@@ -99,10 +105,10 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 * MXNET_KVSTORE_REDUCTION_NTHREADS
   - Values: Int ```(default=4)```
   - The number of CPU threads used for summing up big arrays on a single machine
-  - This will also be used for `dist_sync` kvstore to sum up arrays from different contexts on a single machine. 
-  - This does not affect summing up of arrays from different machines on servers. 
+  - This will also be used for `dist_sync` kvstore to sum up arrays from different contexts on a single machine.
+  - This does not affect summing up of arrays from different machines on servers.
   - Summing up of arrays for `dist_sync_device` kvstore is also unaffected as that happens on GPUs.
-  
+
 * MXNET_KVSTORE_BIGARRAY_BOUND
   - Values: Int ```(default=1000000)```
   - The minimum size of a "big array".
@@ -166,7 +172,7 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
 
 * MXNET_CUDNN_AUTOTUNE_DEFAULT
   - Values: 0, 1, or 2 ```(default=1)```
-  - The default value of cudnn auto tuning for convolution layers. 
+  - The default value of cudnn auto tuning for convolution layers.
   - Value of 0 means there is no auto tuning to pick the convolution algo
   - Performance tests are run to pick the convolution algo when value is 1 or 2
   - Value of 1 chooses the best algo in a limited workspace
@@ -190,12 +196,12 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
 * MXNET_HOME
   - Data directory in the filesystem for storage, for example when downloading gluon models.
   - Default in *nix is .mxnet APPDATA/mxnet in windows.
-  
+
 * MXNET_MKLDNN_ENABLED
   - Values: 0, 1 ```(default=1)```
   - Flag to enable or disable MKLDNN accelerator. On by default.
   - Only applies to mxnet that has been compiled with MKLDNN (```pip install mxnet-mkl``` or built from source with ```USE_MKLDNN=1```)
-  
+
 * MXNET_MKLDNN_CACHE_NUM
   - Values: Int ```(default=-1)```
   - Flag to set num of elements that MKLDNN cache can hold. Default is -1 which means cache size is unbounded. Should only be set if your model has variable input shapes, as cache size may grow unbounded. The number represents the number of items in the cache and is proportional to the number of layers that use MKLDNN and different input shape.
diff --git a/julia/README.md b/julia/README.md
index a4299575f95e..2ff7553063f3 100644
--- a/julia/README.md
+++ b/julia/README.md
@@ -3,7 +3,7 @@
 [![MXNet](http://pkg.julialang.org/badges/MXNet_0.6.svg)](http://pkg.julialang.org/?pkg=MXNet)
 
 
-MXNet.jl is the [dmlc/mxnet](https://github.com/apache/incubator-mxnet) [Julia](http://julialang.org/) package. MXNet.jl brings flexible and efficient GPU computing and state-of-art deep learning to Julia. Some highlight of its features include:
+MXNet.jl is the [Apache MXNet](https://github.com/apache/incubator-mxnet) [Julia](http://julialang.org/) package. MXNet.jl brings flexible and efficient GPU computing and state-of-art deep learning to Julia. Some highlight of its features include:
 
 * Efficient tensor/matrix computation across multiple devices, including multiple CPUs, GPUs and distributed server nodes.
 * Flexible symbolic manipulation to composite and construction of state-of-the-art deep learning models.
diff --git a/make/crosscompile.jetson.mk b/make/crosscompile.jetson.mk
index a1468f4496d3..171f846d20dd 100644
--- a/make/crosscompile.jetson.mk
+++ b/make/crosscompile.jetson.mk
@@ -57,10 +57,10 @@ DEBUG = 0
 USE_SIGNAL_HANDLER = 1
 
 # the additional link flags you want to add
-ADD_LDFLAGS = -L${CROSS_ROOT}/lib
+ADD_LDFLAGS = -L${CROSS_ROOT}/lib -L/usr/lib/aarch64-linux-gnu/
 
 # the additional compile flags you want to add
-ADD_CFLAGS = -I${CROSS_ROOT}/include
+ADD_CFLAGS = -I${CROSS_ROOT}/include -I/usr/include/aarch64-linux-gnu/
 
 #---------------------------------------------
 # matrix computation libraries for CPU/GPU
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index 586e620470d3..9d762745a407 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -26,6 +26,7 @@
 import multiprocessing
 import multiprocessing.queues
 from multiprocessing.reduction import ForkingPickler
+from multiprocessing.pool import ThreadPool
 import threading
 import numpy as np
 
@@ -384,8 +385,9 @@ def _worker_initializer(dataset):
     global _worker_dataset
     _worker_dataset = dataset
 
-def _worker_fn(samples, batchify_fn):
+def _worker_fn(samples, batchify_fn, dataset=None):
     """Function for processing data in worker process."""
+    # pylint: disable=unused-argument
     # it is required that each worker process has to fork a new MXIndexedRecordIO handle
     # preserving dataset as global variable can save tons of overhead and is safe in new process
     global _worker_dataset
@@ -394,10 +396,14 @@ def _worker_fn(samples, batchify_fn):
     ForkingPickler(buf, pickle.HIGHEST_PROTOCOL).dump(batch)
     return buf.getvalue()
 
+def _thread_worker_fn(samples, batchify_fn, dataset):
+    """Threadpool worker function for processing data."""
+    return batchify_fn([dataset[i] for i in samples])
+
 class _MultiWorkerIter(object):
     """Internal multi-worker iterator for DataLoader."""
     def __init__(self, worker_pool, batchify_fn, batch_sampler, pin_memory=False,
-                 worker_fn=_worker_fn, prefetch=0):
+                 worker_fn=_worker_fn, prefetch=0, dataset=None):
         self._worker_pool = worker_pool
         self._batchify_fn = batchify_fn
         self._batch_sampler = batch_sampler
@@ -407,6 +413,7 @@ def __init__(self, worker_pool, batchify_fn, batch_sampler, pin_memory=False,
         self._iter = iter(self._batch_sampler)
         self._worker_fn = worker_fn
         self._pin_memory = pin_memory
+        self._dataset = dataset
         # pre-fetch
         for _ in range(prefetch):
             self._push_next()
@@ -419,7 +426,8 @@ def _push_next(self):
         r = next(self._iter, None)
         if r is None:
             return
-        async_ret = self._worker_pool.apply_async(self._worker_fn, (r, self._batchify_fn))
+        async_ret = self._worker_pool.apply_async(
+            self._worker_fn, (r, self._batchify_fn, self._dataset))
         self._data_buffer[self._sent_idx] = async_ret
         self._sent_idx += 1
 
@@ -432,7 +440,7 @@ def __next__(self):
         assert self._rcvd_idx < self._sent_idx, "rcvd_idx must be smaller than sent_idx"
         assert self._rcvd_idx in self._data_buffer, "fatal error with _push_next, rcvd_idx missing"
         ret = self._data_buffer.pop(self._rcvd_idx)
-        batch = pickle.loads(ret.get())
+        batch = pickle.loads(ret.get()) if self._dataset is None else ret.get()
         if self._pin_memory:
             batch = _as_in_context(batch, context.cpu_pinned())
         batch = batch[0] if len(batch) == 1 else batch
@@ -498,12 +506,18 @@ def default_batchify_fn(data):
         but will consume more shared_memory. Using smaller number may forfeit the purpose of using
         multiple worker processes, try reduce `num_workers` in this case.
         By default it defaults to `num_workers * 2`.
+    thread_pool : bool, default False
+        If ``True``, use threading pool instead of multiprocessing pool. Using threadpool
+        can avoid shared memory usage. If `DataLoader` is more IO bounded or GIL is not a killing
+        problem, threadpool version may achieve better performance than multiprocessing.
+
     """
     def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
                  last_batch=None, batch_sampler=None, batchify_fn=None,
-                 num_workers=0, pin_memory=False, prefetch=None):
+                 num_workers=0, pin_memory=False, prefetch=None, thread_pool=False):
         self._dataset = dataset
         self._pin_memory = pin_memory
+        self._thread_pool = thread_pool
 
         if batch_sampler is None:
             if batch_size is None:
@@ -529,8 +543,11 @@ def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
         self._worker_pool = None
         self._prefetch = max(0, int(prefetch) if prefetch is not None else 2 * self._num_workers)
         if self._num_workers > 0:
-            self._worker_pool = multiprocessing.Pool(
-                self._num_workers, initializer=_worker_initializer, initargs=[self._dataset])
+            if self._thread_pool:
+                self._worker_pool = ThreadPool(self._num_workers)
+            else:
+                self._worker_pool = multiprocessing.Pool(
+                    self._num_workers, initializer=_worker_initializer, initargs=[self._dataset])
         if batchify_fn is None:
             if num_workers > 0:
                 self._batchify_fn = default_mp_batchify_fn
@@ -551,14 +568,17 @@ def same_process_iter():
 
         # multi-worker
         return _MultiWorkerIter(self._worker_pool, self._batchify_fn, self._batch_sampler,
-                                pin_memory=self._pin_memory, worker_fn=_worker_fn,
-                                prefetch=self._prefetch)
+                                pin_memory=self._pin_memory,
+                                worker_fn=_thread_worker_fn if self._thread_pool else _worker_fn,
+                                prefetch=self._prefetch,
+                                dataset=self._dataset if self._thread_pool else None)
 
     def __len__(self):
         return len(self._batch_sampler)
 
     def __del__(self):
         if self._worker_pool:
-            # manually terminate due to a bug that pool is not automatically terminated on linux
+            # manually terminate due to a bug that pool is not automatically terminated
+            # https://bugs.python.org/issue34172
             assert isinstance(self._worker_pool, multiprocessing.pool.Pool)
             self._worker_pool.terminate()
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index b452aecdb04b..1dd665607597 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -46,7 +46,7 @@ def imread(filename, *args, **kwargs):
     """Read and decode an image to an NDArray.
 
     .. note:: `imread` uses OpenCV (not the CV2 Python library).
-    MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
+       MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
 
     Parameters
     ----------
@@ -87,7 +87,7 @@ def imresize(src, w, h, *args, **kwargs):
     r"""Resize image with OpenCV.
 
     .. note:: `imresize` uses OpenCV (not the CV2 Python library). MXNet must have been built
-    with USE_OPENCV=1 for `imresize` to work.
+       with USE_OPENCV=1 for `imresize` to work.
 
     Parameters
     ----------
@@ -144,7 +144,7 @@ def imdecode(buf, *args, **kwargs):
     """Decode an image to an NDArray.
 
     .. note:: `imdecode` uses OpenCV (not the CV2 Python library).
-    MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
+       MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
 
     Parameters
     ----------
@@ -345,7 +345,7 @@ def resize_short(src, size, interp=2):
     """Resizes shorter edge to size.
 
     .. note:: `resize_short` uses OpenCV (not the CV2 Python library).
-    MXNet must have been built with OpenCV for `resize_short` to work.
+       MXNet must have been built with OpenCV for `resize_short` to work.
 
     Resizes the original image by setting the shorter edge to size
     and setting the longer edge accordingly.
diff --git a/scala-package/.gitignore b/scala-package/.gitignore
index 8bc87f53e802..9a89bef324bc 100644
--- a/scala-package/.gitignore
+++ b/scala-package/.gitignore
@@ -4,6 +4,8 @@ core/src/main/scala/org/apache/mxnet/NDArrayBase.scala
 core/src/main/scala/org/apache/mxnet/javaapi/NDArrayBase.scala
 core/src/main/scala/org/apache/mxnet/SymbolAPIBase.scala
 core/src/main/scala/org/apache/mxnet/SymbolBase.scala
+core/src/main/scala/org/apache/mxnet/NDArrayRandomAPIBase.scala
+core/src/main/scala/org/apache/mxnet/SymbolRandomAPIBase.scala
 examples/scripts/infer/images/
 examples/scripts/infer/models/
 local-snapshot
\ No newline at end of file
diff --git a/scala-package/README.md b/scala-package/README.md
index 20fbee2469b0..06ce6930fa46 100644
--- a/scala-package/README.md
+++ b/scala-package/README.md
@@ -1,67 +1,136 @@
-<img src=https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo-m/mxnet2.png width=135/> Deep Learning for Scala/Java
+MXNet Package for Scala/Java
 =====
 
-[![Build Status](http://jenkins.mxnet-ci.amazon-ml.com/job/incubator-mxnet/job/master/badge/icon)](http://jenkins.mxnet-ci.amazon-ml.com/job/incubator-mxnet/job/master/)
-[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
-
-Here you find the MXNet Scala Package!
-It brings flexible and efficient GPU/CPU computing and state-of-art deep learning to JVM.
+The MXNet Scala/Java Package brings flexible and efficient GPU/CPU computing and state-of-art deep learning to JVM.
 
 - It enables you to write seamless tensor/matrix computation with multiple GPUs
   in Scala, Java and other languages built on JVM.
 - It also enables you to construct and customize the state-of-art deep learning models in JVM languages,
   and apply them to tasks such as image classification and data science challenges.
+- The Scala/Java Inferece APIs provides an easy out of the box solution for loading pre-trained MXNet models and running inference on them.
   
-Install
+Pre-Built Maven Packages
 ------------
- 
-Technically, all you need is the `mxnet-full_2.11-{arch}-{xpu}-{version}.jar` in your classpath.
-It will automatically extract the native library to a tempfile and load it.
-You can find the pre-built jar file in [here](https://search.maven.org/search?q=g:org.apache.mxnet)
- and also our nightly build package [here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~)
 
-Currently we provide `linux-x86_64-gpu`, `linux-x86_64-cpu` and `osx-x86_64-cpu`. Support for Windows will come soon.
-Use the following dependency in maven, change the artifactId according to your own architecture, e.g., `mxnet-full_2.11-osx-x86_64-cpu` for OSX (and cpu-only).
+### Stable ###
+
+The MXNet Scala/Java packages can be easily included in your Maven managed project.
+The stable jar files for the packages are available on the [MXNet Maven Package Repository](https://search.maven.org/search?q=g:org.apache.mxnet)
+Currently we provide packages for Linux (Ubuntu 16.04) (CPU and GPU) and macOS (CPU only). Stable packages for Windows and CentOS will come soon. For now, if you have a CentOS machine, follow the ```Build From Source``` section below. 
+
+To add MXNet Scala/Java package to your project, add the dependency as shown below corresponding to your platform, under the ```dependencies``` tag in your project's ```pom.xml``` :
+
+**Linux GPU**
+
+<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-linux-x86_64-gpu"><img src="https://img.shields.io/badge/org.apache.mxnet-linux gpu-green.svg" alt="maven badge"/></a>
+
+```HTML
+<dependency>
+  <groupId>org.apache.mxnet</groupId>
+  <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
+  <version>[1.3.1,)</version>
+</dependency>
+```
+
+**Linux CPU**
+
+<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu"><img src="https://img.shields.io/badge/org.apache.mxnet-linux cpu-green.svg" alt="maven badge"/></a>
+
+```HTML
+<dependency>
+  <groupId>org.apache.mxnet</groupId>
+  <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
+  <version>[1.3.1,)</version>
+</dependency>
+```
+
+**macOS CPU**
+
+<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu"><img src="https://img.shields.io/badge/org.apache.mxnet-macOS cpu-green.svg" alt="maven badge"/></a>
+
+```HTML
+<dependency>
+  <groupId>org.apache.mxnet</groupId>
+  <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
+  <version>[1.3.1,)</version>
+</dependency>
+```
+
+**Note:** ```<version>[1.3.1,)<\version>``` indicates that we will fetch packages with version 1.3.1 or higher. This will always ensure that the pom.xml is able to fetch the latest and greatest jar files from Maven.  
+
+### Nightly ###
+
+Apart from these, the nightly builds representing the bleeding edge development  on Scala/Java packages are also available on the [MXNet Maven Nexus Package Repository](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~~~). 
+Currently we provide nightly packages for Linux (CPU and GPU) and MacOS (CPU only). The Linux nightly jar files also work on CentOS. Nightly packages for Windows will come soon.
+
+Add the following ```repository``` to your project's ```pom.xml``` file : 
+
+````html
+<repositories>
+    <repository>
+      <id>Apache Snapshot</id>
+      <url>https://repository.apache.org/content/groups/snapshots</url>
+    </repository>
+</repositories>
+````
+
+Also, add the dependency which corresponds to your platform to the ```dependencies``` tag :
+
+**Linux GPU**
+
+<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~mxnet-full_2.11-linux-x86_64-gpu~~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux gpu-green.svg" alt="maven badge"/></a>
+
+```HTML
+<dependency>
+  <groupId>org.apache.mxnet</groupId>
+  <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
+  <version>[1.5.0,)</version>
+</dependency>
+```
+
+**Linux CPU**
+
+<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~mxnet-full_2.11-osx-x86_64-cpu~~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux cpu-green.svg" alt="maven badge"/></a>
 
 ```HTML
 <dependency>
   <groupId>org.apache.mxnet</groupId>
-  <artifactId>mxnet-full_2.10-linux-x86_64-gpu</artifactId>
-  <version>0.1.1</version>
+  <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
+  <version>[1.5.0,)</version>
 </dependency>
 ```
 
-You can also use `mxnet-core_2.10-0.1.1.jar` and put the compiled native library somewhere in your load path.
+**macOS CPU**
 
+<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu"><img src="https://img.shields.io/badge/org.apache.mxnet-macOS cpu-green.svg" alt="maven badge"/></a>
 ```HTML
 <dependency>
   <groupId>org.apache.mxnet</groupId>
-  <artifactId>mxnet-core_2.10</artifactId>
-  <version>0.1.1</version>
+  <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
+  <version>[1.5.0,)</version>
 </dependency>
 ```
 
-If you have some native libraries conflict with the ones in the provided 'full' jar (e.g., you use openblas instead of atlas), this is a recommended way.
-Refer to the next section for how to build it from the very source.
+**Note:** ```<version>[1.5.0,)<\version>``` indicates that we will fetch packages with version 1.5.0 or higher. This will always ensure that the pom.xml is able to fetch the latest and greatest jar files from Maven Snapshot repository.
 
-Build
+Build From Source
 ------------
 
-Checkout the [Installation Guide](http://mxnet.incubator.apache.org/install/index.html) contains instructions to install mxnet.
-Then you can compile the Scala Package by
+Checkout the [Installation Guide](http://mxnet.incubator.apache.org/install/index.html) contains instructions to install mxnet package and build it from source.
+If you have built MXNet from source and are looking to setup Scala from that point, you may simply run the following from the MXNet source root:
 
 ```bash
 make scalapkg
 ```
 
-(Optional) run unit/integration tests by
+You can also run the unit tests and integration tests on the Scala Package by :
 
 ```bash
 make scalaunittest
 make scalaintegrationtest
 ```
 
-Or run a subset of unit tests by, e.g.,
+Or run a subset of unit tests, for e.g.,
 
 ```bash
 make SCALA_TEST_ARGS=-Dsuites=org.apache.mxnet.NDArraySuite scalaunittest
@@ -70,123 +139,38 @@ make SCALA_TEST_ARGS=-Dsuites=org.apache.mxnet.NDArraySuite scalaunittest
 If everything goes well, you will find jars for `assembly`, `core` and `example` modules.
 Also it produces the native library in `native/{your-architecture}/target`, which you can use to cooperate with the `core` module.
 
-Once you've downloaded and unpacked MNIST dataset to `./data/`, run the training example by
-
-```bash
-java -Xmx4G -cp \
-  scala-package/assembly/{your-architecture}/target/*:scala-package/examples/target/*:scala-package/examples/target/classes/lib/* \
-  org.apache.mxnet.examples.imclassification.TrainMnist \
-  --data-dir=./data/ \
-  --num-epochs=10 \
-  --network=mlp \
-  --cpus=0,1,2,3
-```
+Examples & Usage
+-------
+- To set up the Scala Project using IntelliJ IDE on macOS follow the instructions [here](https://mxnet.incubator.apache.org/tutorials/scala/mxnet_scala_on_intellij.html).
+- Several examples on using the Scala APIs are provided in the [Scala Examples Folder](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/)
 
-If you've compiled with `USE_DIST_KVSTORE` enabled, the python tools in `mxnet/tracker` can be used to launch distributed training.
-The following command runs the above example using 2 worker nodes (and 2 server nodes) in local. Refer to [Distributed Training](http://mxnet.incubator.apache.org/how_to/multi_devices.html) for more details.
+Scala Training APIs
+-------
+- Module API :
+[The Module API](https://mxnet.incubator.apache.org/api/scala/module.html) provides an intermediate and high-level interface for performing computation with neural networks in MXNet. Modules provide high-level APIs for training, predicting, and evaluating.
 
-```bash
-tracker/dmlc_local.py -n 2 -s 2 \
-  java -Xmx4G -cp \
-  scala-package/assembly/{your-architecture}/target/*:scala-package/examples/target/*:scala-package/examples/target/classes/lib/* \
-  org.apache.mxnet.examples.imclassification.TrainMnist \
-  --data-dir=./data/ \
-  --num-epochs=10 \
-  --network=mlp \
-  --cpus=0 \
-  --kv-store=dist_sync
-```
+- KVStore API : 
+To run training over multiple GPUs and multiple hosts, one can use the [KVStore API](https://mxnet.incubator.apache.org/api/scala/kvstore.html).
 
-Change the arguments and have fun!
+- IO/Data Loading : 
+MXNet Scala provides APIs for preparing data to feed as an input to models. Check out [Data Loading API](https://mxnet.incubator.apache.org/api/scala/io.html) for more info.
+ 
+Other available Scala APIs for training can be found [here](https://mxnet.incubator.apache.org/api/scala/index.html).  
+ 
 
-Usage
+Scala Inference APIs
 -------
-Here is a Scala example of what training a simple 3-layer multilayer perceptron on MNIST looks like. You can download the MNIST dataset using [get_mnist_data script](https://github.com/apache/incubator-mxnet/blob/master/scala-package/core/scripts/get_mnist_data.sh).
-
-```scala
-import org.apache.mxnet._
-import org.apache.mxnet.optimizer.SGD
-
-// model definition
-val data = Symbol.Variable("data")
-val fc1 = Symbol.FullyConnected(name = "fc1")()(Map("data" -> data, "num_hidden" -> 128))
-val act1 = Symbol.Activation(name = "relu1")()(Map("data" -> fc1, "act_type" -> "relu"))
-val fc2 = Symbol.FullyConnected(name = "fc2")()(Map("data" -> act1, "num_hidden" -> 64))
-val act2 = Symbol.Activation(name = "relu2")()(Map("data" -> fc2, "act_type" -> "relu"))
-val fc3 = Symbol.FullyConnected(name = "fc3")()(Map("data" -> act2, "num_hidden" -> 10))
-val mlp = Symbol.SoftmaxOutput(name = "sm")()(Map("data" -> fc3))
-
-// load MNIST dataset
-val trainDataIter = IO.MNISTIter(Map(
-  "image" -> "data/train-images-idx3-ubyte",
-  "label" -> "data/train-labels-idx1-ubyte",
-  "data_shape" -> "(1, 28, 28)",
-  "label_name" -> "sm_label",
-  "batch_size" -> "50",
-  "shuffle" -> "1",
-  "flat" -> "0",
-  "silent" -> "0",
-  "seed" -> "10"))
-
-val valDataIter = IO.MNISTIter(Map(
-  "image" -> "data/t10k-images-idx3-ubyte",
-  "label" -> "data/t10k-labels-idx1-ubyte",
-  "data_shape" -> "(1, 28, 28)",
-  "label_name" -> "sm_label",
-  "batch_size" -> "50",
-  "shuffle" -> "1",
-  "flat" -> "0", "silent" -> "0"))
-
-// setup model and fit the training data
-val model = FeedForward.newBuilder(mlp)
-      .setContext(Context.cpu())
-      .setNumEpoch(10)
-      .setOptimizer(new SGD(learningRate = 0.1f, momentum = 0.9f, wd = 0.0001f))
-      .setTrainData(trainDataIter)
-      .setEvalData(valDataIter)
-      .build()
-```
+The [Scala Inference APIs](https://mxnet.incubator.apache.org/api/scala/infer.html) provide an easy, out of the box solution to load a pre-trained MXNet model and run inference on it. The Inference APIs are present in the [Infer Package](https://github.com/apache/incubator-mxnet/tree/master/scala-package/infer) under the MXNet Scala Package repository, while the documentation for the Infer API is available [here](https://mxnet.incubator.apache.org/api/scala/docs/index.html#org.apache.mxnet.infer.package).  
 
-Predict using the model in the following way:
-
-```scala
-val probArrays = model.predict(valDataIter)
-// in this case, we do not have multiple outputs
-require(probArrays.length == 1)
-val prob = probArrays(0)
-
-// get real labels
-import scala.collection.mutable.ListBuffer
-valDataIter.reset()
-val labels = ListBuffer.empty[NDArray]
-while (valDataIter.hasNext) {
-  val evalData = valDataIter.next()
-  labels += evalData.label(0).copy()
-}
-val y = NDArray.concatenate(labels)
-
-// get predicted labels
-val py = NDArray.argmax_channel(prob)
-require(y.shape == py.shape)
-
-// calculate accuracy
-var numCorrect = 0
-var numInst = 0
-for ((labelElem, predElem) <- y.toArray zip py.toArray) {
-  if (labelElem == predElem) {
-    numCorrect += 1
-  }
-  numInst += 1
-}
-val acc = numCorrect.toFloat / numInst
-println(s"Final accuracy = $acc")
-```
+Java Inference APIs
+-------
+The [Java Inference APIs](http://mxnet.incubator.apache.org/api/java/index.html) also provide an easy, out of the box solution to load a pre-trained MXNet model and run inference on it. The Inference APIs are present in the [Infer Package](https://github.com/apache/incubator-mxnet/tree/master/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi) under the MXNet Scala Package repository, while the documentation for the Infer API is available [here](https://mxnet.incubator.apache.org/api/java/docs/index.html#org.apache.mxnet.infer.package).
+More APIs will be added to the Java Inference APIs soon.
 
-Release
+JVM Memory Management
 -------
-- Version 0.1.1, March 24, 2016.
-  - Bug fix for MAE & MSE metrics.
-- Version 0.1.0, March 22, 2016.
+The Scala/Java APIs also provide an automated resource management system, thus making it easy to manage the native memory footprint without any degradation in performance.
+More details about JVM Memory Management are available [here](https://github.com/apache/incubator-mxnet/blob/master/scala-package/memory-management.md).
 
 License
 -------
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Base.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Base.scala
index b2a53fd9f2dd..bb9518d51f1e 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Base.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Base.scala
@@ -153,3 +153,21 @@ private[mxnet] object Base {
 }
 
 class MXNetError(val err: String) extends Exception(err)
+
+// Some type-classes to ease the work in Symbol.random and NDArray.random modules
+
+class SymbolOrScalar[T](val isScalar: Boolean)
+object SymbolOrScalar {
+  def apply[T](implicit ev: SymbolOrScalar[T]): SymbolOrScalar[T] = ev
+  implicit object FloatWitness extends SymbolOrScalar[Float](true)
+  implicit object IntWitness extends SymbolOrScalar[Int](true)
+  implicit object SymbolWitness extends SymbolOrScalar[Symbol](false)
+}
+
+class NDArrayOrScalar[T](val isScalar: Boolean)
+object NDArrayOrScalar {
+  def apply[T](implicit ev: NDArrayOrScalar[T]): NDArrayOrScalar[T] = ev
+  implicit object FloatWitness extends NDArrayOrScalar[Float](true)
+  implicit object IntWitness extends NDArrayOrScalar[Int](true)
+  implicit object NDArrayWitness extends NDArrayOrScalar[NDArray](false)
+}
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
index 3a0c3c11f16a..125958150b72 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
@@ -40,6 +40,7 @@ object NDArray extends NDArrayBase {
   private val functions: Map[String, NDArrayFunction] = initNDArrayModule()
 
   val api = NDArrayAPI
+  val random = NDArrayRandomAPI
 
   private def addDependency(froms: Array[NDArray], tos: Array[NDArray]): Unit = {
     froms.foreach { from =>
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayAPI.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayAPI.scala
index 1d8551c1b1e5..024fed1c4ba6 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayAPI.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayAPI.scala
@@ -15,11 +15,22 @@
  * limitations under the License.
  */
 package org.apache.mxnet
-@AddNDArrayAPIs(false)
+
 /**
   * typesafe NDArray API: NDArray.api._
   * Main code will be generated during compile time through Macros
   */
+@AddNDArrayAPIs(false)
 object NDArrayAPI extends NDArrayAPIBase {
   // TODO: Implement CustomOp for NDArray
 }
+
+/**
+  * typesafe NDArray random module: NDArray.random._
+  * Main code will be generated during compile time through Macros
+  */
+@AddNDArrayRandomAPIs(false)
+object NDArrayRandomAPI extends NDArrayRandomAPIBase {
+
+}
+
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
index 01349a689b6c..29885fc723cd 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
@@ -842,6 +842,7 @@ object Symbol extends SymbolBase {
   private val bindReqMap = Map("null" -> 0, "write" -> 1, "add" -> 3)
 
   val api = SymbolAPI
+  val random = SymbolRandomAPI
 
   def pow(sym1: Symbol, sym2: Symbol): Symbol = {
     Symbol.createFromListedSymbols("_Power")(Array(sym1, sym2))
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/SymbolAPI.scala b/scala-package/core/src/main/scala/org/apache/mxnet/SymbolAPI.scala
index 1bfb0559cf96..f166de11ea52 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/SymbolAPI.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/SymbolAPI.scala
@@ -19,11 +19,11 @@ package org.apache.mxnet
 import scala.collection.mutable
 
 
-@AddSymbolAPIs(false)
 /**
   * typesafe Symbol API: Symbol.api._
   * Main code will be generated during compile time through Macros
   */
+@AddSymbolAPIs(false)
 object SymbolAPI extends SymbolAPIBase {
   def Custom (op_type : String, kwargs : mutable.Map[String, Any],
              name : String = null, attr : Map[String, String] = null) : Symbol = {
@@ -32,3 +32,13 @@ object SymbolAPI extends SymbolAPIBase {
     Symbol.createSymbolGeneral("Custom", name, attr, Seq(), map.toMap)
   }
 }
+
+/**
+  * typesafe Symbol random module: Symbol.random._
+  * Main code will be generated during compile time through Macros
+  */
+@AddSymbolRandomAPIs(false)
+object SymbolRandomAPI extends SymbolRandomAPIBase {
+
+}
+
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala
index 5d88bb39e502..7992a0ed867b 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala
@@ -576,4 +576,21 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     assert(arr.internal.toDoubleArray === Array(2d, 2d))
     assert(arr.internal.toByteArray === Array(2.toByte, 2.toByte))
   }
+
+  test("NDArray random module is generated properly") {
+    val lam = NDArray.ones(1, 2)
+    val rnd = NDArray.random.poisson(lam = Some(lam), shape = Some(Shape(3, 4)))
+    val rnd2 = NDArray.random.poisson(lam = Some(1f), shape = Some(Shape(3, 4)))
+    assert(rnd.shape === Shape(1, 2, 3, 4))
+    assert(rnd2.shape === Shape(3, 4))
+  }
+
+  test("NDArray random module is generated properly - special case of 'normal'") {
+    val mu = NDArray.ones(1, 2)
+    val sigma = NDArray.ones(1, 2) * 2
+    val rnd = NDArray.random.normal(mu = Some(mu), sigma = Some(sigma), shape = Some(Shape(3, 4)))
+    val rnd2 = NDArray.random.normal(mu = Some(1f), sigma = Some(2f), shape = Some(Shape(3, 4)))
+    assert(rnd.shape === Shape(1, 2, 3, 4))
+    assert(rnd2.shape === Shape(3, 4))
+  }
 }
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
index ebb61d7d4bfb..d134c83ff7e7 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
@@ -20,6 +20,7 @@ package org.apache.mxnet
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 
 class SymbolSuite extends FunSuite with BeforeAndAfterAll {
+
   test("symbol compose") {
     val data = Symbol.Variable("data")
 
@@ -71,4 +72,25 @@ class SymbolSuite extends FunSuite with BeforeAndAfterAll {
     val data2 = data.clone()
     assert(data.toJson === data2.toJson)
   }
+
+  test("Symbol random module is generated properly") {
+    val lam = Symbol.Variable("lam")
+    val rnd = Symbol.random.poisson(lam = Some(lam), shape = Some(Shape(2, 2)))
+    val rnd2 = Symbol.random.poisson(lam = Some(1f), shape = Some(Shape(2, 2)))
+    // scalastyle:off println
+    println(s"Symbol.random.poisson debug info: ${rnd.debugStr}")
+    println(s"Symbol.random.poisson debug info: ${rnd2.debugStr}")
+    // scalastyle:on println
+  }
+
+  test("Symbol random module is generated properly - special case of 'normal'") {
+    val loc = Symbol.Variable("loc")
+    val scale = Symbol.Variable("scale")
+    val rnd = Symbol.random.normal(mu = Some(loc), sigma = Some(scale), shape = Some(Shape(2, 2)))
+    val rnd2 = Symbol.random.normal(mu = Some(1f), sigma = Some(2f), shape = Some(Shape(2, 2)))
+    // scalastyle:off println
+    println(s"Symbol.random.sample_normal debug info: ${rnd.debugStr}")
+    println(s"Symbol.random.random_normal debug info: ${rnd2.debugStr}")
+    // scalastyle:on println
+  }
 }
diff --git a/scala-package/init-native/osx-x86_64/pom.xml b/scala-package/init-native/osx-x86_64/pom.xml
index 12f4d800eba4..1c3966f477e6 100644
--- a/scala-package/init-native/osx-x86_64/pom.xml
+++ b/scala-package/init-native/osx-x86_64/pom.xml
@@ -121,7 +121,7 @@
             </goals>
             <configuration>
               <executable>install_name_tool</executable>
-              <commandlineArgs>-change lib/libmxnet.so @loader_path/libmxnet.so ${project.build.directory}/${artifactId}.jnilib</commandlineArgs>
+              <commandlineArgs>-change @rpath/libmxnet.so @loader_path/libmxnet.so ${project.build.directory}/${artifactId}.jnilib</commandlineArgs>
             </configuration>
           </execution>
           <execution>
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
index ce12dc7cd5a0..97cd18a5b337 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
@@ -27,13 +27,15 @@ import scala.collection.mutable.ListBuffer
   * Two file namely: SymbolAPIBase.scala and NDArrayAPIBase.scala
   * The code will be executed during Macros stage and file live in Core stage
   */
-private[mxnet] object APIDocGenerator extends GeneratorBase {
+private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
 
   def main(args: Array[String]): Unit = {
     val FILE_PATH = args(0)
     val hashCollector = ListBuffer[String]()
     hashCollector += typeSafeClassGen(FILE_PATH, true)
     hashCollector += typeSafeClassGen(FILE_PATH, false)
+    hashCollector += typeSafeRandomClassGen(FILE_PATH, true)
+    hashCollector += typeSafeRandomClassGen(FILE_PATH, false)
     hashCollector += nonTypeSafeClassGen(FILE_PATH, true)
     hashCollector += nonTypeSafeClassGen(FILE_PATH, false)
     hashCollector += javaClassGen(FILE_PATH)
@@ -57,8 +59,27 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
 
     writeFile(
       FILE_PATH,
+      "package org.apache.mxnet",
       if (isSymbol) "SymbolAPIBase" else "NDArrayAPIBase",
+      "import org.apache.mxnet.annotation.Experimental",
+      generated)
+  }
+
+  def typeSafeRandomClassGen(FILE_PATH: String, isSymbol: Boolean): String = {
+    val generated = typeSafeRandomFunctionsToGenerate(isSymbol)
+      .map { func =>
+        val scalaDoc = generateAPIDocFromBackend(func)
+        val typeParameter = randomGenericTypeSpec(isSymbol, false)
+        val decl = generateAPISignature(func, isSymbol, typeParameter)
+        s"$scalaDoc\n$decl"
+      }
+
+    writeFile(
+      FILE_PATH,
       "package org.apache.mxnet",
+      if (isSymbol) "SymbolRandomAPIBase" else "NDArrayRandomAPIBase",
+      """import org.apache.mxnet.annotation.Experimental
+        |import scala.reflect.ClassTag""".stripMargin,
       generated)
   }
 
@@ -85,8 +106,9 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
 
     writeFile(
       FILE_PATH,
-      if (isSymbol) "SymbolBase" else "NDArrayBase",
       "package org.apache.mxnet",
+      if (isSymbol) "SymbolBase" else "NDArrayBase",
+      "import org.apache.mxnet.annotation.Experimental",
       absFuncs)
   }
 
@@ -110,7 +132,12 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
       }).toSeq
     val packageName = "NDArrayBase"
     val packageDef = "package org.apache.mxnet.javaapi"
-    writeFile(filePath + "javaapi/", packageName, packageDef, absFuncs)
+    writeFile(
+      filePath + "javaapi/",
+      packageDef,
+      packageName,
+      "import org.apache.mxnet.annotation.Experimental",
+      absFuncs)
   }
 
   def generateAPIDocFromBackend(func: Func, withParam: Boolean = true): String = {
@@ -146,7 +173,7 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
     }
   }
 
-  def generateAPISignature(func: Func, isSymbol: Boolean): String = {
+  def generateAPISignature(func: Func, isSymbol: Boolean, typeParameter: String = ""): String = {
     val argDef = ListBuffer[String]()
 
     argDef ++= typedFunctionCommonArgDef(func)
@@ -162,7 +189,7 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
     val returnType = func.returnType
 
     s"""@Experimental
-       |def ${func.name} (${argDef.mkString(", ")}): $returnType""".stripMargin
+       |def ${func.name}$typeParameter (${argDef.mkString(", ")}): $returnType""".stripMargin
   }
 
   def generateJavaAPISignature(func : Func) : String = {
@@ -223,8 +250,8 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
     }
   }
 
-  def writeFile(FILE_PATH: String, className: String, packageDef: String,
-                absFuncs: Seq[String]): String = {
+  def writeFile(FILE_PATH: String, packageDef: String, className: String,
+                imports: String, absFuncs: Seq[String]): String = {
 
     val finalStr =
       s"""/*
@@ -246,7 +273,7 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
          |
          |$packageDef
          |
-         |import org.apache.mxnet.annotation.Experimental
+         |$imports
          |
          |// scalastyle:off
          |abstract class $className {
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
index 9245ef1b437f..1c2c4fd704b3 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
@@ -23,7 +23,7 @@ import org.apache.mxnet.utils.{CToScalaUtils, OperatorBuildUtils}
 import scala.collection.mutable.ListBuffer
 import scala.reflect.macros.blackbox
 
-abstract class GeneratorBase {
+private[mxnet] abstract class GeneratorBase {
   type Handle = Long
 
   case class Arg(argName: String, argType: String, argDesc: String, isOptional: Boolean) {
@@ -46,7 +46,8 @@ abstract class GeneratorBase {
     }
   }
 
-  def typeSafeFunctionsToGenerate(isSymbol: Boolean, isContrib: Boolean): List[Func] = {
+  // filter the operators to generate in the type-safe Symbol.api and NDArray.api
+  protected def typeSafeFunctionsToGenerate(isSymbol: Boolean, isContrib: Boolean): List[Func] = {
     // Operators that should not be generated
     val notGenerated = Set("Custom")
 
@@ -144,8 +145,8 @@ abstract class GeneratorBase {
     result
   }
 
+  // build function argument definition, with optionality, and safe names
   protected def typedFunctionCommonArgDef(func: Func): List[String] = {
-    // build function argument definition, with optionality, and safe names
     func.listOfArgs.map(arg =>
       if (arg.isOptional) {
         // let's avoid a stupid Option[Array[...]]
@@ -161,3 +162,71 @@ abstract class GeneratorBase {
     )
   }
 }
+
+// a mixin to ease generating the Random module
+private[mxnet] trait RandomHelpers {
+  self: GeneratorBase =>
+
+  // a generic type spec used in Symbol.random and NDArray.random modules
+  protected def randomGenericTypeSpec(isSymbol: Boolean, fullPackageSpec: Boolean): String = {
+    val classTag = if (fullPackageSpec) "scala.reflect.ClassTag" else "ClassTag"
+    if (isSymbol) s"[T: SymbolOrScalar : $classTag]"
+    else s"[T: NDArrayOrScalar : $classTag]"
+  }
+
+  // filter the operators to generate in the type-safe Symbol.random and NDArray.random
+  protected def typeSafeRandomFunctionsToGenerate(isSymbol: Boolean): List[Func] = {
+    getBackEndFunctions(isSymbol)
+      .filter(f => f.name.startsWith("_sample_") || f.name.startsWith("_random_"))
+      .map(f => f.copy(name = f.name.stripPrefix("_")))
+      // unify _random and _sample
+      .map(f => unifyRandom(f, isSymbol))
+      // deduplicate
+      .groupBy(_.name)
+      .mapValues(_.head)
+      .values
+      .toList
+  }
+
+  // unify call targets (random_xyz and sample_xyz) and unify their argument types
+  private def unifyRandom(func: Func, isSymbol: Boolean): Func = {
+    var typeConv = Set("org.apache.mxnet.NDArray", "org.apache.mxnet.Symbol",
+      "java.lang.Float", "java.lang.Integer")
+
+    func.copy(
+      name = func.name.replaceAll("(random|sample)_", ""),
+      listOfArgs = func.listOfArgs
+        .map(hackNormalFunc)
+        .map(arg =>
+          if (typeConv(arg.argType)) arg.copy(argType = "T")
+          else arg
+        )
+      // TODO: some functions are non consistent in random_ vs sample_ regarding optionality
+      // we may try to unify that as well here.
+    )
+  }
+
+  // hacks to manage the fact that random_normal and sample_normal have
+  // non-consistent parameter naming in the back-end
+  // this first one, merge loc/scale and mu/sigma
+  protected def hackNormalFunc(arg: Arg): Arg = {
+    if (arg.argName == "loc") arg.copy(argName = "mu")
+    else if (arg.argName == "scale") arg.copy(argName = "sigma")
+    else arg
+  }
+
+  // this second one reverts this merge prior to back-end call
+  protected def unhackNormalFunc(func: Func): String = {
+    if (func.name.equals("normal")) {
+      s"""if(target.equals("random_normal")) {
+         |  if(map.contains("mu")) { map("loc") = map("mu"); map.remove("mu")  }
+         |  if(map.contains("sigma")) { map("scale") = map("sigma"); map.remove("sigma") }
+         |}
+       """.stripMargin
+    } else {
+      ""
+    }
+
+  }
+
+}
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
index d85abe1ecc4f..c18694b59bf6 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
@@ -18,7 +18,6 @@
 package org.apache.mxnet
 
 import scala.annotation.StaticAnnotation
-import scala.collection.mutable.ListBuffer
 import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
@@ -30,6 +29,14 @@ private[mxnet] class AddNDArrayAPIs(isContrib: Boolean) extends StaticAnnotation
   private[mxnet] def macroTransform(annottees: Any*) = macro TypedNDArrayAPIMacro.typeSafeAPIDefs
 }
 
+private[mxnet] class AddNDArrayRandomAPIs(isContrib: Boolean) extends StaticAnnotation {
+  private[mxnet] def macroTransform(annottees: Any*) =
+  macro TypedNDArrayRandomAPIMacro.typeSafeAPIDefs
+}
+
+/**
+  * For non-typed NDArray API
+  */
 private[mxnet] object NDArrayMacro extends GeneratorBase {
 
   def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
@@ -70,6 +77,9 @@ private[mxnet] object NDArrayMacro extends GeneratorBase {
   }
 }
 
+/**
+  * NDArray.api code generation
+  */
 private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
 
   def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
@@ -78,9 +88,9 @@ private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
       case q"new AddNDArrayAPIs($b)" => c.eval[Boolean](c.Expr(b))
     }
 
-    val functions = typeSafeFunctionsToGenerate(isSymbol = false, isContrib)
+    val functionDefs = typeSafeFunctionsToGenerate(isSymbol = false, isContrib)
+      .map(f => buildTypedFunction(c)(f))
 
-    val functionDefs = functions.map(f => buildTypedFunction(c)(f))
     structGeneration(c)(functionDefs, annottees: _*)
   }
 
@@ -89,49 +99,136 @@ private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
     import c.universe._
 
     val returnType = "org.apache.mxnet.NDArrayFuncReturn"
-    val ndarrayType = "org.apache.mxnet.NDArray"
-
-    // Construct argument field
-    val argDef = ListBuffer[String]()
-    argDef ++= typedFunctionCommonArgDef(function)
-    argDef += "out : Option[NDArray] = None"
-
-    // Construct Implementation field
-    var impl = ListBuffer[String]()
-    impl += "val map = scala.collection.mutable.Map[String, Any]()"
-    impl += s"val args = scala.collection.mutable.ArrayBuffer.empty[$ndarrayType]"
-
-    // NDArray arg implementation
-    impl ++= function.listOfArgs.map { arg =>
-      if (arg.argType.equals(s"Array[$ndarrayType]")) {
-        s"args ++= ${arg.safeArgName}"
-      } else {
-        val base =
-          if (arg.argType.equals(ndarrayType)) {
-            // ndarrays go to args
+
+    // Construct API arguments declaration
+    val argDecl = super.typedFunctionCommonArgDef(function) :+ "out : Option[NDArray] = None"
+
+    // Map API input args to backend args
+    val backendArgsMapping =
+      function.listOfArgs.map { arg =>
+        // ndarrays go to args, other types go to kwargs
+        if (arg.argType.equals(s"Array[org.apache.mxnet.NDArray]")) {
+          s"args ++= ${arg.safeArgName}.toSeq"
+        } else {
+          val base = if (arg.argType.equals("org.apache.mxnet.NDArray")) {
             s"args += ${arg.safeArgName}"
           } else {
-            // other types go to kwargs
             s"""map("${arg.argName}") = ${arg.safeArgName}"""
           }
-        if (arg.isOptional) s"if (!${arg.safeArgName}.isEmpty) $base.get"
-        else base
+          if (arg.isOptional) s"if (!${arg.safeArgName}.isEmpty) $base.get"
+          else base
+        }
       }
-    }
 
-    impl +=
-      s"""if (!out.isEmpty) map("out") = out.get
-         |org.apache.mxnet.NDArray.genericNDArrayFunctionInvoke(
-         |  "${function.name}", args.toSeq, map.toMap)
+    val impl =
+      s"""
+         |def ${function.name}
+         |  (${argDecl.mkString(",")}): $returnType = {
+         |
+         |  val map = scala.collection.mutable.Map[String, Any]()
+         |  val args = scala.collection.mutable.ArrayBuffer.empty[org.apache.mxnet.NDArray]
+         |
+         |  if (!out.isEmpty) map("out") = out.get
+         |
+         |  ${backendArgsMapping.mkString("\n")}
+         |
+         |  org.apache.mxnet.NDArray.genericNDArrayFunctionInvoke(
+         |    "${function.name}", args.toSeq, map.toMap)
+         |}
        """.stripMargin
 
-    // Combine and build the function string
-    val finalStr =
-      s"""def ${function.name}
-         |   (${argDef.mkString(",")}) : $returnType
-         | = {${impl.mkString("\n")}}
+    c.parse(impl).asInstanceOf[DefDef]
+  }
+}
+
+
+/**
+  * NDArray.random code generation
+  */
+private[mxnet] object TypedNDArrayRandomAPIMacro extends GeneratorBase
+  with RandomHelpers {
+
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+    // Note: no contrib managed in this module
+
+    val functionDefs = typeSafeRandomFunctionsToGenerate(isSymbol = false)
+      .map(f => buildTypedFunction(c)(f))
+
+    structGeneration(c)(functionDefs, annottees: _*)
+  }
+
+  protected def buildTypedFunction(c: blackbox.Context)
+                                  (function: Func): c.universe.DefDef = {
+    import c.universe._
+
+    val returnType = "org.apache.mxnet.NDArrayFuncReturn"
+
+    // Construct API arguments declaration
+    val argDecl = super.typedFunctionCommonArgDef(function) :+ "out : Option[NDArray] = None"
+
+    // Map API input args to backend args
+    val backendArgsMapping =
+      function.listOfArgs.map { arg =>
+        // ndarrays go to args, other types go to kwargs
+        if (arg.argType.equals("Array[org.apache.mxnet.NDArray]")) {
+          s"args ++= ${arg.safeArgName}.toSeq"
+        } else {
+          if (arg.argType.equals("T")) {
+            if (arg.isOptional) {
+              s"""if(${arg.safeArgName}.isDefined) {
+                 |  if(isScalar) {
+                 |    map("${arg.argName}") = ${arg.safeArgName}.get
+                 |  } else {
+                 |    args += ${arg.safeArgName}.get.asInstanceOf[org.apache.mxnet.NDArray]
+                 |  }
+                 |}
+             """.stripMargin
+            } else {
+              s"""if(isScalar) {
+                 |  map("${arg.argName}") = ${arg.safeArgName}
+                 |} else {
+                 |  args += ${arg.safeArgName}.asInstanceOf[org.apache.mxnet.NDArray]
+                 |}
+             """.stripMargin
+            }
+          } else {
+            if (arg.isOptional) {
+              s"""if (${arg.safeArgName}.isDefined) map("${arg.argName}")=${arg.safeArgName}.get"""
+            } else {
+              s"""map("${arg.argName}") = ${arg.safeArgName}"""
+            }
+          }
+        }
+      }
+
+    val impl =
+      s"""
+         |def ${function.name}${randomGenericTypeSpec(false, true)}
+         |  (${argDecl.mkString(",")}): $returnType = {
+         |
+         |  val map = scala.collection.mutable.Map[String, Any]()
+         |  val args = scala.collection.mutable.ArrayBuffer.empty[org.apache.mxnet.NDArray]
+         |  val isScalar = NDArrayOrScalar[T].isScalar
+         |
+         |  if(out.isDefined) map("out") = out.get
+         |
+         |  ${backendArgsMapping.mkString("\n")}
+         |
+         |  val target = if(isScalar) {
+         |    "random_${function.name}"
+         |  } else {
+         |    "sample_${function.name}"
+         |  }
+         |
+         |  ${unhackNormalFunc(function)}
+         |
+         |  org.apache.mxnet.NDArray.genericNDArrayFunctionInvoke(
+         |    target, args.toSeq, map.toMap)
+         |}
        """.stripMargin
 
-    c.parse(finalStr).asInstanceOf[DefDef]
+    c.parse(impl).asInstanceOf[DefDef]
   }
+
+
 }
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
index ab864e1ef195..7ec80b9c066c 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
@@ -17,8 +17,8 @@
 
 package org.apache.mxnet
 
+
 import scala.annotation.StaticAnnotation
-import scala.collection.mutable.ListBuffer
 import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
@@ -30,6 +30,14 @@ private[mxnet] class AddSymbolAPIs(isContrib: Boolean) extends StaticAnnotation
   private[mxnet] def macroTransform(annottees: Any*) = macro TypedSymbolAPIMacro.typeSafeAPIDefs
 }
 
+private[mxnet] class AddSymbolRandomAPIs(isContrib: Boolean) extends StaticAnnotation {
+  private[mxnet] def macroTransform(annottees: Any*) =
+  macro TypedSymbolRandomAPIMacro.typeSafeAPIDefs
+}
+
+/**
+  * For non-typed Symbol API
+  */
 private[mxnet] object SymbolMacro extends GeneratorBase {
 
   def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
@@ -63,6 +71,9 @@ private[mxnet] object SymbolMacro extends GeneratorBase {
   }
 }
 
+/**
+  * Symbol.api code generation
+  */
 private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
 
   def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
@@ -71,9 +82,9 @@ private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
       case q"new AddSymbolAPIs($b)" => c.eval[Boolean](c.Expr(b))
     }
 
-    val functions = typeSafeFunctionsToGenerate(isSymbol = true, isContrib)
+    val functionDefs = typeSafeFunctionsToGenerate(isSymbol = true, isContrib)
+      .map(f => buildTypedFunction(c)(f))
 
-    val functionDefs = functions.map(f => buildTypedFunction(c)(f))
     structGeneration(c)(functionDefs, annottees: _*)
   }
 
@@ -82,45 +93,111 @@ private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
     import c.universe._
 
     val returnType = "org.apache.mxnet.Symbol"
-    val symbolType = "org.apache.mxnet.Symbol"
-
-    // Construct argument field
-    val argDef = ListBuffer[String]()
-    argDef ++= typedFunctionCommonArgDef(function)
-    argDef += "name : String = null"
-    argDef += "attr : Map[String, String] = null"
-
-    // Construct Implementation field
-    val impl = ListBuffer[String]()
-    impl += "val map = scala.collection.mutable.Map[String, Any]()"
-    impl += s"var args = scala.collection.Seq[$symbolType]()"
-
-    // Symbol arg implementation
-    impl ++= function.listOfArgs.map { arg =>
-      if (arg.argType.equals(s"Array[$symbolType]")) {
-        s"if (!${arg.safeArgName}.isEmpty) args = ${arg.safeArgName}.toSeq"
-      } else {
-        // all go in kwargs
-        if (arg.isOptional) {
-          s"""if (!${arg.safeArgName}.isEmpty) map("${arg.argName}") = ${arg.safeArgName}.get"""
+
+    // Construct API arguments declaration
+    val argDecl = super.typedFunctionCommonArgDef(function) :+
+      "name : String = null" :+
+      "attr : Map[String, String] = null"
+
+    // Map API input args to backend args
+    val backendArgsMapping =
+      function.listOfArgs.map { arg =>
+        if (arg.argType.equals(s"Array[org.apache.mxnet.Symbol]")) {
+          s"args = ${arg.safeArgName}.toSeq"
         } else {
-          s"""map("${arg.argName}") = ${arg.safeArgName}"""
+          // all go in kwargs
+          if (arg.isOptional) {
+            s"""if (!${arg.safeArgName}.isEmpty) map("${arg.argName}") = ${arg.safeArgName}.get"""
+          } else {
+            s"""map("${arg.argName}") = ${arg.safeArgName}"""
+          }
         }
       }
-    }
 
-    impl +=
-      s"""org.apache.mxnet.Symbol.createSymbolGeneral(
-         |  "${function.name}", name, attr, args, map.toMap)
+    val impl =
+      s"""
+         |def ${function.name}
+         |  (${argDecl.mkString(",")}): $returnType = {
+         |
+         |  val map = scala.collection.mutable.Map[String, Any]()
+         |  var args = scala.collection.Seq[org.apache.mxnet.Symbol]()
+         |
+         |  ${backendArgsMapping.mkString("\n")}
+         |
+         |  org.apache.mxnet.Symbol.createSymbolGeneral(
+         |    "${function.name}", name, attr, args, map.toMap)
+         |}
        """.stripMargin
 
-    // Combine and build the function string
-    val finalStr =
-      s"""def ${function.name}
-         |   (${argDef.mkString(",")}) : $returnType
-         | = {${impl.mkString("\n")}}
+    c.parse(impl).asInstanceOf[DefDef]
+  }
+}
+
+
+/**
+  * Symbol.random code generation
+  */
+private[mxnet] object TypedSymbolRandomAPIMacro extends GeneratorBase
+  with RandomHelpers {
+
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+    val functionDefs = typeSafeRandomFunctionsToGenerate(isSymbol = true)
+      .map(f => buildTypedFunction(c)(f))
+
+    structGeneration(c)(functionDefs, annottees: _*)
+  }
+
+  protected def buildTypedFunction(c: blackbox.Context)
+                                  (function: Func): c.universe.DefDef = {
+    import c.universe._
+
+    val returnType = "org.apache.mxnet.Symbol"
+
+    // Construct API arguments declaration
+    val argDecl = super.typedFunctionCommonArgDef(function) :+
+      "name : String = null" :+
+      "attr : Map[String, String] = null"
+
+    // Map API input args to backend args
+    val backendArgsMapping =
+      function.listOfArgs.map { arg =>
+        if (arg.argType.equals(s"Array[org.apache.mxnet.Symbol]")) {
+          s"args = ${arg.safeArgName}.toSeq"
+        } else {
+          // all go in kwargs
+          if (arg.isOptional) {
+            s"""if (${arg.safeArgName}.isDefined) map("${arg.argName}") = ${arg.safeArgName}.get"""
+          } else {
+            s"""map("${arg.argName}") = ${arg.safeArgName}"""
+          }
+        }
+      }
+
+    val impl =
+      s"""
+         |def ${function.name}${randomGenericTypeSpec(true, true)}
+         |  (${argDecl.mkString(",")}): $returnType = {
+         |
+         |  val map = scala.collection.mutable.Map[String, Any]()
+         |  var args = scala.collection.Seq[org.apache.mxnet.Symbol]()
+         |  val isScalar = SymbolOrScalar[T].isScalar
+         |
+         |  ${backendArgsMapping.mkString("\n")}
+         |
+         |  val target = if(isScalar) {
+         |    "random_${function.name}"
+         |  } else {
+         |    "sample_${function.name}"
+         |  }
+         |
+         |  ${unhackNormalFunc(function)}
+         |
+         |  org.apache.mxnet.Symbol.createSymbolGeneral(
+         |    target, name, attr, args, map.toMap)
+         |}
        """.stripMargin
 
-    c.parse(finalStr).asInstanceOf[DefDef]
+    c.parse(impl).asInstanceOf[DefDef]
   }
 }
+
diff --git a/scala-package/native/osx-x86_64-cpu/pom.xml b/scala-package/native/osx-x86_64-cpu/pom.xml
index 425ca96815de..f6f90cdcdb58 100644
--- a/scala-package/native/osx-x86_64-cpu/pom.xml
+++ b/scala-package/native/osx-x86_64-cpu/pom.xml
@@ -121,7 +121,7 @@
             </goals>
             <configuration>
               <executable>install_name_tool</executable>
-              <commandlineArgs>-change lib/libmxnet.so @loader_path/libmxnet.so ${project.build.directory}/${artifactId}.jnilib</commandlineArgs>
+              <commandlineArgs>-change @rpath/libmxnet.so @loader_path/libmxnet.so ${project.build.directory}/${artifactId}.jnilib</commandlineArgs>
             </configuration>
           </execution>
           <execution>
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index 1f936b164326..e2e53c7261fa 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -148,8 +148,6 @@ int MXExecutorBindEX(SymbolHandle symbol_handle,
                      NDArrayHandle *aux_states,
                      ExecutorHandle shared_exec,
                      ExecutorHandle *out) {
-  Executor* exec = nullptr;
-
   API_BEGIN();
   nnvm::Symbol *symb = static_cast<nnvm::Symbol*>(symbol_handle);
   Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
@@ -181,7 +179,7 @@ int MXExecutorBindEX(SymbolHandle symbol_handle,
   *out = Executor::Bind(*symb, ctx, ctx_map, in_args_vec,
                         arg_grad_vec, grad_req_vec, aux_states_vec,
                         reinterpret_cast<Executor*>(shared_exec));
-  API_END_HANDLE_ERROR(delete exec);
+  API_END();
 }
 
 /*!
@@ -558,8 +556,11 @@ int MXExecutorReshape(int partial_shaping,
                       NDArrayHandle** aux_states,
                       ExecutorHandle shared_exec,
                       ExecutorHandle *out) {
+  Executor* new_exec = nullptr;
+
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();
+  *out = nullptr;  // ensure we can know whether to free executor on early abort
   // create shape map for in_args and aux_states
   std::unordered_map<std::string, TShape> kwargs(num_provided_arg_shapes);
   for (mx_uint i = 0; i < num_provided_arg_shapes; ++i) {
@@ -581,8 +582,9 @@ int MXExecutorReshape(int partial_shaping,
   std::vector<NDArray> aux_state_vec;
 
   Executor* exec = static_cast<Executor*>(shared_exec);
-  *out = exec->Reshape(partial_shaping, allow_up_sizing, ctx, ctx_map, kwargs,
+  new_exec = exec->Reshape(partial_shaping, allow_up_sizing, ctx, ctx_map, kwargs,
                        &in_arg_vec, &arg_grad_vec, &aux_state_vec);
+  *out = new_exec;
 
   ret->ret_handles.clear();
   ret->ret_handles.reserve(in_arg_vec.size()+arg_grad_vec.size()+aux_state_vec.size());
@@ -623,7 +625,7 @@ int MXExecutorReshape(int partial_shaping,
     *aux_states = &(ret->ret_handles[nd_idx]);
     nd_idx = ret->ret_handles.size();
   }
-  API_END_HANDLE_ERROR(delete out);
+  API_END_HANDLE_ERROR(delete new_exec);
 }
 
 int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
diff --git a/src/initialize.cc b/src/initialize.cc
index ddda3f18a3ae..de7edd1b1455 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -57,11 +57,13 @@ class LibraryInitializer {
         Engine::Get()->Start();
       },
       []() {
-        // Make children single threaded since they are typically workers
-        dmlc::SetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
+        // Conservative thread management for multiprocess workers
+        const size_t mp_worker_threads = dmlc::GetEnv("MXNET_MP_WORKER_NTHREADS", 1);
+        dmlc::SetEnv("MXNET_CPU_WORKER_NTHREADS", mp_worker_threads);
         dmlc::SetEnv("OMP_NUM_THREADS", 1);
 #if MXNET_USE_OPENCV && !__APPLE__
-        cv::setNumThreads(0);  // disable opencv threading
+        const size_t mp_cv_num_threads = dmlc::GetEnv("MXNET_MP_OPENCV_NUM_THREADS", 0);
+        cv::setNumThreads(mp_cv_num_threads);  // disable opencv threading
 #endif  // MXNET_USE_OPENCV
         engine::OpenMP::Get()->set_enabled(false);
         Engine::Get()->Start();
diff --git a/src/operator/contrib/multibox_detection.cc b/src/operator/contrib/multibox_detection.cc
index c005dfa06590..b4f66d8fcf1d 100644
--- a/src/operator/contrib/multibox_detection.cc
+++ b/src/operator/contrib/multibox_detection.cc
@@ -174,7 +174,6 @@ inline void MultiBoxDetectionForward(const Tensor<cpu, 3, DType> &out,
     }
 
     // apply nms
-#pragma omp parallel for num_threads(omp_threads)
     for (int i = 0; i < nkeep; ++i) {
       int offset_i = i * 6;
       if (p_out[offset_i] < 0) continue;  // skip eliminated
diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index e334fe7ec9b2..64ce73ba1cf7 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -23,11 +23,17 @@
  * \brief
  * \author Ziheng Jiang, Jun Wu
 */
+#include <vector>
+#include "quantization_utils.h"
 #include "../nn/fully_connected-inl.h"
 
 namespace mxnet {
 namespace op {
 
+namespace quantized_fc {
+enum QuantizedfcOpResource {kTempSpace};
+}
+
 bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
                                   std::vector<TShape> *in_shape,
                                   std::vector<TShape> *out_shape) {
@@ -79,6 +85,151 @@ bool QuantizedFullyConnectedType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
+                                        const int dev_mask,
+                                        DispatchMode* dispatch_mode,
+                                        std::vector<int> *in_attrs,
+                                        std::vector<int> *out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+
+  for (auto &v : *out_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
+      return false;
+    }
+  }
+
+  for (auto &v : *in_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+struct QuantizedSumInitKernelWithBias {
+  //  init sum data with bias for matrix b (n)
+  MSHADOW_XINLINE static void Map(int i, int32_t *out,
+                                  const int8_t *bias, const float *min_out,
+                                  const float *max_out, const float *min_bias,
+                                  const float *max_bias) {
+    typedef int32_t T1;
+    typedef int8_t  T2;
+    using mshadow::red::limits::MinValue;
+    using mshadow::red::limits::MaxValue;
+    float float_for_one_out_quant  =
+        MaxAbs(*min_out, *max_out) / static_cast<double>(MaxValue<T1>());
+    float float_for_one_bias_quant =
+        MaxAbs(*min_bias, *max_bias) / static_cast<double>(MaxValue<T2>());
+    if (float_for_one_out_quant != 0) {
+      out[i] = bias[i] * float_for_one_bias_quant /
+          float_for_one_out_quant;
+    } else {
+      LOG(INFO) << "float_for_one_out_quant is 0,"
+                << " need to check the why MaxAbs(*min_out, *max_out) of out_data is 0!";
+      out[i] = 0;
+    }
+  }
+};
+
+
+template<typename SrcType>
+void QuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
+                                    const OpContext &ctx,
+                                    const std::vector<NDArray> &in_data,
+                                    const std::vector<OpReqType> &req,
+                                    const std::vector<NDArray> &out_data) {
+#if MSHADOW_USE_MKL == 1
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  using namespace mshadow;
+  using namespace mxnet_op;
+  size_t num_inputs = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_data.size(),  num_inputs * 3);
+  CHECK_EQ(out_data.size(), 3U);
+  const NDArray& data = in_data[0];
+  const NDArray& weight = in_data[1];
+  const NDArray& out = out_data[0];
+  TShape dshape = data.shape();
+  TShape wshape = weight.shape();
+  TShape oshape = out.shape();
+  auto output_temp = out.data().dptr<int32_t>();
+  auto weight_temp = weight.data().dptr<SrcType>();
+  auto data_temp = data.data().dptr<SrcType>();
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  const float alpha = 1.0f;
+  const float beta  = 1.0f;
+  const CBLAS_OFFSET offsetc = CblasFixOffset;
+  const MKL_INT8 oa = 0;
+  const MKL_INT8 ob = 0;
+  MKL_INT32 oc = 0;
+  const int m = dshape[0], n = wshape[0], k = dshape.ProdShape(1, dshape.ndim());
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  //  cblas_gemm_s8u8s32 required first matrix must be uint8
+  //  shift data from int8(from -128 to 127) to uint8 (from 0 to 255)
+  int shift = 128;
+  Tensor<cpu, 1, uint8_t> shiftdata =
+    ctx.requested[quantized_fc::kTempSpace].get_space_typed<cpu, 1, uint8_t>(
+      Shape1(m * k), s);
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = 0; i < m * k; ++i) {
+    shiftdata.dptr_[i] = data_temp[i] + shift;
+  }
+
+  Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
+      out_data[1].data().dptr<float>(), out_data[2].data().dptr<float>(),
+      in_data[num_inputs].data().dptr<float>(), in_data[num_inputs+1].data().dptr<float>(),
+      in_data[num_inputs+2].data().dptr<float>(), in_data[num_inputs+3].data().dptr<float>());
+  if (!param.no_bias) {
+    const NDArray& bias = in_data[2];
+    Kernel<QuantizedSumInitKernelWithBias, cpu>::Launch(s, n, out.data().dptr<int32_t>(),
+        bias.data().dptr<int8_t>(), out_data[1].data().dptr<float>(),
+        out_data[2].data().dptr<float>(), in_data[7].data().dptr<float>(),
+        in_data[8].data().dptr<float>());
+  } else {
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int i = 0; i < m * n; ++i) {
+      output_temp[i] = 0;
+    }
+  }
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < k; ++j) {
+      output_temp[i] -= shift * weight_temp[i * k + j];
+    }
+  }
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = n; i < m * n; ++i) {
+    output_temp[i] = output_temp[i % n];
+  }
+  cblas_gemm_s8u8s32(CblasRowMajor,
+                     CblasNoTrans,
+                     CblasTrans,
+                     offsetc,
+                     m,
+                     n,
+                     k,
+                     alpha,
+                     shiftdata.dptr_,
+                     k,
+                     oa,
+                     weight.data().dptr<SrcType>(),
+                     k,
+                     ob,
+                     beta,
+                     out.data().dptr<int32_t>(),
+                     n,
+                     &oc);
+#else
+  LOG(FATAL) << "Quantized fully connected operator relies on cblas_gemm_s8u8s32"
+             << " which is only supported by MKL BLAS."
+             << " Please build MXNet with USE_BLAS=mkl to leverage this operator.";
+#endif
+}
+
 NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
 .describe(R"code(Fully Connected operator for input, weight and bias data type of int8,
 and accumulates in type int32 for the output. For each argument, two more arguments of type
@@ -112,7 +263,14 @@ and max thresholds representing the threholds for quantizing the float32 output
   })
 .set_attr<nnvm::FInferShape>("FInferShape", QuantizedFullyConnectedShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizedFullyConnectedType)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizedFullyConnectedStorageType)
 .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
+.set_attr<FComputeEx>("FComputeEx<cpu>",
+    QuantizedFullyConnectedForward<int8_t>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
 .add_argument("data", "NDArray-or-Symbol", "Input data.")
 .add_argument("weight", "NDArray-or-Symbol", "weight.")
 .add_argument("bias", "NDArray-or-Symbol", "bias.")
@@ -135,6 +293,5 @@ NNVM_REGISTER_OP(FullyConnected)
     }
     return node;
   });
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 518b69626246..3ff4b69302fb 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -26,6 +26,7 @@
 from mxnet.module import Module
 from mxnet.io import NDArrayIter
 import unittest
+import operator
 
 def is_test_for_gpu():
     return mx.current_context().device_type == 'gpu'
@@ -278,8 +279,15 @@ def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_p
 def test_quantized_fc():
     def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
         if mx.current_context().device_type != 'gpu':
-            print('skipped testing quantized_fc on cpu since it is not supported yet')
-            return
+            hasMKL = False;
+            for key in os.environ.keys():
+                if operator.eq(key, "BUILD_TAG"):
+                    if os.environ['BUILD_TAG'].find("MKL") != -1:
+                        hasMKL = True
+                    break
+            if hasMKL == False:
+                print('skipped testing quantized_fc on cpu since s8u8s32 is only supported by MKL BLAS library')
+                return
         elif qdtype == 'uint8' and is_test_for_gpu():
             print('skipped testing quantized_fc for gpu uint8 since it is not supported yet')
             return
@@ -291,16 +299,16 @@ def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
         fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
         if qdtype == 'uint8':
             data_low = 0.0
-            data_high = 127.0
+            data_high = 63.0
         else:
-            data_low = -127.0
-            data_high = 127.0
+            data_low = -63.0
+            data_high = 63.0
         fc_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                      shape=data_shape).astype('int32')
-        fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+        fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                      shape=arg_shapes[1]).astype('int32')
         if not no_bias:
-            fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+            fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                          shape=arg_shapes[2]).astype('int32')
         output = fc_fp32_exe.forward()[0]
 
@@ -343,6 +351,10 @@ def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
         check_quantized_fc((32, 111, 2, 2), 100, True, qdtype)
         check_quantized_fc((32, 512, 2, 2), 100, False, qdtype)
         check_quantized_fc((32, 111, 2, 2), 100, False, qdtype)
+        check_quantized_fc((256, 2048, 2, 2), 800, False, qdtype)
+        check_quantized_fc((256, 111, 2, 2), 800, False, qdtype)
+        check_quantized_fc((256, 2048, 2, 2), 800, True, qdtype)
+        check_quantized_fc((256, 111, 2, 2), 800, True, qdtype)
 
 @with_seed()
 def test_quantized_flatten():
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index a3ba222c71d8..6a5322616e20 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -156,9 +156,10 @@ def __getitem__(self, key):
 @with_seed()
 def test_multi_worker():
     data = Dataset()
-    loader = gluon.data.DataLoader(data, batch_size=1, num_workers=5)
-    for i, batch in enumerate(loader):
-        assert (batch.asnumpy() == i).all()
+    for thread_pool in [True, False]:
+        loader = gluon.data.DataLoader(data, batch_size=1, num_workers=5, thread_pool=thread_pool)
+        for i, batch in enumerate(loader):
+            assert (batch.asnumpy() == i).all()
 
 class _Dummy(Dataset):
     """Dummy dataset for randomized shape arrays."""
diff --git a/tools/build/build_lib.sh b/tools/build/build_lib.sh
new file mode 100755
index 000000000000..032fcb92045f
--- /dev/null
+++ b/tools/build/build_lib.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script builds the libraries of mxnet.
+make_config=config/pip_${PLATFORM}_${VARIANT}.mk
+if [[ ! -f $make_config ]]; then
+    >&2 echo "Couldn't find make config $make_config for the current settings."
+    exit 1
+fi
+
+git clone --recursive https://github.com/apache/incubator-mxnet mxnet-build
+
+>&2 echo "Now building mxnet modules..."
+cp $make_config mxnet-build/config.mk
+
+cd mxnet-build
+
+make DEPS_PATH=$DEPS_PATH DMLCCORE
+make DEPS_PATH=$DEPS_PATH $PWD/3rdparty/tvm/nnvm/lib/libnnvm.a
+make DEPS_PATH=$DEPS_PATH PSLITE
+
+if [[ $VARIANT == *mkl ]]; then
+    MKLDNN_LICENSE='license.txt'
+    if [[ $PLATFORM == 'linux' ]]; then
+        IOMP_LIBFILE='libiomp5.so'
+        MKLML_LIBFILE='libmklml_intel.so'
+        MKLDNN_LIBFILE='libmkldnn.so.0'
+    else
+        IOMP_LIBFILE='libiomp5.dylib'
+        MKLML_LIBFILE='libmklml.dylib'
+        MKLDNN_LIBFILE='libmkldnn.0.dylib'
+    fi
+    make DEPS_PATH=$DEPS_PATH mkldnn
+    cp 3rdparty/mkldnn/LICENSE ./MKLML_LICENSE
+fi
+
+if [[ $VARIANT == *mkl ]]; then
+    >&2 echo "Copying MKL license."
+    rm lib/libmkldnn.{so,dylib}
+    rm lib/libmkldnn.0.*.dylib
+    rm lib/libmkldnn.so.0.*
+fi
+
+>&2 echo "Now building mxnet..."
+make DEPS_PATH=$DEPS_PATH || exit 1;
+
+if [[ $PLATFORM == 'linux' ]]; then
+    cp -L /usr/lib/gcc/x86_64-linux-gnu/4.8/libgfortran.so lib/libgfortran.so.3
+    cp -L /usr/lib/x86_64-linux-gnu/libquadmath.so.0 lib/libquadmath.so.0
+fi
+
+# Print the linked objects on libmxnet.so
+>&2 echo "Checking linked objects on libmxnet.so..."
+if [[ ! -z $(command -v readelf) ]]; then
+    readelf -d lib/libmxnet.so
+    strip --strip-unneeded lib/libmxnet.so
+elif [[ ! -z $(command -v otool) ]]; then
+    otool -L lib/libmxnet.so
+    strip -u -r -x lib/libmxnet.so
+else
+    >&2 echo "Not available"
+fi
+
+cd ../
diff --git a/tools/pip_package/MANIFEST.in b/tools/build/build_wheel.sh
old mode 100644
new mode 100755
similarity index 69%
rename from tools/pip_package/MANIFEST.in
rename to tools/build/build_wheel.sh
index 5c6a72377e9f..a79634117c21
--- a/tools/pip_package/MANIFEST.in
+++ b/tools/build/build_wheel.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -15,6 +17,15 @@
 # specific language governing permissions and limitations
 # under the License.
 
-include README
-recursive-include * *.py
-recursive-include * *.so
+# This script builds the wheel for binary distribution and performs sanity check.
+
+cd mxnet-build
+echo $(git rev-parse HEAD) >> python/mxnet/COMMIT_HASH
+cd -
+
+# Make wheel for testing
+python setup.py bdist_wheel
+
+wheel_name=$(ls -t dist | head -n 1)
+pip install -U --user --force-reinstall dist/$wheel_name
+python sanity_test.py
diff --git a/tools/dependencies/make_shared_dependencies.sh b/tools/dependencies/make_shared_dependencies.sh
new file mode 100755
index 000000000000..d678fddcc02d
--- /dev/null
+++ b/tools/dependencies/make_shared_dependencies.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This is a convenience script for calling the build scripts of all dependency libraries.
+# Environment variables should be set beforehand.
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
+
+
+if [[ ! $PLATFORM == 'darwin' ]]; then
+    source $DIR/openblas.sh
+fi
+source $DIR/libz.sh
+source $DIR/libturbojpeg.sh
+source $DIR/libpng.sh
+source $DIR/libtiff.sh
+source $DIR/openssl.sh
+source $DIR/curl.sh
+source $DIR/eigen.sh
+source $DIR/opencv.sh
+source $DIR/protobuf.sh
+source $DIR/cityhash.sh
+source $DIR/zmq.sh
+source $DIR/lz4.sh
diff --git a/tools/dependencies/opencv.sh b/tools/dependencies/opencv.sh
index 98ff115f1765..99d0ecb71c36 100755
--- a/tools/dependencies/opencv.sh
+++ b/tools/dependencies/opencv.sh
@@ -20,6 +20,7 @@
 # This script builds the static library of opencv that can be used as dependency of mxnet.
 # It expects openblas, libjpeg, libpng, libtiff, eigen, etc., to be in $DEPS_PATH.
 OPENCV_VERSION=3.4.2
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
 if [[ $PLATFORM == 'linux' ]]; then
     OPENCV_LAPACK_OPTIONS=" \
           -D OpenBLAS_HOME=$DEPS_PATH \
@@ -181,7 +182,7 @@ if [[ ! -f $DEPS_PATH/lib/libopencv_core.a ]] || [[ ! -f $DEPS_PATH/lib/libopenc
           -D CMAKE_BUILD_TYPE=RELEASE \
           -D CMAKE_INSTALL_PREFIX=$DEPS_PATH ..
     if [[ $PLATFORM == 'linux' ]]; then
-        cp $DEPS_PATH/../patch/opencv_lapack.h ./
+        cp $DIR/patch/opencv_lapack.h ./
     fi
     make
     make install
diff --git a/tools/dependencies/patch/opencv_lapack.h b/tools/dependencies/patch/opencv_lapack.h
new file mode 100644
index 000000000000..97af9d67ea31
--- /dev/null
+++ b/tools/dependencies/patch/opencv_lapack.h
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+extern "C" {
+#include "cblas.h"
+#include "lapacke.h"
+}
diff --git a/tools/dependencies/protobuf.sh b/tools/dependencies/protobuf.sh
index dfa3d71f3750..1564701042af 100755
--- a/tools/dependencies/protobuf.sh
+++ b/tools/dependencies/protobuf.sh
@@ -39,3 +39,5 @@ if [[ ! -e $LIBPROTOBUF ]] || [[ ! -e $LIBPROTOC ]]; then
     make install
     cd -
 fi
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$(dirname $(find $DEPS_PATH -type f -name 'libprotoc*' | grep protobuf | head -n 1)):$DEPS_PATH/lib
diff --git a/tools/pip/MANIFEST.in b/tools/pip/MANIFEST.in
new file mode 100644
index 000000000000..5e072064193c
--- /dev/null
+++ b/tools/pip/MANIFEST.in
@@ -0,0 +1,11 @@
+include README
+include mxnet/COMMIT_HASH
+recursive-include mxnet/tools *
+recursive-include mxnet *.py
+recursive-include mxnet *.so
+recursive-include mxnet *.so.*
+recursive-include mxnet *.dylib
+recursive-include mxnet *_LICENSE
+recursive-include mxnet *.h
+recursive-include mxnet *.cuh
+recursive-include dmlc_tracker *.py
diff --git a/tools/pip/sanity_test.py b/tools/pip/sanity_test.py
new file mode 100644
index 000000000000..dc51e479906b
--- /dev/null
+++ b/tools/pip/sanity_test.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+"""Sanity test."""
+from __future__ import print_function
+import sys
+from base64 import b64decode
+
+try:
+    import mxnet as mx
+    mx.img.imdecode(b64decode('iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==')).asnumpy()
+    print('Test succeeded')
+except:
+    import traceback
+    print('Test failed')
+    traceback.print_exc()
+    sys.exit(1)
diff --git a/tools/pip/setup.py b/tools/pip/setup.py
new file mode 100644
index 000000000000..d5db6d87fc1d
--- /dev/null
+++ b/tools/pip/setup.py
@@ -0,0 +1,195 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=invalid-name, exec-used
+"""Setup mxnet package for pip."""
+from __future__ import absolute_import
+from datetime import datetime
+import os
+import sys
+import shutil
+import platform
+
+if platform.system() == 'Linux':
+    sys.argv.append('--universal')
+    sys.argv.append('--plat-name=manylinux1_x86_64')
+
+from setuptools import setup, find_packages
+from setuptools.dist import Distribution
+
+# We can not import `mxnet.info.py` in setup.py directly since mxnet/__init__.py
+# Will be invoked which introduces dependences
+CURRENT_DIR = os.path.dirname(__file__)
+libinfo_py = os.path.join(CURRENT_DIR, 'mxnet-build/python/mxnet/libinfo.py')
+libinfo = {'__file__': libinfo_py}
+exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo)
+
+LIB_PATH = libinfo['find_lib_path']()
+__version__ = libinfo['__version__']
+if 'TRAVIS_TAG' not in os.environ or not os.environ['TRAVIS_TAG'].strip():
+    __version__ += 'b{0}'.format(datetime.today().strftime('%Y%m%d'))
+elif 'TRAVIS_TAG' in os.environ and os.environ['TRAVIS_TAG'].startswith('patch-'):
+    __version__ = os.environ['TRAVIS_TAG'].split('-')[1]
+
+class BinaryDistribution(Distribution):
+    def has_ext_modules(self):
+        return platform.system() == 'Darwin'
+
+
+DEPENDENCIES = [
+    'numpy<1.15.0,>=1.8.2',
+    'requests>=2.20.0',
+    'graphviz<0.9.0,>=0.8.1'
+]
+
+shutil.rmtree(os.path.join(CURRENT_DIR, 'mxnet'), ignore_errors=True)
+shutil.rmtree(os.path.join(CURRENT_DIR, 'dmlc_tracker'), ignore_errors=True)
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/python/mxnet'),
+                os.path.join(CURRENT_DIR, 'mxnet'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/dmlc-core/tracker/dmlc_tracker'),
+                os.path.join(CURRENT_DIR, 'dmlc_tracker'))
+shutil.copy(LIB_PATH[0], os.path.join(CURRENT_DIR, 'mxnet'))
+
+# copy tools to mxnet package
+shutil.rmtree(os.path.join(CURRENT_DIR, 'mxnet/tools'), ignore_errors=True)
+os.mkdir(os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/tools/launch.py'), os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/tools/im2rec.py'), os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/tools/kill-mxnet.py'), os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/tools/parse_log.py'), os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/tools/diagnose.py'), os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/tools/caffe_converter'), os.path.join(CURRENT_DIR, 'mxnet/tools/caffe_converter'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/tools/bandwidth'), os.path.join(CURRENT_DIR, 'mxnet/tools/bandwidth'))
+
+# copy headers to mxnet package
+shutil.rmtree(os.path.join(CURRENT_DIR, 'mxnet/include'), ignore_errors=True)
+os.mkdir(os.path.join(CURRENT_DIR, 'mxnet/include'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/include/mxnet'),
+                os.path.join(CURRENT_DIR, 'mxnet/include/mxnet'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/dlpack/include/dlpack'),
+                os.path.join(CURRENT_DIR, 'mxnet/include/dlpack'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/dmlc-core/include/dmlc'),
+                os.path.join(CURRENT_DIR, 'mxnet/include/dmlc'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/mshadow/mshadow'),
+                os.path.join(CURRENT_DIR, 'mxnet/include/mshadow'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/tvm/nnvm/include/nnvm'),
+                os.path.join(CURRENT_DIR, 'mxnet/include/nnvm'))
+
+package_name = 'mxnet'
+
+variant = os.environ['mxnet_variant'].upper()
+if variant != 'CPU':
+    package_name = 'mxnet_{0}'.format(variant.lower())
+
+with open('doc/PYPI_README.md') as readme_file:
+    long_description = readme_file.read()
+
+with open('doc/{0}_ADDITIONAL.md'.format(variant)) as variant_doc:
+    long_description = long_description + variant_doc.read()
+
+# pypi only supports rst, so use pandoc to convert
+import pypandoc
+if platform.system() == 'Darwin':
+    pypandoc.download_pandoc()
+long_description = pypandoc.convert_text(long_description, 'rst', 'md')
+short_description = 'MXNet is an ultra-scalable deep learning framework.'
+libraries = []
+if variant == 'CPU':
+    libraries.append('openblas')
+else:
+    if variant.startswith('CU92'):
+        libraries.append('CUDA-9.2')
+    elif variant.startswith('CU91'):
+        libraries.append('CUDA-9.1')
+    elif variant.startswith('CU90'):
+        libraries.append('CUDA-9.0')
+    elif variant.startswith('CU80'):
+        libraries.append('CUDA-8.0')
+    elif variant.startswith('CU75'):
+        libraries.append('CUDA-7.5')
+    if variant.endswith('MKL'):
+        libraries.append('MKLDNN')
+
+short_description += ' This version uses {0}.'.format(' and '.join(libraries))
+
+package_data = {'mxnet': [os.path.join('mxnet', os.path.basename(LIB_PATH[0]))],
+                'dmlc_tracker': []}
+if variant.endswith('MKL'):
+    if platform.system() == 'Darwin':
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libmklml.dylib'), os.path.join(CURRENT_DIR, 'mxnet'))
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libiomp5.dylib'), os.path.join(CURRENT_DIR, 'mxnet'))
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libmkldnn.0.dylib'), os.path.join(CURRENT_DIR, 'mxnet'))
+        package_data['mxnet'].append('mxnet/libmklml.dylib')
+        package_data['mxnet'].append('mxnet/libiomp5.dylib')
+        package_data['mxnet'].append('mxnet/libmkldnn.0.dylib')
+    else:
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libmklml_intel.so'), os.path.join(CURRENT_DIR, 'mxnet'))
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libiomp5.so'), os.path.join(CURRENT_DIR, 'mxnet'))
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libmkldnn.so.0'), os.path.join(CURRENT_DIR, 'mxnet'))
+        package_data['mxnet'].append('mxnet/libmklml_intel.so')
+        package_data['mxnet'].append('mxnet/libiomp5.so')
+        package_data['mxnet'].append('mxnet/libmkldnn.so.0')
+    shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), '../MKLML_LICENSE'), os.path.join(CURRENT_DIR, 'mxnet'))
+    package_data['mxnet'].append('mxnet/MKLML_LICENSE')
+if platform.system() == 'Linux':
+    shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libgfortran.so.3'), os.path.join(CURRENT_DIR, 'mxnet'))
+    package_data['mxnet'].append('mxnet/libgfortran.so.3')
+    shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libquadmath.so.0'), os.path.join(CURRENT_DIR, 'mxnet'))
+    package_data['mxnet'].append('mxnet/libquadmath.so.0')
+
+from mxnet.base import _generate_op_module_signature
+from mxnet.ndarray.register import _generate_ndarray_function_code
+from mxnet.symbol.register import _generate_symbol_function_code
+_generate_op_module_signature('mxnet', 'symbol', _generate_symbol_function_code)
+_generate_op_module_signature('mxnet', 'ndarray', _generate_ndarray_function_code)
+
+setup(name=package_name,
+      version=__version__,
+      long_description=long_description,
+      description=short_description,
+      zip_safe=False,
+      packages=find_packages(),
+      package_data=package_data,
+      include_package_data=True,
+      install_requires=DEPENDENCIES,
+      distclass=BinaryDistribution,
+      license='Apache 2.0',
+      classifiers=[ # https://pypi.org/pypi?%3Aaction=list_classifiers
+          'Development Status :: 5 - Production/Stable',
+          'Intended Audience :: Developers',
+          'Intended Audience :: Education',
+          'Intended Audience :: Science/Research',
+          'License :: OSI Approved :: Apache Software License',
+          'Programming Language :: C++',
+          'Programming Language :: Cython',
+          'Programming Language :: Other',  # R, Scala
+          'Programming Language :: Perl',
+          'Programming Language :: Python',
+          'Programming Language :: Python :: 2.7',
+          'Programming Language :: Python :: 3.4',
+          'Programming Language :: Python :: 3.5',
+          'Programming Language :: Python :: 3.6',
+          'Programming Language :: Python :: Implementation :: CPython',
+          'Topic :: Scientific/Engineering',
+          'Topic :: Scientific/Engineering :: Artificial Intelligence',
+          'Topic :: Scientific/Engineering :: Mathematics',
+          'Topic :: Software Development',
+          'Topic :: Software Development :: Libraries',
+          'Topic :: Software Development :: Libraries :: Python Modules',
+      ],
+      url='https://github.com/apache/incubator-mxnet')
diff --git a/tools/pip_package/README.md b/tools/pip_package/README.md
deleted file mode 100644
index f289c98b7155..000000000000
--- a/tools/pip_package/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-MXNet Python Package
-====================
-MXNet is a deep learning framework designed for both *efficiency* and *flexibility*.
-It allows you to mix the flavours of deep learning programs together to maximize the efficiency and your productivity.
-
-
-Installation
-------------
-To install, check [Build Instruction](http://mxnet.io/get_started/setup.html)
diff --git a/tools/pip_package/make_pip_package.sh b/tools/pip_package/make_pip_package.sh
deleted file mode 100755
index 46b4938b0785..000000000000
--- a/tools/pip_package/make_pip_package.sh
+++ /dev/null
@@ -1,179 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-# Assuming the script is run at mxnet/tools/pip_package
-# This script builds from scratch the dependencies of mxnet into static
-# librareis and statically links them to produce a (mostly) standalone
-# libmxnet.so, then packages it into the python wheel.
-# It assumes the build environment to be a sandbox that doesn't have the .so
-# objects for the dependencies, i.e. zlib, openblas, libjpeg, libpng, libtiff
-# and opencv.
-
-# Install necessary build tools
-if [ -n "$(command -v apt-get)" ]; then
-    sudo apt-get update;
-    sudo apt-get install -y build-essential git python-pip zip pkg-config cmake
-elif [ -n "$(command -v yum)" ]; then
-    sudo yum install -y cmake
-    sudo yum groupinstall -y "Development Tools"
-    sudo yum install -y python27 python27-setuptools python27-tools python-pip
-else
-    echo "Need a package manager to install build tools, e.g. apt/yum"
-    exit 1
-fi
-sudo pip install -U pip setuptools wheel
-
-# Set up path as temporary working directory
-DEPS_PATH=$PWD/../../deps
-mkdir $DEPS_PATH
-
-# Dependencies can be updated here. Be sure to verify the download link before
-# changing. The dependencies are:
-ZLIB_VERSION=1.2.6
-OPENBLAS_VERSION=0.2.19
-JPEG_VERSION=8.4.0
-PNG_VERSION=1.5.10
-TIFF_VERSION=3.8.2
-OPENCV_VERSION=2.4.13
-
-# Setup path to dependencies
-export PKG_CONFIG_PATH=$DEPS_PATH/lib/pkgconfig:$DEPS_PATH/lib64/pkgconfig:$PKG_CONFIG_PATH
-export CPATH=$DEPS_PATH/include:$CPATH
-
-# Position Independent code must be turned on for statically linking .a
-export CC="gcc -fPIC"
-export CXX="g++ -fPIC"
-
-# Download and build zlib
-curl -L https://github.com/LuaDist/zlib/archive/$ZLIB_VERSION.zip -o $DEPS_PATH/zlib.zip
-unzip $DEPS_PATH/zlib.zip -d $DEPS_PATH
-mkdir $DEPS_PATH/zlib-$ZLIB_VERSION/build
-cd $DEPS_PATH/zlib-$ZLIB_VERSION/build
-cmake -D CMAKE_BUILD_TYPE=RELEASE \
-      -D CMAKE_INSTALL_PREFIX=$DEPS_PATH \
-      -D BUILD_SHARED_LIBS=OFF ..
-make -j$(nproc)
-make install
-cd -
-
-# download and build openblas
-curl -L https://github.com/xianyi/OpenBLAS/archive/v$OPENBLAS_VERSION.zip -o $DEPS_PATH/openblas.zip
-unzip $DEPS_PATH/openblas.zip -d $DEPS_PATH
-cd $DEPS_PATH/OpenBLAS-$OPENBLAS_VERSION
-make FC=gfortran -j $(($(nproc) + 1))
-make PREFIX=$DEPS_PATH install
-cd -
-ln -s $DEPS_PATH/lib/libopenblas_haswellp-r0.2.19.a $DEPS_PATH/lib/libcblas.a
-
-# download and build libjpeg
-curl -L https://github.com/LuaDist/libjpeg/archive/$JPEG_VERSION.zip -o $DEPS_PATH/libjpeg.zip
-unzip $DEPS_PATH/libjpeg.zip -d $DEPS_PATH
-cd $DEPS_PATH/libjpeg-$JPEG_VERSION
-./configure --disable-shared --prefix=$DEPS_PATH
-make -j$(nproc)
-make test
-make install
-cd -
-
-# download and build libpng
-curl -L https://github.com/LuaDist/libpng/archive/$PNG_VERSION.zip -o $DEPS_PATH/libpng.zip
-unzip $DEPS_PATH/libpng.zip -d $DEPS_PATH
-mkdir $DEPS_PATH/libpng-$PNG_VERSION/build
-cd $DEPS_PATH/libpng-$PNG_VERSION/build
-cmake -D CMAKE_BUILD_TYPE=RELEASE \
-      -D CMAKE_INSTALL_PREFIX=$DEPS_PATH \
-      -D PNG_CONFIGURE_LIBPNG=-fPIC \
-      -D BUILD_SHARED_LIBS=OFF ..
-make -j$(nproc)
-make install
-cd -
-
-# download and build libtiff
-curl -L https://github.com/LuaDist/libtiff/archive/$TIFF_VERSION.zip -o $DEPS_PATH/libtiff.zip
-unzip $DEPS_PATH/libtiff.zip -d $DEPS_PATH
-cd $DEPS_PATH/libtiff-$TIFF_VERSION
-./configure --disable-shared --prefix=$DEPS_PATH
-make -j$(nproc)
-make install
-cd -
-
-# download and build opencv since we need the static library
-curl -L https://github.com/Itseez/opencv/archive/$OPENCV_VERSION.zip -o $DEPS_PATH/opencv.zip
-unzip $DEPS_PATH/opencv.zip -d $DEPS_PATH
-mkdir $DEPS_PATH/opencv-$OPENCV_VERSION/build
-cd $DEPS_PATH/opencv-$OPENCV_VERSION/build
-cmake -D WITH_1394=OFF \
-      -D WITH_AVFOUNDATION=OFF \
-      -D WITH_CUDA=OFF \
-      -D WITH_VTK=OFF \
-      -D WITH_CUFFT=OFF \
-      -D WITH_CUBLAS=OFF \
-      -D WITH_NVCUVID=OFF \
-      -D WITH_EIGEN=ON \
-      -D WITH_VFW=OFF \
-      -D WITH_FFMPEG=OFF \
-      -D WITH_GSTREAMER=OFF \
-      -D WITH_GTK=OFF \
-      -D WITH_JASPER=OFF \
-      -D WITH_JPEG=ON \
-      -D WITH_PNG=ON \
-      -D WITH_QUICKTIME=OFF \
-      -D WITH_TBB=ON \
-      -D WITH_TIFF=OFF \
-      -D WITH_V4L=OFF \
-      -D WITH_LIBV4L=OFF \
-      -D WITH_DSHOW=OFF \
-      -D WITH_MSMF=OFF \
-      -D WITH_OPENCL=OFF \
-      -D WITH_OPENCLAMDFFT=OFF \
-      -D WITH_OPENCLAMDBLAS=OFF \
-      -D BUILD_SHARED_LIBS=OFF \
-      -D BUILD_opencv_apps=OFF \
-      -D BUILD_opencv_gpu=OFF \
-      -D BUILD_opencv_video=OFF \
-      -D BUILD_opencv_contrib=OFF \
-      -D BUILD_opencv_nonfree=OFF \
-      -D BUILD_opencv_flann=OFF \
-      -D BUILD_opencv_features2d=OFF \
-      -D BUILD_opencv_calib3d=OFF \
-      -D BUILD_opencv_objdetect=OFF \
-      -D BUILD_opencv_ml=OFF \
-      -D BUILD_opencv_photo=OFF \
-      -D BUILD_DOCS=OFF \
-      -D BUILD_PACKAGE=OFF \
-      -D CMAKE_BUILD_TYPE=RELEASE \
-      -D CMAKE_INSTALL_PREFIX=$DEPS_PATH ..
-make -j $(nproc)
-make install # user will always have access to home, so no sudo needed
-cd -
-
-# Although .so building is explicitly turned off for most libraries, sometimes
-# they still get created. So, remove them just to make sure they don't
-# interfere, or otherwise we might get libmxnet.so that is not self-contained.
-rm $DEPS_PATH/{lib,lib64}/*.{so,so.0}
-
-# Go to the parent path and build mxnet
-cd ../../
-cp make/pip_$(uname | tr '[:upper:]' '[:lower:]')_cpu.mk config.mk
-make -j $(nproc)
-
-# Generate wheel. The output is in the mxnet/tools/pip_package/dist path.
-cd tools/pip_package
-python setup.py bdist_wheel
diff --git a/tools/pip_package/setup.py b/tools/pip_package/setup.py
deleted file mode 100644
index e4bf48236bde..000000000000
--- a/tools/pip_package/setup.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=invalid-name, exec-used
-"""Setup mxnet package."""
-from __future__ import absolute_import
-import os
-import shutil
-
-from setuptools import setup, find_packages
-from setuptools.dist import Distribution
-
-# We can not import `mxnet.info.py` in setup.py directly since mxnet/__init__.py
-# Will be invoked which introduces dependences
-CURRENT_DIR = os.path.dirname(__file__)
-libinfo_py = os.path.join(CURRENT_DIR, '../../python/mxnet/libinfo.py')
-libinfo = {'__file__': libinfo_py}
-exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo)
-
-LIB_PATH = libinfo['find_lib_path']()
-__version__ = libinfo['__version__']
-
-class BinaryDistribution(Distribution):
-    def has_ext_modules(self):
-        return True
-
-
-DEPENDENCIES = [
-    'numpy',
-]
-
-shutil.rmtree(os.path.join(CURRENT_DIR, 'mxnet'), ignore_errors=True)
-shutil.copytree(os.path.join(CURRENT_DIR, '../../python/mxnet'),
-                os.path.join(CURRENT_DIR, 'mxnet'))
-shutil.copy(LIB_PATH[0], os.path.join(CURRENT_DIR, 'mxnet'))
-
-setup(name='mxnet',
-      version=__version__,
-      description=open(os.path.join(CURRENT_DIR, 'README.md')).read(),
-      zip_safe=False,
-      packages=find_packages(),
-      package_data={'mxnet': [os.path.join('mxnet', os.path.basename(LIB_PATH[0]))]},
-      include_package_data=True,
-      install_requires=DEPENDENCIES,
-      distclass=BinaryDistribution,
-      url='https://github.com/dmlc/mxnet')