From 1e17a516c121b49b7dbaa458a83f050dc8984396 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Tue, 18 Sep 2018 11:53:27 -0700
Subject: [PATCH 01/93] mkldnn is default makefile and explicitly turn off for
 buidls

---
 Makefile                       | 3 +++
 ci/docker/runtime_functions.sh | 7 +++++++
 2 files changed, 10 insertions(+)

diff --git a/Makefile b/Makefile
index 620679a44f80..bfe3e97d6f13 100644
--- a/Makefile
+++ b/Makefile
@@ -60,6 +60,9 @@ endif
 # use customized config file
 include $(config)
 
+ifneq ($(USE_MKLDNN), 0)
+    USE_MKLDNN = 1
+
 ifeq ($(USE_MKL2017), 1)
 $(warning "USE_MKL2017 is deprecated. We will switch to USE_MKLDNN.")
 	USE_MKLDNN=1
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 43006f23974d..55b8bfd6ac14 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -251,6 +251,7 @@ build_centos7_cpu() {
         USE_LAPACK_PATH=/usr/lib64/liblapack.so \
         USE_BLAS=openblas \
         USE_DIST_KVSTORE=1 \
+        USE_MKLDNN=0 \
         -j$(nproc)
 }
 
@@ -303,6 +304,7 @@ build_centos7_gpu() {
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
+        USE_MKLDNN=0                              \
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         -j$(nproc)
@@ -318,6 +320,7 @@ build_ubuntu_cpu_openblas() {
     export CXX="ccache g++"
     make \
         DEV=1                         \
+        USE_MKLDNN=0                  \
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
@@ -385,6 +388,7 @@ build_ubuntu_cpu_clang39() {
         USE_BLAS=openblas             \
         USE_OPENMP=0                  \
         USE_DIST_KVSTORE=1            \
+        USE_MKLDNN=0                  \
         -j$(nproc)
 }
 
@@ -402,6 +406,7 @@ build_ubuntu_cpu_clang60() {
         USE_BLAS=openblas             \
         USE_OPENMP=1                  \
         USE_DIST_KVSTORE=1            \
+        USE_MKLDNN=0                  \
         -j$(nproc)
 }
 
@@ -534,6 +539,7 @@ build_ubuntu_gpu_tensorrt() {
         USE_TENSORRT=1                                       \
         USE_JEMALLOC=0                                       \
         USE_GPERFTOOLS=0                                     \
+        USE_MKLDNN=0                                         \
         ONNX_NAMESPACE=onnx                                  \
         CUDA_ARCH="-gencode arch=compute_70,code=compute_70" \
         -j$(nproc)
@@ -588,6 +594,7 @@ build_ubuntu_gpu_cuda91_cudnn7() {
         USE_CPP_PACKAGE=1                         \
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
+        USE_MKLDNN=0                              \
         -j$(nproc)
 }
 

From abbc3ad526e5ae09334a29d72c87e597a1554993 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Tue, 18 Sep 2018 13:44:24 -0700
Subject: [PATCH 02/93] add endif

---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index bfe3e97d6f13..18543fdc917f 100644
--- a/Makefile
+++ b/Makefile
@@ -62,6 +62,7 @@ include $(config)
 
 ifneq ($(USE_MKLDNN), 0)
     USE_MKLDNN = 1
+endif
 
 ifeq ($(USE_MKL2017), 1)
 $(warning "USE_MKL2017 is deprecated. We will switch to USE_MKLDNN.")

From 92b91f8a19069255d26fa841e06a9ffb2739b949 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Mon, 29 Oct 2018 18:06:55 -0700
Subject: [PATCH 03/93] retrigger


From ce5336cb94a912fe6693c3a7c8603fbb406c5c46 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Wed, 31 Oct 2018 13:28:44 -0700
Subject: [PATCH 04/93] retrigger


From b8a0203a1f296c12cf9659c015f829d817f15ec2 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 8 Nov 2018 17:58:33 -0800
Subject: [PATCH 05/93] build mkldnn as static lib

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5cea683e8f56..abe52ca4d6a3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -224,6 +224,7 @@ if(ENABLE_TESTCOVERAGE)
 endif()
 
 if(USE_MKLDNN)
+  SET(MKLDNN_LIBRARY_TYPE STATIC)
   include(cmake/MklDnn.cmake)
   # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
   if(NOT MSVC)

From bc6c482132856e0acd3060ed17fdad04d2f418c6 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 8 Nov 2018 18:19:33 -0800
Subject: [PATCH 06/93] update makefile to statically build mkldnn

---
 mkldnn.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mkldnn.mk b/mkldnn.mk
index d79bbe7d2a0e..4364f14ba0bc 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -37,7 +37,7 @@ mkldnn_build: $(MKLDNN_LIBFILE)
 $(MKLDNN_LIBFILE):
 	mkdir -p $(MKLDNNROOT)
 	cd $(MKLDNN_SUBMODDIR) && rm -rf external && cd scripts && ./prepare_mkl.sh && cd .. && cp -a external/*/* $(MKLDNNROOT)/.
-	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF
+	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF -DMKLDNN_LIBRARY_TYPE=STATIC
 	$(MAKE) -C $(MKLDNN_BUILDDIR) VERBOSE=1
 	$(MAKE) -C $(MKLDNN_BUILDDIR) install
 	mkdir -p $(MXNET_LIBDIR)

From 15a41fc816fe68ecd8d6e07154f5775003458dde Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 8 Nov 2018 18:43:53 -0800
Subject: [PATCH 07/93] build static mkldnn

---
 Makefile  | 3 +--
 mkldnn.mk | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index b12311736647..f3d3360096f8 100644
--- a/Makefile
+++ b/Makefile
@@ -131,8 +131,7 @@ ifeq ($(USE_MKLDNN), 1)
 		CFLAGS += -I$(MKLROOT)/include
 		LDFLAGS += -L$(MKLROOT)/lib
 	endif
-	CFLAGS += -I$(MKLDNNROOT)/include
-	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
+	LIB_DEP += $(MKLDNNROOT)/lib/libmkldnn.a.0
 endif
 
 # setup opencv
diff --git a/mkldnn.mk b/mkldnn.mk
index 4364f14ba0bc..f84d4cd743e4 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -26,7 +26,7 @@ ifeq ($(UNAME_S), Darwin)
 else
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so.0
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a.0
 endif
 endif
 

From 42b3353ea1a968df004482cb0b188839f575286d Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 8 Nov 2018 18:49:20 -0800
Subject: [PATCH 08/93] fix static name

---
 mkldnn.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mkldnn.mk b/mkldnn.mk
index f84d4cd743e4..3cb6062cd221 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -26,7 +26,7 @@ ifeq ($(UNAME_S), Darwin)
 else
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a.0
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
 endif
 endif
 

From 5af258ae73d24a8d1103f915ee7ab0db26ff5f2e Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 8 Nov 2018 18:52:30 -0800
Subject: [PATCH 09/93] fix static name

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index f3d3360096f8..a92b02e08394 100644
--- a/Makefile
+++ b/Makefile
@@ -131,7 +131,7 @@ ifeq ($(USE_MKLDNN), 1)
 		CFLAGS += -I$(MKLROOT)/include
 		LDFLAGS += -L$(MKLROOT)/lib
 	endif
-	LIB_DEP += $(MKLDNNROOT)/lib/libmkldnn.a.0
+	LIB_DEP += $(MKLDNNROOT)/lib/libmkldnn.a
 endif
 
 # setup opencv

From 32ab9ce9649044541460a30f2d172d6622f36e27 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 8 Nov 2018 18:57:42 -0800
Subject: [PATCH 10/93] update static for mac

---
 mkldnn.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mkldnn.mk b/mkldnn.mk
index 3cb6062cd221..cd5bd8556dfe 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -22,7 +22,7 @@ ifeq ($(USE_MKLDNN), 1)
 ifeq ($(UNAME_S), Darwin)
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.dylib
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml.dylib
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.0.dylib
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
 else
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so

From e2422d63cd450274ba3236bc6e5e5069ece82967 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Mon, 12 Nov 2018 14:03:33 -0800
Subject: [PATCH 11/93] rename mkldnn dep in ci

---
 Jenkinsfile                    | 4 ++--
 ci/docker/runtime_functions.sh | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 3f72843596e7..600e8448c13b 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -32,8 +32,8 @@ mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3r
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.a'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.a, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/lenet, build/cpp-package/example/alexnet, build/cpp-package/example/googlenet, build/cpp-package/example/lenet_with_mxdataiter, build/cpp-package/example/resnet, build/cpp-package/example/mlp, build/cpp-package/example/mlp_cpu, build/cpp-package/example/mlp_gpu, build/cpp-package/example/test_score, build/cpp-package/example/test_optimizer'
 mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/mlp_cpu'
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 27c93b5049ef..f571a54e4c29 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -629,8 +629,8 @@ build_ubuntu_gpu_cmake_mkldnn() {
 
     ninja -v
     # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
-    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
-    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
+#    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
+#    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
 }
 
 build_ubuntu_gpu_cmake() {

From 372f6978283903092d65101f0efb6dead2fd37b1 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Mon, 12 Nov 2018 15:09:20 -0800
Subject: [PATCH 12/93] remove moving mkldnn dynamic lib

---
 Jenkinsfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 600e8448c13b..655135f5ad59 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -32,8 +32,8 @@ mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3r
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.a'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.a, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/lenet, build/cpp-package/example/alexnet, build/cpp-package/example/googlenet, build/cpp-package/example/lenet_with_mxdataiter, build/cpp-package/example/resnet, build/cpp-package/example/mlp, build/cpp-package/example/mlp_cpu, build/cpp-package/example/mlp_gpu, build/cpp-package/example/test_score, build/cpp-package/example/test_optimizer'
 mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/mlp_cpu'

From 67e4dff4c718401d1552d8bfb54f83654ea4aaf7 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Mon, 12 Nov 2018 16:06:03 -0800
Subject: [PATCH 13/93] retrigger


From 150b324572fd445bfcbf6863c60e727b4f42bd4f Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Mon, 12 Nov 2018 16:50:50 -0800
Subject: [PATCH 14/93] remove commented code

---
 ci/docker/runtime_functions.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index f571a54e4c29..7ac8b0297990 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -628,9 +628,6 @@ build_ubuntu_gpu_cmake_mkldnn() {
         /work/mxnet
 
     ninja -v
-    # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
-#    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
-#    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
 }
 
 build_ubuntu_gpu_cmake() {

From 89b11c6e8094ccaba3a441a3e433c0d03307992b Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Tue, 13 Nov 2018 13:25:49 -0800
Subject: [PATCH 15/93] retrigger


From 40fd0ac94e00d91b05f041da1c2ddd9236b7f7a4 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Wed, 14 Nov 2018 12:25:48 -0800
Subject: [PATCH 16/93] remove mkldnn dnaymic for unitest

---
 tests/cpp/unittest.mk | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/cpp/unittest.mk b/tests/cpp/unittest.mk
index 746ee2f096f1..665ce6982874 100644
--- a/tests/cpp/unittest.mk
+++ b/tests/cpp/unittest.mk
@@ -41,22 +41,22 @@ gtest-all.o : $(GTEST_SRCS_)
 gtest.a : gtest-all.o
 	$(AR) $(ARFLAGS) $@ $^
 
-build/tests/cpp/%.o : tests/cpp/%.cc | mkldnn
+build/tests/cpp/%.o : tests/cpp/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/$* $< > build/tests/cpp/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/operator/%.o : tests/cpp/operator/%.cc | mkldnn
+build/tests/cpp/operator/%.o : tests/cpp/operator/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/operator/$* $< > build/tests/cpp/operator/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/operator/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/storage/%.o : tests/cpp/storage/%.cc | mkldnn
+build/tests/cpp/storage/%.o : tests/cpp/storage/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/storage/$* $< > build/tests/cpp/storage/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/storage/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc | mkldnn
+build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/engine/$* $< > build/tests/cpp/engine/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/engine/$*.o $(filter %.cc %.a, $^)

From cb095c6594148a650c4c689592d71c1af251580b Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Wed, 21 Nov 2018 14:25:02 -0800
Subject: [PATCH 17/93] retrigger


From c08f6face171389b51a7ac55cb7a930a8623b503 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Wed, 21 Nov 2018 16:35:19 -0800
Subject: [PATCH 18/93] retrigger


From 0302290659bb7da3fbd46e4ac85f0c266b3aa014 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Thu, 22 Nov 2018 11:52:12 -0800
Subject: [PATCH 19/93] force static for mkldnn lib

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ba76569c7101..6b1c858326fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -225,7 +225,7 @@ endif()
 
 if(USE_MKLDNN)
   include(cmake/DownloadMKLML.cmake)
-  SET(MKLDNN_LIBRARY_TYPE STATIC)
+  set(MKLDNN_LIBRARY_TYPE "STATIC" CACHE INTERNAL "" FORCE)
   # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
   if(NOT MSVC)
     set(ARCH_OPT_FLAGS "-mtune=generic")

From bf78666c88e27624fa4d971afbdcf6ca5175fc8f Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Thu, 22 Nov 2018 11:55:27 -0800
Subject: [PATCH 20/93] turn of mkldnn on arm builds

---
 ci/docker/runtime_functions.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 45eb09f59d10..6b5397c7ff69 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -139,6 +139,7 @@ build_armv6() {
         -DCMAKE_BUILD_TYPE=Release \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_LAPACK=OFF \
+        -DUSE_MKLDNN=0FF \
         -DBUILD_CPP_EXAMPLES=OFF \
         -Dmxnet_LINKER_LIBS=-lgfortran \
         -G Ninja /work/mxnet
@@ -170,6 +171,7 @@ build_armv7() {
         -DCMAKE_BUILD_TYPE=Release \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_LAPACK=OFF \
+        -DUSE_MKLDNN=0FF \
         -DBUILD_CPP_EXAMPLES=OFF \
         -Dmxnet_LINKER_LIBS=-lgfortran \
         -G Ninja /work/mxnet
@@ -190,6 +192,7 @@ build_armv8() {
         -DUSE_LAPACK=OFF\
         -DUSE_SIGNAL_HANDLER=ON\
         -DCMAKE_BUILD_TYPE=Release\
+        -DUSE_MKLDNN=0FF \
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
     ninja -v
@@ -215,6 +218,7 @@ build_android_armv7() {
         -DUSE_OPENCV=OFF\
         -DUSE_OPENMP=OFF\
         -DUSE_SIGNAL_HANDLER=ON\
+        -DUSE_MKLDNN=0FF \
         -DCMAKE_BUILD_TYPE=RelWithDebInfo\
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
@@ -234,6 +238,7 @@ build_android_armv8() {
         -DUSE_SIGNAL_HANDLER=ON\
         -DCMAKE_BUILD_TYPE=RelWithDebInfo\
         -DUSE_MKL_IF_AVAILABLE=OFF\
+        -DUSE_MKLDNN=0FF \
         -G Ninja /work/mxnet
     ninja -v
 }

From d103ec85e84e3a4eb0538b88dcb36788289777a0 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Tue, 27 Nov 2018 17:38:03 -0800
Subject: [PATCH 21/93] remove dynamic mkldnn bind

---
 ci/jenkins/Jenkins_steps.groovy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index f48a26737308..e12b4ed04110 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -34,7 +34,7 @@ mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3r
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
+mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so
 mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*'

From 4b07dcf120419cd90b176a0a0a6cc0badee865ef Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 29 Nov 2018 12:07:50 -0800
Subject: [PATCH 22/93] update jenkins to use only mkldnn

---
 ci/docker/runtime_functions.sh | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 4f2f0f0d8994..babb73158b1f 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -140,7 +140,6 @@ build_armv6() {
         -DCMAKE_BUILD_TYPE=Release \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_LAPACK=OFF \
-        -DUSE_MKLDNN=0FF \
         -DBUILD_CPP_EXAMPLES=OFF \
         -Dmxnet_LINKER_LIBS=-lgfortran \
         -G Ninja /work/mxnet
@@ -172,7 +171,6 @@ build_armv7() {
         -DCMAKE_BUILD_TYPE=Release \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_LAPACK=OFF \
-        -DUSE_MKLDNN=0FF \
         -DBUILD_CPP_EXAMPLES=OFF \
         -Dmxnet_LINKER_LIBS=-lgfortran \
         -G Ninja /work/mxnet
@@ -219,7 +217,6 @@ build_android_armv7() {
         -DUSE_OPENCV=OFF\
         -DUSE_OPENMP=OFF\
         -DUSE_SIGNAL_HANDLER=ON\
-        -DUSE_MKLDNN=0FF \
         -DCMAKE_BUILD_TYPE=RelWithDebInfo\
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
@@ -239,7 +236,6 @@ build_android_armv8() {
         -DUSE_SIGNAL_HANDLER=ON\
         -DCMAKE_BUILD_TYPE=RelWithDebInfo\
         -DUSE_MKL_IF_AVAILABLE=OFF\
-        -DUSE_MKLDNN=0FF \
         -G Ninja /work/mxnet
     ninja -v
 }
@@ -257,7 +253,6 @@ build_centos7_cpu() {
         USE_LAPACK_PATH=/usr/lib64/liblapack.so \
         USE_BLAS=openblas \
         USE_DIST_KVSTORE=1 \
-        USE_MKLDNN=0 \
         -j$(nproc)
 }
 
@@ -291,7 +286,6 @@ build_centos7_mkldnn() {
         ENABLE_TESTCOVERAGE=1 \
         USE_LAPACK=1 \
         USE_LAPACK_PATH=/usr/lib64/liblapack.so \
-        USE_MKLDNN=1 \
         USE_BLAS=openblas \
         -j$(nproc)
 }
@@ -310,7 +304,6 @@ build_centos7_gpu() {
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
-        USE_MKLDNN=0                              \
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         -j$(nproc)
@@ -326,7 +319,6 @@ build_ubuntu_cpu_openblas() {
     export CXX="ccache g++"
     make \
         DEV=1                         \
-        USE_MKLDNN=0                  \
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
@@ -394,7 +386,6 @@ build_ubuntu_cpu_clang39() {
         USE_BLAS=openblas             \
         USE_OPENMP=0                  \
         USE_DIST_KVSTORE=1            \
-        USE_MKLDNN=0                  \
         -j$(nproc)
 }
 
@@ -412,7 +403,6 @@ build_ubuntu_cpu_clang60() {
         USE_BLAS=openblas             \
         USE_OPENMP=1                  \
         USE_DIST_KVSTORE=1            \
-        USE_MKLDNN=0                  \
         -j$(nproc)
 }
 
@@ -454,7 +444,6 @@ build_ubuntu_cpu_clang39_mkldnn() {
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
-        USE_MKLDNN=1                  \
         USE_OPENMP=0                  \
         -j$(nproc)
 }
@@ -471,7 +460,6 @@ build_ubuntu_cpu_clang60_mkldnn() {
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
-        USE_MKLDNN=1                  \
         USE_OPENMP=1                  \
         -j$(nproc)
 }
@@ -486,7 +474,6 @@ build_ubuntu_cpu_mkldnn() {
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
-        USE_MKLDNN=1                  \
         -j$(nproc)
 }
 
@@ -545,7 +532,6 @@ build_ubuntu_gpu_tensorrt() {
         USE_TENSORRT=1                                       \
         USE_JEMALLOC=0                                       \
         USE_GPERFTOOLS=0                                     \
-        USE_MKLDNN=0                                         \
         ONNX_NAMESPACE=onnx                                  \
         CUDA_ARCH="-gencode arch=compute_70,code=compute_70" \
         -j$(nproc)
@@ -561,7 +547,6 @@ build_ubuntu_gpu_mkldnn() {
         ENABLE_TESTCOVERAGE=1                     \
         USE_CPP_PACKAGE=1                         \
         USE_BLAS=openblas                         \
-        USE_MKLDNN=1                              \
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
@@ -578,7 +563,6 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
         DEV=1                                     \
         ENABLE_TESTCOVERAGE=1                     \
         USE_BLAS=openblas                         \
-        USE_MKLDNN=1                              \
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=0                               \
@@ -600,7 +584,6 @@ build_ubuntu_gpu_cuda91_cudnn7() {
         USE_CPP_PACKAGE=1                         \
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
-        USE_MKLDNN=0                              \
         -j$(nproc)
 }
 
@@ -656,7 +639,6 @@ build_ubuntu_gpu_cmake() {
         -DUSE_CUDA=1                            \
         -DUSE_CUDNN=1                           \
         -DUSE_MKLML_MKL=0                       \
-        -DUSE_MKLDNN=0                          \
         -DUSE_DIST_KVSTORE=1                    \
         -DCMAKE_BUILD_TYPE=Release              \
         -DCUDA_ARCH_NAME=Manual                 \

From 6536cda6a87ab2ef804c0e320dfcbe2a8e56eb8d Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 29 Nov 2018 12:30:25 -0800
Subject: [PATCH 23/93] remove last flag

---
 ci/docker/runtime_functions.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index babb73158b1f..081a1475b5c4 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -191,7 +191,6 @@ build_armv8() {
         -DUSE_LAPACK=OFF\
         -DUSE_SIGNAL_HANDLER=ON\
         -DCMAKE_BUILD_TYPE=Release\
-        -DUSE_MKLDNN=0FF \
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
     ninja -v

From de4ff3112739012fac2fcc617665699b36d4b82b Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 29 Nov 2018 12:41:06 -0800
Subject: [PATCH 24/93] turn mkldnn by default on mac

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3b8bbd2e0272..f09a9c29e9d8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,7 +20,7 @@ mxnet_option(USE_F16C             "Build with x86 F16C instruction support" ON)
 mxnet_option(USE_LAPACK           "Build with lapack support" ON)
 mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
 mxnet_option(USE_MKLML_MKL        "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE))
-mxnet_option(USE_MKLDNN           "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE))
+mxnet_option(USE_MKLDNN           "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE)
 mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON IF NOT MSVC)
 mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support (if found)" ON)
 mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)

From 0e5a36286220d4917b0daae8c7431ea290aadf58 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 29 Nov 2018 15:11:25 -0800
Subject: [PATCH 25/93] move mkldnn files for GPU MKLDNN build

---
 Jenkinsfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 015ca81bad76..a05cfa98fc6a 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -372,7 +372,7 @@ core_logic: {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_gpu', 'build_ubuntu_gpu_cmake', false)
-            utils.pack_lib('cmake_gpu', mx_cmake_lib, true)
+            utils.pack_lib('cmake_gpu', mx_cmake_mkldnn_lib, true)
           }
         }
       }
@@ -772,7 +772,7 @@ core_logic: {
       node(NODE_LINUX_GPU) {
         ws('workspace/ut-cpp-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cmake_gpu', mx_cmake_lib, true)
+            utils.unpack_and_init('cmake_gpu', mx_cmake_mkldnn_lib, true)
             utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_cpp', true)
             utils.publish_test_coverage()
           }

From 6eadfae419422fb631965958041ecec14f2e514e Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 29 Nov 2018 17:10:28 -0800
Subject: [PATCH 26/93] copy lib mxnet in gpu build

---
 ci/docker/runtime_functions.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 081a1475b5c4..e7f2f62d120c 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -646,6 +646,9 @@ build_ubuntu_gpu_cmake() {
         /work/mxnet
 
     ninja -v
+    # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
+    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
+    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
 }
 
 build_ubuntu_blc() {

From 0d0f40723388211b50357f97abe1458fc5887649 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 29 Nov 2018 17:36:53 -0800
Subject: [PATCH 27/93] only link windows

---
 CMakeLists.txt | 3 ++-
 Makefile       | 8 +++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 205262b85125..5f35cd8934bc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -225,11 +225,12 @@ endif()
 
 if(USE_MKLDNN)
   include(cmake/DownloadMKLML.cmake)
-  set(MKLDNN_LIBRARY_TYPE "STATIC" CACHE INTERNAL "" FORCE)
   # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
   if(NOT MSVC)
+    set(MKLDNN_LIBRARY_TYPE "STATIC")
     set(ARCH_OPT_FLAGS "-mtune=generic")
   else()
+    set(MKLDNN_LIBRARY_TYPE "SHARED")
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /EHsc")
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /EHsc /Gy")
   endif()
diff --git a/Makefile b/Makefile
index 30c21636a6ea..dd05e2157822 100644
--- a/Makefile
+++ b/Makefile
@@ -131,7 +131,13 @@ ifeq ($(USE_MKLDNN), 1)
 		CFLAGS += -I$(MKLROOT)/include
 		LDFLAGS += -L$(MKLROOT)/lib
 	endif
-	LIB_DEP += $(MKLDNNROOT)/lib/libmkldnn.a
+
+	ifneq ($(UNAME_S), Windows)
+	    LIB_DEP += $(MKLDNNROOT)/lib/libmkldnn.a
+    else
+        CFLAGS += -I$(MKLDNNROOT)/include
+        LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
+    endif
 endif
 
 # setup opencv

From b300b88bea2fef4a4a7d2030c3ff2b3384886c87 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 29 Nov 2018 17:43:36 -0800
Subject: [PATCH 28/93] add mkldnn.mk

---
 mkldnn.mk | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/mkldnn.mk b/mkldnn.mk
index cd5bd8556dfe..c9ca55c975ae 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -19,10 +19,16 @@ ifeq ($(USE_MKLDNN), 1)
 	MKLDNN_SUBMODDIR = $(ROOTDIR)/3rdparty/mkldnn
 	MKLDNN_BUILDDIR = $(MKLDNN_SUBMODDIR)/build
 	MXNET_LIBDIR = $(ROOTDIR)/lib
+	MKLDNN_LIBRARY_TYPE=STATIC
 ifeq ($(UNAME_S), Darwin)
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.dylib
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml.dylib
 	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
+else ifeq ($(UNAME_S), Windows)
+	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
+    MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
+    MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so
+    MKLDNN_LIBRARY_TYPE=SHARED
 else
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
@@ -37,7 +43,7 @@ mkldnn_build: $(MKLDNN_LIBFILE)
 $(MKLDNN_LIBFILE):
 	mkdir -p $(MKLDNNROOT)
 	cd $(MKLDNN_SUBMODDIR) && rm -rf external && cd scripts && ./prepare_mkl.sh && cd .. && cp -a external/*/* $(MKLDNNROOT)/.
-	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF -DMKLDNN_LIBRARY_TYPE=STATIC
+	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF -DMKLDNN_LIBRARY_TYPE=$(MKLDNN_LIBRARY_TYPE)
 	$(MAKE) -C $(MKLDNN_BUILDDIR) VERBOSE=1
 	$(MAKE) -C $(MKLDNN_BUILDDIR) install
 	mkdir -p $(MXNET_LIBDIR)

From b336ef029db0a730819c5b1e20409c0f01a2d669 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 29 Nov 2018 18:23:59 -0800
Subject: [PATCH 29/93] try force linking

---
 CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5f35cd8934bc..161705643194 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -227,10 +227,9 @@ if(USE_MKLDNN)
   include(cmake/DownloadMKLML.cmake)
   # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
   if(NOT MSVC)
-    set(MKLDNN_LIBRARY_TYPE "STATIC")
+    set(MKLDNN_LIBRARY_TYPE "STATIC" CACHE INTERNAL "" FORCE)
     set(ARCH_OPT_FLAGS "-mtune=generic")
   else()
-    set(MKLDNN_LIBRARY_TYPE "SHARED")
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /EHsc")
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /EHsc /Gy")
   endif()

From b9be8236739801ac08033bc6fe5d58242e12081b Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 29 Nov 2018 18:56:49 -0800
Subject: [PATCH 30/93] retrigger


From fdcee0d4bec144cf4b80acec3a24a79286b4d87c Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 29 Nov 2018 19:35:24 -0800
Subject: [PATCH 31/93] retrigger


From 46ee0bd935dc51c7e3e018db263f3b5f642d141e Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 29 Nov 2018 20:26:16 -0800
Subject: [PATCH 32/93] remove mkldnn dynanmic check

---
 tests/python/mkl/test_mkldnn.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index c6c0a0832f1f..d9d3abfc3ced 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -27,7 +27,6 @@
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet.test_utils import *
-import test_mkldnn_install as install
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '../unittest/'))
 from common import with_seed
@@ -441,7 +440,4 @@ def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
     custom = mx.symbol.Custom(name='custom', data=conv, op_type='custom')
     exec1 = custom.bind(mx.cpu(), args={'data': mx.nd.ones([10,3,96,96]), 'conv_weight': mx.nd.ones([8,3,5,5])})
     exec1.forward()[0].wait_to_read()
-
-
-if __name__ == '__main__':
-    install.test_mkldnn_install()
+    

From 7772cdd34a773ef27ce792c7970659ea337e7e1f Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 29 Nov 2018 21:05:00 -0800
Subject: [PATCH 33/93] use ifndef

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index e9d1f7942a70..3c7ca0c3f25c 100644
--- a/Makefile
+++ b/Makefile
@@ -60,7 +60,7 @@ endif
 # use customized config file
 include $(config)
 
-ifneq ($(USE_MKLDNN), 0)
+ifndef $(USE_MKLDNN)
     USE_MKLDNN = 1
 endif
 

From 45e8cd89ad11fde9f56d33a04534b99841a4f9d1 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 29 Nov 2018 21:27:44 -0800
Subject: [PATCH 34/93] remove test mkldnn install

---
 tests/python/mkl/test_mkldnn_install.py | 56 -------------------------
 1 file changed, 56 deletions(-)
 delete mode 100644 tests/python/mkl/test_mkldnn_install.py

diff --git a/tests/python/mkl/test_mkldnn_install.py b/tests/python/mkl/test_mkldnn_install.py
deleted file mode 100644
index c2f26df72f2e..000000000000
--- a/tests/python/mkl/test_mkldnn_install.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-MKL-DNN related test cases
-"""
-
-import sys
-import os
-import logging
-
-
-def test_mkldnn_install():
-    """
-    This test will verify that MXNet is built/installed correctly when
-    compiled with Intel MKL-DNN library. The method will try to import
-    the mxnet module and see if the mkldnn library is mapped to this
-    process's address space.
-    """
-    logging.basicConfig(level=logging.INFO)
-
-    if not sys.platform.startswith('linux'):
-        logging.info("Bypass mkldnn install test for non-Linux OS")
-        return
-
-    try:
-        #pylint: disable=unused-variable
-        import mxnet as mx
-    except (ImportError, OSError) as e:
-        assert 0, "Import mxnet error: %s. Please double check your build/" \
-            "install steps or environment variable settings" % str(e)
-
-    pid = os.getpid()
-    rc = os.system("cat /proc/" + str(pid) +
-                   "/maps | grep libmkldnn > /dev/null")
-
-    if rc == 0:
-        logging.info("MXNet is built/installed correctly with MKL-DNN")
-    else:
-        assert 0, "MXNet is built/installed incorrectly with MKL-DNN, please " \
-            "double check your build/install steps or environment " \
-            "variable settings"

From ed31e12cb6b8e9f13a7d957589705a1c4f4aea81 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Fri, 30 Nov 2018 11:07:23 -0800
Subject: [PATCH 35/93] fix spacing

---
 Makefile  | 8 ++++----
 mkldnn.mk | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index dd05e2157822..88a5459f0b2a 100644
--- a/Makefile
+++ b/Makefile
@@ -133,10 +133,10 @@ ifeq ($(USE_MKLDNN), 1)
 	endif
 
 	ifneq ($(UNAME_S), Windows)
-	    LIB_DEP += $(MKLDNNROOT)/lib/libmkldnn.a
-    else
-        CFLAGS += -I$(MKLDNNROOT)/include
-        LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
+		LIB_DEP += $(MKLDNNROOT)/lib/libmkldnn.a
+	else
+		CFLAGS += -I$(MKLDNNROOT)/include
+		LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
     endif
 endif
 
diff --git a/mkldnn.mk b/mkldnn.mk
index c9ca55c975ae..5af3e9b1d741 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -26,9 +26,9 @@ ifeq ($(UNAME_S), Darwin)
 	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
 else ifeq ($(UNAME_S), Windows)
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
-    MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
-    MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so
-    MKLDNN_LIBRARY_TYPE=SHARED
+	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so
+	MKLDNN_LIBRARY_TYPE=SHARED
 else
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so

From 6a029493c4df9d473b1f56a3b418760f709f0cb9 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Fri, 30 Nov 2018 13:23:04 -0800
Subject: [PATCH 36/93] fix index

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 88a5459f0b2a..2cf3526abc24 100644
--- a/Makefile
+++ b/Makefile
@@ -137,7 +137,7 @@ ifeq ($(USE_MKLDNN), 1)
 	else
 		CFLAGS += -I$(MKLDNNROOT)/include
 		LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
-    endif
+	endif
 endif
 
 # setup opencv

From 3cc21f37aaa96c11ea249d69182f95669536f8f9 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Fri, 30 Nov 2018 13:29:04 -0800
Subject: [PATCH 37/93] remove cp of mkldnn since statically linked

---
 ci/docker/runtime_functions.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 0036ce798e24..94143ed0521a 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -643,9 +643,6 @@ build_ubuntu_gpu_cmake() {
         /work/mxnet
 
     ninja -v
-    # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
-    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
-    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
 }
 
 build_ubuntu_blc() {

From 0b894a0e8b896924cbc8bcaf203d944abbcaf345 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Fri, 30 Nov 2018 14:22:34 -0800
Subject: [PATCH 38/93] add libmkldnn.a to list of files to pack

---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index f329e48dc761..63bf6d25fc42 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -53,7 +53,7 @@ mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdpart
 mx_pip = 'build/*.whl'
 
 // for scala build, need to pass extra libs when run with dist_kvstore
-mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
+mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmkldnn.a'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.

From 6b4db54d48aada5168b772d66a84fe13e4b2094b Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Fri, 30 Nov 2018 14:25:57 -0800
Subject: [PATCH 39/93] include mkl_ml

---
 Jenkinsfile                     | 2 +-
 ci/jenkins/Jenkins_steps.groovy | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 63bf6d25fc42..70a899b6112e 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -47,7 +47,7 @@
 
 
 // mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 
 // Python wheels
 mx_pip = 'build/*.whl'
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index e12b4ed04110..e9456f4c05bc 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -23,13 +23,13 @@
 utils = load('ci/Jenkinsfile_utils.groovy')
 
 // mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 
 // Python wheels
 mx_pip = 'build/*.whl'
 
 // for scala build, need to pass extra libs when run with dist_kvstore
-mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
+mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmkldnn.a'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.

From ccf9855651605f7bc6a3eed705e690f4b0c32df3 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Mon, 10 Dec 2018 15:49:16 -0800
Subject: [PATCH 40/93] add mkldnn to pack

---
 ci/jenkins/Jenkins_steps.groovy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index f48a26737308..8e50ca5b81c7 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -29,7 +29,7 @@ mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdpart
 mx_pip = 'build/*.whl'
 
 // for scala build, need to pass extra libs when run with dist_kvstore
-mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
+mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmklml_intel.so, lib/libmkldnn.so.0'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.

From 225b446cdcf97642740fddcbc665c8037648546f Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Mon, 10 Dec 2018 16:01:28 -0800
Subject: [PATCH 41/93] add libiomp to ci pack

---
 ci/jenkins/Jenkins_steps.groovy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 8e50ca5b81c7..f2026cc0e74e 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -29,7 +29,7 @@ mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdpart
 mx_pip = 'build/*.whl'
 
 // for scala build, need to pass extra libs when run with dist_kvstore
-mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmklml_intel.so, lib/libmkldnn.so.0'
+mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.

From 54231a9fb8c6909a0750947b3db0a4093001ca75 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Mon, 10 Dec 2018 16:54:03 -0800
Subject: [PATCH 42/93] move static libs

---
 ci/jenkins/Jenkins_steps.groovy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index f2026cc0e74e..3ff0f0865481 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -23,7 +23,7 @@
 utils = load('ci/Jenkinsfile_utils.groovy')
 
 // mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a. lib/libmklml_intel.so, lib/libmkldnn.so.0 lib/libiomp5.so'
 
 // Python wheels
 mx_pip = 'build/*.whl'

From 2d2a0f9686b99e56a6a89da2032fe404de728d2d Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Mon, 10 Dec 2018 17:31:48 -0800
Subject: [PATCH 43/93] fix typo

---
 ci/jenkins/Jenkins_steps.groovy | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 3ff0f0865481..e39d1e366dfe 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -23,7 +23,7 @@
 utils = load('ci/Jenkinsfile_utils.groovy')
 
 // mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a. lib/libmklml_intel.so, lib/libmkldnn.so.0 lib/libiomp5.so'
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, lib/libmklml_intel.so, lib/libmkldnn.so.0 lib/libiomp5.so'
 
 // Python wheels
 mx_pip = 'build/*.whl'
@@ -248,7 +248,7 @@ def compile_centos7_cpu_mkldnn() {
 }
 
 def compile_centos7_gpu() {
-    return ['GPU: CentOS 7': {
+    return ['vGPU: CentOS 7': {
       node(NODE_LINUX_CPU) {
         ws('workspace/build-centos7-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {

From 884d955aa57c1c0d3857736c69f195eba0eb5eaa Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Mon, 10 Dec 2018 17:55:32 -0800
Subject: [PATCH 44/93] pack mkldnn

---
 ci/jenkins/Jenkins_steps.groovy | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index e39d1e366dfe..e46b17f02032 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -37,8 +37,8 @@ mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-c
 mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
-mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*'
-mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*'
+mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libmkldnn.so.0 lib/libiomp5.so'
+mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libmkldnn.so.0 lib/libiomp5.so'
 
 // Python unittest for CPU
 // Python 2

From 7dfa87e5a83f4506c2dbb2c33fca069f339c0aef Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Mon, 10 Dec 2018 18:23:17 -0800
Subject: [PATCH 45/93] retrigger


From 409acd0fe52ecbe888b20f8ca3a563c65e979ae0 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Mon, 10 Dec 2018 19:38:16 -0800
Subject: [PATCH 46/93] add linux artifacts

---
 ci/jenkins/Jenkins_steps.groovy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index e46b17f02032..2925518c454f 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -31,7 +31,7 @@ mx_pip = 'build/*.whl'
 // for scala build, need to pass extra libs when run with dist_kvstore
 mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
-mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
+mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
 mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'

From 842228787ba30dc792c3c911fe15bb02fb1c01fc Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Mon, 10 Dec 2018 21:42:42 -0800
Subject: [PATCH 47/93] move libmkldnn in gpu cmake build

---
 ci/docker/runtime_functions.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 081a1475b5c4..e7f2f62d120c 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -646,6 +646,9 @@ build_ubuntu_gpu_cmake() {
         /work/mxnet
 
     ninja -v
+    # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
+    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
+    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
 }
 
 build_ubuntu_blc() {

From 346a602d105599fc7467614a37ffe3cac4fbd944 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Mon, 10 Dec 2018 21:51:22 -0800
Subject: [PATCH 48/93] move libmkldnn and libiomp5 on gpu workspace

---
 ci/docker/runtime_functions.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index e7f2f62d120c..1ddec377242b 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -584,6 +584,10 @@ build_ubuntu_gpu_cuda91_cudnn7() {
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         -j$(nproc)
+    cp lib/libmkldnn.so.0 lib/libmkldnn.so.0.tmp
+    mv lib/libmkldnn.so.0.tmp lib/libmkldnn.so.0
+    cp lib/libiomp5.so.0 lib/libiomp5.so.0.tmp
+    mv lib/libiomp5.so.0.tmp lib/libiomp5.so.0
 }
 
 build_ubuntu_amalgamation() {

From 20741c4b2bad3c7447dee6995ebcd8ff920bfdb2 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Mon, 10 Dec 2018 22:08:38 -0800
Subject: [PATCH 49/93] move linked files

---
 ci/docker/runtime_functions.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 1ddec377242b..8c381169e8b0 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -306,6 +306,10 @@ build_centos7_gpu() {
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         -j$(nproc)
+    cp lib/libmkldnn.so.0 lib/libmkldnn.so.0.tmp
+    mv lib/libmkldnn.so.0.tmp lib/libmkldnn.so.0
+    cp lib/libiomp5.so lib/libiomp5.so.tmp
+    mv lib/libiomp5.so.tmp lib/libiomp5.so
 }
 
 build_ubuntu_cpu() {
@@ -586,8 +590,8 @@ build_ubuntu_gpu_cuda91_cudnn7() {
         -j$(nproc)
     cp lib/libmkldnn.so.0 lib/libmkldnn.so.0.tmp
     mv lib/libmkldnn.so.0.tmp lib/libmkldnn.so.0
-    cp lib/libiomp5.so.0 lib/libiomp5.so.0.tmp
-    mv lib/libiomp5.so.0.tmp lib/libiomp5.so.0
+    cp lib/libiomp5.so lib/libiomp5.so.tmp
+    mv lib/libiomp5.so.tmp lib/libiomp5.so
 }
 
 build_ubuntu_amalgamation() {

From 4e6b2caa4765a2c2f58b6f156e95461f58b6c231 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Mon, 10 Dec 2018 22:45:09 -0800
Subject: [PATCH 50/93] fix typo

---
 ci/jenkins/Jenkins_steps.groovy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 2925518c454f..06183a01c01b 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -23,7 +23,7 @@
 utils = load('ci/Jenkinsfile_utils.groovy')
 
 // mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, lib/libmklml_intel.so, lib/libmkldnn.so.0 lib/libiomp5.so'
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
 
 // Python wheels
 mx_pip = 'build/*.whl'

From 992a2a0c2a9ad6d0c3e1e0ece22aba84809dd002 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Mon, 10 Dec 2018 23:31:06 -0800
Subject: [PATCH 51/93] fix typo

---
 ci/jenkins/Jenkins_steps.groovy | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 06183a01c01b..c3e5cb54e450 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -37,8 +37,8 @@ mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-c
 mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
-mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libmkldnn.so.0 lib/libiomp5.so'
-mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libmkldnn.so.0 lib/libiomp5.so'
+mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
+mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
 
 // Python unittest for CPU
 // Python 2

From 45fd008af65d2986d6ad8844125b2ab37187ff72 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Tue, 11 Dec 2018 00:11:47 -0800
Subject: [PATCH 52/93] add artifacts for tensorrt

---
 ci/jenkins/Jenkins_steps.groovy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index c3e5cb54e450..28f757dac81c 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -36,7 +36,7 @@ mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/li
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
 mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
-mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
+mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
 mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
 mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
 

From 7934cf16a190c58708e49b7161f47c7b755fce13 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Tue, 11 Dec 2018 02:51:46 -0800
Subject: [PATCH 53/93] move mkldnn lib in scala build

---
 ci/docker/runtime_functions.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 8c381169e8b0..1e0dae52d540 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -920,6 +920,7 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
 
 integrationtest_ubuntu_gpu_scala() {
     set -ex
+    cp lib/lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so.0
     make scalapkg USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 USE_DIST_KVSTORE=1 SCALA_ON_GPU=1 ENABLE_TESTCOVERAGE=1
     make scalaintegrationtest USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 SCALA_TEST_ON_GPU=1 USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
 }

From b40e996800f06d71bef76de1da55047109cb7024 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Tue, 11 Dec 2018 02:55:20 -0800
Subject: [PATCH 54/93] move mkldnn lib on cpu scala

---
 ci/docker/runtime_functions.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 1e0dae52d540..c766f800285d 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -768,6 +768,7 @@ unittest_ubuntu_cpu_scala() {
 
 unittest_centos7_cpu_scala() {
     set -ex
+    cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so.0
     cd /work/mxnet
     make scalapkg USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
     make scalaunittest USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
@@ -920,7 +921,7 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
 
 integrationtest_ubuntu_gpu_scala() {
     set -ex
-    cp lib/lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so.0
+    cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so.0
     make scalapkg USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 USE_DIST_KVSTORE=1 SCALA_ON_GPU=1 ENABLE_TESTCOVERAGE=1
     make scalaintegrationtest USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 SCALA_TEST_ON_GPU=1 USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
 }

From 74e86e6a1971e6d6cdaa36cd0d2a091b6cbb6628 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Tue, 11 Dec 2018 09:51:46 -0800
Subject: [PATCH 55/93] create dir for binding

---
 ci/docker/runtime_functions.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index c766f800285d..7b170e1ed87a 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -768,6 +768,7 @@ unittest_ubuntu_cpu_scala() {
 
 unittest_centos7_cpu_scala() {
     set -ex
+    mkdir -p /work/mxnet/3rdparty/mkldnn/build/install/lib/
     cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so.0
     cd /work/mxnet
     make scalapkg USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
@@ -921,6 +922,7 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
 
 integrationtest_ubuntu_gpu_scala() {
     set -ex
+    mkdir /work/mxnet/3rdparty/mkldnn/build/install/lib/
     cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so.0
     make scalapkg USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 USE_DIST_KVSTORE=1 SCALA_ON_GPU=1 ENABLE_TESTCOVERAGE=1
     make scalaintegrationtest USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 SCALA_TEST_ON_GPU=1 USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1

From 5a18a8fcda6fa3724cfdcbdb42db7a8e4957d147 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Tue, 11 Dec 2018 10:07:24 -0800
Subject: [PATCH 56/93] rename libmkldnn in scala

---
 ci/docker/runtime_functions.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 7b170e1ed87a..44754a9a501b 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -769,7 +769,7 @@ unittest_ubuntu_cpu_scala() {
 unittest_centos7_cpu_scala() {
     set -ex
     mkdir -p /work/mxnet/3rdparty/mkldnn/build/install/lib/
-    cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so.0
+    cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so
     cd /work/mxnet
     make scalapkg USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
     make scalaunittest USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
@@ -923,7 +923,7 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
 integrationtest_ubuntu_gpu_scala() {
     set -ex
     mkdir /work/mxnet/3rdparty/mkldnn/build/install/lib/
-    cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so.0
+    cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so
     make scalapkg USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 USE_DIST_KVSTORE=1 SCALA_ON_GPU=1 ENABLE_TESTCOVERAGE=1
     make scalaintegrationtest USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 SCALA_TEST_ON_GPU=1 USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
 }

From c0bd964b10d55e1a72b520fa484736dc224bd460 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Tue, 11 Dec 2018 10:25:34 -0800
Subject: [PATCH 57/93] move mklml dep in scala builds

---
 ci/docker/runtime_functions.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 44754a9a501b..26ee9409a368 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -769,7 +769,9 @@ unittest_ubuntu_cpu_scala() {
 unittest_centos7_cpu_scala() {
     set -ex
     mkdir -p /work/mxnet/3rdparty/mkldnn/build/install/lib/
+    mkdir -p /work/mxnet/deps/include
     cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so
+    cp lib/libmklml_intel.so /work/mxnet/deps/include/libmklml_intel.so
     cd /work/mxnet
     make scalapkg USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
     make scalaunittest USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
@@ -922,8 +924,10 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
 
 integrationtest_ubuntu_gpu_scala() {
     set -ex
-    mkdir /work/mxnet/3rdparty/mkldnn/build/install/lib/
+    mkdir -p /work/mxnet/3rdparty/mkldnn/build/install/lib/
+    mkdir -p /work/mxnet/deps/include
     cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so
+    cp lib/libmklml_intel.so /work/mxnet/deps/include/libmklml_intel.so
     make scalapkg USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 USE_DIST_KVSTORE=1 SCALA_ON_GPU=1 ENABLE_TESTCOVERAGE=1
     make scalaintegrationtest USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 SCALA_TEST_ON_GPU=1 USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
 }

From 2dc1683316d27f30d6eb0473a34eebe5cd92abf7 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Tue, 11 Dec 2018 10:44:40 -0800
Subject: [PATCH 58/93] move mkl to another linked folder

---
 ci/docker/runtime_functions.sh | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 26ee9409a368..faa22e1c21ba 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -769,9 +769,8 @@ unittest_ubuntu_cpu_scala() {
 unittest_centos7_cpu_scala() {
     set -ex
     mkdir -p /work/mxnet/3rdparty/mkldnn/build/install/lib/
-    mkdir -p /work/mxnet/deps/include
     cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so
-    cp lib/libmklml_intel.so /work/mxnet/deps/include/libmklml_intel.so
+    cp lib/libmklml_intel.so /work/mxnet/3rdparty/mkldnn/build/install/lib/libmklml_intel.so
     cd /work/mxnet
     make scalapkg USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
     make scalaunittest USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
@@ -925,9 +924,8 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
 integrationtest_ubuntu_gpu_scala() {
     set -ex
     mkdir -p /work/mxnet/3rdparty/mkldnn/build/install/lib/
-    mkdir -p /work/mxnet/deps/include
     cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so
-    cp lib/libmklml_intel.so /work/mxnet/deps/include/libmklml_intel.so
+    cp lib/libmklml_intel.so /work/mxnet/3rdparty/mkldnn/build/install/lib/libmklml_intel.so
     make scalapkg USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 USE_DIST_KVSTORE=1 SCALA_ON_GPU=1 ENABLE_TESTCOVERAGE=1
     make scalaintegrationtest USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 SCALA_TEST_ON_GPU=1 USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
 }

From 4eb65df07b0f3c2d0ba6e2d1210fc8da836ecfc8 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Tue, 11 Dec 2018 23:00:34 -0800
Subject: [PATCH 59/93] move libmkl to another dir

---
 ci/docker/runtime_functions.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index faa22e1c21ba..2938b20a6c6b 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -770,7 +770,7 @@ unittest_centos7_cpu_scala() {
     set -ex
     mkdir -p /work/mxnet/3rdparty/mkldnn/build/install/lib/
     cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so
-    cp lib/libmklml_intel.so /work/mxnet/3rdparty/mkldnn/build/install/lib/libmklml_intel.so
+    cp lib/libmklml_intel.so /work/mxnet/lib/libmklml_intel.so
     cd /work/mxnet
     make scalapkg USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
     make scalaunittest USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
@@ -925,7 +925,7 @@ integrationtest_ubuntu_gpu_scala() {
     set -ex
     mkdir -p /work/mxnet/3rdparty/mkldnn/build/install/lib/
     cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so
-    cp lib/libmklml_intel.so /work/mxnet/3rdparty/mkldnn/build/install/lib/libmklml_intel.so
+    cp lib/libmklml_intel.so /work/mxnet/lib/libmklml_intel.so
     make scalapkg USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 USE_DIST_KVSTORE=1 SCALA_ON_GPU=1 ENABLE_TESTCOVERAGE=1
     make scalaintegrationtest USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 SCALA_TEST_ON_GPU=1 USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
 }

From ec5421dd97ad4e9582955a1fab3998cb1c2459dc Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Wed, 12 Dec 2018 00:30:25 -0800
Subject: [PATCH 60/93] add libmklml

---
 ci/docker/runtime_functions.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 2938b20a6c6b..faa22e1c21ba 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -770,7 +770,7 @@ unittest_centos7_cpu_scala() {
     set -ex
     mkdir -p /work/mxnet/3rdparty/mkldnn/build/install/lib/
     cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so
-    cp lib/libmklml_intel.so /work/mxnet/lib/libmklml_intel.so
+    cp lib/libmklml_intel.so /work/mxnet/3rdparty/mkldnn/build/install/lib/libmklml_intel.so
     cd /work/mxnet
     make scalapkg USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
     make scalaunittest USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
@@ -925,7 +925,7 @@ integrationtest_ubuntu_gpu_scala() {
     set -ex
     mkdir -p /work/mxnet/3rdparty/mkldnn/build/install/lib/
     cp lib/libmkldnn.so.0 /work/mxnet/3rdparty/mkldnn/build/install/lib/libmkldnn.so
-    cp lib/libmklml_intel.so /work/mxnet/lib/libmklml_intel.so
+    cp lib/libmklml_intel.so /work/mxnet/3rdparty/mkldnn/build/install/lib/libmklml_intel.so
     make scalapkg USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 USE_DIST_KVSTORE=1 SCALA_ON_GPU=1 ENABLE_TESTCOVERAGE=1
     make scalaintegrationtest USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 SCALA_TEST_ON_GPU=1 USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
 }

From 75dd532697f9a94bf34fc30e481602215b300330 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Wed, 12 Dec 2018 00:57:04 -0800
Subject: [PATCH 61/93] move mkldnn

---
 ci/docker/runtime_functions.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index faa22e1c21ba..847015b2196e 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -308,6 +308,8 @@ build_centos7_gpu() {
         -j$(nproc)
     cp lib/libmkldnn.so.0 lib/libmkldnn.so.0.tmp
     mv lib/libmkldnn.so.0.tmp lib/libmkldnn.so.0
+    cp lib/libmklml_intel.so lib/libmklml_intel.so.tmp
+    mv lib/libmklml_intel.so.tmp lib/libmklml_intel.so
     cp lib/libiomp5.so lib/libiomp5.so.tmp
     mv lib/libiomp5.so.tmp lib/libiomp5.so
 }

From 2af09a896641c0ea75bc2021a060d87e7c782853 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Wed, 12 Dec 2018 00:59:47 -0800
Subject: [PATCH 62/93] move mkldnn on centos

---
 ci/docker/runtime_functions.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 847015b2196e..4d21fb31ab54 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -253,6 +253,12 @@ build_centos7_cpu() {
         USE_BLAS=openblas \
         USE_DIST_KVSTORE=1 \
         -j$(nproc)
+    cp lib/libmkldnn.so.0 lib/libmkldnn.so.0.tmp
+    mv lib/libmkldnn.so.0.tmp lib/libmkldnn.so.0
+    cp lib/libmklml_intel.so lib/libmklml_intel.so.tmp
+    mv lib/libmklml_intel.so.tmp lib/libmklml_intel.so
+    cp lib/libiomp5.so lib/libiomp5.so.tmp
+    mv lib/libiomp5.so.tmp lib/libiomp5.so
 }
 
 build_amzn_linux_cpu() {

From 76d842f9817ed6d3b8e994549deca2f049f81c07 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Wed, 12 Dec 2018 11:12:08 -0800
Subject: [PATCH 63/93] specify new dynamic path

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 3c7ca0c3f25c..4bd127b6c479 100644
--- a/Makefile
+++ b/Makefile
@@ -136,7 +136,7 @@ ifeq ($(USE_MKLDNN), 1)
 		LDFLAGS += -L$(MKLROOT)/lib
 	endif
 	CFLAGS += -I$(MKLDNNROOT)/include
-	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
+	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,$(ROOTDIR)/lib
 endif
 
 # setup opencv

From 103c9d1cfab0c818d3badbd05cc9000e9d0c0211 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Wed, 12 Dec 2018 17:42:03 -0800
Subject: [PATCH 64/93] retrigger


From d51043622d4ef7fcb95aff6a3e84d91ab71b48c9 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 13 Dec 2018 15:02:05 -0800
Subject: [PATCH 65/93] remove mkldnn dynamic lib

---
 ci/jenkins/Jenkins_steps.groovy | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index a187e0c9f259..77735d68da60 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -23,22 +23,22 @@
 utils = load('ci/Jenkinsfile_utils.groovy')
 
 // mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, lib/libmklml_intel.so lib/libiomp5.so'
 
 // Python wheels
 mx_pip = 'build/*.whl'
 
 // for scala build, need to pass extra libs when run with dist_kvstore
-mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
+mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmklml_intel.so, lib/libiomp5.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
-mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
+mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
-mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
-mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
-mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
+mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so, lib/libmklml_intel.so, lib/libiomp5.so'
+mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libiomp5.so'
+mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libiomp5.so'
 
 // Python unittest for CPU
 // Python 2

From 16cca196e9e1ad92db74f4e8a01b3b052076d268 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 13 Dec 2018 15:12:15 -0800
Subject: [PATCH 66/93] remove moving mkldnn artifact

---
 ci/docker/runtime_functions.sh | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 07f2484144dc..4d46f5af8001 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -253,8 +253,6 @@ build_centos7_cpu() {
         USE_BLAS=openblas \
         USE_DIST_KVSTORE=1 \
         -j$(nproc)
-    cp lib/libmkldnn.so.0 lib/libmkldnn.so.0.tmp
-    mv lib/libmkldnn.so.0.tmp lib/libmkldnn.so.0
     cp lib/libmklml_intel.so lib/libmklml_intel.so.tmp
     mv lib/libmklml_intel.so.tmp lib/libmklml_intel.so
     cp lib/libiomp5.so lib/libiomp5.so.tmp
@@ -312,8 +310,6 @@ build_centos7_gpu() {
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         -j$(nproc)
-    cp lib/libmkldnn.so.0 lib/libmkldnn.so.0.tmp
-    mv lib/libmkldnn.so.0.tmp lib/libmkldnn.so.0
     cp lib/libmklml_intel.so lib/libmklml_intel.so.tmp
     mv lib/libmklml_intel.so.tmp lib/libmklml_intel.so
     cp lib/libiomp5.so lib/libiomp5.so.tmp
@@ -624,8 +620,6 @@ build_ubuntu_gpu_cuda91_cudnn7() {
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         -j$(nproc)
-    cp lib/libmkldnn.so.0 lib/libmkldnn.so.0.tmp
-    mv lib/libmkldnn.so.0.tmp lib/libmkldnn.so.0
     cp lib/libiomp5.so lib/libiomp5.so.tmp
     mv lib/libiomp5.so.tmp lib/libiomp5.so
 }

From ab753738fb7978053681de1c900b0d7fc75baeb4 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 13 Dec 2018 16:19:57 -0800
Subject: [PATCH 67/93] add ld path

---
 ci/docker/runtime_functions.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 4d46f5af8001..90285a0355a1 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -867,6 +867,7 @@ unittest_ubuntu_cpu_julia06() {
 
     # FIXME
     export LD_PRELOAD='/usr/lib/x86_64-linux-gnu/libjemalloc.so'
+    export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
 
     # use the prebuilt binary from $MXNET_HOME/lib
     julia -e 'Pkg.build("MXNet")'
@@ -1196,6 +1197,7 @@ deploy_jl_docs() {
 
     # FIXME
     export LD_PRELOAD='/usr/lib/x86_64-linux-gnu/libjemalloc.so'
+    export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
 
     # use the prebuilt binary from $MXNET_HOME/lib
     julia -e 'Pkg.build("MXNet")'

From 63821d4c15ec50ed649205c3b0d6d1e58da3d2fb Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 13 Dec 2018 22:34:44 -0800
Subject: [PATCH 68/93] retrigger


From 13bf7ae0bac9e364a6498c47ce734eea9a87e583 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 13 Dec 2018 22:43:46 -0800
Subject: [PATCH 69/93] Revert "remove moving mkldnn artifact"

This reverts commit 16cca196e9e1ad92db74f4e8a01b3b052076d268.
---
 ci/docker/runtime_functions.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 90285a0355a1..4e6622c6e3a4 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -253,6 +253,8 @@ build_centos7_cpu() {
         USE_BLAS=openblas \
         USE_DIST_KVSTORE=1 \
         -j$(nproc)
+    cp lib/libmkldnn.so.0 lib/libmkldnn.so.0.tmp
+    mv lib/libmkldnn.so.0.tmp lib/libmkldnn.so.0
     cp lib/libmklml_intel.so lib/libmklml_intel.so.tmp
     mv lib/libmklml_intel.so.tmp lib/libmklml_intel.so
     cp lib/libiomp5.so lib/libiomp5.so.tmp
@@ -310,6 +312,8 @@ build_centos7_gpu() {
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         -j$(nproc)
+    cp lib/libmkldnn.so.0 lib/libmkldnn.so.0.tmp
+    mv lib/libmkldnn.so.0.tmp lib/libmkldnn.so.0
     cp lib/libmklml_intel.so lib/libmklml_intel.so.tmp
     mv lib/libmklml_intel.so.tmp lib/libmklml_intel.so
     cp lib/libiomp5.so lib/libiomp5.so.tmp
@@ -620,6 +624,8 @@ build_ubuntu_gpu_cuda91_cudnn7() {
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         -j$(nproc)
+    cp lib/libmkldnn.so.0 lib/libmkldnn.so.0.tmp
+    mv lib/libmkldnn.so.0.tmp lib/libmkldnn.so.0
     cp lib/libiomp5.so lib/libiomp5.so.tmp
     mv lib/libiomp5.so.tmp lib/libiomp5.so
 }

From 935b1dc89b1e56383aa7bc5ad35916dba13fb6da Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 13 Dec 2018 22:43:53 -0800
Subject: [PATCH 70/93] Revert "remove mkldnn dynamic lib"

This reverts commit d51043622d4ef7fcb95aff6a3e84d91ab71b48c9.
---
 ci/jenkins/Jenkins_steps.groovy | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 77735d68da60..a187e0c9f259 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -23,22 +23,22 @@
 utils = load('ci/Jenkinsfile_utils.groovy')
 
 // mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, lib/libmklml_intel.so lib/libiomp5.so'
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
 
 // Python wheels
 mx_pip = 'build/*.whl'
 
 // for scala build, need to pass extra libs when run with dist_kvstore
-mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmklml_intel.so, lib/libiomp5.so'
+mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
-mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
+mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
-mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so, lib/libmklml_intel.so, lib/libiomp5.so'
-mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libiomp5.so'
-mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libiomp5.so'
+mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
+mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
+mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*, lib/libmklml_intel.so, lib/libmkldnn.so.0, lib/libiomp5.so'
 
 // Python unittest for CPU
 // Python 2

From 59bae57a038f147d95d740475423d0b69a2b3759 Mon Sep 17 00:00:00 2001
From: Alexander Zai <alexzai@amazon.com>
Date: Thu, 13 Dec 2018 23:02:15 -0800
Subject: [PATCH 71/93] update makefile

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 1faa9e039467..dc17fcae7bdf 100644
--- a/Makefile
+++ b/Makefile
@@ -136,7 +136,7 @@ ifeq ($(USE_MKLDNN), 1)
 		LDFLAGS += -L$(MKLROOT)/lib
 	endif
 	CFLAGS += -I$(MKLDNNROOT)/include
-	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
+	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,$(ROOTDIR)/lib
 endif
 
 # setup opencv

From ce55f1bc88cc789d121071cd28fd31d6101f29eb Mon Sep 17 00:00:00 2001
From: Steffen Rochel <steffenrochel@gmail.com>
Date: Fri, 14 Dec 2018 08:35:42 -0800
Subject: [PATCH 72/93] updated reference to Apache MXNet (#13645)

---
 julia/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/julia/README.md b/julia/README.md
index a4299575f95e..2ff7553063f3 100644
--- a/julia/README.md
+++ b/julia/README.md
@@ -3,7 +3,7 @@
 [![MXNet](http://pkg.julialang.org/badges/MXNet_0.6.svg)](http://pkg.julialang.org/?pkg=MXNet)
 
 
-MXNet.jl is the [dmlc/mxnet](https://github.com/apache/incubator-mxnet) [Julia](http://julialang.org/) package. MXNet.jl brings flexible and efficient GPU computing and state-of-art deep learning to Julia. Some highlight of its features include:
+MXNet.jl is the [Apache MXNet](https://github.com/apache/incubator-mxnet) [Julia](http://julialang.org/) package. MXNet.jl brings flexible and efficient GPU computing and state-of-art deep learning to Julia. Some highlight of its features include:
 
 * Efficient tensor/matrix computation across multiple devices, including multiple CPUs, GPUs and distributed server nodes.
 * Flexible symbolic manipulation to composite and construction of state-of-the-art deep learning models.

From 8f569338432e852b971869984a7dbe8a117ef46f Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Fri, 14 Dec 2018 10:16:32 -0800
Subject: [PATCH 73/93] Complimentary gluon DataLoader improvements (#13606)

* init

* add tests

* doc

* lint

* fix openmp
---
 docs/faq/env_var.md                      | 18 +++++++----
 python/mxnet/gluon/data/dataloader.py    | 40 ++++++++++++++++++------
 src/initialize.cc                        |  8 +++--
 tests/python/unittest/test_gluon_data.py |  7 +++--
 4 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index 0f0c82f4b599..98057d0d76d6 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -37,6 +37,12 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 * MXNET_CPU_NNPACK_NTHREADS
   - Values: Int ```(default=4)```
   - The number of threads used for NNPACK. NNPACK package aims to provide high-performance implementations of some layers for multi-core CPUs. Checkout [NNPACK](http://mxnet.io/faq/nnpack.html) to know more about it.
+* MXNET_MP_WORKER_NTHREADS
+  - Values: Int ```(default=1)```
+  - The number of scheduling threads on CPU given to multiprocess workers. Enlarge this number allows more operators to run in parallel in individual workers but please consider reducing the overall `num_workers` to avoid thread contention (not available on Windows).
+* MXNET_MP_OPENCV_NUM_THREADS
+  - Values: Int ```(default=0)```
+  - The number of OpenCV execution threads given to multiprocess workers. OpenCV multithreading is disabled if `MXNET_MP_OPENCV_NUM_THREADS` < 1 (default). Enlarge this number may boost the performance of individual workers when executing underlying OpenCV functions but please consider reducing the overall `num_workers` to avoid thread contention (not available on Windows).
 
 ## Memory Options
 
@@ -99,10 +105,10 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 * MXNET_KVSTORE_REDUCTION_NTHREADS
   - Values: Int ```(default=4)```
   - The number of CPU threads used for summing up big arrays on a single machine
-  - This will also be used for `dist_sync` kvstore to sum up arrays from different contexts on a single machine. 
-  - This does not affect summing up of arrays from different machines on servers. 
+  - This will also be used for `dist_sync` kvstore to sum up arrays from different contexts on a single machine.
+  - This does not affect summing up of arrays from different machines on servers.
   - Summing up of arrays for `dist_sync_device` kvstore is also unaffected as that happens on GPUs.
-  
+
 * MXNET_KVSTORE_BIGARRAY_BOUND
   - Values: Int ```(default=1000000)```
   - The minimum size of a "big array".
@@ -166,7 +172,7 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
 
 * MXNET_CUDNN_AUTOTUNE_DEFAULT
   - Values: 0, 1, or 2 ```(default=1)```
-  - The default value of cudnn auto tuning for convolution layers. 
+  - The default value of cudnn auto tuning for convolution layers.
   - Value of 0 means there is no auto tuning to pick the convolution algo
   - Performance tests are run to pick the convolution algo when value is 1 or 2
   - Value of 1 chooses the best algo in a limited workspace
@@ -190,12 +196,12 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
 * MXNET_HOME
   - Data directory in the filesystem for storage, for example when downloading gluon models.
   - Default in *nix is .mxnet APPDATA/mxnet in windows.
-  
+
 * MXNET_MKLDNN_ENABLED
   - Values: 0, 1 ```(default=1)```
   - Flag to enable or disable MKLDNN accelerator. On by default.
   - Only applies to mxnet that has been compiled with MKLDNN (```pip install mxnet-mkl``` or built from source with ```USE_MKLDNN=1```)
-  
+
 * MXNET_MKLDNN_CACHE_NUM
   - Values: Int ```(default=-1)```
   - Flag to set num of elements that MKLDNN cache can hold. Default is -1 which means cache size is unbounded. Should only be set if your model has variable input shapes, as cache size may grow unbounded. The number represents the number of items in the cache and is proportional to the number of layers that use MKLDNN and different input shape.
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index 586e620470d3..9d762745a407 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -26,6 +26,7 @@
 import multiprocessing
 import multiprocessing.queues
 from multiprocessing.reduction import ForkingPickler
+from multiprocessing.pool import ThreadPool
 import threading
 import numpy as np
 
@@ -384,8 +385,9 @@ def _worker_initializer(dataset):
     global _worker_dataset
     _worker_dataset = dataset
 
-def _worker_fn(samples, batchify_fn):
+def _worker_fn(samples, batchify_fn, dataset=None):
     """Function for processing data in worker process."""
+    # pylint: disable=unused-argument
     # it is required that each worker process has to fork a new MXIndexedRecordIO handle
     # preserving dataset as global variable can save tons of overhead and is safe in new process
     global _worker_dataset
@@ -394,10 +396,14 @@ def _worker_fn(samples, batchify_fn):
     ForkingPickler(buf, pickle.HIGHEST_PROTOCOL).dump(batch)
     return buf.getvalue()
 
+def _thread_worker_fn(samples, batchify_fn, dataset):
+    """Threadpool worker function for processing data."""
+    return batchify_fn([dataset[i] for i in samples])
+
 class _MultiWorkerIter(object):
     """Internal multi-worker iterator for DataLoader."""
     def __init__(self, worker_pool, batchify_fn, batch_sampler, pin_memory=False,
-                 worker_fn=_worker_fn, prefetch=0):
+                 worker_fn=_worker_fn, prefetch=0, dataset=None):
         self._worker_pool = worker_pool
         self._batchify_fn = batchify_fn
         self._batch_sampler = batch_sampler
@@ -407,6 +413,7 @@ def __init__(self, worker_pool, batchify_fn, batch_sampler, pin_memory=False,
         self._iter = iter(self._batch_sampler)
         self._worker_fn = worker_fn
         self._pin_memory = pin_memory
+        self._dataset = dataset
         # pre-fetch
         for _ in range(prefetch):
             self._push_next()
@@ -419,7 +426,8 @@ def _push_next(self):
         r = next(self._iter, None)
         if r is None:
             return
-        async_ret = self._worker_pool.apply_async(self._worker_fn, (r, self._batchify_fn))
+        async_ret = self._worker_pool.apply_async(
+            self._worker_fn, (r, self._batchify_fn, self._dataset))
         self._data_buffer[self._sent_idx] = async_ret
         self._sent_idx += 1
 
@@ -432,7 +440,7 @@ def __next__(self):
         assert self._rcvd_idx < self._sent_idx, "rcvd_idx must be smaller than sent_idx"
         assert self._rcvd_idx in self._data_buffer, "fatal error with _push_next, rcvd_idx missing"
         ret = self._data_buffer.pop(self._rcvd_idx)
-        batch = pickle.loads(ret.get())
+        batch = pickle.loads(ret.get()) if self._dataset is None else ret.get()
         if self._pin_memory:
             batch = _as_in_context(batch, context.cpu_pinned())
         batch = batch[0] if len(batch) == 1 else batch
@@ -498,12 +506,18 @@ def default_batchify_fn(data):
         but will consume more shared_memory. Using smaller number may forfeit the purpose of using
         multiple worker processes, try reduce `num_workers` in this case.
         By default it defaults to `num_workers * 2`.
+    thread_pool : bool, default False
+        If ``True``, use threading pool instead of multiprocessing pool. Using threadpool
+        can avoid shared memory usage. If `DataLoader` is more IO bounded or GIL is not a killing
+        problem, threadpool version may achieve better performance than multiprocessing.
+
     """
     def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
                  last_batch=None, batch_sampler=None, batchify_fn=None,
-                 num_workers=0, pin_memory=False, prefetch=None):
+                 num_workers=0, pin_memory=False, prefetch=None, thread_pool=False):
         self._dataset = dataset
         self._pin_memory = pin_memory
+        self._thread_pool = thread_pool
 
         if batch_sampler is None:
             if batch_size is None:
@@ -529,8 +543,11 @@ def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
         self._worker_pool = None
         self._prefetch = max(0, int(prefetch) if prefetch is not None else 2 * self._num_workers)
         if self._num_workers > 0:
-            self._worker_pool = multiprocessing.Pool(
-                self._num_workers, initializer=_worker_initializer, initargs=[self._dataset])
+            if self._thread_pool:
+                self._worker_pool = ThreadPool(self._num_workers)
+            else:
+                self._worker_pool = multiprocessing.Pool(
+                    self._num_workers, initializer=_worker_initializer, initargs=[self._dataset])
         if batchify_fn is None:
             if num_workers > 0:
                 self._batchify_fn = default_mp_batchify_fn
@@ -551,14 +568,17 @@ def same_process_iter():
 
         # multi-worker
         return _MultiWorkerIter(self._worker_pool, self._batchify_fn, self._batch_sampler,
-                                pin_memory=self._pin_memory, worker_fn=_worker_fn,
-                                prefetch=self._prefetch)
+                                pin_memory=self._pin_memory,
+                                worker_fn=_thread_worker_fn if self._thread_pool else _worker_fn,
+                                prefetch=self._prefetch,
+                                dataset=self._dataset if self._thread_pool else None)
 
     def __len__(self):
         return len(self._batch_sampler)
 
     def __del__(self):
         if self._worker_pool:
-            # manually terminate due to a bug that pool is not automatically terminated on linux
+            # manually terminate due to a bug that pool is not automatically terminated
+            # https://bugs.python.org/issue34172
             assert isinstance(self._worker_pool, multiprocessing.pool.Pool)
             self._worker_pool.terminate()
diff --git a/src/initialize.cc b/src/initialize.cc
index ddda3f18a3ae..de7edd1b1455 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -57,11 +57,13 @@ class LibraryInitializer {
         Engine::Get()->Start();
       },
       []() {
-        // Make children single threaded since they are typically workers
-        dmlc::SetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
+        // Conservative thread management for multiprocess workers
+        const size_t mp_worker_threads = dmlc::GetEnv("MXNET_MP_WORKER_NTHREADS", 1);
+        dmlc::SetEnv("MXNET_CPU_WORKER_NTHREADS", mp_worker_threads);
         dmlc::SetEnv("OMP_NUM_THREADS", 1);
 #if MXNET_USE_OPENCV && !__APPLE__
-        cv::setNumThreads(0);  // disable opencv threading
+        const size_t mp_cv_num_threads = dmlc::GetEnv("MXNET_MP_OPENCV_NUM_THREADS", 0);
+        cv::setNumThreads(mp_cv_num_threads);  // disable opencv threading
 #endif  // MXNET_USE_OPENCV
         engine::OpenMP::Get()->set_enabled(false);
         Engine::Get()->Start();
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index a3ba222c71d8..6a5322616e20 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -156,9 +156,10 @@ def __getitem__(self, key):
 @with_seed()
 def test_multi_worker():
     data = Dataset()
-    loader = gluon.data.DataLoader(data, batch_size=1, num_workers=5)
-    for i, batch in enumerate(loader):
-        assert (batch.asnumpy() == i).all()
+    for thread_pool in [True, False]:
+        loader = gluon.data.DataLoader(data, batch_size=1, num_workers=5, thread_pool=thread_pool)
+        for i, batch in enumerate(loader):
+            assert (batch.asnumpy() == i).all()
 
 class _Dummy(Dataset):
     """Dummy dataset for randomized shape arrays."""

From d59dae615475acc80be04747fe19a559de0bb511 Mon Sep 17 00:00:00 2001
From: Marco de Abreu <marcoabreu@users.noreply.github.com>
Date: Fri, 14 Dec 2018 20:54:51 +0100
Subject: [PATCH 74/93] Improve CCache handling (#13456)

* Remove gitignore entries

* Modify Makefile

* Modify user permissions

* Add new ccache wrapper function

* Change PATH rewrite to a different one to resolve CUDA issues

* Add ccache to gpu cmake

* Enable ccache for every build

* Set permissions for arm dockerfiles

* Disable ccache for ASAN

* Remove g++-8 ccache redirect

* Update Android Dockerfiles for user permissions

* Fix ASAN compiler typo

* Remove sanity for speed

* Move build dir creation in android armv8

* Revert "Remove sanity for speed"

This reverts commit e8386a774dafe96337930b9cac36cb24fc36585e.

* Add ccache for NVCC in Makefile
---
 .gitignore                               |   4 -
 Makefile                                 |   6 +-
 ci/docker/Dockerfile.build.android_armv7 |   5 ++
 ci/docker/Dockerfile.build.android_armv8 |   8 +-
 ci/docker/Dockerfile.build.armv6         |   5 ++
 ci/docker/Dockerfile.build.armv7         |   5 ++
 ci/docker/Dockerfile.build.armv8         |   5 ++
 ci/docker/Dockerfile.build.jetson        |   5 ++
 ci/docker/install/centos7_adduser.sh     |   5 ++
 ci/docker/install/ubuntu_adduser.sh      |   5 ++
 ci/docker/runtime_functions.sh           | 108 +++++++++++++++++------
 11 files changed, 124 insertions(+), 37 deletions(-)

diff --git a/.gitignore b/.gitignore
index c7530ab69c6a..7eb8e7d6e777 100644
--- a/.gitignore
+++ b/.gitignore
@@ -167,10 +167,6 @@ python/.eggs
 tests/Makefile
 tests/mxnet_unit_tests
 
-# generated wrappers for ccache
-cc
-cxx
-
 # Code coverage related
 .coverage
 *.gcov
diff --git a/Makefile b/Makefile
index dc17fcae7bdf..f82bbf3684a3 100644
--- a/Makefile
+++ b/Makefile
@@ -467,7 +467,7 @@ build/src/%.o: src/%.cc | mkldnn
 
 build/src/%_gpu.o: src/%.cu | mkldnn
 	@mkdir -p $(@D)
-	$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS)" -M -MT build/src/$*_gpu.o $< >build/src/$*_gpu.d
+	$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS)" --generate-dependencies -MT build/src/$*_gpu.o $< >build/src/$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS)" $<
 
 # A nvcc bug cause it to generate "generic/xxx.h" dependencies from torch headers.
@@ -483,7 +483,7 @@ build/plugin/%.o: plugin/%.cc
 
 %_gpu.o: %.cu
 	@mkdir -p $(@D)
-	$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS) -Isrc/operator" -M -MT $*_gpu.o $< >$*_gpu.d
+	$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS) -Isrc/operator" --generate-dependencies -MT $*_gpu.o $< >$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS) -Isrc/operator" $<
 
 %.o: %.cc $(CORE_INC)
@@ -690,7 +690,7 @@ rclean:
 
 ifneq ($(EXTRA_OPERATORS),)
 clean: rclean cyclean $(EXTRA_PACKAGES_CLEAN)
-	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ 
+	$(RM) -r build lib bin deps *~ */*~ */*/*~ */*/*/*~ 
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
diff --git a/ci/docker/Dockerfile.build.android_armv7 b/ci/docker/Dockerfile.build.android_armv7
index c601fc5e5ff7..a2e98cd2efe1 100644
--- a/ci/docker/Dockerfile.build.android_armv7
+++ b/ci/docker/Dockerfile.build.android_armv7
@@ -75,6 +75,11 @@ ENV OpenBLAS_DIR=${CROSS_ROOT}
 
 WORKDIR /work
 
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
 
diff --git a/ci/docker/Dockerfile.build.android_armv8 b/ci/docker/Dockerfile.build.android_armv8
index 60376b8efda2..f7de86763457 100644
--- a/ci/docker/Dockerfile.build.android_armv8
+++ b/ci/docker/Dockerfile.build.android_armv8
@@ -74,6 +74,12 @@ ENV CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang++
 COPY install/android_arm64_openblas.sh /work/
 RUN /work/android_arm64_openblas.sh
 ENV CPLUS_INCLUDE_PATH /work/deps/OpenBLAS
-WORKDIR /work/build
+
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
+
+WORKDIR /work/build
\ No newline at end of file
diff --git a/ci/docker/Dockerfile.build.armv6 b/ci/docker/Dockerfile.build.armv6
index 6f16d8c77a0a..60e223b7a60f 100644
--- a/ci/docker/Dockerfile.build.armv6
+++ b/ci/docker/Dockerfile.build.armv6
@@ -38,5 +38,10 @@ ENV OpenBLAS_DIR=${CROSS_ROOT}
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.armv7 b/ci/docker/Dockerfile.build.armv7
index 5f0223448f12..0b557d5839e9 100644
--- a/ci/docker/Dockerfile.build.armv7
+++ b/ci/docker/Dockerfile.build.armv7
@@ -38,5 +38,10 @@ ENV OpenBLAS_DIR=${CROSS_ROOT}
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.armv8 b/ci/docker/Dockerfile.build.armv8
index 27bd425ae9b7..ef9c95865590 100644
--- a/ci/docker/Dockerfile.build.armv8
+++ b/ci/docker/Dockerfile.build.armv8
@@ -42,5 +42,10 @@ ENV OpenBLAS_DIR=${CROSS_ROOT}
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
 COPY runtime_functions.sh /work/
 WORKDIR /work/build
diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index d128ebc7e2a7..30b9b7e37507 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -82,5 +82,10 @@ ENV NVCCFLAGS "-m64"
 ENV CUDA_ARCH "-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62"
 ENV NVCC /usr/local/cuda/bin/nvcc
 
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
diff --git a/ci/docker/install/centos7_adduser.sh b/ci/docker/install/centos7_adduser.sh
index ba72c9b92281..f9d2402c9554 100755
--- a/ci/docker/install/centos7_adduser.sh
+++ b/ci/docker/install/centos7_adduser.sh
@@ -34,4 +34,9 @@ then
     mkdir /work/mxnet
     mkdir /work/build
     chown -R jenkins_slave /work/
+
+    # Later on, we have to override the links because underlying build systems ignore our compiler settings. Thus,
+    # we have to give the process the proper permission to these files. This is hacky, but unfortunately 
+    # there's no better way to do this without patching all our submodules.
+    chown -R jenkins_slave /usr/local/bin
 fi
diff --git a/ci/docker/install/ubuntu_adduser.sh b/ci/docker/install/ubuntu_adduser.sh
index 515a80f63b07..a7668bac2ab6 100755
--- a/ci/docker/install/ubuntu_adduser.sh
+++ b/ci/docker/install/ubuntu_adduser.sh
@@ -40,4 +40,9 @@ then
     mkdir /work/mxnet
     mkdir /work/build
     chown -R jenkins_slave /work/
+
+    # Later on, we have to override the links because underlying build systems ignore our compiler settings. Thus,
+    # we have to give the process the proper permission to these files. This is hacky, but unfortunately 
+    # there's no better way to do this without patching all our submodules.
+    chown -R jenkins_slave /usr/local/bin
 fi
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 5f60738ee5fa..d073962acf40 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -39,32 +39,59 @@ clean_repo() {
 build_ccache_wrappers() {
     set -ex
 
-    rm -f cc
-    rm -f cxx
-
-    touch cc
-    touch cxx
-
     if [ -z ${CC+x} ]; then
         echo "No \$CC set, defaulting to gcc";
         export CC=gcc
     fi
-
-    if [ -z ${CXX+x} ]; then
+     if [ -z ${CXX+x} ]; then
        echo "No \$CXX set, defaulting to g++";
        export CXX=g++
     fi
 
-    # this function is nessesary for cuda enabled make based builds, since nvcc needs just an executable for -ccbin
-
-    echo -e "#!/bin/sh\n/usr/local/bin/ccache ${CC} \"\$@\"\n" >> cc
-    echo -e "#!/bin/sh\n/usr/local/bin/ccache ${CXX} \"\$@\"\n" >> cxx
-
-    chmod +x cc
-    chmod +x cxx
-
-    export CC=`pwd`/cc
-    export CXX=`pwd`/cxx
+    # Recommended by CCache: https://ccache.samba.org/manual.html#_run_modes
+    # Add to the beginning of path to ensure this redirection is picked up instead
+    # of the original ones. Especially CUDA/NVCC appends itself to the beginning of the
+    # path and thus this redirect is ignored. This change fixes this problem
+    # This hacky approach with symbolic links is required because underlying build
+    # systems of our submodules ignore our CMake settings. If they use Makefile,
+    # we can't influence them at all in general and NVCC also prefers to hardcode their
+    # compiler instead of respecting the settings. Thus, we take this brutal approach
+    # and just redirect everything of this installer has been called.
+    # In future, we could do these links during image build time of the container.
+    # But in the beginning, we'll make this opt-in. In future, loads of processes like
+    # the scala make step or numpy compilation and other pip package generations
+    # could be heavily sped up by using ccache as well.
+    mkdir /tmp/ccache-redirects
+    export PATH=/tmp/ccache-redirects:$PATH
+    ln -s ccache /tmp/ccache-redirects/gcc
+    ln -s ccache /tmp/ccache-redirects/gcc-8
+    ln -s ccache /tmp/ccache-redirects/g++
+    ln -s ccache /tmp/ccache-redirects/g++-8
+    ln -s ccache /tmp/ccache-redirects/nvcc
+    ln -s ccache /tmp/ccache-redirects/clang++-3.9
+    ln -s ccache /tmp/ccache-redirects/clang-3.9
+    ln -s ccache /tmp/ccache-redirects/clang++-5.0
+    ln -s ccache /tmp/ccache-redirects/clang-5.0
+    ln -s ccache /tmp/ccache-redirects/clang++-6.0
+    ln -s ccache /tmp/ccache-redirects/clang-6.0
+    ln -s ccache /usr/local/bin/gcc
+    ln -s ccache /usr/local/bin/gcc-8
+    ln -s ccache /usr/local/bin/g++
+    ln -s ccache /usr/local/bin/g++-8
+    ln -s ccache /usr/local/bin/nvcc
+    ln -s ccache /usr/local/bin/clang++-3.9
+    ln -s ccache /usr/local/bin/clang-3.9
+    ln -s ccache /usr/local/bin/clang++-5.0
+    ln -s ccache /usr/local/bin/clang-5.0
+    ln -s ccache /usr/local/bin/clang++-6.0
+    ln -s ccache /usr/local/bin/clang-6.0
+
+    export NVCC=ccache
+
+    # Uncomment if you would like to debug CCache hit rates.
+    # You can monitor using tail -f ccache-log
+    # export CCACHE_LOGFILE=/work/mxnet/ccache-log
+    # export CCACHE_DEBUG=1
 }
 
 build_wheel() {
@@ -106,6 +133,8 @@ build_jetson() {
     set -ex
     pushd .
 
+    build_ccache_wrappers
+
     cp make/crosscompile.jetson.mk ./config.mk
     make -j$(nproc)
 
@@ -129,6 +158,7 @@ build_armv6() {
 
     # We do not need OpenMP, since most armv6 systems have only 1 core
 
+    build_ccache_wrappers
     cmake \
         -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
@@ -159,6 +189,7 @@ build_armv7() {
     # file tries to add -llapack. Lapack functionality though, requires -lgfortran
     # to be linked additionally.
 
+    build_ccache_wrappers
     cmake \
         -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
         -DCMAKE_CROSSCOMPILING=ON \
@@ -181,6 +212,7 @@ build_armv7() {
 }
 
 build_armv8() {
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -205,6 +237,7 @@ build_armv8() {
 build_android_armv7() {
     set -ex
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DANDROID=ON\
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
@@ -225,6 +258,7 @@ build_android_armv7() {
 build_android_armv8() {
     set -ex
     cd /work/build
+    build_ccache_wrappers
     cmake\
         -DANDROID=ON \
         -DUSE_CUDA=OFF\
@@ -244,7 +278,7 @@ build_centos7_cpu() {
     cd /work/mxnet
     export CC="ccache gcc"
     export CXX="ccache g++"
-
+    build_ccache_wrappers
     make \
         DEV=1 \
         USE_LAPACK=1 \
@@ -263,6 +297,7 @@ build_centos7_cpu() {
 
 build_amzn_linux_cpu() {
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -285,7 +320,7 @@ build_centos7_mkldnn() {
     cd /work/mxnet
     export CC="ccache gcc"
     export CXX="ccache g++"
-
+    build_ccache_wrappers
     make \
         DEV=1 \
         ENABLE_TESTCOVERAGE=1 \
@@ -299,7 +334,7 @@ build_centos7_gpu() {
     set -ex
     cd /work/mxnet
     # unfortunately this build has problems in 3rdparty dependencies with ccache and make
-    # build_ccache_wrappers
+    build_ccache_wrappers
     make \
         DEV=1                                     \
         ENABLE_TESTCOVERAGE=1                     \
@@ -326,8 +361,9 @@ build_ubuntu_cpu() {
 
 build_ubuntu_cpu_openblas() {
     set -ex
-    export CC="ccache gcc"
-    export CXX="ccache g++"
+    export CC="gcc"
+    export CXX="g++"
+    build_ccache_wrappers
     make \
         DEV=1                         \
         ENABLE_TESTCOVERAGE=1         \
@@ -355,6 +391,7 @@ build_ubuntu_cpu_cmake_debug() {
     set -ex
     pushd .
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -376,11 +413,12 @@ build_ubuntu_cpu_cmake_asan() {
 
     pushd .
     cd /work/build
+    export CXX=g++-8
+    export CC=gcc-8
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-        -DCMAKE_CXX_COMPILER=/usr/bin/g++-8 \
-        -DCMAKE_C_COMPILER=/usr/bin/gcc-8 \
         -DUSE_CUDA=OFF \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_OPENMP=OFF \
@@ -402,10 +440,10 @@ build_ubuntu_cpu_cmake_asan() {
 
 build_ubuntu_cpu_clang39() {
     set -ex
-     export CXX=clang++-3.9
+    export CXX=clang++-3.9
     export CC=clang-3.9
-     build_ccache_wrappers
-     make \
+    build_ccache_wrappers
+    make \
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
@@ -440,6 +478,7 @@ build_ubuntu_cpu_clang_tidy() {
 
     pushd .
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -534,6 +573,8 @@ build_ubuntu_gpu_tensorrt() {
     mkdir -p build
     cd build
     cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
         -DCMAKE_CXX_FLAGS=-I/usr/include/python${PYVER}\
         -DBUILD_SHARED_LIBS=ON ..\
         -G Ninja
@@ -548,7 +589,10 @@ build_ubuntu_gpu_tensorrt() {
     cd 3rdparty/onnx-tensorrt/
     mkdir -p build
     cd build
-    cmake ..
+    cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        ..
     make -j$(nproc)
     export LIBRARY_PATH=`pwd`:$LIBRARY_PATH
     popd
@@ -633,6 +677,7 @@ build_ubuntu_gpu_cuda91_cudnn7() {
 build_ubuntu_amalgamation() {
     set -ex
     # Amalgamation can not be run with -j nproc
+    build_ccache_wrappers
     make -C amalgamation/ clean
     make -C amalgamation/     \
         USE_BLAS=openblas     \
@@ -642,6 +687,7 @@ build_ubuntu_amalgamation() {
 build_ubuntu_amalgamation_min() {
     set -ex
     # Amalgamation can not be run with -j nproc
+    build_ccache_wrappers
     make -C amalgamation/ clean
     make -C amalgamation/     \
         USE_BLAS=openblas     \
@@ -652,9 +698,11 @@ build_ubuntu_amalgamation_min() {
 build_ubuntu_gpu_cmake_mkldnn() {
     set -ex
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache    \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache      \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache   \
         -DENABLE_TESTCOVERAGE=ON                \
         -DUSE_CUDA=1                            \
         -DUSE_CUDNN=1                           \
@@ -675,9 +723,11 @@ build_ubuntu_gpu_cmake_mkldnn() {
 build_ubuntu_gpu_cmake() {
     set -ex
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache    \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache      \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache   \
         -DENABLE_TESTCOVERAGE=ON                \
         -DUSE_CUDA=1                            \
         -DUSE_CUDNN=1                           \

From 948ea016f7fe574699330ff8f3363d8faf651224 Mon Sep 17 00:00:00 2001
From: mathieu <matdpro@gmail.com>
Date: Fri, 14 Dec 2018 23:22:06 +0100
Subject: [PATCH 75/93] [MXNET-918] Random module (#13039)

* introduce random API

* revert useless changes

* shorter types in APIDoc gen code

* fix after merge from master

* Trigger CI

* temp code / diag on CI

* cleanup type-class code

* cleanup type-class code

* fix scalastyle
---
 .../main/scala/org/apache/mxnet/Base.scala    |  18 ++
 .../main/scala/org/apache/mxnet/NDArray.scala |   1 +
 .../scala/org/apache/mxnet/NDArrayAPI.scala   |  13 +-
 .../main/scala/org/apache/mxnet/Symbol.scala  |   1 +
 .../scala/org/apache/mxnet/SymbolAPI.scala    |  12 +-
 .../scala/org/apache/mxnet/NDArraySuite.scala |  17 ++
 .../scala/org/apache/mxnet/SymbolSuite.scala  |  22 +++
 .../org/apache/mxnet/APIDocGenerator.scala    |  43 ++++-
 .../org/apache/mxnet/GeneratorBase.scala      |  75 +++++++-
 .../scala/org/apache/mxnet/NDArrayMacro.scala | 171 ++++++++++++++----
 .../scala/org/apache/mxnet/SymbolMacro.scala  | 147 +++++++++++----
 11 files changed, 435 insertions(+), 85 deletions(-)

diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Base.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Base.scala
index b2a53fd9f2dd..bb9518d51f1e 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Base.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Base.scala
@@ -153,3 +153,21 @@ private[mxnet] object Base {
 }
 
 class MXNetError(val err: String) extends Exception(err)
+
+// Some type-classes to ease the work in Symbol.random and NDArray.random modules
+
+class SymbolOrScalar[T](val isScalar: Boolean)
+object SymbolOrScalar {
+  def apply[T](implicit ev: SymbolOrScalar[T]): SymbolOrScalar[T] = ev
+  implicit object FloatWitness extends SymbolOrScalar[Float](true)
+  implicit object IntWitness extends SymbolOrScalar[Int](true)
+  implicit object SymbolWitness extends SymbolOrScalar[Symbol](false)
+}
+
+class NDArrayOrScalar[T](val isScalar: Boolean)
+object NDArrayOrScalar {
+  def apply[T](implicit ev: NDArrayOrScalar[T]): NDArrayOrScalar[T] = ev
+  implicit object FloatWitness extends NDArrayOrScalar[Float](true)
+  implicit object IntWitness extends NDArrayOrScalar[Int](true)
+  implicit object NDArrayWitness extends NDArrayOrScalar[NDArray](false)
+}
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
index 3a0c3c11f16a..125958150b72 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
@@ -40,6 +40,7 @@ object NDArray extends NDArrayBase {
   private val functions: Map[String, NDArrayFunction] = initNDArrayModule()
 
   val api = NDArrayAPI
+  val random = NDArrayRandomAPI
 
   private def addDependency(froms: Array[NDArray], tos: Array[NDArray]): Unit = {
     froms.foreach { from =>
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayAPI.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayAPI.scala
index 1d8551c1b1e5..024fed1c4ba6 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayAPI.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayAPI.scala
@@ -15,11 +15,22 @@
  * limitations under the License.
  */
 package org.apache.mxnet
-@AddNDArrayAPIs(false)
+
 /**
   * typesafe NDArray API: NDArray.api._
   * Main code will be generated during compile time through Macros
   */
+@AddNDArrayAPIs(false)
 object NDArrayAPI extends NDArrayAPIBase {
   // TODO: Implement CustomOp for NDArray
 }
+
+/**
+  * typesafe NDArray random module: NDArray.random._
+  * Main code will be generated during compile time through Macros
+  */
+@AddNDArrayRandomAPIs(false)
+object NDArrayRandomAPI extends NDArrayRandomAPIBase {
+
+}
+
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
index 01349a689b6c..29885fc723cd 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
@@ -842,6 +842,7 @@ object Symbol extends SymbolBase {
   private val bindReqMap = Map("null" -> 0, "write" -> 1, "add" -> 3)
 
   val api = SymbolAPI
+  val random = SymbolRandomAPI
 
   def pow(sym1: Symbol, sym2: Symbol): Symbol = {
     Symbol.createFromListedSymbols("_Power")(Array(sym1, sym2))
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/SymbolAPI.scala b/scala-package/core/src/main/scala/org/apache/mxnet/SymbolAPI.scala
index 1bfb0559cf96..f166de11ea52 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/SymbolAPI.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/SymbolAPI.scala
@@ -19,11 +19,11 @@ package org.apache.mxnet
 import scala.collection.mutable
 
 
-@AddSymbolAPIs(false)
 /**
   * typesafe Symbol API: Symbol.api._
   * Main code will be generated during compile time through Macros
   */
+@AddSymbolAPIs(false)
 object SymbolAPI extends SymbolAPIBase {
   def Custom (op_type : String, kwargs : mutable.Map[String, Any],
              name : String = null, attr : Map[String, String] = null) : Symbol = {
@@ -32,3 +32,13 @@ object SymbolAPI extends SymbolAPIBase {
     Symbol.createSymbolGeneral("Custom", name, attr, Seq(), map.toMap)
   }
 }
+
+/**
+  * typesafe Symbol random module: Symbol.random._
+  * Main code will be generated during compile time through Macros
+  */
+@AddSymbolRandomAPIs(false)
+object SymbolRandomAPI extends SymbolRandomAPIBase {
+
+}
+
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala
index 5d88bb39e502..7992a0ed867b 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala
@@ -576,4 +576,21 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     assert(arr.internal.toDoubleArray === Array(2d, 2d))
     assert(arr.internal.toByteArray === Array(2.toByte, 2.toByte))
   }
+
+  test("NDArray random module is generated properly") {
+    val lam = NDArray.ones(1, 2)
+    val rnd = NDArray.random.poisson(lam = Some(lam), shape = Some(Shape(3, 4)))
+    val rnd2 = NDArray.random.poisson(lam = Some(1f), shape = Some(Shape(3, 4)))
+    assert(rnd.shape === Shape(1, 2, 3, 4))
+    assert(rnd2.shape === Shape(3, 4))
+  }
+
+  test("NDArray random module is generated properly - special case of 'normal'") {
+    val mu = NDArray.ones(1, 2)
+    val sigma = NDArray.ones(1, 2) * 2
+    val rnd = NDArray.random.normal(mu = Some(mu), sigma = Some(sigma), shape = Some(Shape(3, 4)))
+    val rnd2 = NDArray.random.normal(mu = Some(1f), sigma = Some(2f), shape = Some(Shape(3, 4)))
+    assert(rnd.shape === Shape(1, 2, 3, 4))
+    assert(rnd2.shape === Shape(3, 4))
+  }
 }
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
index ebb61d7d4bfb..d134c83ff7e7 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
@@ -20,6 +20,7 @@ package org.apache.mxnet
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 
 class SymbolSuite extends FunSuite with BeforeAndAfterAll {
+
   test("symbol compose") {
     val data = Symbol.Variable("data")
 
@@ -71,4 +72,25 @@ class SymbolSuite extends FunSuite with BeforeAndAfterAll {
     val data2 = data.clone()
     assert(data.toJson === data2.toJson)
   }
+
+  test("Symbol random module is generated properly") {
+    val lam = Symbol.Variable("lam")
+    val rnd = Symbol.random.poisson(lam = Some(lam), shape = Some(Shape(2, 2)))
+    val rnd2 = Symbol.random.poisson(lam = Some(1f), shape = Some(Shape(2, 2)))
+    // scalastyle:off println
+    println(s"Symbol.random.poisson debug info: ${rnd.debugStr}")
+    println(s"Symbol.random.poisson debug info: ${rnd2.debugStr}")
+    // scalastyle:on println
+  }
+
+  test("Symbol random module is generated properly - special case of 'normal'") {
+    val loc = Symbol.Variable("loc")
+    val scale = Symbol.Variable("scale")
+    val rnd = Symbol.random.normal(mu = Some(loc), sigma = Some(scale), shape = Some(Shape(2, 2)))
+    val rnd2 = Symbol.random.normal(mu = Some(1f), sigma = Some(2f), shape = Some(Shape(2, 2)))
+    // scalastyle:off println
+    println(s"Symbol.random.sample_normal debug info: ${rnd.debugStr}")
+    println(s"Symbol.random.random_normal debug info: ${rnd2.debugStr}")
+    // scalastyle:on println
+  }
 }
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
index ce12dc7cd5a0..97cd18a5b337 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
@@ -27,13 +27,15 @@ import scala.collection.mutable.ListBuffer
   * Two file namely: SymbolAPIBase.scala and NDArrayAPIBase.scala
   * The code will be executed during Macros stage and file live in Core stage
   */
-private[mxnet] object APIDocGenerator extends GeneratorBase {
+private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
 
   def main(args: Array[String]): Unit = {
     val FILE_PATH = args(0)
     val hashCollector = ListBuffer[String]()
     hashCollector += typeSafeClassGen(FILE_PATH, true)
     hashCollector += typeSafeClassGen(FILE_PATH, false)
+    hashCollector += typeSafeRandomClassGen(FILE_PATH, true)
+    hashCollector += typeSafeRandomClassGen(FILE_PATH, false)
     hashCollector += nonTypeSafeClassGen(FILE_PATH, true)
     hashCollector += nonTypeSafeClassGen(FILE_PATH, false)
     hashCollector += javaClassGen(FILE_PATH)
@@ -57,8 +59,27 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
 
     writeFile(
       FILE_PATH,
+      "package org.apache.mxnet",
       if (isSymbol) "SymbolAPIBase" else "NDArrayAPIBase",
+      "import org.apache.mxnet.annotation.Experimental",
+      generated)
+  }
+
+  def typeSafeRandomClassGen(FILE_PATH: String, isSymbol: Boolean): String = {
+    val generated = typeSafeRandomFunctionsToGenerate(isSymbol)
+      .map { func =>
+        val scalaDoc = generateAPIDocFromBackend(func)
+        val typeParameter = randomGenericTypeSpec(isSymbol, false)
+        val decl = generateAPISignature(func, isSymbol, typeParameter)
+        s"$scalaDoc\n$decl"
+      }
+
+    writeFile(
+      FILE_PATH,
       "package org.apache.mxnet",
+      if (isSymbol) "SymbolRandomAPIBase" else "NDArrayRandomAPIBase",
+      """import org.apache.mxnet.annotation.Experimental
+        |import scala.reflect.ClassTag""".stripMargin,
       generated)
   }
 
@@ -85,8 +106,9 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
 
     writeFile(
       FILE_PATH,
-      if (isSymbol) "SymbolBase" else "NDArrayBase",
       "package org.apache.mxnet",
+      if (isSymbol) "SymbolBase" else "NDArrayBase",
+      "import org.apache.mxnet.annotation.Experimental",
       absFuncs)
   }
 
@@ -110,7 +132,12 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
       }).toSeq
     val packageName = "NDArrayBase"
     val packageDef = "package org.apache.mxnet.javaapi"
-    writeFile(filePath + "javaapi/", packageName, packageDef, absFuncs)
+    writeFile(
+      filePath + "javaapi/",
+      packageDef,
+      packageName,
+      "import org.apache.mxnet.annotation.Experimental",
+      absFuncs)
   }
 
   def generateAPIDocFromBackend(func: Func, withParam: Boolean = true): String = {
@@ -146,7 +173,7 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
     }
   }
 
-  def generateAPISignature(func: Func, isSymbol: Boolean): String = {
+  def generateAPISignature(func: Func, isSymbol: Boolean, typeParameter: String = ""): String = {
     val argDef = ListBuffer[String]()
 
     argDef ++= typedFunctionCommonArgDef(func)
@@ -162,7 +189,7 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
     val returnType = func.returnType
 
     s"""@Experimental
-       |def ${func.name} (${argDef.mkString(", ")}): $returnType""".stripMargin
+       |def ${func.name}$typeParameter (${argDef.mkString(", ")}): $returnType""".stripMargin
   }
 
   def generateJavaAPISignature(func : Func) : String = {
@@ -223,8 +250,8 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
     }
   }
 
-  def writeFile(FILE_PATH: String, className: String, packageDef: String,
-                absFuncs: Seq[String]): String = {
+  def writeFile(FILE_PATH: String, packageDef: String, className: String,
+                imports: String, absFuncs: Seq[String]): String = {
 
     val finalStr =
       s"""/*
@@ -246,7 +273,7 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
          |
          |$packageDef
          |
-         |import org.apache.mxnet.annotation.Experimental
+         |$imports
          |
          |// scalastyle:off
          |abstract class $className {
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
index 9245ef1b437f..1c2c4fd704b3 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
@@ -23,7 +23,7 @@ import org.apache.mxnet.utils.{CToScalaUtils, OperatorBuildUtils}
 import scala.collection.mutable.ListBuffer
 import scala.reflect.macros.blackbox
 
-abstract class GeneratorBase {
+private[mxnet] abstract class GeneratorBase {
   type Handle = Long
 
   case class Arg(argName: String, argType: String, argDesc: String, isOptional: Boolean) {
@@ -46,7 +46,8 @@ abstract class GeneratorBase {
     }
   }
 
-  def typeSafeFunctionsToGenerate(isSymbol: Boolean, isContrib: Boolean): List[Func] = {
+  // filter the operators to generate in the type-safe Symbol.api and NDArray.api
+  protected def typeSafeFunctionsToGenerate(isSymbol: Boolean, isContrib: Boolean): List[Func] = {
     // Operators that should not be generated
     val notGenerated = Set("Custom")
 
@@ -144,8 +145,8 @@ abstract class GeneratorBase {
     result
   }
 
+  // build function argument definition, with optionality, and safe names
   protected def typedFunctionCommonArgDef(func: Func): List[String] = {
-    // build function argument definition, with optionality, and safe names
     func.listOfArgs.map(arg =>
       if (arg.isOptional) {
         // let's avoid a stupid Option[Array[...]]
@@ -161,3 +162,71 @@ abstract class GeneratorBase {
     )
   }
 }
+
+// a mixin to ease generating the Random module
+private[mxnet] trait RandomHelpers {
+  self: GeneratorBase =>
+
+  // a generic type spec used in Symbol.random and NDArray.random modules
+  protected def randomGenericTypeSpec(isSymbol: Boolean, fullPackageSpec: Boolean): String = {
+    val classTag = if (fullPackageSpec) "scala.reflect.ClassTag" else "ClassTag"
+    if (isSymbol) s"[T: SymbolOrScalar : $classTag]"
+    else s"[T: NDArrayOrScalar : $classTag]"
+  }
+
+  // filter the operators to generate in the type-safe Symbol.random and NDArray.random
+  protected def typeSafeRandomFunctionsToGenerate(isSymbol: Boolean): List[Func] = {
+    getBackEndFunctions(isSymbol)
+      .filter(f => f.name.startsWith("_sample_") || f.name.startsWith("_random_"))
+      .map(f => f.copy(name = f.name.stripPrefix("_")))
+      // unify _random and _sample
+      .map(f => unifyRandom(f, isSymbol))
+      // deduplicate
+      .groupBy(_.name)
+      .mapValues(_.head)
+      .values
+      .toList
+  }
+
+  // unify call targets (random_xyz and sample_xyz) and unify their argument types
+  private def unifyRandom(func: Func, isSymbol: Boolean): Func = {
+    var typeConv = Set("org.apache.mxnet.NDArray", "org.apache.mxnet.Symbol",
+      "java.lang.Float", "java.lang.Integer")
+
+    func.copy(
+      name = func.name.replaceAll("(random|sample)_", ""),
+      listOfArgs = func.listOfArgs
+        .map(hackNormalFunc)
+        .map(arg =>
+          if (typeConv(arg.argType)) arg.copy(argType = "T")
+          else arg
+        )
+      // TODO: some functions are non consistent in random_ vs sample_ regarding optionality
+      // we may try to unify that as well here.
+    )
+  }
+
+  // hacks to manage the fact that random_normal and sample_normal have
+  // non-consistent parameter naming in the back-end
+  // this first one, merge loc/scale and mu/sigma
+  protected def hackNormalFunc(arg: Arg): Arg = {
+    if (arg.argName == "loc") arg.copy(argName = "mu")
+    else if (arg.argName == "scale") arg.copy(argName = "sigma")
+    else arg
+  }
+
+  // this second one reverts this merge prior to back-end call
+  protected def unhackNormalFunc(func: Func): String = {
+    if (func.name.equals("normal")) {
+      s"""if(target.equals("random_normal")) {
+         |  if(map.contains("mu")) { map("loc") = map("mu"); map.remove("mu")  }
+         |  if(map.contains("sigma")) { map("scale") = map("sigma"); map.remove("sigma") }
+         |}
+       """.stripMargin
+    } else {
+      ""
+    }
+
+  }
+
+}
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
index d85abe1ecc4f..c18694b59bf6 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
@@ -18,7 +18,6 @@
 package org.apache.mxnet
 
 import scala.annotation.StaticAnnotation
-import scala.collection.mutable.ListBuffer
 import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
@@ -30,6 +29,14 @@ private[mxnet] class AddNDArrayAPIs(isContrib: Boolean) extends StaticAnnotation
   private[mxnet] def macroTransform(annottees: Any*) = macro TypedNDArrayAPIMacro.typeSafeAPIDefs
 }
 
+private[mxnet] class AddNDArrayRandomAPIs(isContrib: Boolean) extends StaticAnnotation {
+  private[mxnet] def macroTransform(annottees: Any*) =
+  macro TypedNDArrayRandomAPIMacro.typeSafeAPIDefs
+}
+
+/**
+  * For non-typed NDArray API
+  */
 private[mxnet] object NDArrayMacro extends GeneratorBase {
 
   def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
@@ -70,6 +77,9 @@ private[mxnet] object NDArrayMacro extends GeneratorBase {
   }
 }
 
+/**
+  * NDArray.api code generation
+  */
 private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
 
   def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
@@ -78,9 +88,9 @@ private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
       case q"new AddNDArrayAPIs($b)" => c.eval[Boolean](c.Expr(b))
     }
 
-    val functions = typeSafeFunctionsToGenerate(isSymbol = false, isContrib)
+    val functionDefs = typeSafeFunctionsToGenerate(isSymbol = false, isContrib)
+      .map(f => buildTypedFunction(c)(f))
 
-    val functionDefs = functions.map(f => buildTypedFunction(c)(f))
     structGeneration(c)(functionDefs, annottees: _*)
   }
 
@@ -89,49 +99,136 @@ private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
     import c.universe._
 
     val returnType = "org.apache.mxnet.NDArrayFuncReturn"
-    val ndarrayType = "org.apache.mxnet.NDArray"
-
-    // Construct argument field
-    val argDef = ListBuffer[String]()
-    argDef ++= typedFunctionCommonArgDef(function)
-    argDef += "out : Option[NDArray] = None"
-
-    // Construct Implementation field
-    var impl = ListBuffer[String]()
-    impl += "val map = scala.collection.mutable.Map[String, Any]()"
-    impl += s"val args = scala.collection.mutable.ArrayBuffer.empty[$ndarrayType]"
-
-    // NDArray arg implementation
-    impl ++= function.listOfArgs.map { arg =>
-      if (arg.argType.equals(s"Array[$ndarrayType]")) {
-        s"args ++= ${arg.safeArgName}"
-      } else {
-        val base =
-          if (arg.argType.equals(ndarrayType)) {
-            // ndarrays go to args
+
+    // Construct API arguments declaration
+    val argDecl = super.typedFunctionCommonArgDef(function) :+ "out : Option[NDArray] = None"
+
+    // Map API input args to backend args
+    val backendArgsMapping =
+      function.listOfArgs.map { arg =>
+        // ndarrays go to args, other types go to kwargs
+        if (arg.argType.equals(s"Array[org.apache.mxnet.NDArray]")) {
+          s"args ++= ${arg.safeArgName}.toSeq"
+        } else {
+          val base = if (arg.argType.equals("org.apache.mxnet.NDArray")) {
             s"args += ${arg.safeArgName}"
           } else {
-            // other types go to kwargs
             s"""map("${arg.argName}") = ${arg.safeArgName}"""
           }
-        if (arg.isOptional) s"if (!${arg.safeArgName}.isEmpty) $base.get"
-        else base
+          if (arg.isOptional) s"if (!${arg.safeArgName}.isEmpty) $base.get"
+          else base
+        }
       }
-    }
 
-    impl +=
-      s"""if (!out.isEmpty) map("out") = out.get
-         |org.apache.mxnet.NDArray.genericNDArrayFunctionInvoke(
-         |  "${function.name}", args.toSeq, map.toMap)
+    val impl =
+      s"""
+         |def ${function.name}
+         |  (${argDecl.mkString(",")}): $returnType = {
+         |
+         |  val map = scala.collection.mutable.Map[String, Any]()
+         |  val args = scala.collection.mutable.ArrayBuffer.empty[org.apache.mxnet.NDArray]
+         |
+         |  if (!out.isEmpty) map("out") = out.get
+         |
+         |  ${backendArgsMapping.mkString("\n")}
+         |
+         |  org.apache.mxnet.NDArray.genericNDArrayFunctionInvoke(
+         |    "${function.name}", args.toSeq, map.toMap)
+         |}
        """.stripMargin
 
-    // Combine and build the function string
-    val finalStr =
-      s"""def ${function.name}
-         |   (${argDef.mkString(",")}) : $returnType
-         | = {${impl.mkString("\n")}}
+    c.parse(impl).asInstanceOf[DefDef]
+  }
+}
+
+
+/**
+  * NDArray.random code generation
+  */
+private[mxnet] object TypedNDArrayRandomAPIMacro extends GeneratorBase
+  with RandomHelpers {
+
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+    // Note: no contrib managed in this module
+
+    val functionDefs = typeSafeRandomFunctionsToGenerate(isSymbol = false)
+      .map(f => buildTypedFunction(c)(f))
+
+    structGeneration(c)(functionDefs, annottees: _*)
+  }
+
+  protected def buildTypedFunction(c: blackbox.Context)
+                                  (function: Func): c.universe.DefDef = {
+    import c.universe._
+
+    val returnType = "org.apache.mxnet.NDArrayFuncReturn"
+
+    // Construct API arguments declaration
+    val argDecl = super.typedFunctionCommonArgDef(function) :+ "out : Option[NDArray] = None"
+
+    // Map API input args to backend args
+    val backendArgsMapping =
+      function.listOfArgs.map { arg =>
+        // ndarrays go to args, other types go to kwargs
+        if (arg.argType.equals("Array[org.apache.mxnet.NDArray]")) {
+          s"args ++= ${arg.safeArgName}.toSeq"
+        } else {
+          if (arg.argType.equals("T")) {
+            if (arg.isOptional) {
+              s"""if(${arg.safeArgName}.isDefined) {
+                 |  if(isScalar) {
+                 |    map("${arg.argName}") = ${arg.safeArgName}.get
+                 |  } else {
+                 |    args += ${arg.safeArgName}.get.asInstanceOf[org.apache.mxnet.NDArray]
+                 |  }
+                 |}
+             """.stripMargin
+            } else {
+              s"""if(isScalar) {
+                 |  map("${arg.argName}") = ${arg.safeArgName}
+                 |} else {
+                 |  args += ${arg.safeArgName}.asInstanceOf[org.apache.mxnet.NDArray]
+                 |}
+             """.stripMargin
+            }
+          } else {
+            if (arg.isOptional) {
+              s"""if (${arg.safeArgName}.isDefined) map("${arg.argName}")=${arg.safeArgName}.get"""
+            } else {
+              s"""map("${arg.argName}") = ${arg.safeArgName}"""
+            }
+          }
+        }
+      }
+
+    val impl =
+      s"""
+         |def ${function.name}${randomGenericTypeSpec(false, true)}
+         |  (${argDecl.mkString(",")}): $returnType = {
+         |
+         |  val map = scala.collection.mutable.Map[String, Any]()
+         |  val args = scala.collection.mutable.ArrayBuffer.empty[org.apache.mxnet.NDArray]
+         |  val isScalar = NDArrayOrScalar[T].isScalar
+         |
+         |  if(out.isDefined) map("out") = out.get
+         |
+         |  ${backendArgsMapping.mkString("\n")}
+         |
+         |  val target = if(isScalar) {
+         |    "random_${function.name}"
+         |  } else {
+         |    "sample_${function.name}"
+         |  }
+         |
+         |  ${unhackNormalFunc(function)}
+         |
+         |  org.apache.mxnet.NDArray.genericNDArrayFunctionInvoke(
+         |    target, args.toSeq, map.toMap)
+         |}
        """.stripMargin
 
-    c.parse(finalStr).asInstanceOf[DefDef]
+    c.parse(impl).asInstanceOf[DefDef]
   }
+
+
 }
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
index ab864e1ef195..7ec80b9c066c 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
@@ -17,8 +17,8 @@
 
 package org.apache.mxnet
 
+
 import scala.annotation.StaticAnnotation
-import scala.collection.mutable.ListBuffer
 import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
@@ -30,6 +30,14 @@ private[mxnet] class AddSymbolAPIs(isContrib: Boolean) extends StaticAnnotation
   private[mxnet] def macroTransform(annottees: Any*) = macro TypedSymbolAPIMacro.typeSafeAPIDefs
 }
 
+private[mxnet] class AddSymbolRandomAPIs(isContrib: Boolean) extends StaticAnnotation {
+  private[mxnet] def macroTransform(annottees: Any*) =
+  macro TypedSymbolRandomAPIMacro.typeSafeAPIDefs
+}
+
+/**
+  * For non-typed Symbol API
+  */
 private[mxnet] object SymbolMacro extends GeneratorBase {
 
   def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
@@ -63,6 +71,9 @@ private[mxnet] object SymbolMacro extends GeneratorBase {
   }
 }
 
+/**
+  * Symbol.api code generation
+  */
 private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
 
   def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
@@ -71,9 +82,9 @@ private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
       case q"new AddSymbolAPIs($b)" => c.eval[Boolean](c.Expr(b))
     }
 
-    val functions = typeSafeFunctionsToGenerate(isSymbol = true, isContrib)
+    val functionDefs = typeSafeFunctionsToGenerate(isSymbol = true, isContrib)
+      .map(f => buildTypedFunction(c)(f))
 
-    val functionDefs = functions.map(f => buildTypedFunction(c)(f))
     structGeneration(c)(functionDefs, annottees: _*)
   }
 
@@ -82,45 +93,111 @@ private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
     import c.universe._
 
     val returnType = "org.apache.mxnet.Symbol"
-    val symbolType = "org.apache.mxnet.Symbol"
-
-    // Construct argument field
-    val argDef = ListBuffer[String]()
-    argDef ++= typedFunctionCommonArgDef(function)
-    argDef += "name : String = null"
-    argDef += "attr : Map[String, String] = null"
-
-    // Construct Implementation field
-    val impl = ListBuffer[String]()
-    impl += "val map = scala.collection.mutable.Map[String, Any]()"
-    impl += s"var args = scala.collection.Seq[$symbolType]()"
-
-    // Symbol arg implementation
-    impl ++= function.listOfArgs.map { arg =>
-      if (arg.argType.equals(s"Array[$symbolType]")) {
-        s"if (!${arg.safeArgName}.isEmpty) args = ${arg.safeArgName}.toSeq"
-      } else {
-        // all go in kwargs
-        if (arg.isOptional) {
-          s"""if (!${arg.safeArgName}.isEmpty) map("${arg.argName}") = ${arg.safeArgName}.get"""
+
+    // Construct API arguments declaration
+    val argDecl = super.typedFunctionCommonArgDef(function) :+
+      "name : String = null" :+
+      "attr : Map[String, String] = null"
+
+    // Map API input args to backend args
+    val backendArgsMapping =
+      function.listOfArgs.map { arg =>
+        if (arg.argType.equals(s"Array[org.apache.mxnet.Symbol]")) {
+          s"args = ${arg.safeArgName}.toSeq"
         } else {
-          s"""map("${arg.argName}") = ${arg.safeArgName}"""
+          // all go in kwargs
+          if (arg.isOptional) {
+            s"""if (!${arg.safeArgName}.isEmpty) map("${arg.argName}") = ${arg.safeArgName}.get"""
+          } else {
+            s"""map("${arg.argName}") = ${arg.safeArgName}"""
+          }
         }
       }
-    }
 
-    impl +=
-      s"""org.apache.mxnet.Symbol.createSymbolGeneral(
-         |  "${function.name}", name, attr, args, map.toMap)
+    val impl =
+      s"""
+         |def ${function.name}
+         |  (${argDecl.mkString(",")}): $returnType = {
+         |
+         |  val map = scala.collection.mutable.Map[String, Any]()
+         |  var args = scala.collection.Seq[org.apache.mxnet.Symbol]()
+         |
+         |  ${backendArgsMapping.mkString("\n")}
+         |
+         |  org.apache.mxnet.Symbol.createSymbolGeneral(
+         |    "${function.name}", name, attr, args, map.toMap)
+         |}
        """.stripMargin
 
-    // Combine and build the function string
-    val finalStr =
-      s"""def ${function.name}
-         |   (${argDef.mkString(",")}) : $returnType
-         | = {${impl.mkString("\n")}}
+    c.parse(impl).asInstanceOf[DefDef]
+  }
+}
+
+
+/**
+  * Symbol.random code generation
+  */
+private[mxnet] object TypedSymbolRandomAPIMacro extends GeneratorBase
+  with RandomHelpers {
+
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+    val functionDefs = typeSafeRandomFunctionsToGenerate(isSymbol = true)
+      .map(f => buildTypedFunction(c)(f))
+
+    structGeneration(c)(functionDefs, annottees: _*)
+  }
+
+  protected def buildTypedFunction(c: blackbox.Context)
+                                  (function: Func): c.universe.DefDef = {
+    import c.universe._
+
+    val returnType = "org.apache.mxnet.Symbol"
+
+    // Construct API arguments declaration
+    val argDecl = super.typedFunctionCommonArgDef(function) :+
+      "name : String = null" :+
+      "attr : Map[String, String] = null"
+
+    // Map API input args to backend args
+    val backendArgsMapping =
+      function.listOfArgs.map { arg =>
+        if (arg.argType.equals(s"Array[org.apache.mxnet.Symbol]")) {
+          s"args = ${arg.safeArgName}.toSeq"
+        } else {
+          // all go in kwargs
+          if (arg.isOptional) {
+            s"""if (${arg.safeArgName}.isDefined) map("${arg.argName}") = ${arg.safeArgName}.get"""
+          } else {
+            s"""map("${arg.argName}") = ${arg.safeArgName}"""
+          }
+        }
+      }
+
+    val impl =
+      s"""
+         |def ${function.name}${randomGenericTypeSpec(true, true)}
+         |  (${argDecl.mkString(",")}): $returnType = {
+         |
+         |  val map = scala.collection.mutable.Map[String, Any]()
+         |  var args = scala.collection.Seq[org.apache.mxnet.Symbol]()
+         |  val isScalar = SymbolOrScalar[T].isScalar
+         |
+         |  ${backendArgsMapping.mkString("\n")}
+         |
+         |  val target = if(isScalar) {
+         |    "random_${function.name}"
+         |  } else {
+         |    "sample_${function.name}"
+         |  }
+         |
+         |  ${unhackNormalFunc(function)}
+         |
+         |  org.apache.mxnet.Symbol.createSymbolGeneral(
+         |    target, name, attr, args, map.toMap)
+         |}
        """.stripMargin
 
-    c.parse(finalStr).asInstanceOf[DefDef]
+    c.parse(impl).asInstanceOf[DefDef]
   }
 }
+

From ae377f865c82e0ff295596a8914893a517e43c2e Mon Sep 17 00:00:00 2001
From: Taliesin Beynon <taliesinb@gmail.com>
Date: Sat, 15 Dec 2018 01:36:39 +0200
Subject: [PATCH 76/93] Fix incorrect delete in MXExecutorReshape exception
 handling (#13376)

* Fix bad delete.

Delete the pointed-to handle on cleanup, not the location of the handle itself. Also don't delete it if we didn't set it in the first place.

* Remove unusued 'exec' var from MXExecutorBindEX.
---
 src/c_api/c_api_executor.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index 1f936b164326..e2e53c7261fa 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -148,8 +148,6 @@ int MXExecutorBindEX(SymbolHandle symbol_handle,
                      NDArrayHandle *aux_states,
                      ExecutorHandle shared_exec,
                      ExecutorHandle *out) {
-  Executor* exec = nullptr;
-
   API_BEGIN();
   nnvm::Symbol *symb = static_cast<nnvm::Symbol*>(symbol_handle);
   Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
@@ -181,7 +179,7 @@ int MXExecutorBindEX(SymbolHandle symbol_handle,
   *out = Executor::Bind(*symb, ctx, ctx_map, in_args_vec,
                         arg_grad_vec, grad_req_vec, aux_states_vec,
                         reinterpret_cast<Executor*>(shared_exec));
-  API_END_HANDLE_ERROR(delete exec);
+  API_END();
 }
 
 /*!
@@ -558,8 +556,11 @@ int MXExecutorReshape(int partial_shaping,
                       NDArrayHandle** aux_states,
                       ExecutorHandle shared_exec,
                       ExecutorHandle *out) {
+  Executor* new_exec = nullptr;
+
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();
+  *out = nullptr;  // ensure we can know whether to free executor on early abort
   // create shape map for in_args and aux_states
   std::unordered_map<std::string, TShape> kwargs(num_provided_arg_shapes);
   for (mx_uint i = 0; i < num_provided_arg_shapes; ++i) {
@@ -581,8 +582,9 @@ int MXExecutorReshape(int partial_shaping,
   std::vector<NDArray> aux_state_vec;
 
   Executor* exec = static_cast<Executor*>(shared_exec);
-  *out = exec->Reshape(partial_shaping, allow_up_sizing, ctx, ctx_map, kwargs,
+  new_exec = exec->Reshape(partial_shaping, allow_up_sizing, ctx, ctx_map, kwargs,
                        &in_arg_vec, &arg_grad_vec, &aux_state_vec);
+  *out = new_exec;
 
   ret->ret_handles.clear();
   ret->ret_handles.reserve(in_arg_vec.size()+arg_grad_vec.size()+aux_state_vec.size());
@@ -623,7 +625,7 @@ int MXExecutorReshape(int partial_shaping,
     *aux_states = &(ret->ret_handles[nd_idx]);
     nd_idx = ret->ret_handles.size();
   }
-  API_END_HANDLE_ERROR(delete out);
+  API_END_HANDLE_ERROR(delete new_exec);
 }
 
 int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,

From 73c72d1ff5ae6db188d0b0805da19c9153a73145 Mon Sep 17 00:00:00 2001
From: Lanking <lanking520@live.com>
Date: Fri, 14 Dec 2018 16:24:48 -0800
Subject: [PATCH 77/93] [MXNET-1251] Basic configuration to do static-linking
 (#13621)

* Basic configuration to do static-linking

* update build script and place it in the install part

* clean up the code further

* revert maven into build-from-source

* add curl to deps
---
 ci/docker/install/ubuntu_publish.sh | 48 +++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 ci/docker/install/ubuntu_publish.sh

diff --git a/ci/docker/install/ubuntu_publish.sh b/ci/docker/install/ubuntu_publish.sh
new file mode 100644
index 000000000000..bc3513dd13e5
--- /dev/null
+++ b/ci/docker/install/ubuntu_publish.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Build on Ubuntu 14.04 LTS for LINUX CPU/GPU
+apt-get update
+apt-get install -y software-properties-common
+add-apt-repository ppa:ubuntu-toolchain-r/test -y
+add-apt-repository ppa:openjdk-r/ppa -y # Java lib
+apt-get update
+apt-get install -y git \
+    cmake3 \
+    libcurl4-openssl-dev \
+    unzip \
+    gcc-4.8 \
+    g++-4.8 \
+    gfortran \
+    gfortran-4.8 \
+    binutils \
+    nasm \
+    libtool \
+    curl \
+    pandoc \
+    python3-pip \
+    automake \
+    pkg-config \
+    openjdk-8-jdk
+curl -o apache-maven-3.3.9-bin.tar.gz http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz  
+tar xzf apache-maven-3.3.9-bin.tar.gz   
+mkdir /usr/local/maven  
+mv apache-maven-3.3.9/ /usr/local/maven/    
+update-alternatives --install /usr/bin/mvn mvn /usr/local/maven/apache-maven-3.3.9/bin/mvn 1
+update-ca-certificates -f

From 0072d82cfb1224c5c454d5c83a37dfdb81adf841 Mon Sep 17 00:00:00 2001
From: Piyush Ghai <ghai.8@osu.edu>
Date: Fri, 14 Dec 2018 16:46:27 -0800
Subject: [PATCH 78/93] [MXNET-1195] Cleanup Scala README file (#13582)

* Updated the Scala-Readme with upto-date information

* Updated the header

* Removed redundant build status

* Minor formatting changes

* Addressed the PR feedback

* Added section on Scala training APIs

* Removed mention of deprecated Model API
---
 scala-package/README.md | 254 +++++++++++++++++++---------------------
 1 file changed, 119 insertions(+), 135 deletions(-)

diff --git a/scala-package/README.md b/scala-package/README.md
index 20fbee2469b0..06ce6930fa46 100644
--- a/scala-package/README.md
+++ b/scala-package/README.md
@@ -1,67 +1,136 @@
-<img src=https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo-m/mxnet2.png width=135/> Deep Learning for Scala/Java
+MXNet Package for Scala/Java
 =====
 
-[![Build Status](http://jenkins.mxnet-ci.amazon-ml.com/job/incubator-mxnet/job/master/badge/icon)](http://jenkins.mxnet-ci.amazon-ml.com/job/incubator-mxnet/job/master/)
-[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
-
-Here you find the MXNet Scala Package!
-It brings flexible and efficient GPU/CPU computing and state-of-art deep learning to JVM.
+The MXNet Scala/Java Package brings flexible and efficient GPU/CPU computing and state-of-art deep learning to JVM.
 
 - It enables you to write seamless tensor/matrix computation with multiple GPUs
   in Scala, Java and other languages built on JVM.
 - It also enables you to construct and customize the state-of-art deep learning models in JVM languages,
   and apply them to tasks such as image classification and data science challenges.
+- The Scala/Java Inferece APIs provides an easy out of the box solution for loading pre-trained MXNet models and running inference on them.
   
-Install
+Pre-Built Maven Packages
 ------------
- 
-Technically, all you need is the `mxnet-full_2.11-{arch}-{xpu}-{version}.jar` in your classpath.
-It will automatically extract the native library to a tempfile and load it.
-You can find the pre-built jar file in [here](https://search.maven.org/search?q=g:org.apache.mxnet)
- and also our nightly build package [here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~)
 
-Currently we provide `linux-x86_64-gpu`, `linux-x86_64-cpu` and `osx-x86_64-cpu`. Support for Windows will come soon.
-Use the following dependency in maven, change the artifactId according to your own architecture, e.g., `mxnet-full_2.11-osx-x86_64-cpu` for OSX (and cpu-only).
+### Stable ###
+
+The MXNet Scala/Java packages can be easily included in your Maven managed project.
+The stable jar files for the packages are available on the [MXNet Maven Package Repository](https://search.maven.org/search?q=g:org.apache.mxnet)
+Currently we provide packages for Linux (Ubuntu 16.04) (CPU and GPU) and macOS (CPU only). Stable packages for Windows and CentOS will come soon. For now, if you have a CentOS machine, follow the ```Build From Source``` section below. 
+
+To add MXNet Scala/Java package to your project, add the dependency as shown below corresponding to your platform, under the ```dependencies``` tag in your project's ```pom.xml``` :
+
+**Linux GPU**
+
+<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-linux-x86_64-gpu"><img src="https://img.shields.io/badge/org.apache.mxnet-linux gpu-green.svg" alt="maven badge"/></a>
+
+```HTML
+<dependency>
+  <groupId>org.apache.mxnet</groupId>
+  <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
+  <version>[1.3.1,)</version>
+</dependency>
+```
+
+**Linux CPU**
+
+<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu"><img src="https://img.shields.io/badge/org.apache.mxnet-linux cpu-green.svg" alt="maven badge"/></a>
+
+```HTML
+<dependency>
+  <groupId>org.apache.mxnet</groupId>
+  <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
+  <version>[1.3.1,)</version>
+</dependency>
+```
+
+**macOS CPU**
+
+<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu"><img src="https://img.shields.io/badge/org.apache.mxnet-macOS cpu-green.svg" alt="maven badge"/></a>
+
+```HTML
+<dependency>
+  <groupId>org.apache.mxnet</groupId>
+  <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
+  <version>[1.3.1,)</version>
+</dependency>
+```
+
+**Note:** ```<version>[1.3.1,)<\version>``` indicates that we will fetch packages with version 1.3.1 or higher. This will always ensure that the pom.xml is able to fetch the latest and greatest jar files from Maven.  
+
+### Nightly ###
+
+Apart from these, the nightly builds representing the bleeding edge development  on Scala/Java packages are also available on the [MXNet Maven Nexus Package Repository](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~~~). 
+Currently we provide nightly packages for Linux (CPU and GPU) and MacOS (CPU only). The Linux nightly jar files also work on CentOS. Nightly packages for Windows will come soon.
+
+Add the following ```repository``` to your project's ```pom.xml``` file : 
+
+````html
+<repositories>
+    <repository>
+      <id>Apache Snapshot</id>
+      <url>https://repository.apache.org/content/groups/snapshots</url>
+    </repository>
+</repositories>
+````
+
+Also, add the dependency which corresponds to your platform to the ```dependencies``` tag :
+
+**Linux GPU**
+
+<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~mxnet-full_2.11-linux-x86_64-gpu~~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux gpu-green.svg" alt="maven badge"/></a>
+
+```HTML
+<dependency>
+  <groupId>org.apache.mxnet</groupId>
+  <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
+  <version>[1.5.0,)</version>
+</dependency>
+```
+
+**Linux CPU**
+
+<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~mxnet-full_2.11-osx-x86_64-cpu~~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux cpu-green.svg" alt="maven badge"/></a>
 
 ```HTML
 <dependency>
   <groupId>org.apache.mxnet</groupId>
-  <artifactId>mxnet-full_2.10-linux-x86_64-gpu</artifactId>
-  <version>0.1.1</version>
+  <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
+  <version>[1.5.0,)</version>
 </dependency>
 ```
 
-You can also use `mxnet-core_2.10-0.1.1.jar` and put the compiled native library somewhere in your load path.
+**macOS CPU**
 
+<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu"><img src="https://img.shields.io/badge/org.apache.mxnet-macOS cpu-green.svg" alt="maven badge"/></a>
 ```HTML
 <dependency>
   <groupId>org.apache.mxnet</groupId>
-  <artifactId>mxnet-core_2.10</artifactId>
-  <version>0.1.1</version>
+  <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
+  <version>[1.5.0,)</version>
 </dependency>
 ```
 
-If you have some native libraries conflict with the ones in the provided 'full' jar (e.g., you use openblas instead of atlas), this is a recommended way.
-Refer to the next section for how to build it from the very source.
+**Note:** ```<version>[1.5.0,)<\version>``` indicates that we will fetch packages with version 1.5.0 or higher. This will always ensure that the pom.xml is able to fetch the latest and greatest jar files from Maven Snapshot repository.
 
-Build
+Build From Source
 ------------
 
-Checkout the [Installation Guide](http://mxnet.incubator.apache.org/install/index.html) contains instructions to install mxnet.
-Then you can compile the Scala Package by
+Checkout the [Installation Guide](http://mxnet.incubator.apache.org/install/index.html) contains instructions to install mxnet package and build it from source.
+If you have built MXNet from source and are looking to setup Scala from that point, you may simply run the following from the MXNet source root:
 
 ```bash
 make scalapkg
 ```
 
-(Optional) run unit/integration tests by
+You can also run the unit tests and integration tests on the Scala Package by :
 
 ```bash
 make scalaunittest
 make scalaintegrationtest
 ```
 
-Or run a subset of unit tests by, e.g.,
+Or run a subset of unit tests, for e.g.,
 
 ```bash
 make SCALA_TEST_ARGS=-Dsuites=org.apache.mxnet.NDArraySuite scalaunittest
@@ -70,123 +139,38 @@ make SCALA_TEST_ARGS=-Dsuites=org.apache.mxnet.NDArraySuite scalaunittest
 If everything goes well, you will find jars for `assembly`, `core` and `example` modules.
 Also it produces the native library in `native/{your-architecture}/target`, which you can use to cooperate with the `core` module.
 
-Once you've downloaded and unpacked MNIST dataset to `./data/`, run the training example by
-
-```bash
-java -Xmx4G -cp \
-  scala-package/assembly/{your-architecture}/target/*:scala-package/examples/target/*:scala-package/examples/target/classes/lib/* \
-  org.apache.mxnet.examples.imclassification.TrainMnist \
-  --data-dir=./data/ \
-  --num-epochs=10 \
-  --network=mlp \
-  --cpus=0,1,2,3
-```
+Examples & Usage
+-------
+- To set up the Scala Project using IntelliJ IDE on macOS follow the instructions [here](https://mxnet.incubator.apache.org/tutorials/scala/mxnet_scala_on_intellij.html).
+- Several examples on using the Scala APIs are provided in the [Scala Examples Folder](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/)
 
-If you've compiled with `USE_DIST_KVSTORE` enabled, the python tools in `mxnet/tracker` can be used to launch distributed training.
-The following command runs the above example using 2 worker nodes (and 2 server nodes) in local. Refer to [Distributed Training](http://mxnet.incubator.apache.org/how_to/multi_devices.html) for more details.
+Scala Training APIs
+-------
+- Module API :
+[The Module API](https://mxnet.incubator.apache.org/api/scala/module.html) provides an intermediate and high-level interface for performing computation with neural networks in MXNet. Modules provide high-level APIs for training, predicting, and evaluating.
 
-```bash
-tracker/dmlc_local.py -n 2 -s 2 \
-  java -Xmx4G -cp \
-  scala-package/assembly/{your-architecture}/target/*:scala-package/examples/target/*:scala-package/examples/target/classes/lib/* \
-  org.apache.mxnet.examples.imclassification.TrainMnist \
-  --data-dir=./data/ \
-  --num-epochs=10 \
-  --network=mlp \
-  --cpus=0 \
-  --kv-store=dist_sync
-```
+- KVStore API : 
+To run training over multiple GPUs and multiple hosts, one can use the [KVStore API](https://mxnet.incubator.apache.org/api/scala/kvstore.html).
 
-Change the arguments and have fun!
+- IO/Data Loading : 
+MXNet Scala provides APIs for preparing data to feed as an input to models. Check out [Data Loading API](https://mxnet.incubator.apache.org/api/scala/io.html) for more info.
+ 
+Other available Scala APIs for training can be found [here](https://mxnet.incubator.apache.org/api/scala/index.html).  
+ 
 
-Usage
+Scala Inference APIs
 -------
-Here is a Scala example of what training a simple 3-layer multilayer perceptron on MNIST looks like. You can download the MNIST dataset using [get_mnist_data script](https://github.com/apache/incubator-mxnet/blob/master/scala-package/core/scripts/get_mnist_data.sh).
-
-```scala
-import org.apache.mxnet._
-import org.apache.mxnet.optimizer.SGD
-
-// model definition
-val data = Symbol.Variable("data")
-val fc1 = Symbol.FullyConnected(name = "fc1")()(Map("data" -> data, "num_hidden" -> 128))
-val act1 = Symbol.Activation(name = "relu1")()(Map("data" -> fc1, "act_type" -> "relu"))
-val fc2 = Symbol.FullyConnected(name = "fc2")()(Map("data" -> act1, "num_hidden" -> 64))
-val act2 = Symbol.Activation(name = "relu2")()(Map("data" -> fc2, "act_type" -> "relu"))
-val fc3 = Symbol.FullyConnected(name = "fc3")()(Map("data" -> act2, "num_hidden" -> 10))
-val mlp = Symbol.SoftmaxOutput(name = "sm")()(Map("data" -> fc3))
-
-// load MNIST dataset
-val trainDataIter = IO.MNISTIter(Map(
-  "image" -> "data/train-images-idx3-ubyte",
-  "label" -> "data/train-labels-idx1-ubyte",
-  "data_shape" -> "(1, 28, 28)",
-  "label_name" -> "sm_label",
-  "batch_size" -> "50",
-  "shuffle" -> "1",
-  "flat" -> "0",
-  "silent" -> "0",
-  "seed" -> "10"))
-
-val valDataIter = IO.MNISTIter(Map(
-  "image" -> "data/t10k-images-idx3-ubyte",
-  "label" -> "data/t10k-labels-idx1-ubyte",
-  "data_shape" -> "(1, 28, 28)",
-  "label_name" -> "sm_label",
-  "batch_size" -> "50",
-  "shuffle" -> "1",
-  "flat" -> "0", "silent" -> "0"))
-
-// setup model and fit the training data
-val model = FeedForward.newBuilder(mlp)
-      .setContext(Context.cpu())
-      .setNumEpoch(10)
-      .setOptimizer(new SGD(learningRate = 0.1f, momentum = 0.9f, wd = 0.0001f))
-      .setTrainData(trainDataIter)
-      .setEvalData(valDataIter)
-      .build()
-```
+The [Scala Inference APIs](https://mxnet.incubator.apache.org/api/scala/infer.html) provide an easy, out of the box solution to load a pre-trained MXNet model and run inference on it. The Inference APIs are present in the [Infer Package](https://github.com/apache/incubator-mxnet/tree/master/scala-package/infer) under the MXNet Scala Package repository, while the documentation for the Infer API is available [here](https://mxnet.incubator.apache.org/api/scala/docs/index.html#org.apache.mxnet.infer.package).  
 
-Predict using the model in the following way:
-
-```scala
-val probArrays = model.predict(valDataIter)
-// in this case, we do not have multiple outputs
-require(probArrays.length == 1)
-val prob = probArrays(0)
-
-// get real labels
-import scala.collection.mutable.ListBuffer
-valDataIter.reset()
-val labels = ListBuffer.empty[NDArray]
-while (valDataIter.hasNext) {
-  val evalData = valDataIter.next()
-  labels += evalData.label(0).copy()
-}
-val y = NDArray.concatenate(labels)
-
-// get predicted labels
-val py = NDArray.argmax_channel(prob)
-require(y.shape == py.shape)
-
-// calculate accuracy
-var numCorrect = 0
-var numInst = 0
-for ((labelElem, predElem) <- y.toArray zip py.toArray) {
-  if (labelElem == predElem) {
-    numCorrect += 1
-  }
-  numInst += 1
-}
-val acc = numCorrect.toFloat / numInst
-println(s"Final accuracy = $acc")
-```
+Java Inference APIs
+-------
+The [Java Inference APIs](http://mxnet.incubator.apache.org/api/java/index.html) also provide an easy, out of the box solution to load a pre-trained MXNet model and run inference on it. The Inference APIs are present in the [Infer Package](https://github.com/apache/incubator-mxnet/tree/master/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi) under the MXNet Scala Package repository, while the documentation for the Infer API is available [here](https://mxnet.incubator.apache.org/api/java/docs/index.html#org.apache.mxnet.infer.package).
+More APIs will be added to the Java Inference APIs soon.
 
-Release
+JVM Memory Management
 -------
-- Version 0.1.1, March 24, 2016.
-  - Bug fix for MAE & MSE metrics.
-- Version 0.1.0, March 22, 2016.
+The Scala/Java APIs also provide an automated resource management system, thus making it easy to manage the native memory footprint without any degradation in performance.
+More details about JVM Memory Management are available [here](https://github.com/apache/incubator-mxnet/blob/master/scala-package/memory-management.md).
 
 License
 -------

From 47e2c48cdc1e594ab699f6e4c4b732db1ff54235 Mon Sep 17 00:00:00 2001
From: Sheng Zha <szha@users.noreply.github.com>
Date: Fri, 14 Dec 2018 18:17:41 -0800
Subject: [PATCH 79/93] scripts for building libmxnet binary and wheel (#13648)

* add script for making all dependencies

* tools for building pip package

* build scripts for lib and wheel
---
 tools/build/build_lib.sh                      |  80 +++++++
 .../MANIFEST.in => build/build_wheel.sh}      |  17 +-
 .../dependencies/make_shared_dependencies.sh  |  40 ++++
 tools/dependencies/opencv.sh                  |   3 +-
 tools/dependencies/patch/opencv_lapack.h      |  23 +++
 tools/dependencies/protobuf.sh                |   2 +
 tools/pip/MANIFEST.in                         |  11 +
 tools/pip/sanity_test.py                      |  32 +++
 tools/pip/setup.py                            | 195 ++++++++++++++++++
 tools/pip_package/README.md                   |   9 -
 tools/pip_package/make_pip_package.sh         | 179 ----------------
 tools/pip_package/setup.py                    |  60 ------
 12 files changed, 399 insertions(+), 252 deletions(-)
 create mode 100755 tools/build/build_lib.sh
 rename tools/{pip_package/MANIFEST.in => build/build_wheel.sh} (69%)
 mode change 100644 => 100755
 create mode 100755 tools/dependencies/make_shared_dependencies.sh
 create mode 100644 tools/dependencies/patch/opencv_lapack.h
 create mode 100644 tools/pip/MANIFEST.in
 create mode 100644 tools/pip/sanity_test.py
 create mode 100644 tools/pip/setup.py
 delete mode 100644 tools/pip_package/README.md
 delete mode 100755 tools/pip_package/make_pip_package.sh
 delete mode 100644 tools/pip_package/setup.py

diff --git a/tools/build/build_lib.sh b/tools/build/build_lib.sh
new file mode 100755
index 000000000000..032fcb92045f
--- /dev/null
+++ b/tools/build/build_lib.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script builds the libraries of mxnet.
+make_config=config/pip_${PLATFORM}_${VARIANT}.mk
+if [[ ! -f $make_config ]]; then
+    >&2 echo "Couldn't find make config $make_config for the current settings."
+    exit 1
+fi
+
+git clone --recursive https://github.com/apache/incubator-mxnet mxnet-build
+
+>&2 echo "Now building mxnet modules..."
+cp $make_config mxnet-build/config.mk
+
+cd mxnet-build
+
+make DEPS_PATH=$DEPS_PATH DMLCCORE
+make DEPS_PATH=$DEPS_PATH $PWD/3rdparty/tvm/nnvm/lib/libnnvm.a
+make DEPS_PATH=$DEPS_PATH PSLITE
+
+if [[ $VARIANT == *mkl ]]; then
+    MKLDNN_LICENSE='license.txt'
+    if [[ $PLATFORM == 'linux' ]]; then
+        IOMP_LIBFILE='libiomp5.so'
+        MKLML_LIBFILE='libmklml_intel.so'
+        MKLDNN_LIBFILE='libmkldnn.so.0'
+    else
+        IOMP_LIBFILE='libiomp5.dylib'
+        MKLML_LIBFILE='libmklml.dylib'
+        MKLDNN_LIBFILE='libmkldnn.0.dylib'
+    fi
+    make DEPS_PATH=$DEPS_PATH mkldnn
+    cp 3rdparty/mkldnn/LICENSE ./MKLML_LICENSE
+fi
+
+if [[ $VARIANT == *mkl ]]; then
+    >&2 echo "Copying MKL license."
+    rm lib/libmkldnn.{so,dylib}
+    rm lib/libmkldnn.0.*.dylib
+    rm lib/libmkldnn.so.0.*
+fi
+
+>&2 echo "Now building mxnet..."
+make DEPS_PATH=$DEPS_PATH || exit 1;
+
+if [[ $PLATFORM == 'linux' ]]; then
+    cp -L /usr/lib/gcc/x86_64-linux-gnu/4.8/libgfortran.so lib/libgfortran.so.3
+    cp -L /usr/lib/x86_64-linux-gnu/libquadmath.so.0 lib/libquadmath.so.0
+fi
+
+# Print the linked objects on libmxnet.so
+>&2 echo "Checking linked objects on libmxnet.so..."
+if [[ ! -z $(command -v readelf) ]]; then
+    readelf -d lib/libmxnet.so
+    strip --strip-unneeded lib/libmxnet.so
+elif [[ ! -z $(command -v otool) ]]; then
+    otool -L lib/libmxnet.so
+    strip -u -r -x lib/libmxnet.so
+else
+    >&2 echo "Not available"
+fi
+
+cd ../
diff --git a/tools/pip_package/MANIFEST.in b/tools/build/build_wheel.sh
old mode 100644
new mode 100755
similarity index 69%
rename from tools/pip_package/MANIFEST.in
rename to tools/build/build_wheel.sh
index 5c6a72377e9f..a79634117c21
--- a/tools/pip_package/MANIFEST.in
+++ b/tools/build/build_wheel.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -15,6 +17,15 @@
 # specific language governing permissions and limitations
 # under the License.
 
-include README
-recursive-include * *.py
-recursive-include * *.so
+# This script builds the wheel for binary distribution and performs sanity check.
+
+cd mxnet-build
+echo $(git rev-parse HEAD) >> python/mxnet/COMMIT_HASH
+cd -
+
+# Make wheel for testing
+python setup.py bdist_wheel
+
+wheel_name=$(ls -t dist | head -n 1)
+pip install -U --user --force-reinstall dist/$wheel_name
+python sanity_test.py
diff --git a/tools/dependencies/make_shared_dependencies.sh b/tools/dependencies/make_shared_dependencies.sh
new file mode 100755
index 000000000000..d678fddcc02d
--- /dev/null
+++ b/tools/dependencies/make_shared_dependencies.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This is a convenience script for calling the build scripts of all dependency libraries.
+# Environment variables should be set beforehand.
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
+
+
+if [[ ! $PLATFORM == 'darwin' ]]; then
+    source $DIR/openblas.sh
+fi
+source $DIR/libz.sh
+source $DIR/libturbojpeg.sh
+source $DIR/libpng.sh
+source $DIR/libtiff.sh
+source $DIR/openssl.sh
+source $DIR/curl.sh
+source $DIR/eigen.sh
+source $DIR/opencv.sh
+source $DIR/protobuf.sh
+source $DIR/cityhash.sh
+source $DIR/zmq.sh
+source $DIR/lz4.sh
diff --git a/tools/dependencies/opencv.sh b/tools/dependencies/opencv.sh
index 98ff115f1765..99d0ecb71c36 100755
--- a/tools/dependencies/opencv.sh
+++ b/tools/dependencies/opencv.sh
@@ -20,6 +20,7 @@
 # This script builds the static library of opencv that can be used as dependency of mxnet.
 # It expects openblas, libjpeg, libpng, libtiff, eigen, etc., to be in $DEPS_PATH.
 OPENCV_VERSION=3.4.2
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
 if [[ $PLATFORM == 'linux' ]]; then
     OPENCV_LAPACK_OPTIONS=" \
           -D OpenBLAS_HOME=$DEPS_PATH \
@@ -181,7 +182,7 @@ if [[ ! -f $DEPS_PATH/lib/libopencv_core.a ]] || [[ ! -f $DEPS_PATH/lib/libopenc
           -D CMAKE_BUILD_TYPE=RELEASE \
           -D CMAKE_INSTALL_PREFIX=$DEPS_PATH ..
     if [[ $PLATFORM == 'linux' ]]; then
-        cp $DEPS_PATH/../patch/opencv_lapack.h ./
+        cp $DIR/patch/opencv_lapack.h ./
     fi
     make
     make install
diff --git a/tools/dependencies/patch/opencv_lapack.h b/tools/dependencies/patch/opencv_lapack.h
new file mode 100644
index 000000000000..97af9d67ea31
--- /dev/null
+++ b/tools/dependencies/patch/opencv_lapack.h
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+extern "C" {
+#include "cblas.h"
+#include "lapacke.h"
+}
diff --git a/tools/dependencies/protobuf.sh b/tools/dependencies/protobuf.sh
index dfa3d71f3750..1564701042af 100755
--- a/tools/dependencies/protobuf.sh
+++ b/tools/dependencies/protobuf.sh
@@ -39,3 +39,5 @@ if [[ ! -e $LIBPROTOBUF ]] || [[ ! -e $LIBPROTOC ]]; then
     make install
     cd -
 fi
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$(dirname $(find $DEPS_PATH -type f -name 'libprotoc*' | grep protobuf | head -n 1)):$DEPS_PATH/lib
diff --git a/tools/pip/MANIFEST.in b/tools/pip/MANIFEST.in
new file mode 100644
index 000000000000..5e072064193c
--- /dev/null
+++ b/tools/pip/MANIFEST.in
@@ -0,0 +1,11 @@
+include README
+include mxnet/COMMIT_HASH
+recursive-include mxnet/tools *
+recursive-include mxnet *.py
+recursive-include mxnet *.so
+recursive-include mxnet *.so.*
+recursive-include mxnet *.dylib
+recursive-include mxnet *_LICENSE
+recursive-include mxnet *.h
+recursive-include mxnet *.cuh
+recursive-include dmlc_tracker *.py
diff --git a/tools/pip/sanity_test.py b/tools/pip/sanity_test.py
new file mode 100644
index 000000000000..dc51e479906b
--- /dev/null
+++ b/tools/pip/sanity_test.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+"""Sanity test."""
+from __future__ import print_function
+import sys
+from base64 import b64decode
+
+try:
+    import mxnet as mx
+    mx.img.imdecode(b64decode('iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==')).asnumpy()
+    print('Test succeeded')
+except:
+    import traceback
+    print('Test failed')
+    traceback.print_exc()
+    sys.exit(1)
diff --git a/tools/pip/setup.py b/tools/pip/setup.py
new file mode 100644
index 000000000000..d5db6d87fc1d
--- /dev/null
+++ b/tools/pip/setup.py
@@ -0,0 +1,195 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=invalid-name, exec-used
+"""Setup mxnet package for pip."""
+from __future__ import absolute_import
+from datetime import datetime
+import os
+import sys
+import shutil
+import platform
+
+if platform.system() == 'Linux':
+    sys.argv.append('--universal')
+    sys.argv.append('--plat-name=manylinux1_x86_64')
+
+from setuptools import setup, find_packages
+from setuptools.dist import Distribution
+
+# We can not import `mxnet.info.py` in setup.py directly since mxnet/__init__.py
+# Will be invoked which introduces dependences
+CURRENT_DIR = os.path.dirname(__file__)
+libinfo_py = os.path.join(CURRENT_DIR, 'mxnet-build/python/mxnet/libinfo.py')
+libinfo = {'__file__': libinfo_py}
+exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo)
+
+LIB_PATH = libinfo['find_lib_path']()
+__version__ = libinfo['__version__']
+if 'TRAVIS_TAG' not in os.environ or not os.environ['TRAVIS_TAG'].strip():
+    __version__ += 'b{0}'.format(datetime.today().strftime('%Y%m%d'))
+elif 'TRAVIS_TAG' in os.environ and os.environ['TRAVIS_TAG'].startswith('patch-'):
+    __version__ = os.environ['TRAVIS_TAG'].split('-')[1]
+
+class BinaryDistribution(Distribution):
+    def has_ext_modules(self):
+        return platform.system() == 'Darwin'
+
+
+DEPENDENCIES = [
+    'numpy<1.15.0,>=1.8.2',
+    'requests>=2.20.0',
+    'graphviz<0.9.0,>=0.8.1'
+]
+
+shutil.rmtree(os.path.join(CURRENT_DIR, 'mxnet'), ignore_errors=True)
+shutil.rmtree(os.path.join(CURRENT_DIR, 'dmlc_tracker'), ignore_errors=True)
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/python/mxnet'),
+                os.path.join(CURRENT_DIR, 'mxnet'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/dmlc-core/tracker/dmlc_tracker'),
+                os.path.join(CURRENT_DIR, 'dmlc_tracker'))
+shutil.copy(LIB_PATH[0], os.path.join(CURRENT_DIR, 'mxnet'))
+
+# copy tools to mxnet package
+shutil.rmtree(os.path.join(CURRENT_DIR, 'mxnet/tools'), ignore_errors=True)
+os.mkdir(os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/tools/launch.py'), os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/tools/im2rec.py'), os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/tools/kill-mxnet.py'), os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/tools/parse_log.py'), os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/tools/diagnose.py'), os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/tools/caffe_converter'), os.path.join(CURRENT_DIR, 'mxnet/tools/caffe_converter'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/tools/bandwidth'), os.path.join(CURRENT_DIR, 'mxnet/tools/bandwidth'))
+
+# copy headers to mxnet package
+shutil.rmtree(os.path.join(CURRENT_DIR, 'mxnet/include'), ignore_errors=True)
+os.mkdir(os.path.join(CURRENT_DIR, 'mxnet/include'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/include/mxnet'),
+                os.path.join(CURRENT_DIR, 'mxnet/include/mxnet'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/dlpack/include/dlpack'),
+                os.path.join(CURRENT_DIR, 'mxnet/include/dlpack'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/dmlc-core/include/dmlc'),
+                os.path.join(CURRENT_DIR, 'mxnet/include/dmlc'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/mshadow/mshadow'),
+                os.path.join(CURRENT_DIR, 'mxnet/include/mshadow'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/tvm/nnvm/include/nnvm'),
+                os.path.join(CURRENT_DIR, 'mxnet/include/nnvm'))
+
+package_name = 'mxnet'
+
+variant = os.environ['mxnet_variant'].upper()
+if variant != 'CPU':
+    package_name = 'mxnet_{0}'.format(variant.lower())
+
+with open('doc/PYPI_README.md') as readme_file:
+    long_description = readme_file.read()
+
+with open('doc/{0}_ADDITIONAL.md'.format(variant)) as variant_doc:
+    long_description = long_description + variant_doc.read()
+
+# pypi only supports rst, so use pandoc to convert
+import pypandoc
+if platform.system() == 'Darwin':
+    pypandoc.download_pandoc()
+long_description = pypandoc.convert_text(long_description, 'rst', 'md')
+short_description = 'MXNet is an ultra-scalable deep learning framework.'
+libraries = []
+if variant == 'CPU':
+    libraries.append('openblas')
+else:
+    if variant.startswith('CU92'):
+        libraries.append('CUDA-9.2')
+    elif variant.startswith('CU91'):
+        libraries.append('CUDA-9.1')
+    elif variant.startswith('CU90'):
+        libraries.append('CUDA-9.0')
+    elif variant.startswith('CU80'):
+        libraries.append('CUDA-8.0')
+    elif variant.startswith('CU75'):
+        libraries.append('CUDA-7.5')
+    if variant.endswith('MKL'):
+        libraries.append('MKLDNN')
+
+short_description += ' This version uses {0}.'.format(' and '.join(libraries))
+
+package_data = {'mxnet': [os.path.join('mxnet', os.path.basename(LIB_PATH[0]))],
+                'dmlc_tracker': []}
+if variant.endswith('MKL'):
+    if platform.system() == 'Darwin':
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libmklml.dylib'), os.path.join(CURRENT_DIR, 'mxnet'))
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libiomp5.dylib'), os.path.join(CURRENT_DIR, 'mxnet'))
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libmkldnn.0.dylib'), os.path.join(CURRENT_DIR, 'mxnet'))
+        package_data['mxnet'].append('mxnet/libmklml.dylib')
+        package_data['mxnet'].append('mxnet/libiomp5.dylib')
+        package_data['mxnet'].append('mxnet/libmkldnn.0.dylib')
+    else:
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libmklml_intel.so'), os.path.join(CURRENT_DIR, 'mxnet'))
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libiomp5.so'), os.path.join(CURRENT_DIR, 'mxnet'))
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libmkldnn.so.0'), os.path.join(CURRENT_DIR, 'mxnet'))
+        package_data['mxnet'].append('mxnet/libmklml_intel.so')
+        package_data['mxnet'].append('mxnet/libiomp5.so')
+        package_data['mxnet'].append('mxnet/libmkldnn.so.0')
+    shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), '../MKLML_LICENSE'), os.path.join(CURRENT_DIR, 'mxnet'))
+    package_data['mxnet'].append('mxnet/MKLML_LICENSE')
+if platform.system() == 'Linux':
+    shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libgfortran.so.3'), os.path.join(CURRENT_DIR, 'mxnet'))
+    package_data['mxnet'].append('mxnet/libgfortran.so.3')
+    shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libquadmath.so.0'), os.path.join(CURRENT_DIR, 'mxnet'))
+    package_data['mxnet'].append('mxnet/libquadmath.so.0')
+
+from mxnet.base import _generate_op_module_signature
+from mxnet.ndarray.register import _generate_ndarray_function_code
+from mxnet.symbol.register import _generate_symbol_function_code
+_generate_op_module_signature('mxnet', 'symbol', _generate_symbol_function_code)
+_generate_op_module_signature('mxnet', 'ndarray', _generate_ndarray_function_code)
+
+setup(name=package_name,
+      version=__version__,
+      long_description=long_description,
+      description=short_description,
+      zip_safe=False,
+      packages=find_packages(),
+      package_data=package_data,
+      include_package_data=True,
+      install_requires=DEPENDENCIES,
+      distclass=BinaryDistribution,
+      license='Apache 2.0',
+      classifiers=[ # https://pypi.org/pypi?%3Aaction=list_classifiers
+          'Development Status :: 5 - Production/Stable',
+          'Intended Audience :: Developers',
+          'Intended Audience :: Education',
+          'Intended Audience :: Science/Research',
+          'License :: OSI Approved :: Apache Software License',
+          'Programming Language :: C++',
+          'Programming Language :: Cython',
+          'Programming Language :: Other',  # R, Scala
+          'Programming Language :: Perl',
+          'Programming Language :: Python',
+          'Programming Language :: Python :: 2.7',
+          'Programming Language :: Python :: 3.4',
+          'Programming Language :: Python :: 3.5',
+          'Programming Language :: Python :: 3.6',
+          'Programming Language :: Python :: Implementation :: CPython',
+          'Topic :: Scientific/Engineering',
+          'Topic :: Scientific/Engineering :: Artificial Intelligence',
+          'Topic :: Scientific/Engineering :: Mathematics',
+          'Topic :: Software Development',
+          'Topic :: Software Development :: Libraries',
+          'Topic :: Software Development :: Libraries :: Python Modules',
+      ],
+      url='https://github.com/apache/incubator-mxnet')
diff --git a/tools/pip_package/README.md b/tools/pip_package/README.md
deleted file mode 100644
index f289c98b7155..000000000000
--- a/tools/pip_package/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-MXNet Python Package
-====================
-MXNet is a deep learning framework designed for both *efficiency* and *flexibility*.
-It allows you to mix the flavours of deep learning programs together to maximize the efficiency and your productivity.
-
-
-Installation
-------------
-To install, check [Build Instruction](http://mxnet.io/get_started/setup.html)
diff --git a/tools/pip_package/make_pip_package.sh b/tools/pip_package/make_pip_package.sh
deleted file mode 100755
index 46b4938b0785..000000000000
--- a/tools/pip_package/make_pip_package.sh
+++ /dev/null
@@ -1,179 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-# Assuming the script is run at mxnet/tools/pip_package
-# This script builds from scratch the dependencies of mxnet into static
-# librareis and statically links them to produce a (mostly) standalone
-# libmxnet.so, then packages it into the python wheel.
-# It assumes the build environment to be a sandbox that doesn't have the .so
-# objects for the dependencies, i.e. zlib, openblas, libjpeg, libpng, libtiff
-# and opencv.
-
-# Install necessary build tools
-if [ -n "$(command -v apt-get)" ]; then
-    sudo apt-get update;
-    sudo apt-get install -y build-essential git python-pip zip pkg-config cmake
-elif [ -n "$(command -v yum)" ]; then
-    sudo yum install -y cmake
-    sudo yum groupinstall -y "Development Tools"
-    sudo yum install -y python27 python27-setuptools python27-tools python-pip
-else
-    echo "Need a package manager to install build tools, e.g. apt/yum"
-    exit 1
-fi
-sudo pip install -U pip setuptools wheel
-
-# Set up path as temporary working directory
-DEPS_PATH=$PWD/../../deps
-mkdir $DEPS_PATH
-
-# Dependencies can be updated here. Be sure to verify the download link before
-# changing. The dependencies are:
-ZLIB_VERSION=1.2.6
-OPENBLAS_VERSION=0.2.19
-JPEG_VERSION=8.4.0
-PNG_VERSION=1.5.10
-TIFF_VERSION=3.8.2
-OPENCV_VERSION=2.4.13
-
-# Setup path to dependencies
-export PKG_CONFIG_PATH=$DEPS_PATH/lib/pkgconfig:$DEPS_PATH/lib64/pkgconfig:$PKG_CONFIG_PATH
-export CPATH=$DEPS_PATH/include:$CPATH
-
-# Position Independent code must be turned on for statically linking .a
-export CC="gcc -fPIC"
-export CXX="g++ -fPIC"
-
-# Download and build zlib
-curl -L https://github.com/LuaDist/zlib/archive/$ZLIB_VERSION.zip -o $DEPS_PATH/zlib.zip
-unzip $DEPS_PATH/zlib.zip -d $DEPS_PATH
-mkdir $DEPS_PATH/zlib-$ZLIB_VERSION/build
-cd $DEPS_PATH/zlib-$ZLIB_VERSION/build
-cmake -D CMAKE_BUILD_TYPE=RELEASE \
-      -D CMAKE_INSTALL_PREFIX=$DEPS_PATH \
-      -D BUILD_SHARED_LIBS=OFF ..
-make -j$(nproc)
-make install
-cd -
-
-# download and build openblas
-curl -L https://github.com/xianyi/OpenBLAS/archive/v$OPENBLAS_VERSION.zip -o $DEPS_PATH/openblas.zip
-unzip $DEPS_PATH/openblas.zip -d $DEPS_PATH
-cd $DEPS_PATH/OpenBLAS-$OPENBLAS_VERSION
-make FC=gfortran -j $(($(nproc) + 1))
-make PREFIX=$DEPS_PATH install
-cd -
-ln -s $DEPS_PATH/lib/libopenblas_haswellp-r0.2.19.a $DEPS_PATH/lib/libcblas.a
-
-# download and build libjpeg
-curl -L https://github.com/LuaDist/libjpeg/archive/$JPEG_VERSION.zip -o $DEPS_PATH/libjpeg.zip
-unzip $DEPS_PATH/libjpeg.zip -d $DEPS_PATH
-cd $DEPS_PATH/libjpeg-$JPEG_VERSION
-./configure --disable-shared --prefix=$DEPS_PATH
-make -j$(nproc)
-make test
-make install
-cd -
-
-# download and build libpng
-curl -L https://github.com/LuaDist/libpng/archive/$PNG_VERSION.zip -o $DEPS_PATH/libpng.zip
-unzip $DEPS_PATH/libpng.zip -d $DEPS_PATH
-mkdir $DEPS_PATH/libpng-$PNG_VERSION/build
-cd $DEPS_PATH/libpng-$PNG_VERSION/build
-cmake -D CMAKE_BUILD_TYPE=RELEASE \
-      -D CMAKE_INSTALL_PREFIX=$DEPS_PATH \
-      -D PNG_CONFIGURE_LIBPNG=-fPIC \
-      -D BUILD_SHARED_LIBS=OFF ..
-make -j$(nproc)
-make install
-cd -
-
-# download and build libtiff
-curl -L https://github.com/LuaDist/libtiff/archive/$TIFF_VERSION.zip -o $DEPS_PATH/libtiff.zip
-unzip $DEPS_PATH/libtiff.zip -d $DEPS_PATH
-cd $DEPS_PATH/libtiff-$TIFF_VERSION
-./configure --disable-shared --prefix=$DEPS_PATH
-make -j$(nproc)
-make install
-cd -
-
-# download and build opencv since we need the static library
-curl -L https://github.com/Itseez/opencv/archive/$OPENCV_VERSION.zip -o $DEPS_PATH/opencv.zip
-unzip $DEPS_PATH/opencv.zip -d $DEPS_PATH
-mkdir $DEPS_PATH/opencv-$OPENCV_VERSION/build
-cd $DEPS_PATH/opencv-$OPENCV_VERSION/build
-cmake -D WITH_1394=OFF \
-      -D WITH_AVFOUNDATION=OFF \
-      -D WITH_CUDA=OFF \
-      -D WITH_VTK=OFF \
-      -D WITH_CUFFT=OFF \
-      -D WITH_CUBLAS=OFF \
-      -D WITH_NVCUVID=OFF \
-      -D WITH_EIGEN=ON \
-      -D WITH_VFW=OFF \
-      -D WITH_FFMPEG=OFF \
-      -D WITH_GSTREAMER=OFF \
-      -D WITH_GTK=OFF \
-      -D WITH_JASPER=OFF \
-      -D WITH_JPEG=ON \
-      -D WITH_PNG=ON \
-      -D WITH_QUICKTIME=OFF \
-      -D WITH_TBB=ON \
-      -D WITH_TIFF=OFF \
-      -D WITH_V4L=OFF \
-      -D WITH_LIBV4L=OFF \
-      -D WITH_DSHOW=OFF \
-      -D WITH_MSMF=OFF \
-      -D WITH_OPENCL=OFF \
-      -D WITH_OPENCLAMDFFT=OFF \
-      -D WITH_OPENCLAMDBLAS=OFF \
-      -D BUILD_SHARED_LIBS=OFF \
-      -D BUILD_opencv_apps=OFF \
-      -D BUILD_opencv_gpu=OFF \
-      -D BUILD_opencv_video=OFF \
-      -D BUILD_opencv_contrib=OFF \
-      -D BUILD_opencv_nonfree=OFF \
-      -D BUILD_opencv_flann=OFF \
-      -D BUILD_opencv_features2d=OFF \
-      -D BUILD_opencv_calib3d=OFF \
-      -D BUILD_opencv_objdetect=OFF \
-      -D BUILD_opencv_ml=OFF \
-      -D BUILD_opencv_photo=OFF \
-      -D BUILD_DOCS=OFF \
-      -D BUILD_PACKAGE=OFF \
-      -D CMAKE_BUILD_TYPE=RELEASE \
-      -D CMAKE_INSTALL_PREFIX=$DEPS_PATH ..
-make -j $(nproc)
-make install # user will always have access to home, so no sudo needed
-cd -
-
-# Although .so building is explicitly turned off for most libraries, sometimes
-# they still get created. So, remove them just to make sure they don't
-# interfere, or otherwise we might get libmxnet.so that is not self-contained.
-rm $DEPS_PATH/{lib,lib64}/*.{so,so.0}
-
-# Go to the parent path and build mxnet
-cd ../../
-cp make/pip_$(uname | tr '[:upper:]' '[:lower:]')_cpu.mk config.mk
-make -j $(nproc)
-
-# Generate wheel. The output is in the mxnet/tools/pip_package/dist path.
-cd tools/pip_package
-python setup.py bdist_wheel
diff --git a/tools/pip_package/setup.py b/tools/pip_package/setup.py
deleted file mode 100644
index e4bf48236bde..000000000000
--- a/tools/pip_package/setup.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=invalid-name, exec-used
-"""Setup mxnet package."""
-from __future__ import absolute_import
-import os
-import shutil
-
-from setuptools import setup, find_packages
-from setuptools.dist import Distribution
-
-# We can not import `mxnet.info.py` in setup.py directly since mxnet/__init__.py
-# Will be invoked which introduces dependences
-CURRENT_DIR = os.path.dirname(__file__)
-libinfo_py = os.path.join(CURRENT_DIR, '../../python/mxnet/libinfo.py')
-libinfo = {'__file__': libinfo_py}
-exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo)
-
-LIB_PATH = libinfo['find_lib_path']()
-__version__ = libinfo['__version__']
-
-class BinaryDistribution(Distribution):
-    def has_ext_modules(self):
-        return True
-
-
-DEPENDENCIES = [
-    'numpy',
-]
-
-shutil.rmtree(os.path.join(CURRENT_DIR, 'mxnet'), ignore_errors=True)
-shutil.copytree(os.path.join(CURRENT_DIR, '../../python/mxnet'),
-                os.path.join(CURRENT_DIR, 'mxnet'))
-shutil.copy(LIB_PATH[0], os.path.join(CURRENT_DIR, 'mxnet'))
-
-setup(name='mxnet',
-      version=__version__,
-      description=open(os.path.join(CURRENT_DIR, 'README.md')).read(),
-      zip_safe=False,
-      packages=find_packages(),
-      package_data={'mxnet': [os.path.join('mxnet', os.path.basename(LIB_PATH[0]))]},
-      include_package_data=True,
-      install_requires=DEPENDENCIES,
-      distclass=BinaryDistribution,
-      url='https://github.com/dmlc/mxnet')

From 0bbbaac6ea40dc216db1e9cf711b90062c722503 Mon Sep 17 00:00:00 2001
From: Amol Lele <19983848+leleamol@users.noreply.github.com>
Date: Fri, 14 Dec 2018 18:21:18 -0800
Subject: [PATCH 80/93] [MXNET-1083] Add the example to demonstrate the
 inference workflow using C++ API (#13294)

* [MXNET-1083] Add the example to demonstrate the inference workflow using C++ API

* [MXNET-1083] Add the example to demonstrate the inference workflow using C++ API

* Updated the code to address the review comments.

* Added the README file for the folder.

* Addressed the review comments

* Addressed the review comments to use argmax and default mean values.
---
 cpp-package/example/README.md                 |   5 +-
 cpp-package/example/inference/Makefile        |  40 ++
 cpp-package/example/inference/README.md       |  41 ++
 .../example/inference/inception_inference.cpp | 446 ++++++++++++++++++
 .../unit_test_inception_inference.sh          |  43 ++
 5 files changed, 573 insertions(+), 2 deletions(-)
 create mode 100644 cpp-package/example/inference/Makefile
 create mode 100644 cpp-package/example/inference/README.md
 create mode 100644 cpp-package/example/inference/inception_inference.cpp
 create mode 100755 cpp-package/example/inference/unit_test_inception_inference.sh

diff --git a/cpp-package/example/README.md b/cpp-package/example/README.md
index c7223e94c920..c2329330b6be 100644
--- a/cpp-package/example/README.md
+++ b/cpp-package/example/README.md
@@ -2,7 +2,8 @@
 
 ## Building C++ examples
 
-The examples are built while building the MXNet library and cpp-package from source . However, they can be built manually as follows
+The examples in this folder demonstrate the **training** workflow. The **inference workflow** related examples can be found in [inference](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference>) folder.
+The examples in this folder are built while building the MXNet library and cpp-package from source . However, they can be built manually as follows
 
 From cpp-package/examples directory
 
@@ -18,7 +19,7 @@ The examples that are built to be run on GPU may not work on the non-GPU machine
 The makefile will also download the necessary data files and store in a data folder. (The download will take couple of minutes, but will be done only once on a fresh installation.)
 
 
-## Examples
+## Examples demonstrating training workflow
 
 This directory contains following examples. In order to run the examples, ensure that the path to the MXNet shared library is added to the OS specific environment variable viz. **LD\_LIBRARY\_PATH** for Linux, Mac and Ubuntu OS and **PATH** for Windows OS. For example `export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/home/ubuntu/incubator-mxnet/lib` on ubuntu using gpu.
 
diff --git a/cpp-package/example/inference/Makefile b/cpp-package/example/inference/Makefile
new file mode 100644
index 000000000000..5efe6cfb68e5
--- /dev/null
+++ b/cpp-package/example/inference/Makefile
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+CPPEX_SRC = $(wildcard *.cpp)
+CPPEX_EXE = $(patsubst %.cpp, %, $(CPPEX_SRC))
+OPENCV_CFLAGS=`pkg-config --cflags opencv`
+OPENCV_LDFLAGS=`pkg-config --libs opencv`
+
+CXX=g++
+
+
+CFLAGS=$(COMMFLAGS) -I../../../3rdparty/tvm/nnvm/include -I../../../3rdparty/dmlc-core/include -I ../../include -I ../../../include -Wall -O3 -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas
+CPPEX_EXTRA_LDFLAGS := -L../../../lib -lmxnet $(OPENCV_LDFLAGS)
+
+all: $(CPPEX_EXE)
+
+debug: CPPEX_CFLAGS += -DDEBUG -g
+debug: all
+
+
+$(CPPEX_EXE):% : %.cpp
+	$(CXX) -std=c++0x $(CFLAGS)  $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(CPPEX_EXTRA_LDFLAGS)
+
+clean:
+	rm -f $(CPPEX_EXE)
diff --git a/cpp-package/example/inference/README.md b/cpp-package/example/inference/README.md
new file mode 100644
index 000000000000..79831b40b6bd
--- /dev/null
+++ b/cpp-package/example/inference/README.md
@@ -0,0 +1,41 @@
+# MXNet C++ Package Inference Workflow Examples
+
+## Building C++ Inference examples
+
+The examples in this folder demonstrate the **inference** workflow.
+To build examples use following commands:
+
+-  Release: **make all**
+-  Debug: **make debug all**
+
+
+## Examples demonstrating inference workflow
+
+This directory contains following examples. In order to run the examples, ensure that the path to the MXNet shared library is added to the OS specific environment variable viz. **LD\_LIBRARY\_PATH** for Linux, Mac and Ubuntu OS and **PATH** for Windows OS.
+
+### [inception_inference.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/inception_inference.cpp>)
+
+This example demonstrates image classification workflow with pre-trained models using MXNet C++ API. The command line parameters the example can accept are as shown below:
+
+```
+./inception_inference --help
+Usage:
+inception_inference --symbol <model symbol file in json format>
+                    --params <model params file>
+					--image <path to the image used for prediction
+					--synset file containing labels for prediction
+					[--input_shape <dimensions of input image e.g "3 224 224"]
+					[--mean file containing mean image for normalizing the input image
+					[--gpu] Specify this option if workflow needs to be run in gpu context
+```
+The model json and param file and synset files are required to run this example.  The sample command line is as follows:
+
+```
+
+./inception_inference --symbol "./model/Inception-BN-symbol.json" --params "./model/Inception-BN-0126.params" --synset "./model/synset.txt" --mean "./model/mean_224.nd" --image "./model/dog.jpg"
+```
+Alternatively, The script [unit_test_inception_inference.sh](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/unit_test_inception_inference.sh>) downloads the pre-trained **Inception** model and a test image. The users can invoke this script as follows:
+
+```
+./unit_test_inception_inference.sh
+```
diff --git a/cpp-package/example/inference/inception_inference.cpp b/cpp-package/example/inference/inception_inference.cpp
new file mode 100644
index 000000000000..7005e745b2f4
--- /dev/null
+++ b/cpp-package/example/inference/inception_inference.cpp
@@ -0,0 +1,446 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * This example demonstrates image classification workflow with pre-trained models using MXNet C++ API.
+ * The example performs following tasks.
+ * 1. Load the pre-trained model.
+ * 2. Load the parameters of pre-trained model.
+ * 3. Load the image to be classified  in to NDArray.
+ * 4. Normalize the image using the mean of images that were used for training.
+ * 5. Run the forward pass and predict the input image.
+ */
+
+#include <sys/stat.h>
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+#include <opencv2/opencv.hpp>
+
+using namespace mxnet::cpp;
+
+static mx_float DEFAULT_MEAN_R = 123.675;
+static mx_float DEFAULT_MEAN_G = 116.28;
+static mx_float DEFAULT_MEAN_B = 103.53;
+/*
+ * class Predictor
+ *
+ * This class encapsulates the functionality to load the model, process input image and run the forward pass.
+ */
+
+class Predictor {
+ public:
+    Predictor() {}
+    Predictor(const std::string& model_json_file,
+              const std::string& model_params_file,
+              const Shape& input_shape,
+              bool gpu_context_type = false,
+              const std::string& synset_file = "",
+              const std::string& mean_image_file = "");
+    void PredictImage(const std::string& image_file);
+    ~Predictor();
+
+ private:
+    void LoadModel(const std::string& model_json_file);
+    void LoadParameters(const std::string& model_parameters_file);
+    void LoadSynset(const std::string& synset_file);
+    NDArray LoadInputImage(const std::string& image_file);
+    void LoadMeanImageData();
+    void LoadDefaultMeanImageData();
+    void NormalizeInput(const std::string& mean_image_file);
+    inline bool FileExists(const std::string& name) {
+        struct stat buffer;
+        return (stat(name.c_str(), &buffer) == 0);
+    }
+    NDArray mean_img;
+    std::map<std::string, NDArray> args_map;
+    std::map<std::string, NDArray> aux_map;
+    std::vector<std::string> output_labels;
+    Symbol net;
+    Executor *executor;
+    Shape input_shape;
+    NDArray mean_image_data;
+    NDArray std_dev_image_data;
+    Context global_ctx = Context::cpu();
+    std::string mean_image_file;
+};
+
+
+/*
+ * The constructor takes following parameters as input:
+ * 1. model_json_file:  The model in json formatted file.
+ * 2. model_params_file: File containing model parameters
+ * 3. synset_file: File containing the list of image labels
+ * 4. input_shape: Shape of input data to the model. Since this class will be running one inference at a time,
+ *                 the input shape is required to be in format Shape(1, number_of_channels, height, width)
+ * The input image will be resized to (height x width) size before running the inference.
+ * The constructor will:
+ *  1. Load the model and parameter files.
+ *  2. Load the synset file.
+ *  3. Invoke the SimpleBind to bind the input argument to the model and create an executor.
+ *
+ *  The SimpleBind is expected to be invoked only once.
+ */
+Predictor::Predictor(const std::string& model_json_file,
+                     const std::string& model_params_file,
+                     const Shape& input_shape,
+                     bool gpu_context_type,
+                     const std::string& synset_file,
+                     const std::string& mean_image_file):
+                     input_shape(input_shape),
+                     mean_image_file(mean_image_file) {
+  if (gpu_context_type) {
+    global_ctx = Context::gpu();
+  }
+  // Load the model
+  LoadModel(model_json_file);
+
+  // Load the model parameters.
+  LoadParameters(model_params_file);
+
+  /*
+   * The data will be used to output the exact label that matches highest output of the model.
+   */
+  LoadSynset(synset_file);
+
+  /*
+   * Load the mean image data if specified.
+   */
+  if (!mean_image_file.empty()) {
+    LoadMeanImageData();
+  } else {
+    LG << "Mean image file for normalizing the input is not provide."
+       << " We will use the default mean values for R,G and B channels.";
+    LoadDefaultMeanImageData();
+  }
+
+  // Create an executor after binding the model to input parameters.
+  args_map["data"] = NDArray(input_shape, global_ctx, false);
+  executor = net.SimpleBind(global_ctx, args_map, std::map<std::string, NDArray>(),
+                              std::map<std::string, OpReqType>(), aux_map);
+}
+
+/*
+ * The following function loads the model from json file.
+ */
+void Predictor::LoadModel(const std::string& model_json_file) {
+  if (!FileExists(model_json_file)) {
+    LG << "Model file " << model_json_file << " does not exist";
+    throw std::runtime_error("Model file does not exist");
+  }
+  LG << "Loading the model from " << model_json_file << std::endl;
+  net = Symbol::Load(model_json_file);
+}
+
+
+/*
+ * The following function loads the model parameters.
+ */
+void Predictor::LoadParameters(const std::string& model_parameters_file) {
+  if (!FileExists(model_parameters_file)) {
+    LG << "Parameter file " << model_parameters_file << " does not exist";
+    throw std::runtime_error("Model parameters does not exist");
+  }
+  LG << "Loading the model parameters from " << model_parameters_file << std::endl;
+  std::map<std::string, NDArray> parameters;
+  NDArray::Load(model_parameters_file, 0, &parameters);
+  for (const auto &k : parameters) {
+    if (k.first.substr(0, 4) == "aux:") {
+      auto name = k.first.substr(4, k.first.size() - 4);
+      aux_map[name] = k.second.Copy(global_ctx);
+    }
+    if (k.first.substr(0, 4) == "arg:") {
+      auto name = k.first.substr(4, k.first.size() - 4);
+      args_map[name] = k.second.Copy(global_ctx);
+    }
+  }
+  /*WaitAll is need when we copy data between GPU and the main memory*/
+  NDArray::WaitAll();
+}
+
+
+/*
+ * The following function loads the synset file.
+ * This information will be used later to report the label of input image.
+ */
+void Predictor::LoadSynset(const std::string& synset_file) {
+  if (!FileExists(synset_file)) {
+    LG << "Synset file " << synset_file << " does not exist";
+    throw std::runtime_error("Synset file does not exist");
+  }
+  LG << "Loading the synset file.";
+  std::ifstream fi(synset_file.c_str());
+  if (!fi.is_open()) {
+    std::cerr << "Error opening synset file " << synset_file << std::endl;
+    throw std::runtime_error("Error in opening the synset file.");
+  }
+  std::string synset, lemma;
+  while (fi >> synset) {
+    getline(fi, lemma);
+    output_labels.push_back(lemma);
+  }
+  fi.close();
+}
+
+
+/*
+ * The following function loads the mean data from mean image file.
+ * This data will be used for normalizing the image before running the forward
+ * pass.
+ * The output data has the same shape as that of the input image data.
+ */
+void Predictor::LoadMeanImageData() {
+  LG << "Load the mean image data that will be used to normalize "
+     << "the image before running forward pass.";
+  mean_image_data = NDArray(input_shape, global_ctx, false);
+  mean_image_data.SyncCopyFromCPU(
+        NDArray::LoadToMap(mean_image_file)["mean_img"].GetData(),
+        input_shape.Size());
+  NDArray::WaitAll();
+}
+
+
+/*
+ * The following function loads the default mean values for
+ * R, G and B channels into NDArray that has the same shape as that of
+ * input image.
+ */
+void Predictor::LoadDefaultMeanImageData() {
+  LG << "Loading the default mean image data";
+  std::vector<float> array;
+  /*resize pictures to (224, 224) according to the pretrained model*/
+  int height = input_shape[2];
+  int width = input_shape[3];
+  int channels = input_shape[1];
+  std::vector<mx_float> default_means;
+  default_means.push_back(DEFAULT_MEAN_R);
+  default_means.push_back(DEFAULT_MEAN_G);
+  default_means.push_back(DEFAULT_MEAN_B);
+  for (int c = 0; c < channels; ++c) {
+    for (int i = 0; i < height; ++i) {
+      for (int j = 0; j < width; ++j) {
+        array.push_back(default_means[c]);
+      }
+    }
+  }
+  mean_image_data = NDArray(input_shape, global_ctx, false);
+  mean_image_data.SyncCopyFromCPU(array.data(), input_shape.Size());
+  NDArray::WaitAll();
+}
+
+
+/*
+ * The following function loads the input image into NDArray.
+ */
+NDArray Predictor::LoadInputImage(const std::string& image_file) {
+  if (!FileExists(image_file)) {
+    LG << "Image file " << image_file << " does not exist";
+    throw std::runtime_error("Image file does not exist");
+  }
+  LG << "Loading the image " << image_file << std::endl;
+  std::vector<float> array;
+  cv::Mat mat = cv::imread(image_file);
+  /*resize pictures to (224, 224) according to the pretrained model*/
+  int height = input_shape[2];
+  int width = input_shape[3];
+  int channels = input_shape[1];
+  cv::resize(mat, mat, cv::Size(height, width));
+  for (int c = 0; c < channels; ++c) {
+    for (int i = 0; i < height; ++i) {
+      for (int j = 0; j < width; ++j) {
+        array.push_back(static_cast<float>(mat.data[(i * height + j) * 3 + c]));
+      }
+    }
+  }
+  NDArray image_data = NDArray(input_shape, global_ctx, false);
+  image_data.SyncCopyFromCPU(array.data(), input_shape.Size());
+  NDArray::WaitAll();
+  return image_data;
+}
+
+
+/*
+ * The following function runs the forward pass on the model.
+ * The executor is created in the constructor.
+ *
+ */
+void Predictor::PredictImage(const std::string& image_file) {
+  // Load the input image
+  NDArray image_data = LoadInputImage(image_file);
+
+  // Normalize the image
+  image_data.Slice(0, 1) -= mean_image_data;
+
+  LG << "Running the forward pass on model to predict the image";
+  /*
+   * The executor->arg_arrays represent the arguments to the model.
+   *
+   * Copying the image_data that contains the NDArray of input image
+   * to the arg map of the executor. The input is stored with the key "data" in the map.
+   *
+   */
+  image_data.CopyTo(&(executor->arg_dict()["data"]));
+  NDArray::WaitAll();
+
+  // Run the forward pass.
+  executor->Forward(false);
+
+  // The output is available in executor->outputs.
+  auto array = executor->outputs[0].Copy(global_ctx);
+  NDArray::WaitAll();
+
+  /*
+   * Find out the maximum accuracy and the index associated with that accuracy.
+   * This is done by using the argmax operator on NDArray.
+   */
+  auto predicted = array.ArgmaxChannel();
+  NDArray::WaitAll();
+
+  int best_idx = predicted.At(0, 0);
+  float best_accuracy = array.At(0, best_idx);
+
+  if (output_labels.empty()) {
+    LG << "The model predicts the highest accuracy of " << best_accuracy << " at index "
+       << best_idx;
+  } else {
+    LG << "The model predicts the input image to be a [" << output_labels[best_idx]
+       << " ] with Accuracy = " << best_accuracy << std::endl;
+  }
+}
+
+
+Predictor::~Predictor() {
+  if (executor) {
+    delete executor;
+  }
+  MXNotifyShutdown();
+}
+
+
+/*
+ * Convert the input string of number of hidden units into the vector of integers.
+ */
+std::vector<index_t> getShapeDimensions(const std::string& hidden_units_string) {
+    std::vector<index_t> dimensions;
+    char *p_next;
+    int num_unit = strtol(hidden_units_string.c_str(), &p_next, 10);
+    dimensions.push_back(num_unit);
+    while (*p_next) {
+        num_unit = strtol(p_next, &p_next, 10);
+        dimensions.push_back(num_unit);
+    }
+    return dimensions;
+}
+
+void printUsage() {
+    std::cout << "Usage:" << std::endl;
+    std::cout << "inception_inference --symbol <model symbol file in json format>  " << std::endl
+              << "--params <model params file> " << std::endl
+              << "--image <path to the image used for prediction> " << std::endl
+              << "--synset <file containing labels for prediction> " << std::endl
+              << "[--input_shape <dimensions of input image e.g \"3 224 224\">] " << std::endl
+              << "[--mean <file containing mean image for normalizing the input image>] "
+              << std::endl
+              << "[--gpu  <Specify this option if workflow needs to be run in gpu context>]"
+              << std::endl;
+}
+
+int main(int argc, char** argv) {
+  std::string model_file_json;
+  std::string model_file_params;
+  std::string synset_file = "";
+  std::string mean_image = "";
+  std::string input_image = "";
+  bool gpu_context_type = false;
+
+  std::string input_shape = "3 224 224";
+    int index = 1;
+    while (index < argc) {
+        if (strcmp("--symbol", argv[index]) == 0) {
+            index++;
+            model_file_json = (index < argc ? argv[index]:"");
+        } else if (strcmp("--params", argv[index]) == 0) {
+            index++;
+            model_file_params = (index < argc ? argv[index]:"");
+        } else if (strcmp("--synset", argv[index]) == 0) {
+            index++;
+            synset_file = (index < argc ? argv[index]:"");
+        } else if (strcmp("--mean", argv[index]) == 0) {
+            index++;
+            mean_image = (index < argc ? argv[index]:"");
+        } else if (strcmp("--image", argv[index]) == 0) {
+            index++;
+            input_image = (index < argc ? argv[index]:"");
+        } else if (strcmp("--input_shape", argv[index]) == 0) {
+            index++;
+            input_shape = (index < argc ? argv[index]:input_shape);
+        } else if (strcmp("--gpu", argv[index]) == 0) {
+            gpu_context_type = true;
+        } else if (strcmp("--help", argv[index]) == 0) {
+            printUsage();
+            return 0;
+        }
+        index++;
+    }
+
+  if (model_file_json.empty() || model_file_params.empty() || synset_file.empty()) {
+    LG << "ERROR: Model details such as symbol, param and/or synset files are not specified";
+    printUsage();
+    return 1;
+  }
+
+  if (input_image.empty()) {
+    LG << "ERROR: Path to the input image is not specified.";
+    printUsage();
+    return 1;
+  }
+
+  std::vector<index_t> input_dimensions = getShapeDimensions(input_shape);
+
+  /*
+   * Since we are running inference for 1 image, add 1 to the input_dimensions so that
+   * the shape of input data for the model will be
+   * {no. of images, channels, height, width}
+   */
+  input_dimensions.insert(input_dimensions.begin(), 1);
+
+  Shape input_data_shape(input_dimensions);
+
+  try {
+    // Initialize the predictor object
+    Predictor predict(model_file_json, model_file_params, input_data_shape, gpu_context_type,
+                      synset_file, mean_image);
+
+    // Run the forward pass to predict the image.
+    predict.PredictImage(input_image);
+  } catch (std::runtime_error &error) {
+    LG << "Execution failed with ERROR: " << error.what();
+  } catch (...) {
+    /*
+     * If underlying MXNet code has thrown an exception the error message is
+     * accessible through MXGetLastError() function.
+     */
+    LG << "Execution failed with following MXNet error";
+    LG << MXGetLastError();
+  }
+  return 0;
+}
diff --git a/cpp-package/example/inference/unit_test_inception_inference.sh b/cpp-package/example/inference/unit_test_inception_inference.sh
new file mode 100755
index 000000000000..4f40b496bbd3
--- /dev/null
+++ b/cpp-package/example/inference/unit_test_inception_inference.sh
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Downloading the data and model
+mkdir -p model
+wget -nc http://data.dmlc.ml/mxnet/models/imagenet/inception-bn.tar.gz
+wget -nc -O model/dog.jpg https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/dog.jpg?raw=true
+wget -nc -O model/mean_224.nd https://github.com/dmlc/web-data/raw/master/mxnet/example/feature_extract/mean_224.nd
+tar -xvzf inception-bn.tar.gz -C model
+
+# Building
+make all
+
+
+# Running the example with dog image.
+if [ "$(uname)" == "Darwin" ]; then
+    DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:../../../lib ./inception_inference --symbol "./model/Inception-BN-symbol.json" --params "./model/Inception-BN-0126.params" --synset "./model/synset.txt" --mean "./model/mean_224.nd" --image "./model/dog.jpg" 2&> inception_inference.log
+else
+    LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../lib ./inception_inference --symbol "./model/Inception-BN-symbol.json" --params "./model/Inception-BN-0126.params" --synset "./model/synset.txt" --mean "./model/mean_224.nd" --image "./model/dog.jpg" 2&> inception_inference.log
+fi
+result=`grep -c "pug-dog" inception_inference.log`
+if [ $result == 1 ];
+then
+    echo "PASS: inception_inference correctly identified the image."
+    exit 0
+else
+    echo "FAIL: inception_inference FAILED to identify the image."
+    exit 1
+fi

From c41d873dbefc664963985027a6febb750184fb0a Mon Sep 17 00:00:00 2001
From: Manu Seth <22492939+mseth10@users.noreply.github.com>
Date: Fri, 14 Dec 2018 22:25:17 -0700
Subject: [PATCH 81/93] Update MKLDNN_README.md (#13653)

---
 MKLDNN_README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/MKLDNN_README.md b/MKLDNN_README.md
index 2618d23388e7..6b25fee85195 100644
--- a/MKLDNN_README.md
+++ b/MKLDNN_README.md
@@ -1,9 +1,9 @@
 # Build/Install MXNet with MKL-DNN
 
-A better training and inference perforamce are expected to achieved on Intel-Architecture CPUs with MXNET built with [Intel MKL-DNN](https://github.com/intel/mkl-dnn) on multiple operating system, including Linux, Windows and MacOS.
-In the following sections, you will find building instructions for MXNET with Intel MKL-DNN on Linux, MacOS and Windows.
+A better training and inference performance is expected to be achieved on Intel-Architecture CPUs with MXNet built with [Intel MKL-DNN](https://github.com/intel/mkl-dnn) on multiple operating system, including Linux, Windows and MacOS.
+In the following sections, you will find build instructions for MXNet with Intel MKL-DNN on Linux, MacOS and Windows.
 
-The detailed performance data collected on Intel Xeon CPU with MXNET built with Intel MKL-DNN can be found at [here](https://mxnet.incubator.apache.org/faq/perf.html#intel-cpu).
+The detailed performance data collected on Intel Xeon CPU with MXNet built with Intel MKL-DNN can be found [here](https://mxnet.incubator.apache.org/faq/perf.html#intel-cpu).
 
 
 <h2 id="0">Contents</h2>
@@ -83,7 +83,7 @@ LIBRARY_PATH=$(brew --prefix llvm)/lib/ make -j $(sysctl -n hw.ncpu) CC=$(brew -
 
 <h2 id="3">Windows</h2>
 
-On Windows, you can use [Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) and [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) to compile MXNET with Intel MKL-DNN.
+On Windows, you can use [Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) and [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) to compile MXNet with Intel MKL-DNN.
 [Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is recommended.
 
 **Visual Studio 2015**
@@ -123,7 +123,7 @@ cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -D
 These commands produce a library called ```libmxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
 Also ```libmkldnn.dll``` with be in the ```./build/3rdparty/mkldnn/src/Release/```
 
-6. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading mxnet.
+6. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
 
 **Visual Studio 2017**
 
@@ -177,7 +177,7 @@ cmake -G "Visual Studio 15 2017 Win64" .. -T host=x64 -DUSE_CUDA=0 -DUSE_CUDNN=0
 msbuild mxnet.sln /p:Configuration=Release;Platform=x64 /maxcpucount
 ```
 
-9. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading mxnet.
+9. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
 
 <h2 id="4">Verify MXNet with python</h2>
 

From 8a00f1870bd3a76788c2a122237544b2c871be95 Mon Sep 17 00:00:00 2001
From: Hao Li <hao.h.li@intel.com>
Date: Sat, 15 Dec 2018 13:35:08 +0800
Subject: [PATCH 82/93] Support Quantized Fully Connected by INT8 GEMM (#12922)

* add quantized fully connect support

* disable qfc cpu case since s8u8s32 is only supported by MKL BLAS library

* retrigger to ci testing

* move implementation to cc file and add  STORAGE_TYPE_ASSIGN_CHECK

* fix typo bug

* retrigger the ci test

* fix typo bug

* retrigger ci

* retrigger the ci test

* retrigger the ci

* retrigger the ci test

* retrigger ci test

* fix indent issue

* retrigger the ci

* retrigger the ci test

* add verbose message

* update log message

* using range for loop

* using for auto range

* enable MKL BLAS ci test

* fix typo issue

* use TYPE_ASSIGN_CHECK

* retrigger the ci
---
 .../quantization/quantized_fully_connected.cc | 159 +++++++++++++++++-
 .../python/quantization/test_quantization.py  |  26 ++-
 2 files changed, 177 insertions(+), 8 deletions(-)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index e334fe7ec9b2..64ce73ba1cf7 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -23,11 +23,17 @@
  * \brief
  * \author Ziheng Jiang, Jun Wu
 */
+#include <vector>
+#include "quantization_utils.h"
 #include "../nn/fully_connected-inl.h"
 
 namespace mxnet {
 namespace op {
 
+namespace quantized_fc {
+enum QuantizedfcOpResource {kTempSpace};
+}
+
 bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
                                   std::vector<TShape> *in_shape,
                                   std::vector<TShape> *out_shape) {
@@ -79,6 +85,151 @@ bool QuantizedFullyConnectedType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
+                                        const int dev_mask,
+                                        DispatchMode* dispatch_mode,
+                                        std::vector<int> *in_attrs,
+                                        std::vector<int> *out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+
+  for (auto &v : *out_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
+      return false;
+    }
+  }
+
+  for (auto &v : *in_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+struct QuantizedSumInitKernelWithBias {
+  //  init sum data with bias for matrix b (n)
+  MSHADOW_XINLINE static void Map(int i, int32_t *out,
+                                  const int8_t *bias, const float *min_out,
+                                  const float *max_out, const float *min_bias,
+                                  const float *max_bias) {
+    typedef int32_t T1;
+    typedef int8_t  T2;
+    using mshadow::red::limits::MinValue;
+    using mshadow::red::limits::MaxValue;
+    float float_for_one_out_quant  =
+        MaxAbs(*min_out, *max_out) / static_cast<double>(MaxValue<T1>());
+    float float_for_one_bias_quant =
+        MaxAbs(*min_bias, *max_bias) / static_cast<double>(MaxValue<T2>());
+    if (float_for_one_out_quant != 0) {
+      out[i] = bias[i] * float_for_one_bias_quant /
+          float_for_one_out_quant;
+    } else {
+      LOG(INFO) << "float_for_one_out_quant is 0,"
+                << " need to check the why MaxAbs(*min_out, *max_out) of out_data is 0!";
+      out[i] = 0;
+    }
+  }
+};
+
+
+template<typename SrcType>
+void QuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
+                                    const OpContext &ctx,
+                                    const std::vector<NDArray> &in_data,
+                                    const std::vector<OpReqType> &req,
+                                    const std::vector<NDArray> &out_data) {
+#if MSHADOW_USE_MKL == 1
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  using namespace mshadow;
+  using namespace mxnet_op;
+  size_t num_inputs = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_data.size(),  num_inputs * 3);
+  CHECK_EQ(out_data.size(), 3U);
+  const NDArray& data = in_data[0];
+  const NDArray& weight = in_data[1];
+  const NDArray& out = out_data[0];
+  TShape dshape = data.shape();
+  TShape wshape = weight.shape();
+  TShape oshape = out.shape();
+  auto output_temp = out.data().dptr<int32_t>();
+  auto weight_temp = weight.data().dptr<SrcType>();
+  auto data_temp = data.data().dptr<SrcType>();
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  const float alpha = 1.0f;
+  const float beta  = 1.0f;
+  const CBLAS_OFFSET offsetc = CblasFixOffset;
+  const MKL_INT8 oa = 0;
+  const MKL_INT8 ob = 0;
+  MKL_INT32 oc = 0;
+  const int m = dshape[0], n = wshape[0], k = dshape.ProdShape(1, dshape.ndim());
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  //  cblas_gemm_s8u8s32 required first matrix must be uint8
+  //  shift data from int8(from -128 to 127) to uint8 (from 0 to 255)
+  int shift = 128;
+  Tensor<cpu, 1, uint8_t> shiftdata =
+    ctx.requested[quantized_fc::kTempSpace].get_space_typed<cpu, 1, uint8_t>(
+      Shape1(m * k), s);
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = 0; i < m * k; ++i) {
+    shiftdata.dptr_[i] = data_temp[i] + shift;
+  }
+
+  Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
+      out_data[1].data().dptr<float>(), out_data[2].data().dptr<float>(),
+      in_data[num_inputs].data().dptr<float>(), in_data[num_inputs+1].data().dptr<float>(),
+      in_data[num_inputs+2].data().dptr<float>(), in_data[num_inputs+3].data().dptr<float>());
+  if (!param.no_bias) {
+    const NDArray& bias = in_data[2];
+    Kernel<QuantizedSumInitKernelWithBias, cpu>::Launch(s, n, out.data().dptr<int32_t>(),
+        bias.data().dptr<int8_t>(), out_data[1].data().dptr<float>(),
+        out_data[2].data().dptr<float>(), in_data[7].data().dptr<float>(),
+        in_data[8].data().dptr<float>());
+  } else {
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int i = 0; i < m * n; ++i) {
+      output_temp[i] = 0;
+    }
+  }
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < k; ++j) {
+      output_temp[i] -= shift * weight_temp[i * k + j];
+    }
+  }
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = n; i < m * n; ++i) {
+    output_temp[i] = output_temp[i % n];
+  }
+  cblas_gemm_s8u8s32(CblasRowMajor,
+                     CblasNoTrans,
+                     CblasTrans,
+                     offsetc,
+                     m,
+                     n,
+                     k,
+                     alpha,
+                     shiftdata.dptr_,
+                     k,
+                     oa,
+                     weight.data().dptr<SrcType>(),
+                     k,
+                     ob,
+                     beta,
+                     out.data().dptr<int32_t>(),
+                     n,
+                     &oc);
+#else
+  LOG(FATAL) << "Quantized fully connected operator relies on cblas_gemm_s8u8s32"
+             << " which is only supported by MKL BLAS."
+             << " Please build MXNet with USE_BLAS=mkl to leverage this operator.";
+#endif
+}
+
 NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
 .describe(R"code(Fully Connected operator for input, weight and bias data type of int8,
 and accumulates in type int32 for the output. For each argument, two more arguments of type
@@ -112,7 +263,14 @@ and max thresholds representing the threholds for quantizing the float32 output
   })
 .set_attr<nnvm::FInferShape>("FInferShape", QuantizedFullyConnectedShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizedFullyConnectedType)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizedFullyConnectedStorageType)
 .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
+.set_attr<FComputeEx>("FComputeEx<cpu>",
+    QuantizedFullyConnectedForward<int8_t>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
 .add_argument("data", "NDArray-or-Symbol", "Input data.")
 .add_argument("weight", "NDArray-or-Symbol", "weight.")
 .add_argument("bias", "NDArray-or-Symbol", "bias.")
@@ -135,6 +293,5 @@ NNVM_REGISTER_OP(FullyConnected)
     }
     return node;
   });
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 518b69626246..3ff4b69302fb 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -26,6 +26,7 @@
 from mxnet.module import Module
 from mxnet.io import NDArrayIter
 import unittest
+import operator
 
 def is_test_for_gpu():
     return mx.current_context().device_type == 'gpu'
@@ -278,8 +279,15 @@ def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_p
 def test_quantized_fc():
     def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
         if mx.current_context().device_type != 'gpu':
-            print('skipped testing quantized_fc on cpu since it is not supported yet')
-            return
+            hasMKL = False;
+            for key in os.environ.keys():
+                if operator.eq(key, "BUILD_TAG"):
+                    if os.environ['BUILD_TAG'].find("MKL") != -1:
+                        hasMKL = True
+                    break
+            if hasMKL == False:
+                print('skipped testing quantized_fc on cpu since s8u8s32 is only supported by MKL BLAS library')
+                return
         elif qdtype == 'uint8' and is_test_for_gpu():
             print('skipped testing quantized_fc for gpu uint8 since it is not supported yet')
             return
@@ -291,16 +299,16 @@ def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
         fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
         if qdtype == 'uint8':
             data_low = 0.0
-            data_high = 127.0
+            data_high = 63.0
         else:
-            data_low = -127.0
-            data_high = 127.0
+            data_low = -63.0
+            data_high = 63.0
         fc_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                      shape=data_shape).astype('int32')
-        fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+        fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                      shape=arg_shapes[1]).astype('int32')
         if not no_bias:
-            fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+            fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                          shape=arg_shapes[2]).astype('int32')
         output = fc_fp32_exe.forward()[0]
 
@@ -343,6 +351,10 @@ def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
         check_quantized_fc((32, 111, 2, 2), 100, True, qdtype)
         check_quantized_fc((32, 512, 2, 2), 100, False, qdtype)
         check_quantized_fc((32, 111, 2, 2), 100, False, qdtype)
+        check_quantized_fc((256, 2048, 2, 2), 800, False, qdtype)
+        check_quantized_fc((256, 111, 2, 2), 800, False, qdtype)
+        check_quantized_fc((256, 2048, 2, 2), 800, True, qdtype)
+        check_quantized_fc((256, 111, 2, 2), 800, True, qdtype)
 
 @with_seed()
 def test_quantized_flatten():

From 02a0e7ef49bc1fa1e4f1b4d33ee6775d1080c771 Mon Sep 17 00:00:00 2001
From: Lanking <lanking520@live.com>
Date: Sat, 15 Dec 2018 17:35:55 -0800
Subject: [PATCH 83/93] add build fix for Scala/Java build (#13655)

---
 scala-package/.gitignore                     | 2 ++
 scala-package/init-native/osx-x86_64/pom.xml | 2 +-
 scala-package/native/osx-x86_64-cpu/pom.xml  | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/scala-package/.gitignore b/scala-package/.gitignore
index 8bc87f53e802..9a89bef324bc 100644
--- a/scala-package/.gitignore
+++ b/scala-package/.gitignore
@@ -4,6 +4,8 @@ core/src/main/scala/org/apache/mxnet/NDArrayBase.scala
 core/src/main/scala/org/apache/mxnet/javaapi/NDArrayBase.scala
 core/src/main/scala/org/apache/mxnet/SymbolAPIBase.scala
 core/src/main/scala/org/apache/mxnet/SymbolBase.scala
+core/src/main/scala/org/apache/mxnet/NDArrayRandomAPIBase.scala
+core/src/main/scala/org/apache/mxnet/SymbolRandomAPIBase.scala
 examples/scripts/infer/images/
 examples/scripts/infer/models/
 local-snapshot
\ No newline at end of file
diff --git a/scala-package/init-native/osx-x86_64/pom.xml b/scala-package/init-native/osx-x86_64/pom.xml
index 12f4d800eba4..1c3966f477e6 100644
--- a/scala-package/init-native/osx-x86_64/pom.xml
+++ b/scala-package/init-native/osx-x86_64/pom.xml
@@ -121,7 +121,7 @@
             </goals>
             <configuration>
               <executable>install_name_tool</executable>
-              <commandlineArgs>-change lib/libmxnet.so @loader_path/libmxnet.so ${project.build.directory}/${artifactId}.jnilib</commandlineArgs>
+              <commandlineArgs>-change @rpath/libmxnet.so @loader_path/libmxnet.so ${project.build.directory}/${artifactId}.jnilib</commandlineArgs>
             </configuration>
           </execution>
           <execution>
diff --git a/scala-package/native/osx-x86_64-cpu/pom.xml b/scala-package/native/osx-x86_64-cpu/pom.xml
index 425ca96815de..f6f90cdcdb58 100644
--- a/scala-package/native/osx-x86_64-cpu/pom.xml
+++ b/scala-package/native/osx-x86_64-cpu/pom.xml
@@ -121,7 +121,7 @@
             </goals>
             <configuration>
               <executable>install_name_tool</executable>
-              <commandlineArgs>-change lib/libmxnet.so @loader_path/libmxnet.so ${project.build.directory}/${artifactId}.jnilib</commandlineArgs>
+              <commandlineArgs>-change @rpath/libmxnet.so @loader_path/libmxnet.so ${project.build.directory}/${artifactId}.jnilib</commandlineArgs>
             </configuration>
           </execution>
           <execution>

From e2fe081ef4877f62032a311c2ecdc98da03aeb2f Mon Sep 17 00:00:00 2001
From: Pedro Larroy <pedro.larroy.lists@gmail.com>
Date: Mon, 17 Dec 2018 02:37:30 +0000
Subject: [PATCH 84/93] Fix Jetson compilation (#13532)

---
 ci/docker/Dockerfile.build.jetson | 1 +
 ci/jenkins/Jenkinsfile_edge       | 2 +-
 make/crosscompile.jetson.mk       | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index 30b9b7e37507..07097887f87d 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -77,6 +77,7 @@ RUN JETPACK_DOWNLOAD_PREFIX=https://developer.download.nvidia.com/devzone/devcen
     dpkg -i --force-architecture  $ARM_NVINFER_INSTALLER_PACKAGE && \
     dpkg -i --force-architecture  $ARM_NVINFER_DEV_INSTALLER_PACKAGE && \
     apt update -y || true && apt install -y cuda-libraries-dev-9-0 libcudnn7-dev libnvinfer-dev
+RUN ln -s /usr/include/aarch64-linux-gnu/cudnn_v7.h /usr/include/aarch64-linux-gnu/cudnn.h
 ENV PATH $PATH:/usr/local/cuda/bin
 ENV NVCCFLAGS "-m64"
 ENV CUDA_ARCH "-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62"
diff --git a/ci/jenkins/Jenkinsfile_edge b/ci/jenkins/Jenkinsfile_edge
index 275a0c96de94..c101ba102386 100644
--- a/ci/jenkins/Jenkinsfile_edge
+++ b/ci/jenkins/Jenkinsfile_edge
@@ -34,7 +34,7 @@ utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-//    custom_steps.compile_armv8_jetson_gpu(),
+    custom_steps.compile_armv8_jetson_gpu(),
     custom_steps.compile_armv7_cpu(),
     custom_steps.compile_armv6_cpu(),
     custom_steps.compile_armv8_cpu(),
diff --git a/make/crosscompile.jetson.mk b/make/crosscompile.jetson.mk
index a1468f4496d3..171f846d20dd 100644
--- a/make/crosscompile.jetson.mk
+++ b/make/crosscompile.jetson.mk
@@ -57,10 +57,10 @@ DEBUG = 0
 USE_SIGNAL_HANDLER = 1
 
 # the additional link flags you want to add
-ADD_LDFLAGS = -L${CROSS_ROOT}/lib
+ADD_LDFLAGS = -L${CROSS_ROOT}/lib -L/usr/lib/aarch64-linux-gnu/
 
 # the additional compile flags you want to add
-ADD_CFLAGS = -I${CROSS_ROOT}/include
+ADD_CFLAGS = -I${CROSS_ROOT}/include -I/usr/include/aarch64-linux-gnu/
 
 #---------------------------------------------
 # matrix computation libraries for CPU/GPU

From aecae3420c78ab34148361bb4870f2047ed5680d Mon Sep 17 00:00:00 2001
From: Xinyu Chen <xinyu1.chen@intel.com>
Date: Mon, 17 Dec 2018 14:36:45 +0800
Subject: [PATCH 85/93] remove omp which can cause ssd accuracy variance
 (#13622)

---
 src/operator/contrib/multibox_detection.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/contrib/multibox_detection.cc b/src/operator/contrib/multibox_detection.cc
index c005dfa06590..b4f66d8fcf1d 100644
--- a/src/operator/contrib/multibox_detection.cc
+++ b/src/operator/contrib/multibox_detection.cc
@@ -174,7 +174,6 @@ inline void MultiBoxDetectionForward(const Tensor<cpu, 3, DType> &out,
     }
 
     // apply nms
-#pragma omp parallel for num_threads(omp_threads)
     for (int i = 0; i < nkeep; ++i) {
       int offset_i = i * 6;
       if (p_out[offset_i] < 0) continue;  // skip eliminated

From 43e81255a2ecc941ded2bf9aa51acc7f699fc68c Mon Sep 17 00:00:00 2001
From: Kellen Sunderland <kellen.sunderland@gmail.com>
Date: Mon, 17 Dec 2018 17:14:31 -0800
Subject: [PATCH 86/93] Revert "[MXNET-43] Fix Jetson compilation" (#13665)

* Revert "remove omp which can cause ssd accuracy variance (#13622)"

This reverts commit 655f1c6f7a0706dd622f73db9af2e6df895ca213.

* Revert "Fix Jetson compilation (#13532)"

This reverts commit 48e25c4cae355753dd96ea7afe004bf78e0719e4.
---
 ci/docker/Dockerfile.build.jetson | 1 -
 ci/jenkins/Jenkinsfile_edge       | 2 +-
 make/crosscompile.jetson.mk       | 4 ++--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index 07097887f87d..30b9b7e37507 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -77,7 +77,6 @@ RUN JETPACK_DOWNLOAD_PREFIX=https://developer.download.nvidia.com/devzone/devcen
     dpkg -i --force-architecture  $ARM_NVINFER_INSTALLER_PACKAGE && \
     dpkg -i --force-architecture  $ARM_NVINFER_DEV_INSTALLER_PACKAGE && \
     apt update -y || true && apt install -y cuda-libraries-dev-9-0 libcudnn7-dev libnvinfer-dev
-RUN ln -s /usr/include/aarch64-linux-gnu/cudnn_v7.h /usr/include/aarch64-linux-gnu/cudnn.h
 ENV PATH $PATH:/usr/local/cuda/bin
 ENV NVCCFLAGS "-m64"
 ENV CUDA_ARCH "-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62"
diff --git a/ci/jenkins/Jenkinsfile_edge b/ci/jenkins/Jenkinsfile_edge
index c101ba102386..275a0c96de94 100644
--- a/ci/jenkins/Jenkinsfile_edge
+++ b/ci/jenkins/Jenkinsfile_edge
@@ -34,7 +34,7 @@ utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-    custom_steps.compile_armv8_jetson_gpu(),
+//    custom_steps.compile_armv8_jetson_gpu(),
     custom_steps.compile_armv7_cpu(),
     custom_steps.compile_armv6_cpu(),
     custom_steps.compile_armv8_cpu(),
diff --git a/make/crosscompile.jetson.mk b/make/crosscompile.jetson.mk
index 171f846d20dd..a1468f4496d3 100644
--- a/make/crosscompile.jetson.mk
+++ b/make/crosscompile.jetson.mk
@@ -57,10 +57,10 @@ DEBUG = 0
 USE_SIGNAL_HANDLER = 1
 
 # the additional link flags you want to add
-ADD_LDFLAGS = -L${CROSS_ROOT}/lib -L/usr/lib/aarch64-linux-gnu/
+ADD_LDFLAGS = -L${CROSS_ROOT}/lib
 
 # the additional compile flags you want to add
-ADD_CFLAGS = -I${CROSS_ROOT}/include -I/usr/include/aarch64-linux-gnu/
+ADD_CFLAGS = -I${CROSS_ROOT}/include
 
 #---------------------------------------------
 # matrix computation libraries for CPU/GPU

From cd1540c292857063b8f4fde35fd8e79e013b9a6e Mon Sep 17 00:00:00 2001
From: Pedro Larroy <pedro.larroy.lists@gmail.com>
Date: Tue, 18 Dec 2018 08:04:36 +0000
Subject: [PATCH 87/93] Fix Jetson compilation (#13666)

---
 ci/docker/runtime_functions.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index d073962acf40..7a1734c14d95 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -133,7 +133,7 @@ build_jetson() {
     set -ex
     pushd .
 
-    build_ccache_wrappers
+    #build_ccache_wrappers
 
     cp make/crosscompile.jetson.mk ./config.mk
     make -j$(nproc)

From e15c34be37835ab8bb043235691e896a6cf2e64a Mon Sep 17 00:00:00 2001
From: Aaron Markham <markhama@amazon.com>
Date: Tue, 18 Dec 2018 09:02:02 -0800
Subject: [PATCH 88/93] turn on Sphinx warnings as errors (#13544)

* turn on warnings as errors

* move warnings as error logic to build_all_version

* fix typo in comment

* add warning as error option for docs pipeline

* bump ci to test again; use this chance to add notes on this feature

* fix bugs in image.py docs
---
 ci/docker/runtime_functions.sh              | 6 ++----
 docs/README.md                              | 6 +++++-
 docs/build_version_doc/build_all_version.sh | 7 ++++++-
 python/mxnet/image/image.py                 | 8 ++++----
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 7a1734c14d95..377d18016ab6 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -1207,7 +1207,7 @@ nightly_straight_dope_python3_multi_gpu_tests() {
 nightly_tutorial_test_ubuntu_python3_gpu() {
     set -ex
     cd /work/mxnet/docs
-    export BUILD_VER=tutorial 
+    export BUILD_VER=tutorial
     export MXNET_DOCS_BUILD_MXNET=0
     make html
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
@@ -1237,7 +1237,7 @@ deploy_docs() {
     set -ex
     pushd .
 
-    make docs
+    make docs SPHINXOPTS=-W
 
     popd
 }
@@ -1294,5 +1294,3 @@ EOF
     declare -F | cut -d' ' -f3
     echo
 fi
-
-
diff --git a/docs/README.md b/docs/README.md
index c21836edd821..80463cc68d54 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -17,9 +17,13 @@ git clone --recursive https://github.com/apache/incubator-mxnet.git mxnet
 cd mxnet/docs/build_version_doc
 ./setup_docs_ubuntu.sh
 cd ../../
-make docs USE_OPENMP=1
+make docs USE_OPENMP=1 SPHINXOPTS=-W
 ```
 
+OpenMP speeds things up and will work on Ubuntu if you used the `setup_docs_ubuntu.sh` script.
+The `-W` Sphinx option enforces "warnings as errors". This will help you debug your builds and get them through CI.
+**CI will not let a PR through if it breaks the website.** Refer to the [MXNet Developer wiki's documentation guide](https://cwiki.apache.org/confluence/display/MXNET/Documentation+Guide) for troubleshooting tips.
+
 For more information on each API's documentation dependencies, how to serve the docs, or how to build the full website with each legacy MXNet version, refer to the following links:
 
 * [Dependencies](https://github.com/apache/incubator-mxnet/tree/master/docs/build_version_doc#dependencies) - required before you build the docs
diff --git a/docs/build_version_doc/build_all_version.sh b/docs/build_version_doc/build_all_version.sh
index 5f857996f19d..6b8c3cbd864e 100755
--- a/docs/build_version_doc/build_all_version.sh
+++ b/docs/build_version_doc/build_all_version.sh
@@ -43,6 +43,9 @@
 set -e
 set -x
 
+# Set OPTS to any Sphinx build options, like -W for "warnings as errors"
+OPTS=
+
 # $1 is the list of branches/tags to build
 if [ -z "$1" ]
   then
@@ -117,6 +120,8 @@ function checkout () {
   git checkout "$repo_folder" || git branch $repo_folder "upstream/$repo_folder" && git checkout "$repo_folder" || exit 1
   if [ $tag == 'master' ]; then
     git pull
+    # master gets warnings as errors for Sphinx builds
+    OPTS="-W"
   fi
   git submodule update --init --recursive
   cd ..
@@ -160,7 +165,7 @@ for key in ${!build_arr[@]}; do
 
     echo "Building $tag..."
     cd $tag/docs
-    make html USE_OPENMP=1 BUILD_VER=$tag || exit 1
+    make html USE_OPENMP=1 BUILD_VER=$tag SPHINXOPTS=$OPTS || exit 1
     # Navigate back to build_version_doc folder
     cd ../../../
     # Use the display tag name for the folder name
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index b452aecdb04b..1dd665607597 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -46,7 +46,7 @@ def imread(filename, *args, **kwargs):
     """Read and decode an image to an NDArray.
 
     .. note:: `imread` uses OpenCV (not the CV2 Python library).
-    MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
+       MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
 
     Parameters
     ----------
@@ -87,7 +87,7 @@ def imresize(src, w, h, *args, **kwargs):
     r"""Resize image with OpenCV.
 
     .. note:: `imresize` uses OpenCV (not the CV2 Python library). MXNet must have been built
-    with USE_OPENCV=1 for `imresize` to work.
+       with USE_OPENCV=1 for `imresize` to work.
 
     Parameters
     ----------
@@ -144,7 +144,7 @@ def imdecode(buf, *args, **kwargs):
     """Decode an image to an NDArray.
 
     .. note:: `imdecode` uses OpenCV (not the CV2 Python library).
-    MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
+       MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
 
     Parameters
     ----------
@@ -345,7 +345,7 @@ def resize_short(src, size, interp=2):
     """Resizes shorter edge to size.
 
     .. note:: `resize_short` uses OpenCV (not the CV2 Python library).
-    MXNet must have been built with OpenCV for `resize_short` to work.
+       MXNet must have been built with OpenCV for `resize_short` to work.
 
     Resizes the original image by setting the shorter edge to size
     and setting the longer edge accordingly.

From 12d34a77eae5bf287b5e1e32a1ad079488268580 Mon Sep 17 00:00:00 2001
From: Pedro Larroy <pedro.larroy.lists@gmail.com>
Date: Tue, 18 Dec 2018 17:04:26 +0000
Subject: [PATCH 89/93] Update CODEOWNERS, add Pedro Larroy. (#13579)

---
 CODEOWNERS | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 5a88e89dfb02..ce648ef2e087 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -13,14 +13,14 @@
 
 # Language bindings
 /R-package/                @thirdwing
-/scala-package/            @yzhliu @nswamy
+/scala-package/            @yzhliu @nswamy @pllarroy
 /perl-package/             @sergeykolychev
-/python/                   @szha
+/python/                   @szha @pllarroy
 /contrib/clojure-package/  @gigasquid
 
 # C++ base
 /src/kvstore/     @rahul003 @anirudh2290
-/include/         @anirudh2290
+/include/         @anirudh2290 @pllarroy
 /src/c_api/       @anirudh2290
 /src/common/      @anirudh2290
 /src/engine/      @anirudh2290
@@ -33,13 +33,17 @@
 /src/profiler/    @anirudh2290
 /src/storage/     @anirudh2290
 /tests/cpp/       @anirudh2290
-/cpp-package/ @nswamy
+/cpp-package/     @nswamy @pllarroy
+/src/             @pllarroy
+/plugin/          @pllarroy
 
 # CMake
-CMakeLists.txt    @szha @rahul003
-/cmake/           @szha @rahul003
+CMakeLists.txt    @szha @rahul003 @pllarroy
+/cmake/           @szha @rahul003 @pllarroy
 
 # MXNet CI
+dev_menu.py         @pllarroy
+/ci/                @pllarroy
 /tests/ci_build/    @marcoabreu
 Jenkinsfile         @marcoabreu
 .travis.yml         @marcoabreu
@@ -50,16 +54,16 @@ Makefile          @szha
 prepare_mkl.sh    @szha
 
 # Docs
-/docs/            @szha
+/docs/            @szha @pllarroy
 
 # Submodules
 .gitmodules       @szha
 
 # Examples
-/example/         @szha
+/example/         @szha @pllarroy
 
 # Tools
-/tools/           @szha
+/tools/           @szha @pllarroy
 
 # Github templates
 /.github/         @szha

From 097bc587e3ac893770adc59d1c2112a4dea6b690 Mon Sep 17 00:00:00 2001
From: Pedro Larroy <pedro.larroy.lists@gmail.com>
Date: Tue, 18 Dec 2018 19:44:21 +0000
Subject: [PATCH 90/93] Revert "Revert "[MXNET-43] Fix Jetson compilation"
 (#13665)" (#13672)

This reverts commit 3433776dac7be75928082bbc1d552fca248fb8e8.
---
 ci/docker/Dockerfile.build.jetson | 1 +
 ci/jenkins/Jenkinsfile_edge       | 2 +-
 make/crosscompile.jetson.mk       | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index 30b9b7e37507..07097887f87d 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -77,6 +77,7 @@ RUN JETPACK_DOWNLOAD_PREFIX=https://developer.download.nvidia.com/devzone/devcen
     dpkg -i --force-architecture  $ARM_NVINFER_INSTALLER_PACKAGE && \
     dpkg -i --force-architecture  $ARM_NVINFER_DEV_INSTALLER_PACKAGE && \
     apt update -y || true && apt install -y cuda-libraries-dev-9-0 libcudnn7-dev libnvinfer-dev
+RUN ln -s /usr/include/aarch64-linux-gnu/cudnn_v7.h /usr/include/aarch64-linux-gnu/cudnn.h
 ENV PATH $PATH:/usr/local/cuda/bin
 ENV NVCCFLAGS "-m64"
 ENV CUDA_ARCH "-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62"
diff --git a/ci/jenkins/Jenkinsfile_edge b/ci/jenkins/Jenkinsfile_edge
index 275a0c96de94..c101ba102386 100644
--- a/ci/jenkins/Jenkinsfile_edge
+++ b/ci/jenkins/Jenkinsfile_edge
@@ -34,7 +34,7 @@ utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-//    custom_steps.compile_armv8_jetson_gpu(),
+    custom_steps.compile_armv8_jetson_gpu(),
     custom_steps.compile_armv7_cpu(),
     custom_steps.compile_armv6_cpu(),
     custom_steps.compile_armv8_cpu(),
diff --git a/make/crosscompile.jetson.mk b/make/crosscompile.jetson.mk
index a1468f4496d3..171f846d20dd 100644
--- a/make/crosscompile.jetson.mk
+++ b/make/crosscompile.jetson.mk
@@ -57,10 +57,10 @@ DEBUG = 0
 USE_SIGNAL_HANDLER = 1
 
 # the additional link flags you want to add
-ADD_LDFLAGS = -L${CROSS_ROOT}/lib
+ADD_LDFLAGS = -L${CROSS_ROOT}/lib -L/usr/lib/aarch64-linux-gnu/
 
 # the additional compile flags you want to add
-ADD_CFLAGS = -I${CROSS_ROOT}/include
+ADD_CFLAGS = -I${CROSS_ROOT}/include -I/usr/include/aarch64-linux-gnu/
 
 #---------------------------------------------
 # matrix computation libraries for CPU/GPU

From e5b236c3dff99a24e8e0cfa5de40a1ada0e56bd9 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-40-106.us-east-2.compute.internal>
Date: Tue, 18 Dec 2018 20:02:09 +0000
Subject: [PATCH 91/93] revert rpath

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index f82bbf3684a3..08539df7ea12 100644
--- a/Makefile
+++ b/Makefile
@@ -136,7 +136,7 @@ ifeq ($(USE_MKLDNN), 1)
 		LDFLAGS += -L$(MKLROOT)/lib
 	endif
 	CFLAGS += -I$(MKLDNNROOT)/include
-	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,$(ROOTDIR)/lib
+	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
 endif
 
 # setup opencv

From 438e75e97267b1d5306b29c561d2e68501b1a5f7 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-40-106.us-east-2.compute.internal>
Date: Tue, 18 Dec 2018 22:19:36 +0000
Subject: [PATCH 92/93] set mkldnn flag=0 for non mkldnn test and removing
 redundant mkldnn flag=1

---
 ci/docker/runtime_functions.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 377d18016ab6..de3ee5ff83b1 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -707,7 +707,6 @@ build_ubuntu_gpu_cmake_mkldnn() {
         -DUSE_CUDA=1                            \
         -DUSE_CUDNN=1                           \
         -DUSE_MKLML_MKL=1                       \
-        -DUSE_MKLDNN=1                          \
         -DCMAKE_BUILD_TYPE=Release              \
         -DCUDA_ARCH_NAME=Manual                 \
         -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
@@ -732,6 +731,7 @@ build_ubuntu_gpu_cmake() {
         -DUSE_CUDA=1                            \
         -DUSE_CUDNN=1                           \
         -DUSE_MKLML_MKL=0                       \
+        -DUSE_MKLDNN=0                          \
         -DUSE_DIST_KVSTORE=1                    \
         -DCMAKE_BUILD_TYPE=Release              \
         -DCUDA_ARCH_NAME=Manual                 \

From edfce6047383ef230b104ece802b976b35d00c25 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-40-106.us-east-2.compute.internal>
Date: Tue, 18 Dec 2018 22:53:08 +0000
Subject: [PATCH 93/93] removing export ld lib path

---
 ci/docker/runtime_functions.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index de3ee5ff83b1..53cd9ba2d4c6 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -926,7 +926,7 @@ unittest_ubuntu_cpu_julia06() {
 
     # FIXME
     export LD_PRELOAD='/usr/lib/x86_64-linux-gnu/libjemalloc.so'
-    export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
+    # export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
 
     # use the prebuilt binary from $MXNET_HOME/lib
     julia -e 'Pkg.build("MXNet")'
@@ -1256,7 +1256,7 @@ deploy_jl_docs() {
 
     # FIXME
     export LD_PRELOAD='/usr/lib/x86_64-linux-gnu/libjemalloc.so'
-    export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
+    # export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
 
     # use the prebuilt binary from $MXNET_HOME/lib
     julia -e 'Pkg.build("MXNet")'