From 4b2c90909048e9c354403eeaf2408793c52eb507 Mon Sep 17 00:00:00 2001
From: Anton Chernov <mechernov@gmail.com>
Date: Fri, 7 Dec 2018 13:53:55 +0100
Subject: [PATCH 1/2] Improved CMakeLists.txt

---
 CMakeLists.txt                               | 421 +++++++++++--------
 ci/docker/runtime_functions.sh               |  13 +-
 cmake/{ChooseBlas.cmake => ChooseBLAS.cmake} |   0
 cmake/cmake_options.yml                      |   2 +-
 4 files changed, 264 insertions(+), 172 deletions(-)
 rename cmake/{ChooseBlas.cmake => ChooseBLAS.cmake} (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e582bae5e95..ecb3a995b9e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,7 @@
 cmake_minimum_required(VERSION 3.0.2)
 
+message(STATUS "CMAKE_VERSION=${CMAKE_VERSION}")
+
 # workaround to store CMAKE_CROSSCOMPILING because is getting reset by the project command
 if(CMAKE_CROSSCOMPILING)
   set(__CMAKE_CROSSCOMPILING ${CMAKE_CROSSCOMPILING})
@@ -16,67 +18,46 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/build/private/local_config.cmake)
   include(${CMAKE_CURRENT_SOURCE_DIR}/build/private/local_config.cmake)
 endif()
 
+message(STATUS "CMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}")
+
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Utils.cmake)
 
 #Some things have order. This must be put in front alone
-mxnet_option(USE_CUDA             "Build with CUDA support"   ON)
-mxnet_option(USE_OLDCMAKECUDA     "Build with old cmake cuda" OFF)
-mxnet_option(USE_NCCL             "Use NVidia NCCL with CUDA" OFF)
-mxnet_option(USE_OPENCV           "Build with OpenCV support" ON)
-mxnet_option(USE_OPENMP           "Build with Openmp support" ON)
-mxnet_option(USE_CUDNN            "Build with cudnn support"  ON) # one could set CUDNN_ROOT for search path
-mxnet_option(USE_SSE              "Build with x86 SSE instruction support" ON IF NOT ARM)
-mxnet_option(USE_F16C             "Build with x86 F16C instruction support" ON) # autodetects support if ON
-mxnet_option(USE_LAPACK           "Build with lapack support" ON)
-mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
-mxnet_option(USE_MKLML_MKL        "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE))
-mxnet_option(USE_MKLDNN           "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
-mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON IF NOT MSVC)
-mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support (if found)" ON)
-mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)
-mxnet_option(USE_PROFILER         "Build with Profiler support"   ON)
-mxnet_option(USE_DIST_KVSTORE     "Build with DIST_KVSTORE support" OFF)
-mxnet_option(USE_PLUGINS_WARPCTC  "Use WARPCTC Plugins" OFF)
-mxnet_option(USE_PLUGIN_CAFFE     "Use Caffe Plugin" OFF)
-mxnet_option(USE_CPP_PACKAGE      "Build C++ Package" OFF)
+mxnet_option(USE_CUDA "Build with CUDA support" ON)
+mxnet_option(USE_OLDCMAKECUDA "Build with old cmake cuda" OFF)
+mxnet_option(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
+mxnet_option(USE_OPENCV "Build with OpenCV support" ON)
+mxnet_option(USE_OPENMP "Build with Openmp support" ON)
+mxnet_option(USE_CUDNN "Build with cudnn support" ON) # one could set CUDNN_ROOT for search path
+mxnet_option(USE_SSE "Build with x86 SSE instruction support" ON IF NOT ARM)
+mxnet_option(USE_F16C "Build with x86 F16C instruction support" ON) # autodetects support if ON
+mxnet_option(USE_OPERATOR_TUNING "Enable auto-tuning of operators" ON IF NOT MSVC)
+mxnet_option(USE_GPERFTOOLS "Build with GPerfTools support (if found)" ON)
+mxnet_option(USE_JEMALLOC "Build with Jemalloc support" ON)
+mxnet_option(USE_PROFILER "Build with Profiler support" ON)
+mxnet_option(USE_DIST_KVSTORE "Build with DIST_KVSTORE support" OFF)
+mxnet_option(USE_PLUGINS_WARPCTC "Use WARPCTC Plugins" OFF)
+mxnet_option(USE_PLUGIN_CAFFE "Use Caffe Plugin" OFF)
+mxnet_option(USE_CPP_PACKAGE "Build C++ Package" OFF)
 mxnet_option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON)
-mxnet_option(USE_GPROF            "Compile with gprof (profiling) flag" OFF)
+mxnet_option(USE_GPROF "Compile with gprof (profiling) flag" OFF)
 mxnet_option(USE_CXX14_IF_AVAILABLE "Build with C++14 if the compiler supports it" OFF)
-mxnet_option(USE_VTUNE            "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path
-mxnet_option(ENABLE_CUDA_RTC      "Build with CUDA runtime compilation support" ON)
-mxnet_option(BUILD_CPP_EXAMPLES   "Build cpp examples" ON)
-mxnet_option(INSTALL_EXAMPLES     "Install the example source files." OFF)
-mxnet_option(USE_SIGNAL_HANDLER   "Print stack traces on segfaults." OFF)
-mxnet_option(USE_TENSORRT         "Enable infeference optimization with TensorRT." OFF)
-mxnet_option(USE_ASAN             "Enable Clang/GCC ASAN sanitizers." OFF)
-mxnet_option(ENABLE_TESTCOVERAGE  "Enable compilation with test coverage metric output" OFF)
+mxnet_option(USE_VTUNE "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path
+mxnet_option(ENABLE_CUDA_RTC "Build with CUDA runtime compilation support" ON)
+mxnet_option(BUILD_CPP_EXAMPLES "Build cpp examples" ON)
+mxnet_option(INSTALL_EXAMPLES "Install the example source files." OFF)
+mxnet_option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." OFF)
+mxnet_option(USE_TENSORRT "Enable infeference optimization with TensorRT." OFF)
+mxnet_option(USE_ASAN "Enable Clang/GCC ASAN sanitizers." OFF)
+mxnet_option(ENABLE_TESTCOVERAGE "Enable compilation with test coverage metric output" OFF)
 
 message(STATUS "CMAKE_CROSSCOMPILING ${CMAKE_CROSSCOMPILING}")
 message(STATUS "CMAKE_HOST_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR}")
 message(STATUS "CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}")
 
-message(STATUS "CMAKE_SYSTEM_NAME ${CMAKE_SYSTEM_NAME}")
-if(USE_CUDA AND NOT USE_OLDCMAKECUDA)
-  message(STATUS "CMake version '${CMAKE_VERSION}' using generator '${CMAKE_GENERATOR}'")
-  if(
-      (
-        (${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
-        OR (${CMAKE_GENERATOR} MATCHES "Xcode.*")
-        OR (${CMAKE_GENERATOR} STREQUAL "Unix Makefiles")
-      ) AND (
-        (${CMAKE_VERSION} VERSION_GREATER "3.9.0") OR (${CMAKE_VERSION} VERSION_EQUAL "3.9.0")
-      )
-    )
-    set(FIRST_CUDA TRUE)
-    project(mxnet C CXX CUDA)
-  else()
-    set(FIRST_CUDA FALSE)
-    set(USE_OLDCMAKECUDA TRUE)
-    project(mxnet C CXX)
-  endif()
-else()
-  project(mxnet C CXX)
-endif()
+if(NOT mxnet_LINKER_LIBS)
+  set(mxnet_LINKER_LIBS "")
+endif(NOT mxnet_LINKER_LIBS)
 
 
 if(MSVC)
@@ -86,14 +67,128 @@ else()
 endif()
 
 set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules;${CMAKE_MODULE_PATH}")
+message(STATUS "CMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}")
 
-SET(EXTRA_OPERATORS "" CACHE PATH "EXTRA OPERATORS PATH")
+set(EXTRA_OPERATORS "" CACHE PATH "EXTRA OPERATORS PATH")
 
 if("$ENV{VERBOSE}" STREQUAL "1")
   message(STATUS " Verbose Makefile ACTIVATED")
   set(CMAKE_VERBOSE_MAKEFILE ON)
 endif()
 
+# ---[ BLAS
+
+# Choose BLAS (Basic Linear Algebra Subprograms) computation libraries
+
+# MXNet supports multiple mathematical backends for computations on the CPU:
+#
+# * Atlas
+# * OpenBLAS
+# * MKL (MKL, MKLML)
+# * MKLDNN
+# * Apple Accelerate
+#
+# The default order of choice for the libraries if found follows the path from the most
+# (recommended) to less performant backends. The order is as follows:
+#
+# For desktop platforms (x86_64):
+#
+# 1. MKLDNN (submodule) | USE_MKLDNN
+# 2. MKL | USE_MKL_IF_AVAILABLE
+# 3. MKLML (downloaded) | USE_MKLML
+# 4. Apple Accelerate | USE_APPLE_ACCELERATE_IF_AVAILABLE | Mac only
+# 5. OpenBLAS | BLAS | Options: Atlas, Open, MKL, Apple
+#
+# Note: If USE_MKL_IF_AVAILABLE is set to False then MKLML and MKLDNN will be disabled as well for
+# configuration backwards compatibility.
+#
+# For embedded platforms (all other and if cross compiled):
+#
+# 1. OpenBLAS | BLAS | Options: Atlas, Open
+#
+# You can set the BLAS library explicitly by setting the BLAS variable to:
+#
+# * Atlas
+# * Open
+# * MKL
+# * Apple
+#
+# See cmake/ChooseBLAS.cmake file for the options.
+#
+# Intel's MKL (Math Kernel Library) is one of the most powerful math libraries
+# https://software.intel.com/en-us/mkl
+#
+# It has following flavours:
+#
+# * MKL is a complete full math library, containing basic and LAPACK functions. It is free under
+#   community support licensing (https://software.intel.com/en-us/articles/free-mkl),
+#   but needs to be downloaded and installed manually.
+#
+# * MKLML is a subset of MKL. It contains a smaller number of functions to reduce the
+#   size of the download and reduce the number of dynamic libraries the user needs. This
+#   is the most effective option since it can be downloaded and installed automatically
+#   by the cmake script (see cmake/DownloadMKLML.cmake).
+#
+# * MKLDNN is a separate open-source library, it can be used separately from MKL or MKLML. It is
+#   shipped as a subrepo with MXNet source code (see 3rdparty/mkldnn).
+#   See: https://github.com/intel/mkl-dnn
+#
+# Since the full MKL library is almost always faster than any other BLAS library it's turned on by
+# default, however it needs to be downloaded and installed manually before doing cmake
+# configuration.
+# Register and download here https://software.seek.intel.com/performance-libraries
+#
+# Note: MKL is supported only for desktop builds and the framework itself supports the following
+# hardware:
+#
+# * Intel® Xeon Phi™ processor
+# * Intel® Xeon® processor
+# * Intel® Core™ processor family
+# * Intel Atom® processor
+#
+# If you have a different processor you can still try to use MKL, but performance results are
+# unpredictable.
+mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
+
+# If the full MKL library could not be found the thinner subset MKLML will be downloaded
+# unless switched off explicitly.
+# Note: The same limitation on hardware as for MKL applies for MKLML as well.
+mxnet_option(USE_MKLML "Use MKLML subset of MKL instead of full MKL, will be downloaded" ON IF
+             ${USE_MKL_IF_AVAILABLE} AND (NOT APPLE))
+
+# If either MKL of MKLML is present MKLDNN can be utilised from the 3rdparty/mkldnn subrepo.
+# See more information here: https://github.com/intel/mkl-dnn
+# Note: The same limitation on hardware as for MKL and MKLDNN applies for MKLDNN as well.
+mxnet_option(USE_MKLDNN "Use MKLDNN (separate addition to MKL, MKL/MKLML not required)" ON IF
+             ${USE_MKL_IF_AVAILABLE} AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
+
+# Apple's mathematical framework, probably the best choice on a Mac if MKL/MKLML/MKLDNN
+# are not available.
+# https://developer.apple.com/documentation/accelerate
+mxnet_option(USE_APPLE_ACCELERATE_IF_AVAILABLE "Use Apple Accelerate framework if found, \
+                                                works if MKL not found or disabled" ON IF ${APPLE})
+
+# Another important option of the math libraries is presence of additional set of
+# mathematical functions gathered and named as the LAPACK (Linear Algebra Package). Some
+# libraries don't include it, thus the cmake script will check the presence of an
+# indicating function "cheev_" within the available choosen libraries and switch the
+# functionality off if not found.
+mxnet_option(USE_LAPACK "Build with LAPACK support" ON)
+
+
+if(USE_CUDA AND NOT USE_OLDCMAKECUDA)
+  include(CheckLanguage)
+  check_language(CUDA)
+  if(CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+    set(FIRST_CUDA TRUE)
+  else()
+    message(STATUS "No first class CUDA language support")
+
+    set(FIRST_CUDA FALSE)
+    set(USE_OLDCMAKECUDA TRUE)
+  endif()
+endif()
 
 if(MSVC)
   add_definitions(-DWIN32_LEAN_AND_MEAN)
@@ -115,12 +210,12 @@ else(MSVC)
   if(USE_CXX14_IF_AVAILABLE)
     check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14)
   endif()
-  check_cxx_compiler_flag("-std=c++11"   SUPPORT_CXX11)
-  check_cxx_compiler_flag("-std=c++0x"   SUPPORT_CXX0X)
+  check_cxx_compiler_flag("-std=c++11" SUPPORT_CXX11)
+  check_cxx_compiler_flag("-std=c++0x" SUPPORT_CXX0X)
   # For cross compilation, we can't rely on the compiler which accepts the flag, but mshadow will
   # add platform specific includes not available in other arches
   if(USE_SSE)
-    check_cxx_compiler_flag("-msse2"     SUPPORT_MSSE2)
+    check_cxx_compiler_flag("-msse2" SUPPORT_MSSE2)
   else()
     set(SUPPORT_MSSE2 FALSE)
   endif()
@@ -133,13 +228,13 @@ else(MSVC)
     set(SUPPORT_F16C FALSE)
   endif()
   if(SUPPORT_F16C)
-    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -mf16c")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c")
   else()
     add_definitions(-DMSHADOW_USE_F16C=0)
   endif()
   set(CMAKE_POSITION_INDEPENDENT_CODE ON)
   set(CMAKE_C_FLAGS "-Wall -Wno-unknown-pragmas -Wno-sign-compare")
-  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES ".*Clang$")
+  if("${CMAKE_CXX_COMPILER_ID}" MATCHES ".*Clang$")
     set(CMAKE_C_FLAGS "-Wno-braced-scalar-init")
   endif()
   if(CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -168,10 +263,6 @@ else(MSVC)
   endif()
 endif(MSVC)
 
-if(NOT mxnet_LINKER_LIBS)
-  set(mxnet_LINKER_LIBS "")
-endif(NOT mxnet_LINKER_LIBS)
-
 if(USE_GPROF)
   message(STATUS "Using GPROF")
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-omit-frame-pointer -g -pg")
@@ -208,25 +299,25 @@ if(USE_TENSORRT)
   find_package(Protobuf REQUIRED)
 
   find_library(ONNX_LIBRARY NAMES libonnx.so REQUIRED
-          PATHS ${ONNX_PATH}
-          DOC "Path to onnx library.")
+               PATHS ${ONNX_PATH}
+               DOC "Path to onnx library.")
   find_library(ONNX_PROTO_LIBRARY NAMES libonnx_proto.so REQUIRED
-          PATHS ${ONNX_PATH}
-          DOC "Path to onnx_proto library.")
+               PATHS ${ONNX_PATH}
+               DOC "Path to onnx_proto library.")
   find_library(ONNX_TRT_RUNTIME_LIBRARY NAMES libnvonnxparser_runtime.so REQUIRED
-          PATHS ${ONNX_TRT_PATH}
-          DOC "Path to onnx_proto library.")
+               PATHS ${ONNX_TRT_PATH}
+               DOC "Path to onnx_proto library.")
   find_library(ONNX_TRT_PARSER_LIBRARY NAMES libnvonnxparser.so REQUIRED
-          PATHS ${ONNX_TRT_PATH}
-          DOC "Path to onnx_proto library.")
+               PATHS ${ONNX_TRT_PATH}
+               DOC "Path to onnx_proto library.")
 
   list(APPEND mxnet_LINKER_LIBS libnvinfer.so ${ONNX_TRT_PARSER_LIBRARY} ${ONNX_TRT_RUNTIME_LIBRARY}
-          ${ONNX_PROTO_LIBRARY} ${ONNX_LIBRARY} ${PROTOBUF_LIBRARY})
+       ${ONNX_PROTO_LIBRARY} ${ONNX_LIBRARY} ${PROTOBUF_LIBRARY})
 endif()
 
 if(ENABLE_TESTCOVERAGE)
   message(STATUS "Compiling with test coverage support enabled. This will result in additional files being written to your source directory!")
-  find_program( GCOV_PATH gcov )
+  find_program(GCOV_PATH gcov)
   if(NOT GCOV_PATH)
     message(FATAL_ERROR "gcov not found! Aborting...")
   endif() # NOT GCOV_PATH
@@ -295,7 +386,7 @@ else()
   add_definitions(-DMXNET_USE_NCCL=0)
 endif()
 
-include(cmake/ChooseBlas.cmake)
+include(cmake/ChooseBLAS.cmake)
 if(USE_CUDA AND FIRST_CUDA)
   include(3rdparty/mshadow/cmake/Utils.cmake)
   include(cmake/FirstClassLangCuda.cmake)
@@ -323,7 +414,7 @@ endif()
 list(APPEND mxnet_LINKER_LIBS ${mshadow_LINKER_LIBS})
 
 foreach(var ${C_CXX_INCLUDE_DIRECTORIES})
-    include_directories(${var})
+  include_directories(${var})
 endforeach()
 
 include_directories("include")
@@ -443,7 +534,7 @@ endif()
 if(USE_LAPACK)
   message("USE_LAPACK is ON")
   add_definitions(-DMXNET_USE_LAPACK=1)
-  if (NOT MSVC)
+  if(NOT MSVC)
     list(APPEND mxnet_LINKER_LIBS lapack)
   endif()
 endif()
@@ -477,7 +568,7 @@ if(USE_CUDNN AND USE_CUDA)
     add_definitions(-DUSE_CUDNN)
     include_directories(SYSTEM ${CUDNN_INCLUDE})
     list(APPEND mxnet_LINKER_LIBS ${CUDNN_LIBRARY})
-      add_definitions(-DMSHADOW_USE_CUDNN=1)
+    add_definitions(-DMSHADOW_USE_CUDNN=1)
   endif()
 endif()
 
@@ -488,45 +579,45 @@ endif()
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/mshadow/cmake)
   add_subdirectory("3rdparty/mshadow")
 endif()
-FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h" "include/*.h")
-FILE(GLOB_RECURSE CUDA "src/*.cu" "src/*.cuh")
+file(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h" "include/*.h")
+file(GLOB_RECURSE CUDA "src/*.cu" "src/*.cuh")
 
 # add nnvm to source
-FILE(GLOB_RECURSE NNVMSOURCE
-  3rdparty/tvm/nnvm/src/c_api/*.cc
-  3rdparty/tvm/nnvm/src/core/*.cc
-  3rdparty/tvm/nnvm/src/pass/*.cc
-  3rdparty/tvm/nnvm/src/c_api/*.h
-  3rdparty/tvm/nnvm/src/core/*.h
-  3rdparty/tvm/nnvm/src/pass/*.h
-  3rdparty/tvm/nnvm/include/*.h)
+file(GLOB_RECURSE NNVMSOURCE
+     3rdparty/tvm/nnvm/src/c_api/*.cc
+     3rdparty/tvm/nnvm/src/core/*.cc
+     3rdparty/tvm/nnvm/src/pass/*.cc
+     3rdparty/tvm/nnvm/src/c_api/*.h
+     3rdparty/tvm/nnvm/src/core/*.h
+     3rdparty/tvm/nnvm/src/pass/*.h
+     3rdparty/tvm/nnvm/include/*.h)
 list(APPEND SOURCE ${NNVMSOURCE})
 
 # add mshadow file
-FILE(GLOB_RECURSE MSHADOWSOURCE "3rdparty/mshadow/mshadow/*.h")
-FILE(GLOB_RECURSE MSHADOW_CUDASOURCE "3rdparty/mshadow/mshadow/*.cuh")
+file(GLOB_RECURSE MSHADOWSOURCE "3rdparty/mshadow/mshadow/*.h")
+file(GLOB_RECURSE MSHADOW_CUDASOURCE "3rdparty/mshadow/mshadow/*.cuh")
 list(APPEND SOURCE ${MSHADOWSOURCE})
 list(APPEND CUDA ${MSHADOW_CUDASOURCE})
 
 # add source group
-FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "3rdparty/tvm/nnvm/*.cc" "plugin/*.cc")
-FILE(GLOB_RECURSE GROUP_Include "src/*.h" "3rdparty/tvm/nnvm/*.h" "3rdparty/mshadow/mshadow/*.h" "plugin/*.h")
-FILE(GLOB_RECURSE GROUP_CUDA "src/*.cu" "src/*.cuh" "3rdparty/mshadow/mshadow/*.cuh" "plugin/*.cu"
-  "plugin/*.cuh" "3rdparty/nvidia_cub/cub/*.cuh")
+file(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "3rdparty/tvm/nnvm/*.cc" "plugin/*.cc")
+file(GLOB_RECURSE GROUP_Include "src/*.h" "3rdparty/tvm/nnvm/*.h" "3rdparty/mshadow/mshadow/*.h" "plugin/*.h")
+file(GLOB_RECURSE GROUP_CUDA "src/*.cu" "src/*.cuh" "3rdparty/mshadow/mshadow/*.cuh" "plugin/*.cu"
+     "plugin/*.cuh" "3rdparty/nvidia_cub/cub/*.cuh")
 assign_source_group("Source" ${GROUP_SOURCE})
 assign_source_group("Include" ${GROUP_Include})
 assign_source_group("CUDA" ${GROUP_CUDA})
 
 if(USE_PLUGINS_WARPCTC)
-    set(WARPCTC_INCLUDE  "" CACHE PATH "WARPCTC include")
-    set(WARPCTC_LIB_DEBUG  "" CACHE FILEPATH "WARPCTC lib")
-    set(WARPCTC_LIB_RELEASE  "" CACHE FILEPATH "WARPCTC lib")
-    include_directories(SYSTEM ${WARPCTC_INCLUDE})
-    list(APPEND mxnet_LINKER_LIBS ${WARPCTC_LIB})
-    FILE(GLOB_RECURSE PLUGINS_SOURCE "plugin/warpctc/*.cc" "plugin/warpctc/*.h")
-    FILE(GLOB_RECURSE PLUGINS_CUSRC "plugin/warpctc/*.cu")
-    list(APPEND SOURCE ${PLUGINS_SOURCE})
-    list(APPEND CUDA ${PLUGINS_CUSRC})
+  set(WARPCTC_INCLUDE "" CACHE PATH "WARPCTC include")
+  set(WARPCTC_LIB_DEBUG "" CACHE FILEPATH "WARPCTC lib")
+  set(WARPCTC_LIB_RELEASE "" CACHE FILEPATH "WARPCTC lib")
+  include_directories(SYSTEM ${WARPCTC_INCLUDE})
+  list(APPEND mxnet_LINKER_LIBS ${WARPCTC_LIB})
+  file(GLOB_RECURSE PLUGINS_SOURCE "plugin/warpctc/*.cc" "plugin/warpctc/*.h")
+  file(GLOB_RECURSE PLUGINS_CUSRC "plugin/warpctc/*.cu")
+  list(APPEND SOURCE ${PLUGINS_SOURCE})
+  list(APPEND CUDA ${PLUGINS_CUSRC})
 endif()
 
 if(USE_OPERATOR_TUNING AND USE_OPENMP)
@@ -555,31 +646,31 @@ if(USE_PLUGIN_CAFFE)
   if(NOT DEFINED CAFFE_PATH)
     message(FATAL_ERROR "Please set CAFFE_PATH to point to the caffe source installation")
   endif()
-  FILE(GLOB_RECURSE PLUGINS_SOURCE "plugin/caffe/*.cc" "plugin/caffe/*.h")
-  FILE(GLOB_RECURSE PLUGINS_CUSRC "plugin/caffe/*.cu")
+  file(GLOB_RECURSE PLUGINS_SOURCE "plugin/caffe/*.cc" "plugin/caffe/*.h")
+  file(GLOB_RECURSE PLUGINS_CUSRC "plugin/caffe/*.cu")
   list(APPEND SOURCE ${PLUGINS_SOURCE})
   list(APPEND CUDA ${PLUGINS_CUSRC})
   include_directories(${CMAKE_BINARY_DIR}/include)
   add_definitions(-DMXNET_USE_CAFFE=1)
   list(APPEND mxnet_LINKER_LIBS
-    protobuf boost_system boost_thread boost_filesystem
-    gflags glog caffe
-    ${Caffe_LINKER_LIBS}
-)
+       protobuf boost_system boost_thread boost_filesystem
+       gflags glog caffe
+       ${Caffe_LINKER_LIBS}
+       )
 endif()
 
-if (NOT (EXTRA_OPERATORS STREQUAL ""))
-    mxnet_source_group("Extra"   GLOB_RECURSE "${EXTRA_OPERATORS}/*.cc")
-    mxnet_source_group("Extra\\Cuda"   GLOB_RECURSE "${EXTRA_OPERATORS}/*.cu")
-    FILE(GLOB_RECURSE EXTRA_SRC "${EXTRA_OPERATORS}/*.cc")
-    FILE(GLOB_RECURSE EXTRA_CUSRC "${EXTRA_OPERATORS}/*.cu")
-    list(APPEND SOURCE ${EXTRA_SRC} ${EXTRA_CUSRC})
+if(NOT (EXTRA_OPERATORS STREQUAL ""))
+  mxnet_source_group("Extra" GLOB_RECURSE "${EXTRA_OPERATORS}/*.cc")
+  mxnet_source_group("Extra\\Cuda" GLOB_RECURSE "${EXTRA_OPERATORS}/*.cu")
+  file(GLOB_RECURSE EXTRA_SRC "${EXTRA_OPERATORS}/*.cc")
+  file(GLOB_RECURSE EXTRA_CUSRC "${EXTRA_OPERATORS}/*.cu")
+  list(APPEND SOURCE ${EXTRA_SRC} ${EXTRA_CUSRC})
 endif()
 
 if(MSVC)
   foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+          CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+          CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
     if(${flag_var} MATCHES "/MD")
       string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
     endif(${flag_var} MATCHES "/MD")
@@ -593,8 +684,8 @@ if(USE_CUDA)
     set(CMAKE_CUDA_FLAGS "${NVCC_FLAGS_ARCH}")
     list(APPEND mxnet_LINKER_LIBS cublas cufft cusolver curand)
     if(ENABLE_CUDA_RTC)
-        list(APPEND mxnet_LINKER_LIBS nvrtc cuda)
-        add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
+      list(APPEND mxnet_LINKER_LIBS nvrtc cuda)
+      add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
     endif()
     list(APPEND SOURCE ${CUDA})
     add_definitions(-DMXNET_USE_CUDA=1)
@@ -604,42 +695,42 @@ if(USE_CUDA)
     # define preprocessor macro so that we will not include the generated forcelink header
     mshadow_cuda_compile(cuda_objs ${CUDA})
     if(MSVC)
-        if(ENABLE_CUDA_RTC)
-            FIND_LIBRARY(CUDA_nvrtc_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
-            list(APPEND mxnet_LINKER_LIBS ${CUDA_nvrtc_LIBRARY})
-            set(CUDA_cuda_LIBRARY "${CUDA_nvrtc_LIBRARY}/../cuda.lib")
-            list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY})
-            add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
-        endif()
-        FIND_LIBRARY(CUDA_cufft_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
-        list(APPEND mxnet_LINKER_LIBS "${CUDA_cufft_LIBRARY}/../cufft.lib") # For fft operator
-        FIND_LIBRARY(CUDA_cusolver_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
-        list(APPEND mxnet_LINKER_LIBS "${CUDA_cusolver_LIBRARY}/../cusolver.lib") # For cusolver
-        link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/win32)
-        link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
+      if(ENABLE_CUDA_RTC)
+        find_library(CUDA_nvrtc_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
+        list(APPEND mxnet_LINKER_LIBS ${CUDA_nvrtc_LIBRARY})
+        set(CUDA_cuda_LIBRARY "${CUDA_nvrtc_LIBRARY}/../cuda.lib")
+        list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY})
+        add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
+      endif()
+      find_library(CUDA_cufft_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
+      list(APPEND mxnet_LINKER_LIBS "${CUDA_cufft_LIBRARY}/../cufft.lib") # For fft operator
+      find_library(CUDA_cusolver_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
+      list(APPEND mxnet_LINKER_LIBS "${CUDA_cusolver_LIBRARY}/../cusolver.lib") # For cusolver
+      link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/win32)
+      link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
     else(MSVC)
-        list(APPEND mxnet_LINKER_LIBS cufft cusolver)
-        if(ENABLE_CUDA_RTC)
-            list(APPEND mxnet_LINKER_LIBS nvrtc cuda)
-            add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
-        endif()
-        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+      list(APPEND mxnet_LINKER_LIBS cufft cusolver)
+      if(ENABLE_CUDA_RTC)
+        list(APPEND mxnet_LINKER_LIBS nvrtc cuda)
+        add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
+      endif()
+      link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
     endif()
     list(APPEND SOURCE ${cuda_objs} ${CUDA})
     add_definitions(-DMXNET_USE_CUDA=1)
     if(CUDA_LIBRARY_PATH)
-        if(IS_CONTAINER_BUILD)
+      if(IS_CONTAINER_BUILD)
         # In case of building on a production-like build container which may not have Cuda installed
         if(NOT CMAKE_SYSTEM_HAS_CUDA)
-            # Assuming building in a container that doesn't have CUDA installed (ie CPU-only build machine)
-            # so use the stub cuda driver shared library
-            if(EXISTS ${CUDA_LIBRARY_PATH}/stubs/libcuda.so)
+          # Assuming building in a container that doesn't have CUDA installed (ie CPU-only build machine)
+          # so use the stub cuda driver shared library
+          if(EXISTS ${CUDA_LIBRARY_PATH}/stubs/libcuda.so)
             link_directories(${CUDA_LIBRARY_PATH}/stubs)
-            endif()
-        endif()
+          endif()
         endif()
+      endif()
     endif()
- endif()
+  endif()
 endif()
 
 # unsupported: if caffe is a subdirectory of mxnet, load its CMakeLists.txt as well
@@ -725,13 +816,13 @@ if(USE_OPENCV AND OpenCV_VERSION_MAJOR GREATER 2)
     target_link_libraries(im2rec ${BEGIN_WHOLE_ARCHIVE} mxnet_static ${END_WHOLE_ARCHIVE})
   endif()
   target_link_libraries(im2rec
-    ${mxnet_LINKER_LIBS}
-    ${OpenCV_LIBS}
-    dmlc
-    ${pslite_LINKER_LIBS}
-    )
+                        ${mxnet_LINKER_LIBS}
+                        ${OpenCV_LIBS}
+                        dmlc
+                        ${pslite_LINKER_LIBS}
+                        )
 else()
-    message(WARNING "OpenCV_VERSION_MAJOR: ${OpenCV_VERSION_MAJOR}, version 3 with imgcodecs \
+  message(WARNING "OpenCV_VERSION_MAJOR: ${OpenCV_VERSION_MAJOR}, version 3 with imgcodecs \
     is required for im2rec, im2rec will not be available")
 endif()
 
@@ -745,10 +836,10 @@ add_subdirectory(tests)
 
 include(GNUInstallDirs)
 install(TARGETS ${MXNET_INSTALL_TARGETS}
-  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-)
+        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        )
 
 # NOTE: Public headers will be installed into ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}, see
 #       https://cmake.org/cmake/help/v3.0/variable/CMAKE_INSTALL_PREFIX.html
@@ -756,27 +847,27 @@ install(TARGETS ${MXNET_INSTALL_TARGETS}
 
 install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 install(DIRECTORY 3rdparty/tvm/nnvm/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-if (INSTALL_EXAMPLES)
-  install(DIRECTORY example  DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME})
+if(INSTALL_EXAMPLES)
+  install(DIRECTORY example DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME})
 endif()
 
-if (USE_SIGNAL_HANDLER)
-    add_definitions(-DMXNET_USE_SIGNAL_HANDLER=1)
+if(USE_SIGNAL_HANDLER)
+  add_definitions(-DMXNET_USE_SIGNAL_HANDLER=1)
 endif()
 
 # AUTO_INSTALL_DIR -> Optional: specify post-build install direcory
 if(AUTO_INSTALL_DIR)
   # ---[ Install Includes
   add_custom_command(TARGET mxnet POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_directory
-    ${CMAKE_CURRENT_SOURCE_DIR}/include ${AUTO_INSTALL_DIR}/include
-    )
+                     COMMAND ${CMAKE_COMMAND} -E copy_directory
+                     ${CMAKE_CURRENT_SOURCE_DIR}/include ${AUTO_INSTALL_DIR}/include
+                     )
 
   # ---[ Install Examples
   add_custom_command(TARGET mxnet POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_directory
-    ${CMAKE_CURRENT_SOURCE_DIR}/example ${AUTO_INSTALL_DIR}/example
-    )
+                     COMMAND ${CMAKE_COMMAND} -E copy_directory
+                     ${CMAKE_CURRENT_SOURCE_DIR}/example ${AUTO_INSTALL_DIR}/example
+                     )
 endif()
 
 if(INSTALL_PYTHON_VERSIONS)
@@ -784,9 +875,9 @@ if(INSTALL_PYTHON_VERSIONS)
   foreach(version ${INSTALL_PYTHON_VERSIONS})
     set(outdir ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/python${version}/site-packages/mxnet)
     add_custom_command(TARGET mxnet POST_BUILD
-      COMMAND mkdir -p ${outdir}
-      COMMAND cp -ru ${CMAKE_CURRENT_SOURCE_DIR}/python/mxnet/* ${outdir}
-      )
+                       COMMAND mkdir -p ${outdir}
+                       COMMAND cp -ru ${CMAKE_CURRENT_SOURCE_DIR}/python/mxnet/* ${outdir}
+                       )
   endforeach()
 endif()
 
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index a89c51de0d8e..d6bac55256ca 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -717,7 +717,8 @@ build_ubuntu_gpu_cmake_mkldnn() {
         -DENABLE_TESTCOVERAGE=ON                \
         -DUSE_CUDA=1                            \
         -DUSE_CUDNN=1                           \
-        -DUSE_MKLML_MKL=1                       \
+        -DUSE_MKLML=1                           \
+        -DUSE_MKLDNN=1                          \
         -DCMAKE_BUILD_TYPE=Release              \
         -DCUDA_ARCH_NAME=Manual                 \
         -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
@@ -740,12 +741,12 @@ build_ubuntu_gpu_cmake() {
         -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache   \
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DENABLE_TESTCOVERAGE=ON                \
-        -DUSE_CUDA=ON                           \
-        -DUSE_CUDNN=ON                          \
+        -DUSE_CUDA=1                            \
+        -DUSE_CUDNN=1                           \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
-        -DUSE_MKLML_MKL=OFF                     \
-        -DUSE_MKLDNN=OFF                        \
-        -DUSE_DIST_KVSTORE=ON                   \
+        -DUSE_MKLML=0                           \
+        -DUSE_MKLDNN=0                          \
+        -DUSE_DIST_KVSTORE=1                    \
         -DCMAKE_BUILD_TYPE=Release              \
         -DCUDA_ARCH_NAME=Manual                 \
         -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
diff --git a/cmake/ChooseBlas.cmake b/cmake/ChooseBLAS.cmake
similarity index 100%
rename from cmake/ChooseBlas.cmake
rename to cmake/ChooseBLAS.cmake
diff --git a/cmake/cmake_options.yml b/cmake/cmake_options.yml
index a4323feb92d4..365c851a19f3 100644
--- a/cmake/cmake_options.yml
+++ b/cmake/cmake_options.yml
@@ -26,7 +26,7 @@ USE_SSE: "ON" # Build with x86 SSE instruction support IF NOT ARM
 USE_F16C: "ON" # Build with x86 F16C instruction support) # autodetects support if "ON"
 USE_LAPACK: "ON" # Build with lapack support
 USE_MKL_IF_AVAILABLE: "ON" # Use MKL if found
-USE_MKLML_MKL: "ON" # Use MKLDNN variant of MKL (if MKL found) IF USE_MKL_IF_AVAILABLE AND (NOT APPLE)
+USE_MKLML: "ON" # Use MKLDNN variant of MKL (if MKL found) IF USE_MKL_IF_AVAILABLE AND (NOT APPLE)
 USE_MKLDNN: "ON" # Use MKLDNN variant of MKL (if MKL found) IF USE_MKL_IF_AVAILABLE AND (NOT APPLE)
 USE_OPERATOR_TUNING: "ON" # Enable auto-tuning of operators IF NOT MSVC
 USE_GPERFTOOLS: "ON" # Build with GPerfTools support (if found)

From d8cabdc5495bbd7823f31e5726fbfd6ff67e99c6 Mon Sep 17 00:00:00 2001
From: Anton Chernov <mechernov@gmail.com>
Date: Wed, 30 Jan 2019 10:43:03 +0100
Subject: [PATCH 2/2] Added USE_MKLML_MKL usage warning

---
 CMakeLists.txt          | 4 ++++
 cmake/cmake_options.yml | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ecb3a995b9e4..86128f49aebd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -156,6 +156,10 @@ mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
 mxnet_option(USE_MKLML "Use MKLML subset of MKL instead of full MKL, will be downloaded" ON IF
              ${USE_MKL_IF_AVAILABLE} AND (NOT APPLE))
 
+if(DEFINED USE_MKLML_MKL)
+  message(WARNING "The option USE_MKLML_MKL is deprecated and will not be considered, use USE_MKLML instead")
+endif()
+
 # If either MKL of MKLML is present MKLDNN can be utilised from the 3rdparty/mkldnn subrepo.
 # See more information here: https://github.com/intel/mkl-dnn
 # Note: The same limitation on hardware as for MKL and MKLDNN applies for MKLDNN as well.
diff --git a/cmake/cmake_options.yml b/cmake/cmake_options.yml
index 365c851a19f3..5eda3c728a11 100644
--- a/cmake/cmake_options.yml
+++ b/cmake/cmake_options.yml
@@ -26,7 +26,7 @@ USE_SSE: "ON" # Build with x86 SSE instruction support IF NOT ARM
 USE_F16C: "ON" # Build with x86 F16C instruction support) # autodetects support if "ON"
 USE_LAPACK: "ON" # Build with lapack support
 USE_MKL_IF_AVAILABLE: "ON" # Use MKL if found
-USE_MKLML: "ON" # Use MKLDNN variant of MKL (if MKL found) IF USE_MKL_IF_AVAILABLE AND (NOT APPLE)
+USE_MKLML: "ON" # Use MKLML variant of MKL IF USE_MKL_IF_AVAILABLE AND (NOT APPLE)
 USE_MKLDNN: "ON" # Use MKLDNN variant of MKL (if MKL found) IF USE_MKL_IF_AVAILABLE AND (NOT APPLE)
 USE_OPERATOR_TUNING: "ON" # Enable auto-tuning of operators IF NOT MSVC
 USE_GPERFTOOLS: "ON" # Build with GPerfTools support (if found)