diff --git a/.licenserc.yaml b/.licenserc.yaml
index f7e262af1235..dc6e8faa8ec8 100644
--- a/.licenserc.yaml
+++ b/.licenserc.yaml
@@ -35,7 +35,7 @@ header:
     - 'src/operator/contrib/multi_proposal.cu'
     - 'src/operator/contrib/psroi_pooling.cc'
     - 'src/operator/contrib/psroi_pooling.cu'
-    - 'src/operator/nn/mkldnn/mkldnn_base-inl.h'
+    - 'src/operator/nn/dnnl/dnnl_base-inl.h'
     # files licensed under boost license
     - 'cmake/Modules/FindJeMalloc.cmake'
     # files licensed under bsd 3-clause
@@ -64,7 +64,7 @@ header:
     - 'include/mshadow' # symlink to 3rdparty/mshadow/mshadow
     - 'include/onednn' # symlinks to 3rdparty/onednn
     # test/build data
-    - 'tests/python/mkl/data/test_mkldnn_test_mkldnn_model_model1.json'
+    - 'tests/python/dnnl/data/test_dnnl_test_dnnl_model_model1.json'
 
 
   comment: on-failure
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 882e8b09d404..6966920a1bfe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -273,16 +273,16 @@ if(USE_ONEDNN)
   endif()
 
   function(load_onednn)
-    set(MKLDNN_BUILD_TESTS OFF CACHE INTERNAL "" FORCE)
-    set(MKLDNN_BUILD_EXAMPLES OFF CACHE INTERNAL "" FORCE)
-    set(MKLDNN_ARCH_OPT_FLAGS "" CACHE INTERNAL "" FORCE)
-    set(MKLDNN_ENABLE_JIT_PROFILING OFF CACHE INTERNAL "" FORCE)
-    set(MKLDNN_LIBRARY_TYPE STATIC CACHE INTERNAL "" FORCE)
+    set(DNNL_BUILD_TESTS OFF CACHE INTERNAL "" FORCE)
+    set(DNNL_BUILD_EXAMPLES OFF CACHE INTERNAL "" FORCE)
+    set(DNNL_ARCH_OPT_FLAGS "" CACHE INTERNAL "" FORCE)
+    set(DNNL_ENABLE_JIT_PROFILING OFF CACHE INTERNAL "" FORCE)
+    set(DNNL_LIBRARY_TYPE STATIC CACHE INTERNAL "" FORCE)
     set(DNNL_ENABLE_CONCURRENT_EXEC ON CACHE INTERNAL "" FORCE)
     set(DNNL_ENABLE_PRIMITIVE_CACHE ON CACHE INTERNAL "" FORCE)
 
     if(NOT USE_OPENMP)
-      set(MKLDNN_CPU_RUNTIME SEQ CACHE INTERNAL "" FORCE)
+      set(DNNL_CPU_RUNTIME SEQ CACHE INTERNAL "" FORCE)
     endif()
 
     set(CMAKE_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/onednn")
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index c02414cad3f1..47b491d6b74e 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -104,7 +104,7 @@ healthy project. The PPMC actively seeks to appoint new committers from the list
 * [Aaron Markham](https://github.com/aaronmarkham)
 * [Alex Zai](https://github.com/azai91)
 * [Anirudh Acharya](https://github.com/anirudhacharya)
-* [Anna Karbownik]((https://github.com/akarbown)
+* [Anna Karbownik](https://github.com/akarbown)
 * [Aston Zhang](https://github.com/astonzhang)
 * [Chaitanya Bapat](https://github.com/ChaiBapchya)
 * [Ciyong Chen](https://github.com/ciyongch)
diff --git a/MKLDNN_README.md b/DNNL_README.md
similarity index 83%
rename from MKLDNN_README.md
rename to DNNL_README.md
index 795e5023f67e..d9d9d35b54be 100644
--- a/MKLDNN_README.md
+++ b/DNNL_README.md
@@ -18,4 +18,4 @@
   ~
 -->
 
-File is moved to [docs/tutorials/mkldnn/MKLDNN_README.md](docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_readme.md).
+File is moved to [docs/python_docs/python/tutorials/performance/backend/dnnl/dnnl_readme.md](docs/python_docs/python/tutorials/performance/backend/dnnl/dnnl_readme.md).
diff --git a/LICENSE b/LICENSE
index 80abbe115d90..d06d50ff5cae 100644
--- a/LICENSE
+++ b/LICENSE
@@ -252,7 +252,7 @@
     src/operator/contrib/multi_proposal.cu
     src/operator/contrib/psroi_pooling.cc
     src/operator/contrib/psroi_pooling.cu
-    src/operator/nn/mkldnn/mkldnn_base-inl.h
+    src/operator/nn/dnnl/dnnl_base-inl.h
 
     =======================================================================================
     MIT license
diff --git a/README.md b/README.md
index baf1e5e439e7..638987f106d5 100644
--- a/README.md
+++ b/README.md
@@ -91,7 +91,7 @@ What's New
 
 ### Ecosystem News
 
-* [ONEDNN for Faster CPU Performance](docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_readme.md)
+* [oneDNN for Faster CPU Performance](docs/python_docs/python/tutorials/performance/backend/dnnl/dnnl_readme.md)
 * [MXNet Memory Monger, Training Deeper Nets with Sublinear Memory Cost](https://github.com/dmlc/mxnet-memonger)
 * [Tutorial for NVidia GTC 2016](https://github.com/dmlc/mxnet-gtc-tutorial)
 * [MXNet.js: Javascript Package for Deep Learning in Browser (without server)](https://github.com/dmlc/mxnet.js/)
diff --git a/cd/python/docker/test_python_image.sh b/cd/python/docker/test_python_image.sh
index be4f9dc78d1e..88f21db37cbb 100755
--- a/cd/python/docker/test_python_image.sh
+++ b/cd/python/docker/test_python_image.sh
@@ -33,7 +33,7 @@ fi
 
 # Execute tests
 if [[ $mxnet_variant != native ]]; then
-    python3 tests/python/mkl/test_mkldnn.py
+    python3 tests/python/dnnl/test_dnnl.py
 fi
 
 # TODO: Add more tests (18549)
diff --git a/cd/python/pypi/pypi_package.sh b/cd/python/pypi/pypi_package.sh
index 076f85a2b1bf..26626ef422e2 100755
--- a/cd/python/pypi/pypi_package.sh
+++ b/cd/python/pypi/pypi_package.sh
@@ -22,11 +22,10 @@ set -ex
 export mxnet_variant=${1:?"Please specify the mxnet variant"}
 
 # Due to this PR: https://github.com/apache/incubator-mxnet/pull/14899
-# The setup.py expects that mkldnn_version.h be present in
+# The setup.py expects that dnnl_version.h be present in
 # mxnet-build/3rdparty/onednn/build/install/include
 # The artifact repository stores this file in the dependencies
 # and CD unpacks it to a directory called cd_misc
-# Nov. 2019 Update: With v1.1, MKL-DNN is renaming to DNNL. Hence changing the prefix of file name.
 if [ -f "cd_misc/dnnl_version.h" ]; then
   mkdir -p 3rdparty/onednn/include/oneapi/dnnl
   cp cd_misc/dnnl_version.h 3rdparty/onednn/include/oneapi/dnnl/.
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index be0f1cab796e..3345ee7efbdc 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -801,7 +801,7 @@ cd_unittest_ubuntu() {
     fi
 
     if [[ ${mxnet_variant} = *mkl ]]; then
-        OMP_NUM_THREADS=$(expr $(nproc) / 4) pytest -n 4 --durations=50 --verbose tests/python/mkl
+        OMP_NUM_THREADS=$(expr $(nproc) / 4) pytest -n 4 --durations=50 --verbose tests/python/dnnl
     fi
 }
 
@@ -841,7 +841,7 @@ unittest_ubuntu_python3_cpu_onednn() {
     MXNET_ENGINE_TYPE=NaiveEngine \
                      OMP_NUM_THREADS=$(expr $(nproc) / 4) pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
     pytest -m 'serial' --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
-    pytest --durations=50 --cov-report xml:tests_mkl.xml --verbose tests/python/mkl
+    pytest --durations=50 --cov-report xml:tests_mkl.xml --verbose tests/python/dnnl
 }
 
 unittest_array_api_standardization() {
diff --git a/cpp-package/example/inference/README.md b/cpp-package/example/inference/README.md
index ddc8a19e36c7..fc81dea45b0b 100644
--- a/cpp-package/example/inference/README.md
+++ b/cpp-package/example/inference/README.md
@@ -30,7 +30,7 @@ This directory contains following examples. In order to run the examples, ensure
 
 ## [imagenet_inference.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/imagenet_inference.cpp>)
 
-This example demonstrates image classification workflow with pre-trained models using MXNet C++ API. Now this script also supports inference with quantized CNN models generated by Intel® MKL-DNN (see this [quantization flow](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/README.md)). By using C++ API, the latency of most models will be reduced to some extent compared with current Python implementation.
+This example demonstrates image classification workflow with pre-trained models using MXNet C++ API. Now this script also supports inference with quantized CNN models generated by oneDNN (see this [quantization flow](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/README.md)). By using C++ API, the latency of most models will be reduced to some extent compared with current Python implementation.
 
 Most of CNN models have been tested on Linux systems. And 50000 images are used to collect accuracy numbers. Please refer to this [README](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/README.md) for  more details about accuracy.
 
diff --git a/docs/python_docs/python/tutorials/index.rst b/docs/python_docs/python/tutorials/index.rst
index 2e0de421e50b..e9a61be097fb 100644
--- a/docs/python_docs/python/tutorials/index.rst
+++ b/docs/python_docs/python/tutorials/index.rst
@@ -84,10 +84,10 @@ Performance
       How to use int8 in your model to boost training speed.
 
    .. card::
-      :title: MKL-DNN
+      :title: oneDNN
       :link: performance/backend/mkldnn/index.html
 
-      How to get the most from your CPU by using Intel's MKL-DNN.
+      How to get the most from your CPU by using oneDNN.
 
    .. card::
       :title: TVM
diff --git a/docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_readme.md b/docs/python_docs/python/tutorials/performance/backend/dnnl/dnnl_readme.md
similarity index 96%
rename from docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_readme.md
rename to docs/python_docs/python/tutorials/performance/backend/dnnl/dnnl_readme.md
index 8ff92fe1a9a5..e68dc53a780b 100644
--- a/docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_readme.md
+++ b/docs/python_docs/python/tutorials/performance/backend/dnnl/dnnl_readme.md
@@ -208,11 +208,11 @@ o = exe.outputs[0]
 t = o.asnumpy()
 ```
 
-More detailed debugging and profiling information can be logged by setting the environment variable 'MKLDNN_VERBOSE':
+More detailed debugging and profiling information can be logged by setting the environment variable 'DNNL_VERBOSE':
 ```
-export MKLDNN_VERBOSE=1
+export DNNL_VERBOSE=1
 ```
-For example, by running above code snippet, the following debugging logs providing more insights on ONEDNN primitives `convolution` and `reorder`. That includes: Memory layout, infer shape and the time cost of primitive execution.
+For example, by running above code snippet, the following debugging logs providing more insights on oneDNN primitives `convolution` and `reorder`. That includes: Memory layout, infer shape and the time cost of primitive execution.
 ```
 dnnl_verbose,info,DNNL v1.1.2 (commit cb2cc7ac17ff4e2ef50805c7048d33256d82be4d)
 dnnl_verbose,info,Detected ISA is Intel AVX-512 with Intel DL Boost
@@ -281,7 +281,7 @@ MKL_VERBOSE SGEMM(T,N,12,10,8,0x7f7f927b1378,0x1bc2140,8,0x1ba8040,8,0x7f7f927b1
 Graph optimization with subgraph is available and enabled by default in master branch. For MXNet release v1.5, you can manually enable it by:
 
 ```
-export MXNET_SUBGRAPH_BACKEND=MKLDNN
+export MXNET_SUBGRAPH_BACKEND=ONEDNN
 ```
 
 This limitations of this experimental feature are:
@@ -293,7 +293,7 @@ This limitations of this experimental feature are:
 
 <h2 id="7">Quantization and Inference with INT8</h2>
 
-Benefiting from Intel ONEDNN, MXNet built with Intel ONEDNN brings outstanding performance improvement on quantization and inference with INT8 Intel CPU Platform on Intel Xeon Scalable Platform.
+Benefiting from oneDNN, MXNet built with oneDNN brings outstanding performance improvement on quantization and inference with INT8 Intel CPU Platform on Intel Xeon Scalable Platform.
 
 - [CNN Quantization Examples](https://github.com/apache/incubator-mxnet/tree/master/example/quantization).
 
@@ -303,6 +303,6 @@ Benefiting from Intel ONEDNN, MXNet built with Intel ONEDNN brings outstanding p
 
 - For questions or support specific to MKL, visit the [Intel MKL](https://software.intel.com/en-us/mkl) website.
 
-- For questions or support specific to ONEDNN, visit the [Intel ONEDNN](https://github.com/oneapi-src/oneDNN) website.
+- For questions or support specific to oneDNN, visit the [oneDNN](https://github.com/oneapi-src/oneDNN) website.
 
-- If you find bugs, please open an issue on GitHub for [MXNet with MKL](https://github.com/apache/incubator-mxnet/labels/MKL) or [MXNet with ONEDNN](https://github.com/apache/incubator-mxnet/labels/MKLDNN).
+- If you find bugs, please open an issue on GitHub for [MXNet with MKL](https://github.com/apache/incubator-mxnet/labels/MKL) or [MXNet with oneDNN](https://github.com/apache/incubator-mxnet/labels/MKLDNN).
diff --git a/docs/python_docs/python/tutorials/performance/backend/mkldnn/index.rst b/docs/python_docs/python/tutorials/performance/backend/dnnl/index.rst
similarity index 78%
rename from docs/python_docs/python/tutorials/performance/backend/mkldnn/index.rst
rename to docs/python_docs/python/tutorials/performance/backend/dnnl/index.rst
index ec8585599603..116458c2d20d 100644
--- a/docs/python_docs/python/tutorials/performance/backend/mkldnn/index.rst
+++ b/docs/python_docs/python/tutorials/performance/backend/dnnl/index.rst
@@ -15,22 +15,22 @@
    specific language governing permissions and limitations
    under the License.
 
-Intel MKL-DNN
+oneDNN
 =============
 
 .. container:: cards
 
    .. card::
-      :title: MKL-DNN Installation and Verification
-      :link: mkldnn_readme
+      :title: oneDNN Installation and Verification
+      :link: dnnl_readme
 
-      A guide on using MKL-DNN with MXNet.
+      A guide on using oneDNN with MXNet.
 
    .. card::
-      :title: MKL-DNN Quantization
-      :link: mkldnn_quantization
+      :title: oneDNN Quantization
+      :link: dnnl_quantization
 
-      How to perform quantization with MKLDNN
+      How to perform quantization with oneDNN
 
 .. toctree::
    :hidden:
diff --git a/docs/python_docs/python/tutorials/performance/backend/index.rst b/docs/python_docs/python/tutorials/performance/backend/index.rst
index 942f3994a5b2..d9b2947eb00b 100644
--- a/docs/python_docs/python/tutorials/performance/backend/index.rst
+++ b/docs/python_docs/python/tutorials/performance/backend/index.rst
@@ -22,10 +22,10 @@ The following tutorials will help you learn how to use backend tools to boost pe
 .. container:: cards
 
   .. card::
-     :title: MKL-DNN
-     :link: mkldnn/index.html
+     :title: oneDNN
+     :link: dnnl/index.html
 
-     How to get the most from your CPU by using Intel's MKL-DNN.
+     How to get the most from your CPU by using oneDNN.
 
   .. card::
      :title: TVM
@@ -50,7 +50,7 @@ The following tutorials will help you learn how to use backend tools to boost pe
    :hidden:
    :maxdepth: 1
 
-   mkldnn/index
+   dnnl/index
    tvm
    profiler
    amp
diff --git a/docs/python_docs/python/tutorials/performance/backend/profiler.md b/docs/python_docs/python/tutorials/performance/backend/profiler.md
index f935e46f2258..e56019405287 100644
--- a/docs/python_docs/python/tutorials/performance/backend/profiler.md
+++ b/docs/python_docs/python/tutorials/performance/backend/profiler.md
@@ -211,11 +211,11 @@ Let's zoom in to check the time taken by operators
 The above picture visualizes the sequence in which the operators were executed and the time taken by each operator.
 
 ### Profiling ONEDNN Operators
-Reagrding ONEDNN operators, the library has already provided the internal profiling tool. Firstly, you need set `MKLDNN_VERBOSE=1` to enable internal profiler.
+Reagrding ONEDNN operators, the library has already provided the internal profiling tool. Firstly, you need set `DNNL_VERBOSE=1` to enable internal profiler.
 
-`$ MKLDNN_VERBOSE=1 python my_script.py > mkldnn_verbose.log`
+`$ DNNL_VERBOSE=1 python my_script.py > dnnl_verbose.log`
 
-Now, the detailed profiling insights of each ONEDNN prmitive are saved into `mkldnn_verbose.log` (like below).
+Now, the detailed profiling insights of each oneDNN prmitive are saved into `dnnl_verbose.log` (like below).
 
 ```
 dnnl_verbose,info,DNNL v1.1.2 (commit cb2cc7ac17ff4e2ef50805c7048d33256d82be4d)
@@ -225,13 +225,13 @@ dnnl_verbose,exec,cpu,convolution,jit:avx512_common,forward_inference,src_f32::b
 
 For example, if you want to calculate the total executing time of `convolution` primitive, you can just run:
 
-`$ cat mkldnn_verbose.log | grep "exec,cpu,convolution" | awk 'BEGIN{FS=","} {SUM+=$11} END {print SUM}'`
+`$ cat dnnl_verbose.log | grep "exec,cpu,convolution" | awk 'BEGIN{FS=","} {SUM+=$11} END {print SUM}'`
 
-Moreover, you can set `MKLDNN_VERBOSE=2` to collect both creating and executing time of each primitive.
+Moreover, you can set `DNNL_VERBOSE=2` to collect both creating and executing time of each primitive.
 
-`$ cat mkldnn_verbose.log | grep "create,cpu,convolution" | awk 'BEGIN{FS=","} {SUM+=$11} END {print SUM}'`
+`$ cat dnnl_verbose.log | grep "create,cpu,convolution" | awk 'BEGIN{FS=","} {SUM+=$11} END {print SUM}'`
 
-`$ cat mkldnn_verbose.log | grep "exec,cpu,convolution" | awk 'BEGIN{FS=","} {SUM+=$11} END {print SUM}'`
+`$ cat dnnl_verbose.log | grep "exec,cpu,convolution" | awk 'BEGIN{FS=","} {SUM+=$11} END {print SUM}'`
 
 
 ### Profiling Custom Operators
diff --git a/docs/python_docs/python/tutorials/performance/index.rst b/docs/python_docs/python/tutorials/performance/index.rst
index 43c548eb9ffb..f4491dba5af8 100644
--- a/docs/python_docs/python/tutorials/performance/index.rst
+++ b/docs/python_docs/python/tutorials/performance/index.rst
@@ -76,10 +76,10 @@ Accelerated Backend
    ..
       TBD Content
       .. card::
-         :title: MKL-DNN
-         :link: backend/mkldnn/mkldnn_readme
+         :title: oneDNN
+         :link: backend/dnnl/dnnl_readme
 
-         How to get the most from your CPU by using Intel's MKL-DNN.
+         How to get the most from your CPU by using oneDNN.
 
       .. card::
          :title: TVM
diff --git a/docs/static_site/src/pages/api/faq/env_var.md b/docs/static_site/src/pages/api/faq/env_var.md
index a4b4915c84c6..eed6cf3d9fc0 100644
--- a/docs/static_site/src/pages/api/faq/env_var.md
+++ b/docs/static_site/src/pages/api/faq/env_var.md
@@ -375,9 +375,9 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
   - This variable controls how many CuDNN dropout state resources to create for each GPU context for use in operator.
 
 * MXNET_SUBGRAPH_BACKEND
-  - Values: String ```(default="MKLDNN")``` if ONEDNN is avaliable, otherwise ```(default="")```
+  - Values: String ```(default="ONEDNN")``` if oneDNN is available, otherwise ```(default="")```
   - This variable controls the subgraph partitioning in MXNet.
-  - This variable is used to perform ONEDNN FP32 operator fusion and quantization. Please refer to the [ONEDNN operator list](https://github.com/apache/incubator-mxnet/blob/v1.5.x/docs/tutorials/mkldnn/operator_list.md) for how this variable is used and the list of fusion passes.
+  - This variable is used to perform oneDNN FP32 operator fusion and quantization. Please refer to the [oneDNN operator list](https://github.com/apache/incubator-mxnet/blob/v1.5.x/docs/tutorials/mkldnn/operator_list.md) for how this variable is used and the list of fusion passes.
   - Set ```MXNET_SUBGRAPH_BACKEND=NONE``` to disable subgraph backend.
 
 * MXNET_SAFE_ACCUMULATION
diff --git a/docs/static_site/src/pages/api/faq/perf.md b/docs/static_site/src/pages/api/faq/perf.md
index 083ef6974f10..0759afcc0163 100644
--- a/docs/static_site/src/pages/api/faq/perf.md
+++ b/docs/static_site/src/pages/api/faq/perf.md
@@ -58,7 +58,7 @@ We also find that setting the following environment variables can help:
 | :-------- | :---------- |
 | `OMP_NUM_THREADS`            | Suggested value: `vCPUs / 2` in which `vCPUs` is the number of virtual CPUs. For more information, please see the guide for [setting the number of threads using an OpenMP environment variable](https://software.intel.com/en-us/mkl-windows-developer-guide-setting-the-number-of-threads-using-an-openmp-environment-variable) |
 | `KMP_AFFINITY`               | Suggested value: `granularity=fine,compact,1,0`.  For more information, please see the guide for [Thread Affinity Interface (Linux* and Windows*)](https://software.intel.com/en-us/node/522691). |
-| `MXNET_SUBGRAPH_BACKEND` | Set to ONEDNN to enable the [subgraph feature](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN) for better performance. For more information please see [Build/Install MXNet with ONEDNN](https://mxnet.apache.org/api/python/docs/tutorials/performance/backend/mkldnn/mkldnn_readme.html)|
+| `MXNET_SUBGRAPH_BACKEND` | Set to ONEDNN to enable the [subgraph feature](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN) for better performance. For more information please see [Build/Install MXNet with oneDNN](https://mxnet.apache.org/api/python/docs/tutorials/performance/backend/dnnl/dnnl_readme.html)|
 
 Note that _MXNet_ treats all CPUs on a single machine as a single device.
 So whether you specify `cpu(0)` or `cpu()`, _MXNet_ will use all CPU cores on the machine.
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 12b083c67576..e374523490f0 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -541,7 +541,7 @@ inline std::ostream& operator<<(std::ostream &out, const Context &ctx) {
 
 
 #if MXNET_USE_ONEDNN == 1 || MXNET_USE_INTGEMM == 1
-constexpr size_t kMKLDNNAlign = 64;
+constexpr size_t kDNNLAlign = 64;
 #endif
 
 }  // namespace mxnet
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 25c5850fd8c9..5e6af4d7f768 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -37,7 +37,7 @@
 #include <string>
 #include <vector>
 #if MXNET_USE_ONEDNN == 1
-#include <mkldnn.hpp>
+#include <dnnl.hpp>
 #endif
 #include "./base.h"
 #include "./engine.h"
@@ -73,7 +73,7 @@ enum NDArrayFormatErr {
   kRSPIdxErr,     // indices error for row sparse
 };
 
-class MKLDNNMemory;
+class DNNLMemory;
 
 /*!
  * \brief ndarray interface
@@ -217,7 +217,7 @@ class NDArray {
   /*
    * This indicates whether an array is a view of another array (created by
    * reshape or slice). If an array is a view and the data is stored in
-   * MKLDNN format, we need to convert the data to the default format when
+   * DNNL format, we need to convert the data to the default format when
    * data in the view is accessed.
    */
   inline bool IsView() const {
@@ -729,20 +729,20 @@ class NDArray {
 
 #if MXNET_USE_ONEDNN == 1
   /*
-   * Create NDArray from mkldnn memory.
-   * mkldnn_mem The mkldnn memory to be managed.
+   * Create NDArray from dnnl memory.
+   * dnnl_mem The dnnl memory to be managed.
    */
-  explicit NDArray(const std::shared_ptr<mkldnn::memory>& mkldnn_mem);
+  explicit NDArray(const std::shared_ptr<dnnl::memory>& dnnl_mem);
   /*
-   * Create NDArray from mkldnn memory descriptor.
-   * mem_pd The mkldnn memory descriptor to be created.
+   * Create NDArray from dnnl memory descriptor.
+   * mem_pd The dnnl memory descriptor to be created.
    */
-  explicit NDArray(const mkldnn::memory::desc& md);
+  explicit NDArray(const dnnl::memory::desc& md);
   /*
-   * Test if the data is stored in one of special MKLDNN format.
+   * Test if the data is stored in one of special DNNL format.
    */
-  bool IsMKLDNNData() const {
-    return ptr_->IsMKLDNN();
+  bool IsDNNLData() const {
+    return ptr_->IsDNNL();
   }
   /*
    * Test if the data is stored in one of default MXNet formats.
@@ -751,37 +751,37 @@ class NDArray {
     return ptr_->IsDefault();
   }
   /*
-   * All functions below return a raw pointer to mkldnn memory. Actually there
-   * is a shared pointer that hold the memory either in NDArray or in MKLDNN
+   * All functions below return a raw pointer to dnnl memory. Actually there
+   * is a shared pointer that hold the memory either in NDArray or in DNNL
    * stream. As long as we call these functions inside an operator, the return
    * memory is always valid.
    */
 
   /*
-   * This function returns mkldnn::memory with the default primitive_desc.
+   * This function returns dnnl::memory with the default primitive_desc.
    */
-  const mkldnn::memory* GetMKLDNNData() const;
+  const dnnl::memory* GetDNNLData() const;
   /*
-   * This function returns mkldnn::memory with the given primitive_desc
+   * This function returns dnnl::memory with the given primitive_desc
    * as long as the array size meets the required size in the given primitive_desc.
    */
-  const mkldnn::memory* GetMKLDNNData(const mkldnn::memory::desc& md) const;
+  const dnnl::memory* GetDNNLData(const dnnl::memory::desc& md) const;
   /*
-   * This function returns mkldnn::memory with the given primitive_desc.
-   * The returned mkldnn::memory will have the same physical layout as
+   * This function returns dnnl::memory with the given primitive_desc.
+   * The returned dnnl::memory will have the same physical layout as
    * the given primitive_desc.
    */
-  const mkldnn::memory* GetMKLDNNDataReorder(const mkldnn::memory::desc& md) const;
+  const dnnl::memory* GetDNNLDataReorder(const dnnl::memory::desc& md) const;
 
   /*
-   * This function copies data from mkldnn memory.
+   * This function copies data from dnnl memory.
    */
-  void CopyFrom(const mkldnn::memory& mem);
+  void CopyFrom(const dnnl::memory& mem);
   /*
-   * This function allocates memory for array and creates mkldnn memory
+   * This function allocates memory for array and creates dnnl memory
    * with the specified format.
    */
-  mkldnn::memory* CreateMKLDNNData(const mkldnn::memory::desc& md);
+  dnnl::memory* CreateDNNLData(const dnnl::memory::desc& md);
 
   /*
    * These are the async version of the methods above.
@@ -789,7 +789,7 @@ class NDArray {
    * the array are complete.
    */
   void Reorder2DefaultAsync() const;
-  void MKLDNNDataReorderAsync(const mkldnn::memory::desc& md) const;
+  void DNNLDataReorderAsync(const dnnl::memory::desc& md) const;
 
   /*
    * This creates a new NDArray with the reordered data.
@@ -803,7 +803,7 @@ class NDArray {
    */
   NDArray Reorder2DefaultFloatFormat() const;
 
-  void InvalidateMKLDNNData();
+  void InvalidateDNNLData();
 
   /*
    * This function is used inside operators to reshape an array.
@@ -815,12 +815,12 @@ class NDArray {
    * which can be expensive.
    * It's used by FullyConnected right now.
    */
-  NDArray MKLDNNDataReshape(const mxnet::TShape& shape) const;
+  NDArray DNNLDataReshape(const mxnet::TShape& shape) const;
 
   /*!
-   * \ Fix mkldnn memory descriptor mismatch from NDArray.
+   * \ Fix dnnl memory descriptor mismatch from NDArray.
    */
-  void UpdateMKLDNNMemDesc(const mkldnn::memory::desc& desc);
+  void UpdateDNNLMemDesc(const dnnl::memory::desc& desc);
 #endif
 
   /*!
@@ -857,9 +857,9 @@ class NDArray {
     std::vector<Storage::Handle> aux_handles;
 
 #if MXNET_USE_ONEDNN == 1
-    /*! This is created when data is stored in MKLDNN format.
+    /*! This is created when data is stored in DNNL format.
      */
-    std::shared_ptr<MKLDNNMemory> mkl_mem_;
+    std::shared_ptr<DNNLMemory> dnnl_mem_;
 #endif
     /*! \brief variable from engine */
     Engine::VarHandle var;
@@ -1035,7 +1035,7 @@ class NDArray {
       if (delay_alloc) {
         Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_ONEDNN == 1
-        mkl_mem_ = nullptr;
+        dnnl_mem_ = nullptr;
 #endif
         delay_alloc = false;
       }
@@ -1051,7 +1051,7 @@ class NDArray {
         shandle.size = dbytes;
         Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_ONEDNN == 1
-        mkl_mem_ = nullptr;
+        dnnl_mem_ = nullptr;
 #endif
         delay_alloc = false;
       } else if (shandle.size < dbytes) {
@@ -1061,7 +1061,7 @@ class NDArray {
         shandle.size = dbytes;
         Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_ONEDNN == 1
-        mkl_mem_ = nullptr;
+        dnnl_mem_ = nullptr;
 #endif
       }
     }
@@ -1099,14 +1099,14 @@ class NDArray {
 
 #if MXNET_USE_ONEDNN == 1
     // Have MKL memory reference to the data in the default storage
-    // or create memory for MKLDNN.
+    // or create memory for DNNL.
     void SetMKLMem(const mxnet::TShape& shape, int dtype);
-    // If the data is stored in MKLDNN layout, we reorder data in mkl_mem_ and
+    // If the data is stored in DNNL layout, we reorder data in dnnl_mem_ and
     // save the result in shandle.
     void Reorder2Default();
     // Reroder data to a specified layout.
-    void MKLDNNDataReorder(const mkldnn::memory::desc& md);
-    bool IsMKLDNN() const;
+    void DNNLDataReorder(const dnnl::memory::desc& md);
+    bool IsDNNL() const;
     bool IsDefault() const;
 #endif
 
diff --git a/include/onednn/mkldnn.h b/include/onednn/mkldnn.h
deleted file mode 120000
index ef19407410d8..000000000000
--- a/include/onednn/mkldnn.h
+++ /dev/null
@@ -1 +0,0 @@
-../../3rdparty/onednn/include/mkldnn.h
\ No newline at end of file
diff --git a/include/onednn/mkldnn.hpp b/include/onednn/mkldnn.hpp
deleted file mode 120000
index e7f56e95d055..000000000000
--- a/include/onednn/mkldnn.hpp
+++ /dev/null
@@ -1 +0,0 @@
-../../3rdparty/onednn/include/mkldnn.hpp
\ No newline at end of file
diff --git a/include/onednn/mkldnn_config.h b/include/onednn/mkldnn_config.h
deleted file mode 120000
index 714a58682a91..000000000000
--- a/include/onednn/mkldnn_config.h
+++ /dev/null
@@ -1 +0,0 @@
-../../3rdparty/onednn/include/mkldnn_config.h
\ No newline at end of file
diff --git a/include/onednn/mkldnn_debug.h b/include/onednn/mkldnn_debug.h
deleted file mode 120000
index ca0e6b99801e..000000000000
--- a/include/onednn/mkldnn_debug.h
+++ /dev/null
@@ -1 +0,0 @@
-../../3rdparty/onednn/include/mkldnn_debug.h
\ No newline at end of file
diff --git a/include/onednn/mkldnn_dnnl_mangling.h b/include/onednn/mkldnn_dnnl_mangling.h
deleted file mode 120000
index 67bf8d0893a7..000000000000
--- a/include/onednn/mkldnn_dnnl_mangling.h
+++ /dev/null
@@ -1 +0,0 @@
-../../3rdparty/onednn/include/mkldnn_dnnl_mangling.h
\ No newline at end of file
diff --git a/include/onednn/mkldnn_types.h b/include/onednn/mkldnn_types.h
deleted file mode 120000
index 334078bfafa1..000000000000
--- a/include/onednn/mkldnn_types.h
+++ /dev/null
@@ -1 +0,0 @@
-../../3rdparty/onednn/include/mkldnn_types.h
\ No newline at end of file
diff --git a/include/onednn/mkldnn_version.h b/include/onednn/mkldnn_version.h
deleted file mode 120000
index ed357587f6df..000000000000
--- a/include/onednn/mkldnn_version.h
+++ /dev/null
@@ -1 +0,0 @@
-../../3rdparty/onednn/include/mkldnn_version.h
\ No newline at end of file
diff --git a/python/mxnet/amp/lists/symbol_bf16.py b/python/mxnet/amp/lists/symbol_bf16.py
index 29904291dc8b..86f5b0aabe72 100644
--- a/python/mxnet/amp/lists/symbol_bf16.py
+++ b/python/mxnet/amp/lists/symbol_bf16.py
@@ -360,8 +360,8 @@
     'uniform',
     'unravel_index',
     'zeros_like',
-    '_sg_mkldnn_conv',
-    '_sg_mkldnn_fully_connected',
+    '_sg_onednn_conv',
+    '_sg_onednn_fully_connected',
     'broadcast_mul',
     'Convolution_v1',
     'IdentityAttachKLSparseReg',
diff --git a/python/mxnet/amp/lists/symbol_fp16.py b/python/mxnet/amp/lists/symbol_fp16.py
index 009586ed28f8..b561b335d9a7 100644
--- a/python/mxnet/amp/lists/symbol_fp16.py
+++ b/python/mxnet/amp/lists/symbol_fp16.py
@@ -611,10 +611,10 @@
 
 if Features().is_enabled('ONEDNN'):
     FP32_FUNCS.extend([
-        '_sg_mkldnn_conv',
-        '_sg_mkldnn_fully_connected',
-        '_sg_mkldnn_selfatt_qk',
-        '_sg_mkldnn_selfatt_valatt',
+        '_sg_onednn_conv',
+        '_sg_onednn_fully_connected',
+        '_sg_onednn_selfatt_qk',
+        '_sg_onednn_selfatt_valatt',
     ])
 
 # Functions that have to be cast to FP32 only for
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index b7ff517e5158..4444c4b0fc97 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -529,14 +529,13 @@ def quantize_model(sym, arg_params, aux_params, data_names=('data',),
 
     return qsym, qarg_params, aux_params
 
-
-def quantize_model_mkldnn(sym, arg_params, aux_params, data_names=('data',),
+def quantize_model_onednn(sym, arg_params, aux_params, data_names=('data',),
                           ctx=cpu(), excluded_sym_names=None, excluded_op_names=None,
                           calib_mode='entropy', calib_data=None, num_calib_batches=None,
                           quantized_dtype='int8', quantize_mode='smart',
                           quantize_granularity='tensor-wise', logger=None):
     """User-level API for generating a fusion + quantized model from a FP32 model
-    w/ or w/o calibration with Intel MKL-DNN.
+    w/ or w/o calibration with oneDNN.
     The backend quantized operators are only enabled for Linux systems. Please do not run
     inference using the quantized models on Windows for now.
 
@@ -555,9 +554,9 @@ def quantize_model_mkldnn(sym, arg_params, aux_params, data_names=('data',),
         raise ValueError('currently only supports single ctx, while received %s' % str(ctx))
     if ctx.device_type != 'cpu':
         raise ValueError(
-            'quantize_model_mkldnn only support Intel cpu platform with MKL-DNN Backend')
+            'quantize_model_onednn only support Intel cpu platform with oneDNN Backend')
 
-    sym = sym.optimize_for(backend='MKLDNN_QUANTIZE')
+    sym = sym.optimize_for(backend='ONEDNN_QUANTIZE')
 
     qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
                                                    data_names=data_names, ctx=ctx,
@@ -568,7 +567,7 @@ def quantize_model_mkldnn(sym, arg_params, aux_params, data_names=('data',),
                                                    quantized_dtype=quantized_dtype, quantize_mode=quantize_mode,
                                                    quantize_granularity=quantize_granularity, logger=logger)
 
-    qsym = qsym.optimize_for(backend='MKLDNN_QUANTIZE')
+    qsym = qsym.optimize_for(backend='ONEDNN_QUANTIZE')
 
     return qsym, qarg_params, aux_params
 
@@ -824,7 +823,7 @@ def quantize_net(network, quantized_dtype='auto', quantize_mode='full', quantize
 
     if ctx != mx.cpu():
         raise ValueError('Quantization currently supports only CPU context')
-    backend = 'MKLDNN_QUANTIZE'
+    backend = 'ONEDNN_QUANTIZE'
 
     network.hybridize(static_alloc=False, static_shape=False)
     data_types = None
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index f73cc18ce5bc..edd2e5514878 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -163,7 +163,7 @@ void CustomFComputeDispatcher(const std::string op_name,
   std::vector<size_t> in_verIDs, out_verIDs;
   std::vector<const char*> in_dev_type, out_dev_type;
   std::vector<int> in_dev_id, out_dev_id;
-  std::vector<NDArray> conv_mkl;  // converted NDArrays from MKLDNN format
+  std::vector<NDArray> conv_mkl;  // converted NDArrays from DNNL format
 
   // Extra data for sparse inputs and outputs.
   std::vector<int> in_stypes(inputs.size(), 0), out_stypes(outputs.size(), 0);
@@ -176,9 +176,9 @@ void CustomFComputeDispatcher(const std::string op_name,
   for (size_t i = 0; i < inputs.size(); i++) {
     NDArray const* in_nd = &(inputs[i]);
 #if MXNET_USE_ONEDNN == 1
-    // reorder data if in MKLDNN format
-    if (in_nd->IsMKLDNNData()) {
-      // convert from MKLDNN
+    // reorder data if in DNNL format
+    if (in_nd->IsDNNLData()) {
+      // convert from DNNL
       conv_mkl.push_back(in_nd->Reorder2Default());
       in_nd = &(conv_mkl.back());
     }
@@ -1642,8 +1642,8 @@ void registerPasses(void* lib,
           const NDArray& in_arg = *(in_args_ptr[i]);
 
 #if MXNET_USE_ONEDNN == 1
-          // reorder data if in MKLDNN format
-          if (in_arg.IsMKLDNNData()) {
+          // reorder data if in DNNL format
+          if (in_arg.IsDNNLData()) {
             in_arg.Reorder2DefaultAsync();
             in_arg.WaitToRead();
           }
@@ -1668,8 +1668,8 @@ void registerPasses(void* lib,
           const auto& in_aux = *(in_aux_ptr[i]);
 
 #if MXNET_USE_ONEDNN == 1
-          // reorder data if in MKLDNN format
-          if (in_aux.IsMKLDNNData()) {
+          // reorder data if in DNNL format
+          if (in_aux.IsDNNLData()) {
             in_aux.Reorder2DefaultAsync();
             in_aux.WaitToRead();
           }
@@ -2557,7 +2557,7 @@ int MXNDArrayGetData(NDArrayHandle handle, void** out_pdata) {
   API_BEGIN();
   NDArray* arr = static_cast<NDArray*>(handle);
 #if MXNET_USE_ONEDNN == 1
-  if (arr->IsMKLDNNData()) {
+  if (arr->IsDNNLData()) {
     arr->Reorder2DefaultAsync();
     arr->WaitToRead();
   }
diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h
index ec2aa7cb6975..21a97130c183 100644
--- a/src/common/exec_utils.h
+++ b/src/common/exec_utils.h
@@ -94,7 +94,7 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
     const auto& nd = src[i];
 
 #if MXNET_USE_ONEDNN == 1
-    if (req->at(i) == kWriteInplace && nd.IsMKLDNNData())
+    if (req->at(i) == kWriteInplace && nd.IsDNNLData())
       // If it's write inplace and the output array doesn't use the default
       // layout, we'll generate a temporary output array below, which means
       // the input array and the output array are no longer the same array.
@@ -108,7 +108,7 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
       if (bufs != nullptr) {
         temp = bufs->at(i);
       } else if (kAddTo == req->at(i)) {
-        temp = nd.IsMKLDNNData() ? nd.Reorder2Default() : nd;
+        temp = nd.IsDNNLData() ? nd.Reorder2Default() : nd;
       } else {
         temp = NDArray(nd.shape(), nd.ctx(), true, nd.dtype());
       }
diff --git a/src/common/utils.h b/src/common/utils.h
index 710cc61413ad..c62fafab10a3 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -49,7 +49,7 @@
 
 #include "../operator/mxnet_op.h"
 #if MXNET_USE_ONEDNN == 1
-#include "../operator/nn/mkldnn/mkldnn_base-inl.h"
+#include "../operator/nn/dnnl/dnnl_base-inl.h"
 #endif
 
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
@@ -513,11 +513,11 @@ inline void LogStorageFallback(const nnvm::NodeAttrs& attrs,
   os << "\nStorage type fallback detected:\n" << op_str << warning;
   LogOnce(os.str());
 #if MXNET_USE_ONEDNN == 1
-  if (!MKLDNNEnvSet())
+  if (!DNNLEnvSet())
     common::LogOnce(
         "MXNET_ONEDNN_ENABLED flag is off. "
         "You can re-enable by setting MXNET_ONEDNN_ENABLED=1");
-  if (GetMKLDNNCacheSize() != -1)
+  if (GetDNNLCacheSize() != -1)
     common::LogOnce(
         "MXNET_ONEDNN_CACHE_NUM is set."
         "Should only be set if "
diff --git a/src/imperative/attach_op_execs_pass.cc b/src/imperative/attach_op_execs_pass.cc
index a9402b75fdb5..4a8c51d107c7 100644
--- a/src/imperative/attach_op_execs_pass.cc
+++ b/src/imperative/attach_op_execs_pass.cc
@@ -36,10 +36,10 @@ namespace mxnet {
 namespace exec {
 
 #if MXNET_USE_ONEDNN == 1
-#define CREATE_DEFAULT_INPUTS_MKLDNN(in_array, in_array_fallback, attrs) \
+#define CREATE_DEFAULT_INPUTS_DNNL(in_array, in_array_fallback, attrs) \
   CREATE_DEFAULT_INPUTS(true, attrs, CreateDefaultInputs(in_array, in_array_fallback))
 #else
-#define CREATE_DEFAULT_INPUTS_MKLDNN(in_array, in_array_fallback, attrs)  // empty macro
+#define CREATE_DEFAULT_INPUTS_DNNL(in_array, in_array_fallback, attrs)  // empty macro
 #endif
 
 // abstract OpExecutor which provides storage fallback procedure on
@@ -168,7 +168,7 @@ class StatefulComputeExExecutor : public OpExecutor {
     op_ctx.run_ctx = rctx;
     INVALIDATE_OUTPUTS(out_array, req);
     std::vector<NDArray>* pInArray = &in_array;
-    CREATE_DEFAULT_INPUTS_MKLDNN(in_array, pInArray = &in_array_fallback, attrs_);
+    CREATE_DEFAULT_INPUTS_DNNL(in_array, pInArray = &in_array_fallback, attrs_);
     fcompute_(state_, op_ctx, *pInArray, req, out_array);
   }
 
@@ -240,7 +240,7 @@ class FComputeExExecutor : public OpExecutor {
     op_ctx.run_ctx = rctx;
     INVALIDATE_OUTPUTS(out_array, req);
     std::vector<NDArray>* pInArray = &in_array;
-    CREATE_DEFAULT_INPUTS_MKLDNN(in_array, pInArray = &in_array_fallback, attrs_);
+    CREATE_DEFAULT_INPUTS_DNNL(in_array, pInArray = &in_array_fallback, attrs_);
     fcompute_(attrs_, op_ctx, *pInArray, req, out_array);
   }
 
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 5944b0a2ff22..7d506fa4f97b 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -16,20 +16,23 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+// The first two includes below need to be in unalphabetical for the miscellaneous CI to pass.
 #include <mxnet/operator.h>
 #include <mxnet/imperative.h>
 #include <nnvm/pass_functions.h>
-#include <utility>
+
 #include <algorithm>
-#include <vector>
 #include <map>
 #include <string>
-#include "./exec_pass.h"
+#include <utility>
+#include <vector>
+
 #include "../c_api/c_api_common.h"
-#include "../common/utils.h"
 #include "../common/exec_utils.h"
-#include "../operator/nn/mkldnn/mkldnn_base-inl.h"
+#include "../common/utils.h"
+#include "../operator/nn/dnnl/dnnl_base-inl.h"
 #include "../operator/operator_common.h"
+#include "./exec_pass.h"
 
 #ifndef MXNET_IMPERATIVE_IMPERATIVE_UTILS_H_
 #define MXNET_IMPERATIVE_IMPERATIVE_UTILS_H_
@@ -51,7 +54,7 @@ void InvalidateOutputs(const std::vector<T>* pArrs, const std::vector<OpReqType>
   auto arrs = *pArrs;
   for (size_t i = 0; i < arrs.size(); i++) {
     if (reqs[i] == kWriteTo || reqs[i] == kNullOp)
-      pntr(arrs[i])->InvalidateMKLDNNData();
+      pntr(arrs[i])->InvalidateDNNLData();
   }
 }
 
@@ -60,7 +63,7 @@ static inline void CreateDefaultInputs(const std::vector<NDArray>& arrs,
                                        std::vector<NDArray>* out_arrs) {
   out_arrs->clear();
   for (size_t i = 0; i < arrs.size(); ++i) {
-    if (arrs[i].IsMKLDNNData())
+    if (arrs[i].IsDNNLData())
       out_arrs->push_back(arrs[i].Reorder2Default());
     else
       out_arrs->push_back(arrs[i]);
@@ -77,7 +80,7 @@ static inline void CreateDefaultInputs(std::vector<NDArray>* pArrs) {
 #define INVALIDATE_OUTPUTS(outputs, req) InvalidateOutputs(&outputs, req)
 // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
 // its FCcomputeEx, but AsyncPush the copy operation to engine.
-// So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B
+// So for the case that A is holding dnnl memory, and then copy A to B, and then copy B
 // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time,
 // copying A to B may not happen, and will corrupt A's memory.
 #define INVALIDATE_OUTPUTS_COND(cond, outputs, req) \
@@ -85,12 +88,12 @@ static inline void CreateDefaultInputs(std::vector<NDArray>* pArrs) {
     INVALIDATE_OUTPUTS(outputs, req);               \
   }
 
-// add for mkldnn OP + no mkldnn OP
-#define CREATE_DEFAULT_INPUTS(cond, attrs, func_call)      \
-  if (cond) {                                              \
-    const auto is_mkldnn = Op::GetAttr<bool>("TIsMKLDNN"); \
-    if (!is_mkldnn.get(attrs.op, false))                   \
-      func_call;                                           \
+// add for dnnl OP + no dnnl OP
+#define CREATE_DEFAULT_INPUTS(cond, attrs, func_call)  \
+  if (cond) {                                          \
+    const auto is_dnnl = Op::GetAttr<bool>("TIsDNNL"); \
+    if (!is_dnnl.get(attrs.op, false))                 \
+      func_call;                                       \
   }
 
 #else
@@ -573,7 +576,7 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray*>& src,
     const auto& nd = *src[i];
 
 #if MXNET_USE_ONEDNN == 1
-    if (req->at(i) == kWriteInplace && nd.IsMKLDNNData())
+    if (req->at(i) == kWriteInplace && nd.IsDNNLData())
       // If it's write inplace and the output array doesn't use the default
       // layout, we'll generate a temporary output array below, which means
       // the input array and the output array are no longer the same array.
@@ -586,7 +589,7 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray*>& src,
       if (bufs != nullptr) {
         temp = bufs->at(i);
       } else if (kAddTo == req->at(i)) {
-        temp = nd.IsMKLDNNData() ? nd.Reorder2Default() : nd;
+        temp = nd.IsDNNLData() ? nd.Reorder2Default() : nd;
       } else {
         temp = NDArray(nd.shape(), nd.ctx(), true, nd.dtype());
       }
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 3e64a8db88ef..d927ff8fc938 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -25,20 +25,18 @@
 #include <dmlc/logging.h>
 #include <dmlc/memory_io.h>
 #include <dmlc/registry.h>
+#include <mshadow/tensor.h>
 #include <mxnet/base.h>
 #include <mxnet/imperative.h>
 #include <mxnet/ndarray.h>
 #include <mxnet/resource.h>
 
-#include <mshadow/tensor.h>
-
-#include "./ndarray_function.h"
-
 #include "../common/utils.h"
-#include "../operator/nn/mkldnn/mkldnn_base-inl.h"
+#include "../operator/nn/dnnl/dnnl_base-inl.h"
 #include "../operator/tensor/init_op.h"
 #include "../operator/tensor/matrix_op-inl.h"
 #include "../profiler/storage_profiler.h"
+#include "./ndarray_function.h"
 
 #if MXNET_USE_OPENCV
 #include <opencv2/opencv.hpp>
@@ -119,7 +117,7 @@ struct ChunkMem {
   Storage::Handle h;
   std::vector<Storage::Handle> aux_h;
 #if MXNET_USE_ONEDNN == 1
-  std::shared_ptr<MKLDNNMemory> mem;
+  std::shared_ptr<DNNLMemory> mem;
 #endif
 };
 
@@ -129,8 +127,8 @@ NDArray::Chunk::~Chunk() {
   mem.h     = this->shandle;
   mem.aux_h = this->aux_handles;
 #if MXNET_USE_ONEDNN == 1
-  // We want to delete mkldnn memory after deleting the variable.
-  mem.mem = this->mkl_mem_;
+  // We want to delete dnnl memory after deleting the variable.
+  mem.mem = this->dnnl_mem_;
 #endif
   if (auto engine = engine_ref_.lock()) {
     engine->DeleteVariable(
@@ -168,7 +166,7 @@ void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape& shape, int dtype) {
     shandle.size = dbytes;
     Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_ONEDNN == 1
-    mkl_mem_ = nullptr;
+    dnnl_mem_ = nullptr;
 #endif
   }
   // init shape
@@ -198,34 +196,34 @@ nnvm::Symbol NDArray::get_autograd_symbol() const {
 
 #if MXNET_USE_ONEDNN == 1
 
-NDArray::NDArray(const mkldnn::memory::desc& md)
+NDArray::NDArray(const dnnl::memory::desc& md)
     : storage_type_(kDefaultStorage), autograd_entry_(nullptr) {
   shape_ = mxnet::TShape(md.data.dims, md.data.dims + md.data.ndims);
   dtype_ = get_mxnet_type(md.data.data_type);
   ptr_   = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
   ptr_->CheckAndAlloc(md.get_size());
-  ptr_->mkl_mem_ = std::make_shared<MKLDNNMemory>(md, ptr_->shandle.dptr);
+  ptr_->dnnl_mem_ = std::make_shared<DNNLMemory>(md, ptr_->shandle.dptr);
 }
 
-NDArray::NDArray(const std::shared_ptr<mkldnn::memory>& mkldnn_mem)
+NDArray::NDArray(const std::shared_ptr<dnnl::memory>& dnnl_mem)
     : storage_type_(kDefaultStorage), autograd_entry_(nullptr) {
-  auto mem_desc      = mkldnn_mem->get_desc();
+  auto mem_desc      = dnnl_mem->get_desc();
   shape_             = mxnet::TShape(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims);
   dtype_             = get_mxnet_type(mem_desc.data.data_type);
   ptr_               = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
-  ptr_->shandle.dptr = mkldnn_mem->get_data_handle();
+  ptr_->shandle.dptr = dnnl_mem->get_data_handle();
   ptr_->shandle.size = mem_desc.get_size();
   ptr_->delay_alloc  = false;
-  ptr_->mkl_mem_     = std::make_shared<MKLDNNMemory>(mkldnn_mem);
+  ptr_->dnnl_mem_    = std::make_shared<DNNLMemory>(dnnl_mem);
   ptr_->static_data  = true;
 }
 
-NDArray NDArray::MKLDNNDataReshape(const mxnet::TShape& shape) const {
+NDArray NDArray::DNNLDataReshape(const mxnet::TShape& shape) const {
   CHECK(!is_none()) << "NDArray is not initialized";
   CHECK_GE(shape_.Size(), shape.Size())
       << "NDArray.Reshape: target shape size is larger current shape";
   CHECK_EQ(storage_type(), kDefaultStorage);
-  if (!IsMKLDNNData()) {
+  if (!IsDNNLData()) {
     NDArray ret = this->Detach();
     ret.shape_  = shape;
     return ret;
@@ -233,22 +231,22 @@ NDArray NDArray::MKLDNNDataReshape(const mxnet::TShape& shape) const {
     NDArray ret(shape, ctx(), true, dtype());
     // We shouldn't submit the reorder primitive here because submit will
     // be called in operators.
-    mkldnn_format_tag_t format = ptr_->mkl_mem_->GetDefaultFormat();
-    CHECK(ptr_->IsMKLDNN());
-    mkldnn::memory::desc def_desc            = ptr_->mkl_mem_->GetDesc(format);
-    mkldnn::memory* def_mem                  = TmpMemMgr::Get()->Alloc(def_desc);
-    MKLDNNStream* stream                     = MKLDNNStream::Get();
-    std::shared_ptr<mkldnn::memory> curr_mem = ptr_->mkl_mem_->GetMem();
+    dnnl_format_tag_t format = ptr_->dnnl_mem_->GetDefaultFormat();
+    CHECK(ptr_->IsDNNL());
+    dnnl::memory::desc def_desc            = ptr_->dnnl_mem_->GetDesc(format);
+    dnnl::memory* def_mem                  = TmpMemMgr::Get()->Alloc(def_desc);
+    DNNLStream* stream                     = DNNLStream::Get();
+    std::shared_ptr<dnnl::memory> curr_mem = ptr_->dnnl_mem_->GetMem();
     stream->RegisterMem(curr_mem);
-    std::unordered_map<int, mkldnn::memory> args(
-        {{MKLDNN_ARG_FROM, *curr_mem}, {MKLDNN_ARG_TO, *def_mem}});
-    stream->RegisterPrimArgs(mkldnn::reorder(*curr_mem, *def_mem), args);
+    std::unordered_map<int, dnnl::memory> args(
+        {{DNNL_ARG_FROM, *curr_mem}, {DNNL_ARG_TO, *def_mem}});
+    stream->RegisterPrimArgs(dnnl::reorder(*curr_mem, *def_mem), args);
     // def_mem points to a memory region in the temp space. It's only valid
     // inside an operator. As such, the returned NDArray can only be valid
     // inside an operator and the shared point doesn't need to do anything
     // when it's destroyed.
-    auto tmp = std::shared_ptr<mkldnn::memory>(def_mem, [](mkldnn::memory* mem) {});
-    ret.ptr_->mkl_mem_.reset(new MKLDNNMemory(tmp));
+    auto tmp = std::shared_ptr<dnnl::memory>(def_mem, [](dnnl::memory* mem) {});
+    ret.ptr_->dnnl_mem_.reset(new DNNLMemory(tmp));
     ret.ptr_->shandle.dptr = def_mem->get_data_handle();
     ret.ptr_->shandle.size = def_mem->get_desc().get_size();
     ret.ptr_->delay_alloc  = false;
@@ -500,185 +498,185 @@ void NDArray::set_fresh_out_grad(bool state) const {
 
 #if MXNET_USE_ONEDNN == 1
 
-bool NDArray::Chunk::IsMKLDNN() const {
+bool NDArray::Chunk::IsDNNL() const {
   if (storage_type != kDefaultStorage)
     return false;
-  if (mkl_mem_ == nullptr)
+  if (dnnl_mem_ == nullptr)
     return false;
-  return mkl_mem_->IsMKLDNN();
+  return dnnl_mem_->IsDNNL();
 }
 
 bool NDArray::Chunk::IsDefault() const {
   if (storage_type != kDefaultStorage)
     return false;
-  // If we don't have mkldnn memory yet, we just assume it's not the default
+  // If we don't have dnnl memory yet, we just assume it's not the default
   // format.
-  if (mkl_mem_ == nullptr)
+  if (dnnl_mem_ == nullptr)
     return true;
-  return !mkl_mem_->IsMKLDNN();
+  return !dnnl_mem_->IsDNNL();
 }
 
 void NDArray::Chunk::Reorder2Default() {
-  if (mkl_mem_ == nullptr)
+  if (dnnl_mem_ == nullptr)
     return;
 
   if (IsDefault())
     return;
 
-  mkldnn_format_tag_t format    = mkl_mem_->GetDefaultFormat();
-  mkldnn::memory::desc def_desc = mkl_mem_->GetDesc(format);
-  mkldnn_mem_ptr def_mem(new mkldnn::memory(def_desc, CpuEngine::Get()->get_engine()));
-  mkl_mem_->ReorderTo(def_mem.get());
+  dnnl_format_tag_t format    = dnnl_mem_->GetDefaultFormat();
+  dnnl::memory::desc def_desc = dnnl_mem_->GetDesc(format);
+  dnnl_mem_ptr def_mem(new dnnl::memory(def_desc, CpuEngine::Get()->get_engine()));
+  dnnl_mem_->ReorderTo(def_mem.get());
 
   CHECK(shandle.size >= def_desc.get_size());
   CheckAndAlloc(def_desc.get_size());
   // TODO(zhengda) We need to avoid memory copy here.
   memcpy(shandle.dptr, def_mem->get_data_handle(), def_desc.get_size());
-  mkl_mem_ = nullptr;
+  dnnl_mem_ = nullptr;
 }
 
-void NDArray::Chunk::MKLDNNDataReorder(const mkldnn::memory::desc& md) {
+void NDArray::Chunk::DNNLDataReorder(const dnnl::memory::desc& md) {
   // If the memory already uses the specified layout, don't do anything.
-  if (mkl_mem_ != nullptr && mkl_mem_->SameFormat(md))
+  if (dnnl_mem_ != nullptr && dnnl_mem_->SameFormat(md))
     return;
 
   // If the memory is default, don't do anything.
-  if (!mxnet::IsMKLDNN(md) && IsDefault())
+  if (!mxnet::IsDNNL(md) && IsDefault())
     return;
-  if (!mxnet::IsMKLDNN(md)) {
+  if (!mxnet::IsDNNL(md)) {
     // If the specified layout is default, we should use Reorder2Default.
     Reorder2Default();
     return;
   }
   auto engine = CpuEngine::Get()->get_engine();
-  mkldnn::stream s(engine);
+  dnnl::stream s(engine);
 
-  std::shared_ptr<mkldnn::memory> new_mem(new mkldnn::memory(md, engine));
-  std::shared_ptr<mkldnn::memory> old_mem;
+  std::shared_ptr<dnnl::memory> new_mem(new dnnl::memory(md, engine));
+  std::shared_ptr<dnnl::memory> old_mem;
   if (IsDefault()) {
-    mkldnn_format_tag_t def_format = GetDefaultFormat(md);
-    mkldnn::memory::desc def_desc  = GetDesc(md, def_format);
-    old_mem.reset(new mkldnn::memory(def_desc, engine, shandle.dptr));
+    dnnl_format_tag_t def_format = GetDefaultFormat(md);
+    dnnl::memory::desc def_desc  = GetDesc(md, def_format);
+    old_mem.reset(new dnnl::memory(def_desc, engine, shandle.dptr));
   } else {
-    old_mem = this->mkl_mem_->GetMem();
+    old_mem = this->dnnl_mem_->GetMem();
   }
   CHECK(old_mem->get_desc().data.ndims == md.data.ndims);
 
-  // This may be called in MKLDNN operators. We can't use MKLDNNStream here.
-  mkldnn::reorder(*old_mem, *new_mem).execute(s, *old_mem, *new_mem);
+  // This may be called in DNNL operators. We can't use DNNLStream here.
+  dnnl::reorder(*old_mem, *new_mem).execute(s, *old_mem, *new_mem);
 
   CHECK(shandle.size >= md.get_size());
   CheckAndAlloc(md.get_size());
   // TODO(zhengda) We need to avoid memory copy here.
   memcpy(shandle.dptr, new_mem->get_data_handle(), md.get_size());
-  mkl_mem_.reset(new MKLDNNMemory(md, shandle.dptr));
+  dnnl_mem_.reset(new DNNLMemory(md, shandle.dptr));
 }
 
 void NDArray::Chunk::SetMKLMem(const mxnet::TShape& shape, int dtype) {
   // The shape of the array and the one of the MKL memory may mismatch.
   // For example, if the array stores parameters, the MKL memory may store data
   // in 5 dimensions while the NDArray stores data in 4 dimensions.
-  if (mkl_mem_ && mkl_mem_->GetDataHandle() == shandle.dptr && mkl_mem_->SameFormat(shape, dtype)) {
+  if (dnnl_mem_ && dnnl_mem_->GetDataHandle() == shandle.dptr &&
+      dnnl_mem_->SameFormat(shape, dtype)) {
     return;
   }
 
-  mkldnn::memory::dims dims;
-  // These are shapes supprted by MKLDNN.
+  dnnl::memory::dims dims;
+  // These are shapes supprted by DNNL.
   if (shape.ndim() >= 1 && shape.ndim() <= 6) {
     dims.resize(shape.ndim());
     for (size_t i = 0; i < dims.size(); i++)
       dims[i] = shape[i];
   } else {
-    LOG(FATAL) << "MKLDNN doesn't support " << shape.ndim() << " dimensions";
+    LOG(FATAL) << "DNNL doesn't support " << shape.ndim() << " dimensions";
   }
-  mkldnn::memory::format_tag layout = mkldnn::memory::format_tag::undef;
+  dnnl::memory::format_tag layout = dnnl::memory::format_tag::undef;
   switch (dims.size()) {
     case 1:
-      layout = mkldnn::memory::format_tag::a;
+      layout = dnnl::memory::format_tag::a;
       break;
     case 2:
-      layout = mkldnn::memory::format_tag::ab;
+      layout = dnnl::memory::format_tag::ab;
       break;
     case 3:
-      layout = mkldnn::memory::format_tag::abc;
+      layout = dnnl::memory::format_tag::abc;
       break;
     case 4:
-      layout = mkldnn::memory::format_tag::abcd;
+      layout = dnnl::memory::format_tag::abcd;
       break;
     case 5:
-      layout = mkldnn::memory::format_tag::abcde;
+      layout = dnnl::memory::format_tag::abcde;
       break;
     case 6:
-      layout = mkldnn::memory::format_tag::abcdef;
+      layout = dnnl::memory::format_tag::abcdef;
       break;
     default:
-      LOG(FATAL) << "Not implemented dimension (" << dims.size() << ") for MKLDNN";
+      LOG(FATAL) << "Not implemented dimension (" << dims.size() << ") for DNNL";
   }
-  mkldnn::memory::desc data_md{dims, get_mkldnn_type(dtype), layout};
+  dnnl::memory::desc data_md{dims, get_dnnl_type(dtype), layout};
   if (shandle.dptr == nullptr) {
     CHECK(delay_alloc);
     CheckAndAlloc();
   }
   CHECK(shandle.size >= data_md.get_size());
-  mkl_mem_.reset(new MKLDNNMemory(data_md, shandle.dptr));
+  dnnl_mem_.reset(new DNNLMemory(data_md, shandle.dptr));
 }
 
-const mkldnn::memory* NDArray::GetMKLDNNData(const mkldnn::memory::desc& desc) const {
+const dnnl::memory* NDArray::GetDNNLData(const dnnl::memory::desc& desc) const {
   if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
-    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
+    LOG(FATAL) << "The size of NDArray doesn't match the requested DNNL memory desc";
     return nullptr;
   }
-  const mkldnn::memory* mem  = GetMKLDNNData();
-  mkldnn::memory::desc desc1 = mem->get_desc();
+  const dnnl::memory* mem  = GetDNNLData();
+  dnnl::memory::desc desc1 = mem->get_desc();
   // The MKL memory has the same format and shape as required,
   // or both use the default format, we can return the MKL memory.
-  if (desc1 == desc || ((!mxnet::IsMKLDNN(desc1)) && (!mxnet::IsMKLDNN(desc)))) {
-    return GetMKLDNNExact(mem, desc);
+  if (desc1 == desc || ((!mxnet::IsDNNL(desc1)) && (!mxnet::IsDNNL(desc)))) {
+    return GetDNNLExact(mem, desc);
   } else {
     return nullptr;
   }
 }
 
-const mkldnn::memory* NDArray::GetMKLDNNDataReorder(const mkldnn::memory::desc& new_desc) const {
+const dnnl::memory* NDArray::GetDNNLDataReorder(const dnnl::memory::desc& new_desc) const {
   CHECK(storage_type() == kDefaultStorage);
 
-  const mkldnn::memory* mem = GetMKLDNNData();
+  const dnnl::memory* mem = GetDNNLData();
   // If the memory descriptor matches, it's easy.
-  MKLDNNStream* stream = MKLDNNStream::Get();
+  DNNLStream* stream = DNNLStream::Get();
   if (mem->get_desc() == new_desc) {
-    return GetMKLDNNExact(mem, new_desc);
+    return GetDNNLExact(mem, new_desc);
   }
 
-  mkldnn::memory::desc old_desc = mem->get_desc();
+  dnnl::memory::desc old_desc = mem->get_desc();
   // Now we need to determine if we should reorder the memory.
   // If both use the default formats, we think we don't need to reorder.
-  if ((!mxnet::IsMKLDNN(old_desc)) && (!mxnet::IsMKLDNN(new_desc))) {
-    mkldnn_mem_ptr ret(
-        new mkldnn::memory(new_desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
+  if ((!mxnet::IsDNNL(old_desc)) && (!mxnet::IsDNNL(new_desc))) {
+    dnnl_mem_ptr ret(
+        new dnnl::memory(new_desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
     stream->RegisterMem(ret);
     return ret.get();
   } else if (same_shape(old_desc, new_desc)) {
     // If they have the same shape, we can reorder data directly.
-    mkldnn::memory* ret = TmpMemMgr::Get()->Alloc(new_desc);
-    std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *mem}, {MKLDNN_ARG_TO, *ret}});
-    stream->RegisterPrimArgs(mkldnn::reorder(*mem, *ret), args);
+    dnnl::memory* ret = TmpMemMgr::Get()->Alloc(new_desc);
+    std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, *mem}, {DNNL_ARG_TO, *ret}});
+    stream->RegisterPrimArgs(dnnl::reorder(*mem, *ret), args);
     return ret;
   } else {
     // If they have different shapes, we need to reshape the array first.
     // Since this method will only be used inside an operator, we can call
-    // MKLDNNDataReshape to reshape an array.
+    // DNNLDataReshape to reshape an array.
     mxnet::TShape required_shape(new_desc.data.ndims, -1);
     for (int i = 0; i < new_desc.data.ndims; i++)
       required_shape[i] = new_desc.data.dims[i];
-    NDArray reshaped          = MKLDNNDataReshape(required_shape);
-    const mkldnn::memory* ret = reshaped.GetMKLDNNData();
+    NDArray reshaped        = DNNLDataReshape(required_shape);
+    const dnnl::memory* ret = reshaped.GetDNNLData();
     if (ret->get_desc() == new_desc) {
-      return GetMKLDNNExact(ret, new_desc);
+      return GetDNNLExact(ret, new_desc);
     } else {
-      mkldnn::memory* ret2 = TmpMemMgr::Get()->Alloc(new_desc);
-      std::unordered_map<int, mkldnn::memory> args(
-          {{MKLDNN_ARG_FROM, *ret}, {MKLDNN_ARG_TO, *ret2}});
-      stream->RegisterPrimArgs(mkldnn::reorder(*ret, *ret2), args);
+      dnnl::memory* ret2 = TmpMemMgr::Get()->Alloc(new_desc);
+      std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, *ret}, {DNNL_ARG_TO, *ret2}});
+      stream->RegisterPrimArgs(dnnl::reorder(*ret, *ret2), args);
       return ret2;
     }
   }
@@ -687,22 +685,22 @@ const mkldnn::memory* NDArray::GetMKLDNNDataReorder(const mkldnn::memory::desc&
 NDArray NDArray::Reorder2Default() const {
   CHECK(storage_type() == kDefaultStorage);
 
-  if (ptr_->mkl_mem_ == nullptr)
+  if (ptr_->dnnl_mem_ == nullptr)
     return *this;
-  if (!ptr_->mkl_mem_->IsMKLDNN())
+  if (!ptr_->dnnl_mem_->IsDNNL())
     return *this;
 
-  // create new ndarray from  mkldnn layout
-  mkldnn::memory::desc from_desc = ptr_->mkl_mem_->GetDesc();
+  // create new ndarray from  dnnl layout
+  dnnl::memory::desc from_desc = ptr_->dnnl_mem_->GetDesc();
   mxnet::TShape tshape(from_desc.data.ndims, -1);
   for (int i = 0; i < from_desc.data.ndims; i++)
     tshape[i] = from_desc.data.dims[i];
   NDArray ret(tshape, ctx(), false, dtype());
-  mkldnn_format_tag_t format    = ptr_->mkl_mem_->GetDefaultFormat();
-  mkldnn::memory::desc def_desc = ptr_->mkl_mem_->GetDesc(format);
+  dnnl_format_tag_t format    = ptr_->dnnl_mem_->GetDefaultFormat();
+  dnnl::memory::desc def_desc = ptr_->dnnl_mem_->GetDesc(format);
   CHECK(ret.ptr_->shandle.size >= def_desc.get_size());
-  mkldnn::memory def_mem(def_desc, CpuEngine::Get()->get_engine(), ret.ptr_->shandle.dptr);
-  ptr_->mkl_mem_->ReorderTo(&def_mem);
+  dnnl::memory def_mem(def_desc, CpuEngine::Get()->get_engine(), ret.ptr_->shandle.dptr);
+  ptr_->dnnl_mem_->ReorderTo(&def_mem);
   // reshape as needed
   ret.shape_       = shape_;
   ret.byte_offset_ = byte_offset_;
@@ -711,17 +709,17 @@ NDArray NDArray::Reorder2Default() const {
 }
 
 void NDArray::SelfReorder2Default() {
-  if (!IsMKLDNNData())
+  if (!IsDNNLData())
     return;
 
   CHECK(storage_type() == kDefaultStorage);
 
-  const auto mkl_mem = ptr_->mkl_mem_;
-  if (mkl_mem == nullptr || !mkl_mem->IsMKLDNN())
+  const auto dnnl_mem = ptr_->dnnl_mem_;
+  if (dnnl_mem == nullptr || !dnnl_mem->IsDNNL())
     return;
 
-  // create new ndarray from  mkldnn layout
-  mkldnn::memory::desc from_desc = mkl_mem->GetDesc();
+  // create new ndarray from  dnnl layout
+  dnnl::memory::desc from_desc = dnnl_mem->GetDesc();
   mxnet::TShape tshape(from_desc.data.ndims, -1);
   for (int i = 0; i < from_desc.data.ndims; i++)
     tshape[i] = from_desc.data.dims[i];
@@ -730,11 +728,11 @@ void NDArray::SelfReorder2Default() {
   const auto saved_byte_offset = byte_offset_;
   this->ReInit(kDefaultStorage, tshape, ctx(), dtype(), false);
 
-  mkldnn_format_tag_t format    = mkl_mem->GetDefaultFormat();
-  mkldnn::memory::desc def_desc = mkl_mem->GetDesc(format);
+  dnnl_format_tag_t format    = dnnl_mem->GetDefaultFormat();
+  dnnl::memory::desc def_desc = dnnl_mem->GetDesc(format);
   CHECK(ptr_->shandle.size >= def_desc.get_size());
-  mkldnn::memory def_mem(def_desc, CpuEngine::Get()->get_engine(), ptr_->shandle.dptr);
-  mkl_mem->ReorderTo(&def_mem);
+  dnnl::memory def_mem(def_desc, CpuEngine::Get()->get_engine(), ptr_->shandle.dptr);
+  dnnl_mem->ReorderTo(&def_mem);
   // reshape as needed
   shape_       = saved_shape;
   byte_offset_ = saved_byte_offset;
@@ -765,14 +763,14 @@ NDArray NDArray::Reorder2DefaultFloatFormat() const {
     return Reorder2Default();
   }
   NDArray ret(shape(), ctx(), false, mshadow::DataType<float>::kFlag);
-  auto src_mem = GetMKLDNNData();
-  auto dst_mem = ret.GetMKLDNNData();
+  auto src_mem = GetDNNLData();
+  auto dst_mem = ret.GetDNNLData();
   ReorderTo(src_mem, dst_mem);
 
   return ret;
 }
 
-void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc& desc) const {
+void NDArray::DNNLDataReorderAsync(const dnnl::memory::desc& desc) const {
   std::vector<Engine::VarHandle> const_vars;
   std::vector<Engine::VarHandle> mutable_vars(1, this->var());
   NDArray tmp        = *this;
@@ -782,7 +780,7 @@ void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc& desc) const {
         // MXNet will try to reuse NDArray from memory planning, so we need to ensure
         // the NDArray is still holding the original trunk data.
         if (tmp.version() == version) {
-          tmp.ptr_->MKLDNNDataReorder(desc);
+          tmp.ptr_->DNNLDataReorder(desc);
         }
         on_complete();
       },
@@ -794,120 +792,119 @@ void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc& desc) const {
       "Reorder");
 }
 
-const mkldnn::memory* NDArray::GetMKLDNNData() const {
+const dnnl::memory* NDArray::GetDNNLData() const {
   CHECK(storage_type() == kDefaultStorage);
   const auto is_view = IsView();
-  if (IsMKLDNNData()) {
-    // If this array uses MKLDNN layout, we have to make sure it's not a view.
+  if (IsDNNLData()) {
+    // If this array uses DNNL layout, we have to make sure it's not a view.
     // Otherwise, we'll have to change the layout inside the array.
     CHECK(!is_view);
-    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
-    // If this array uses MKLDNN format, we should return now. Otherwise,
-    // SetMKLMem may mess up mkl_mem_.
-    return ptr_->mkl_mem_->GetRaw();
+    DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
+    // If this array uses DNNL format, we should return now. Otherwise,
+    // SetMKLMem may mess up dnnl_mem_.
+    return ptr_->dnnl_mem_->GetRaw();
   }
 
   CheckAndAlloc();
   if (is_view) {
-    // If this is a view, we can't create a MKLDNN memory for the chunk
+    // If this is a view, we can't create a DNNL memory for the chunk
     // because we don't have the complete data type and shape information for
     // the chunk.
     void* off_addr = static_cast<char*>(ptr_->shandle.dptr) + byte_offset_;
-    // Create the primitive desc for the new mkldnn memory.
-    mkldnn::memory::dims dims(shape().ndim());
+    // Create the primitive desc for the new dnnl memory.
+    dnnl::memory::dims dims(shape().ndim());
     for (size_t i = 0; i < dims.size(); i++)
       dims[i] = shape()[i];
 
-    const auto cpp_format =
-        static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(shape().ndim()));
-    mkldnn::memory::desc data_md(dims, get_mkldnn_type(dtype_), cpp_format);
-    std::shared_ptr<mkldnn::memory> ret(
-        new mkldnn::memory(data_md, CpuEngine::Get()->get_engine(), off_addr));
-    MKLDNNStream::Get()->RegisterMem(ret);
+    const auto cpp_format = static_cast<dnnl::memory::format_tag>(GetDefaultFormat(shape().ndim()));
+    dnnl::memory::desc data_md(dims, get_dnnl_type(dtype_), cpp_format);
+    std::shared_ptr<dnnl::memory> ret(
+        new dnnl::memory(data_md, CpuEngine::Get()->get_engine(), off_addr));
+    DNNLStream::Get()->RegisterMem(ret);
     return ret.get();
   }
 
-  // If this isn't a view, we can create a MKLDNN memory and store it in the chunk
+  // If this isn't a view, we can create a DNNL memory and store it in the chunk
   ptr_->SetMKLMem(shape_, dtype_);
-  MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
-  return ptr_->mkl_mem_->GetRaw();
+  DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
+  return ptr_->dnnl_mem_->GetRaw();
 }
 
-void NDArray::InvalidateMKLDNNData() {
-  // Removing mkl_mem_ means the NDArray will store data in the default format.
-  if (ptr_->mkl_mem_ && ptr_->mkl_mem_->IsMKLDNN())
-    ptr_->mkl_mem_ = nullptr;
+void NDArray::InvalidateDNNLData() {
+  // Removing dnnl_mem_ means the NDArray will store data in the default format.
+  if (ptr_->dnnl_mem_ && ptr_->dnnl_mem_->IsDNNL())
+    ptr_->dnnl_mem_ = nullptr;
 }
 
-void NDArray::CopyFrom(const mkldnn::memory& mem) {
+void NDArray::CopyFrom(const dnnl::memory& mem) {
   CHECK(ptr_ != nullptr) << "The NDArray hasn't been initialized";
-  if (ptr_->mkl_mem_ && ptr_->mkl_mem_->GetRaw() == &mem)
+  if (ptr_->dnnl_mem_ && ptr_->dnnl_mem_->GetRaw() == &mem)
     return;
 
   CHECK(mem.get_desc().get_size() == shape().Size() * GetTypeSize(dtype_))
-      << "The size of NDArray doesn't match the requested MKLDNN memory desc";
-  // If this array uses MKLDNN layout, we have to make sure it's not a view.
+      << "The size of NDArray doesn't match the requested DNNL memory desc";
+  // If this array uses DNNL layout, we have to make sure it's not a view.
   // Otherwise, we'll have to change the layout inside the array.
 
-  if (IsMKLDNNData() && IsView())
+  if (IsDNNLData() && IsView())
     ptr_->Reorder2Default();
 
-  const mkldnn::memory* this_mem = GetMKLDNNData();
-  MKLDNNMemoryCopy(mem, this_mem);
+  const dnnl::memory* this_mem = GetDNNLData();
+  DNNLMemoryCopy(mem, this_mem);
 }
 
-mkldnn::memory* NDArray::CreateMKLDNNData(const mkldnn::memory::desc& desc) {
+dnnl::memory* NDArray::CreateDNNLData(const dnnl::memory::desc& desc) {
   if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
-    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc. "
-               << "MKLDNN memory requests for " << desc.get_size() << " bytes, but got "
+    LOG(FATAL) << "The size of NDArray doesn't match the requested DNNL memory desc. "
+               << "DNNL memory requests for " << desc.get_size() << " bytes, but got "
                << shape().Size() * GetTypeSize(dtype_) << " bytes from NDArray";
     return nullptr;
   }
   bool isDefaultFormat = IsDefaultFormat(desc);
   if (isDefaultFormat && !IsView()) {
     ptr_->SetMKLMem(shape_, dtype_);
-    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
-    return GetMKLDNNExact(ptr_->mkl_mem_->GetRaw(), desc);
+    DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
+    return GetDNNLExact(ptr_->dnnl_mem_->GetRaw(), desc);
   } else if (isDefaultFormat) {
     ptr_->CheckAndAlloc();
     CHECK(ptr_->shandle.dptr);
     // When this is a view and a user wants the default layout, we can simply
-    // create a new mkldnn memory that points to the right memory.
-    std::shared_ptr<mkldnn::memory> mem(
-        new mkldnn::memory(desc,
-                           CpuEngine::Get()->get_engine(),
-                           static_cast<char*>(ptr_->shandle.dptr) + byte_offset_));
-    MKLDNNStream::Get()->RegisterMem(mem);
+    // create a new dnnl memory that points to the right memory.
+    std::shared_ptr<dnnl::memory> mem(
+        new dnnl::memory(desc,
+                         CpuEngine::Get()->get_engine(),
+                         static_cast<char*>(ptr_->shandle.dptr) + byte_offset_));
+    DNNLStream::Get()->RegisterMem(mem);
     return mem.get();
   } else if (IsView()) {
     // If this is a view and a user wants to write data to it with special
-    // a MKLDNN format, we should reorder the data in the array and return NULL.
+    // a DNNL format, we should reorder the data in the array and return NULL.
     // In this way, the user will create a new NDArray for the special format
     // and copy data back.
     ptr_->Reorder2Default();
     return nullptr;
   }
 
-  if (ptr_->mkl_mem_)
-    CHECK(ptr_->mkl_mem_->GetDataHandle() == ptr_->shandle.dptr);
-  if (ptr_->mkl_mem_ && ptr_->mkl_mem_->GetDesc() == desc) {
-    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
-    return GetMKLDNNExact(ptr_->mkl_mem_->GetRaw(), desc);
+  if (ptr_->dnnl_mem_)
+    CHECK(ptr_->dnnl_mem_->GetDataHandle() == ptr_->shandle.dptr);
+  if (ptr_->dnnl_mem_ && ptr_->dnnl_mem_->GetDesc() == desc) {
+    DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
+    return GetDNNLExact(ptr_->dnnl_mem_->GetRaw(), desc);
   }
 
   CHECK(ptr_->shandle.size >= desc.get_size());
   ptr_->CheckAndAlloc(desc.get_size());
-  ptr_->mkl_mem_.reset(new MKLDNNMemory(desc, ptr_->shandle.dptr));
-  MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
-  return ptr_->mkl_mem_->GetRaw();
+  ptr_->dnnl_mem_.reset(new DNNLMemory(desc, ptr_->shandle.dptr));
+  DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
+  return ptr_->dnnl_mem_->GetRaw();
 }
 
-void NDArray::UpdateMKLDNNMemDesc(const mkldnn::memory::desc& desc) {
+void NDArray::UpdateDNNLMemDesc(const dnnl::memory::desc& desc) {
   auto new_desc           = desc;
-  auto this_dtype         = get_mkldnn_type(dtype());
-  new_desc.data.data_type = static_cast<mkldnn_data_type_t>(this_dtype);
-  ptr_->mkl_mem_.reset(new MKLDNNMemory(new_desc, ptr_->shandle.dptr));
-  MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
+  auto this_dtype         = get_dnnl_type(dtype());
+  new_desc.data.data_type = static_cast<dnnl_data_type_t>(this_dtype);
+  ptr_->dnnl_mem_.reset(new DNNLMemory(new_desc, ptr_->shandle.dptr));
+  DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
 }
 
 #endif
@@ -919,8 +916,8 @@ void NDArray::SetTBlob() const {
   auto stype          = storage_type();
   if (stype == kDefaultStorage) {
 #if MXNET_USE_ONEDNN == 1
-    CHECK(!IsMKLDNNData()) << "We can't generate TBlob for MKLDNN data. "
-                           << "Please use Reorder2Default() to generate a new NDArray first";
+    CHECK(!IsDNNLData()) << "We can't generate TBlob for DNNL data. "
+                         << "Please use Reorder2Default() to generate a new NDArray first";
 #endif
     dptr += byte_offset_;
   } else if (stype == kCSRStorage || stype == kRowSparseStorage) {
@@ -1316,38 +1313,38 @@ inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext
 template <typename from_xpu, typename to_xpu>
 inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
 #if MXNET_USE_ONEDNN == 1
-  // If neither is MKLDNN, we can copy data normally.
-  if (!from.IsMKLDNNData() && !to.IsMKLDNNData()) {
+  // If neither is DNNL, we can copy data normally.
+  if (!from.IsDNNLData() && !to.IsDNNLData()) {
 #endif
     using namespace mshadow;
     CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
     TBlob tmp = to.data();
     ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp, from.ctx(), to.ctx(), ctx);
 #if MXNET_USE_ONEDNN == 1
-  } else if (SupportMKLDNN(from.dtype(), from.shape()) && SupportMKLDNN(to.dtype(), to.shape()) &&
+  } else if (SupportDNNL(from.dtype(), from.shape()) && SupportDNNL(to.dtype(), to.shape()) &&
              from.ctx().dev_mask() == cpu::kDevMask && to.ctx().dev_mask() == cpu::kDevMask) {
     // If we copy data directly, we need to make sure both NDArrays are supported
-    // by MKLDNN.
-    auto from_mem = from.GetMKLDNNData();
-    auto to_mem   = to.GetMKLDNNData();
+    // by DNNL.
+    auto from_mem = from.GetDNNLData();
+    auto to_mem   = to.GetDNNLData();
     if (from_mem->get_desc() == to_mem->get_desc()) {
       size_t size = std::min(from_mem->get_desc().get_size(), to_mem->get_desc().get_size());
       memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size);
     } else {
       const_cast<NDArray&>(to).CopyFrom(*from_mem);
-      MKLDNNStream::Get()->Submit();
+      DNNLStream::Get()->Submit();
     }
   } else {
-    // In this case, one of the NDArray isn't supported by MKLDNN, we need
-    // to convert the MKLDNN array to the default format first and copy data
+    // In this case, one of the NDArray isn't supported by DNNL, we need
+    // to convert the DNNL array to the default format first and copy data
     // with Copy().
     NDArray tmp_from = from;
-    if (tmp_from.IsMKLDNNData()) {
+    if (tmp_from.IsDNNLData()) {
       // TODO(zhengda) tmp_from should be cached.
       tmp_from     = NDArray(from.shape(), from.ctx(), false, from.dtype());
-      auto tmp_mem = from.GetMKLDNNData();
+      auto tmp_mem = from.GetDNNLData();
       tmp_from.CopyFrom(*tmp_mem);
-      MKLDNNStream::Get()->Submit();
+      DNNLStream::Get()->Submit();
     }
     CHECK(tmp_from.IsDefaultData());
     CHECK(to.IsDefaultData());
@@ -1896,7 +1893,7 @@ void NDArray::Save(dmlc::Stream* strm) const {
     this->WaitToRead();
     nd_cpu = *this;
 #if MXNET_USE_ONEDNN == 1
-    if (nd_cpu.IsMKLDNNData())
+    if (nd_cpu.IsDNNLData())
       nd_cpu = nd_cpu.Reorder2Default();
 #endif
     save_data = nd_cpu.data();
@@ -2346,7 +2343,7 @@ void NDArray::SyncCopyToCPU(void* data, size_t size) const {
     RunContext rctx{this->ctx(), nullptr, nullptr, false};
     NDArray src = *this;
 #if MXNET_USE_ONEDNN == 1
-    if (src.IsMKLDNNData())
+    if (src.IsDNNLData())
       src = this->Reorder2Default();
 #endif
     ndarray::Copy<cpu, cpu>(src.data(), &dst, Context::CPU(), Context::CPU(), rctx);
diff --git a/src/operator/contrib/batch_norm_relu.cc b/src/operator/contrib/batch_norm_relu.cc
index 93ccbd9db38b..d223c65cf4ec 100644
--- a/src/operator/contrib/batch_norm_relu.cc
+++ b/src/operator/contrib/batch_norm_relu.cc
@@ -28,7 +28,7 @@
 #include "../elemwise_op_common.h"
 #include "../operator_common.h"
 #if MXNET_USE_ONEDNN == 1
-#include "../nn/mkldnn/mkldnn_batch_norm-inl.h"
+#include "../nn/dnnl/dnnl_batch_norm-inl.h"
 #endif
 
 namespace mxnet {
@@ -130,7 +130,7 @@ static bool BatchNormWithReLUType(const nnvm::NodeAttrs& attrs,
 }
 
 #if MXNET_USE_ONEDNN == 1
-static inline bool SupportMKLDNNBNReLU(const NDArray& input, const BatchNormParam& param) {
+static inline bool SupportDNNLBNReLU(const NDArray& input, const BatchNormParam& param) {
   if (mxnet::op::batchnorm::disable_mkl)
     return false;
   const mxnet::TShape shape = input.shape();
@@ -139,7 +139,7 @@ static inline bool SupportMKLDNNBNReLU(const NDArray& input, const BatchNormPara
     return false;
   const int dtype = input.dtype();
   return (dtype == mshadow::kFloat32 || dtype == mshadow::kBfloat16) &&
-         SupportStorageMKLDNN(input.storage_type());
+         SupportStorageDNNL(input.storage_type());
 }
 
 void BatchNormWithReLUComputeExCPU(const nnvm::NodeAttrs& attrs,
@@ -150,15 +150,15 @@ void BatchNormWithReLUComputeExCPU(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 5U);
   const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
   bool fuse_relu              = true;
-  if (SupportMKLDNNBNReLU(inputs[0], param)) {
+  if (SupportDNNLBNReLU(inputs[0], param)) {
     CHECK_GT(outputs.size(), 3U);
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNN_REAL_TYPE_SWITCH(inputs[0].dtype(), DTYPE, {
-      MKLDNNBatchNormForward<DTYPE>(attrs, ctx, inputs, req, outputs, fuse_relu);
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNL_REAL_TYPE_SWITCH(inputs[0].dtype(), DTYPE, {
+      DNNLBatchNormForward<DTYPE>(attrs, ctx, inputs, req, outputs, fuse_relu);
     });
     return;
   }
-  LOG(FATAL) << "BatchNormWithReLU operator only supports MKL-DNN Backend.";
+  LOG(FATAL) << "BatchNormWithReLU operator only supports DNNL Backend.";
 }
 
 void BatchNormWithReLUGradComputeExCPU(const nnvm::NodeAttrs& attrs,
@@ -168,13 +168,13 @@ void BatchNormWithReLUGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                        const std::vector<NDArray>& outputs) {
   const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
   bool fuse_relu              = true;
-  if (SupportMKLDNNBNReLU(inputs[0], param)) {
+  if (SupportDNNLBNReLU(inputs[0], param)) {
     CHECK_EQ(inputs.size(), 9U);
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNBatchNormBackward<float>(attrs, ctx, inputs, req, outputs, fuse_relu);
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLBatchNormBackward<float>(attrs, ctx, inputs, req, outputs, fuse_relu);
     return;
   }
-  LOG(FATAL) << "BatchNormWithReLU operator only supports MKL-DNN Backend.";
+  LOG(FATAL) << "BatchNormWithReLU operator only supports DNNL Backend.";
 }
 #endif
 
@@ -188,9 +188,9 @@ static inline bool BatchNormWithReLUStorageType(const nnvm::NodeAttrs& attrs,
   bool dispatched = false;
 #if MXNET_USE_ONEDNN == 1
   if (!dispatched) {
-    dispatched = MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+    dispatched = DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
   }
-  if (!MKLDNNEnvSet()) {
+  if (!DNNLEnvSet()) {
     *dispatch_mode = DispatchMode::kFComputeFallback;
   }
 #else
@@ -288,7 +288,7 @@ An extented operator of Batch normalization which can fuse ReLU activation.
 #endif
     .set_attr<nnvm::FGradient>("FGradient", BatchNormWithReLUGrad)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FResourceRequest>("FResourceRequest",
                                 [](const NodeAttrs& n) {
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
@@ -322,7 +322,7 @@ NNVM_REGISTER_OP(_backward_contrib_BatchNormWithReLU)
                                 [](const NodeAttrs& n) {
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                 })
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormWithReLUGradComputeExCPU)
 #endif
     .set_attr_parser(ParamParser<BatchNormParam>);
diff --git a/src/operator/leaky_relu.cc b/src/operator/leaky_relu.cc
index 799311b07679..dd331ade231c 100644
--- a/src/operator/leaky_relu.cc
+++ b/src/operator/leaky_relu.cc
@@ -25,8 +25,8 @@
 
 #include "./leaky_relu-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./nn/mkldnn/mkldnn_base-inl.h"
-#include "./nn/mkldnn/mkldnn_ops-inl.h"
+#include "./nn/dnnl/dnnl_base-inl.h"
+#include "./nn/dnnl/dnnl_ops-inl.h"
 #endif  // MXNET_USE_ONEDNN == 1
 
 #include <nnvm/op_attr_types.h>
@@ -95,10 +95,10 @@ static void LeakyReLUComputeExCPU(const nnvm::NodeAttrs& attrs,
   const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
   size_t expected             = param.act_type == leakyrelu::kPReLU ? 2 : 1;
   CHECK_EQ(inputs.size(), expected);
-  if (SupportMKLDNNLeakyRelu(param, inputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNLeakyReluForward, attrs, ctx, inputs[0], req[0], outputs[0]);
-    MKLDNN_OPCHECK_RUN(LeakyReLUCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLLeakyRelu(param, inputs[0])) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLLeakyReluForward, attrs, ctx, inputs[0], req[0], outputs[0]);
+    DNNL_OPCHECK_RUN(LeakyReLUCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(LeakyReLUCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -112,11 +112,11 @@ void LeakyReLUGradComputeExCPU(const nnvm::NodeAttrs& attrs,
   if (inputs[0].shape().Size() == 0U)
     return;
   const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
-  if (SupportMKLDNNLeakyRelu(param, inputs[0])) {
+  if (SupportDNNLLeakyRelu(param, inputs[0])) {
     std::vector<NDArray> in_data{inputs[0], inputs[1]};
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNLeakyReluBackward, attrs, ctx, in_data, req, outputs);
-    MKLDNN_OPCHECK_RUN(LeakyReLUGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLLeakyReluBackward, attrs, ctx, in_data, req, outputs);
+    DNNL_OPCHECK_RUN(LeakyReLUGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(LeakyReLUGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -130,8 +130,8 @@ inline static bool LeakyReLUStorageType(const nnvm::NodeAttrs& attrs,
   const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
   size_t expected             = param.act_type == leakyrelu::kPReLU ? 2 : 1;
   CHECK_EQ(in_attrs->size(), expected);
-  return MKLDNNStorageType(
-      attrs, dev_mask, SupportMKLDNNLeakyRelu(param), dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(
+      attrs, dev_mask, SupportDNNLLeakyRelu(param), dispatch_mode, in_attrs, out_attrs);
 }
 
 inline static bool BackwardLeakyReLUStorageType(const nnvm::NodeAttrs& attrs,
@@ -140,8 +140,8 @@ inline static bool BackwardLeakyReLUStorageType(const nnvm::NodeAttrs& attrs,
                                                 std::vector<int>* in_attrs,
                                                 std::vector<int>* out_attrs) {
   const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
-  return MKLDNNStorageType(
-      attrs, dev_mask, SupportMKLDNNLeakyRelu(param), dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(
+      attrs, dev_mask, SupportDNNLLeakyRelu(param), dispatch_mode, in_attrs, out_attrs);
 }
 #endif  // MXNET_USE_ONEDNN == 1
 
@@ -197,7 +197,7 @@ The following modified ReLU Activation functions are supported:
     .set_attr<nnvm::FInferType>("FInferType", LeakyReLUType)
     .set_attr<FCompute>("FCompute<cpu>", LeakyReLUCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", LeakyReLUComputeExCPU)
 #endif
     .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_LeakyReLU"})
@@ -248,7 +248,7 @@ NNVM_REGISTER_OP(_backward_LeakyReLU)
                                 })
     .set_attr_parser(ParamParser<LeakyReLUParam>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", LeakyReLUGradComputeExCPU)
 #endif
     .set_attr<FCompute>("FCompute<cpu>", LeakyReLUGradCompute<cpu>);
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index 4efe4cd41262..a228bf8a7610 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -26,8 +26,8 @@
 #include "../mshadow_op.h"
 #include "../tensor/elemwise_unary_op.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./mkldnn/mkldnn_base-inl.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
+#include "./dnnl/dnnl_base-inl.h"
+#include "./dnnl/dnnl_ops-inl.h"
 #endif  // MXNET_USE_ONEDNN == 1
 #include "../operator_common.h"
 #include "../../common/utils.h"
@@ -112,10 +112,10 @@ static void ActivationComputeExCPU(const nnvm::NodeAttrs& attrs,
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
-  if (SupportMKLDNNAct(param, inputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNActivationForward, attrs, ctx, inputs[0], req[0], outputs[0]);
-    MKLDNN_OPCHECK_RUN(ActivationCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLAct(param, inputs[0])) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLActivationForward, attrs, ctx, inputs[0], req[0], outputs[0]);
+    DNNL_OPCHECK_RUN(ActivationCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(ActivationComputeImpl<cpu>, attrs, ctx, inputs, req, outputs);
@@ -128,10 +128,10 @@ void ActivationGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const std::vector<NDArray>& outputs) {
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), activation::GradNumInputs(param.act_type));
-  if (SupportMKLDNNAct(param, inputs[0])) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNActivationBackward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(ActivationGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLAct(param, inputs[0])) {
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLActivationBackward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(ActivationGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(ActivationGradComputeImpl<cpu>, attrs, ctx, inputs, req, outputs);
@@ -145,8 +145,8 @@ inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1);
   CHECK_EQ(out_attrs->size(), 1);
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  return MKLDNNStorageType(
-      attrs, dev_mask, SupportMKLDNNAct(param), dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(
+      attrs, dev_mask, SupportDNNLAct(param), dispatch_mode, in_attrs, out_attrs);
 }
 
 inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
@@ -156,8 +156,8 @@ inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
                                           std::vector<int>* out_attrs) {
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), activation::GradNumInputs(param.act_type));
-  return MKLDNNStorageType(
-      attrs, dev_mask, SupportMKLDNNAct(param), dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(
+      attrs, dev_mask, SupportDNNLAct(param), dispatch_mode, in_attrs, out_attrs);
 }
 #endif  // MXNET_USE_ONEDNN == 1
 
@@ -186,7 +186,7 @@ The following activation functions are supported:
                                       })
     .set_attr<FCompute>("FCompute<cpu>", ActivationCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", ActivationComputeExCPU)
 #endif
     .set_attr<nnvm::FGradient>("FGradient", ActivationGrad{"_backward_Activation"})
@@ -216,7 +216,7 @@ NNVM_REGISTER_OP(_backward_Activation)
 #endif
     .set_attr_parser(ParamParser<ActivationParam>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", ActivationGradComputeExCPU)
 #endif
     .set_attr<FCompute>("FCompute<cpu>", ActivationGradCompute<cpu>);
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index 5a18363abe39..d3502b985b6f 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -30,7 +30,7 @@
 
 #include "batch_norm-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./mkldnn/mkldnn_batch_norm-inl.h"
+#include "./dnnl/dnnl_batch_norm-inl.h"
 #endif
 
 namespace mxnet {
@@ -446,7 +446,7 @@ static bool BatchNormType(const nnvm::NodeAttrs& attrs,
 }
 
 #if MXNET_USE_ONEDNN == 1
-static inline bool SupportMKLDNNBN(const NDArray& input, const BatchNormParam& param) {
+static inline bool SupportDNNLBN(const NDArray& input, const BatchNormParam& param) {
   if (mxnet::op::batchnorm::disable_mkl)
     return false;
   const mxnet::TShape shape = input.shape();
@@ -455,7 +455,7 @@ static inline bool SupportMKLDNNBN(const NDArray& input, const BatchNormParam& p
     return false;
   const int dtype = input.dtype();
   return (dtype == mshadow::kFloat32 || dtype == mshadow::kBfloat16) &&
-         SupportStorageMKLDNN(input.storage_type());
+         SupportStorageDNNL(input.storage_type());
 }
 
 void BatchNormComputeExCPU(const nnvm::NodeAttrs& attrs,
@@ -466,12 +466,12 @@ void BatchNormComputeExCPU(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 5U);
   const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
   bool fuse_relu              = false;
-  if (SupportMKLDNNBN(inputs[0], param)) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNN_REAL_TYPE_SWITCH(inputs[0].dtype(), DTYPE, {
-      MKLDNNBatchNormForward<DTYPE>(attrs, ctx, inputs, req, outputs, fuse_relu);
+  if (SupportDNNLBN(inputs[0], param)) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNL_REAL_TYPE_SWITCH(inputs[0].dtype(), DTYPE, {
+      DNNLBatchNormForward<DTYPE>(attrs, ctx, inputs, req, outputs, fuse_relu);
     });
-    MKLDNN_OPCHECK_RUN(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -484,10 +484,10 @@ void BatchNormGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                const std::vector<NDArray>& outputs) {
   const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
   bool fuse_relu              = false;
-  if (SupportMKLDNNBN(inputs[0], param)) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNBatchNormBackward<float>(attrs, ctx, inputs, req, outputs, fuse_relu);
-    MKLDNN_OPCHECK_RUN(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLBN(inputs[0], param)) {
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLBatchNormBackward<float>(attrs, ctx, inputs, req, outputs, fuse_relu);
+    DNNL_OPCHECK_RUN(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -504,9 +504,9 @@ static inline bool BatchNormStorageType(const nnvm::NodeAttrs& attrs,
   bool dispatched = false;
 #if MXNET_USE_ONEDNN == 1
   if (!dispatched) {
-    dispatched = MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+    dispatched = DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
   }
-  if (!MKLDNNEnvSet()) {
+  if (!DNNLEnvSet()) {
     *dispatch_mode = DispatchMode::kFComputeFallback;
   }
 #else
@@ -648,7 +648,7 @@ then set ``gamma`` to 1 and its gradient to 0.
 #endif
     .set_attr<nnvm::FGradient>("FGradient", BatchNormGrad)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
 #endif
     .set_attr<FResourceRequest>("FResourceRequest",
                                 [](const NodeAttrs& n) {
@@ -687,7 +687,7 @@ NNVM_REGISTER_OP(_backward_BatchNorm)
                                 })
     .set_attr_parser(ParamParser<BatchNormParam>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormGradComputeExCPU)
 #endif
     .set_attr<FCompute>("FCompute<cpu>", BatchNormGradCompute<cpu>);
diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc
index 5bfe8cfa7463..580183f7970a 100644
--- a/src/operator/nn/concat.cc
+++ b/src/operator/nn/concat.cc
@@ -23,10 +23,10 @@
  * \author Bing Xu
  */
 
-#include "./concat-inl.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
-#include "./mkldnn/mkldnn_base-inl.h"
 #include "../../common/utils.h"
+#include "./concat-inl.h"
+#include "./dnnl/dnnl_base-inl.h"
+#include "./dnnl/dnnl_ops-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -212,7 +212,7 @@ inline static bool ConcatForwardInferStorageType(const nnvm::NodeAttrs& attrs,
     dispatched = dispatch_fallback(out_attrs, dispatch_mode);
   }
 #if MXNET_USE_ONEDNN == 1
-  if (!MKLDNNEnvSet())
+  if (!DNNLEnvSet())
     *dispatch_mode = DispatchMode::kFComputeFallback;
 #endif  // MXNET_USE_ONEDNN == 1
   return dispatched;
@@ -234,13 +234,13 @@ inline static bool BackwardConcatStorageType(const nnvm::NodeAttrs& attrs,
 #endif  // MXNET_USE_ONEDNN == 1
     wanted_mode = DispatchMode::kFCompute;
 #if MXNET_USE_ONEDNN == 1
-  if (!MKLDNNEnvSet())
+  if (!DNNLEnvSet())
     wanted_mode = DispatchMode::kFComputeFallback;
 #endif  // MXNET_USE_ONEDNN == 1
   return storage_type_assign(out_attrs, mxnet::kDefaultStorage, dispatch_mode, wanted_mode);
 }
 #if MXNET_USE_ONEDNN == 1
-bool SupportMKLDNNConcat(const std::vector<NDArray>& arrs) {
+bool SupportDNNLConcat(const std::vector<NDArray>& arrs) {
   for (auto& arr : arrs) {
     if (arr.IsView())
       return false;
@@ -250,8 +250,8 @@ bool SupportMKLDNNConcat(const std::vector<NDArray>& arrs) {
     if (arr.shape().Size() == 0)
       return false;
     int ndim               = arr.shape().ndim();
-    const int mkldnn_ndims = arr.GetMKLDNNData()->get_desc().data.ndims;
-    if (!(ndim == 2 || ndim == 4) || ndim != mkldnn_ndims)
+    const int dnnl_ndims   = arr.GetDNNLData()->get_desc().data.ndims;
+    if (!(ndim == 2 || ndim == 4) || ndim != dnnl_ndims)
       return false;
   }
   return true;
@@ -271,10 +271,10 @@ static void ConcatComputeExCPU(const nnvm::NodeAttrs& attrs,
       outputs[0].storage_type() == kCSRStorage) {
     ConcatCSRImpl<cpu>(attrs, op_ctx, inputs, req, outputs);
 #if MXNET_USE_ONEDNN == 1
-  } else if (SupportMKLDNNConcat(inputs)) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNConcatForward, attrs, op_ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(ConcatCompute<cpu>, attrs, op_ctx, inputs, req, outputs);
+  } else if (SupportDNNLConcat(inputs)) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLConcatForward, attrs, op_ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(ConcatCompute<cpu>, attrs, op_ctx, inputs, req, outputs);
   } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) {
     FallBackCompute(ConcatCompute<cpu>, attrs, op_ctx, inputs, req, outputs);
 #endif  // MXNET_USE_ONEDNN == 1
@@ -289,10 +289,10 @@ static void ConcatGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                    const std::vector<NDArray>& inputs,
                                    const std::vector<OpReqType>& req,
                                    const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNConcat(inputs)) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNConcatBackward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(ConcatGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLConcat(inputs)) {
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLConcatBackward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(ConcatGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(ConcatGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -390,7 +390,7 @@ Example::
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                 })
     .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
 #endif  // MXNET_USE_ONEDNN == 1
         CONCAT_FORWARD_ATTRS.set_attr<mxnet::FInferShape>("FInferShape", ConcatShape)
     .add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate")
@@ -419,7 +419,7 @@ NNVM_REGISTER_OP(_backward_Concat)
     .set_attr<nnvm::TIsBackward>("TIsBackward", true)
     .set_attr<FInferStorageType>("FInferStorageType", BackwardConcatStorageType)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", ConcatGradComputeExCPU)
 #endif  // MXNET_USE_ONEDNN == 1
     .set_attr<FCompute>("FCompute<cpu>", ConcatGradCompute<cpu>);
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 7dcc2030ec12..0e054c0ff07f 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -27,8 +27,8 @@
 #include "../elemwise_op_common.h"
 #include "../operator_common.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./mkldnn/mkldnn_base-inl.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
+#include "./dnnl/dnnl_base-inl.h"
+#include "./dnnl/dnnl_ops-inl.h"
 #endif  // MXNET_USE_ONEDNN
 
 namespace mxnet {
@@ -54,10 +54,10 @@ static void ConvolutionComputeExCPU(const nnvm::NodeAttrs& attrs,
                                     const std::vector<OpReqType>& req,
                                     const std::vector<NDArray>& outputs) {
   const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
-  if (SupportMKLDNNConv(params, inputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNConvolutionForward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(ConvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLConv(params, inputs[0])) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLConvolutionForward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(ConvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(ConvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -69,10 +69,10 @@ static void ConvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                         const std::vector<OpReqType>& req,
                                         const std::vector<NDArray>& outputs) {
   const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
-  if (SupportMKLDNNConv(params, inputs[0])) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNConvolutionBackward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(ConvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLConv(params, inputs[0])) {
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLConvolutionBackward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(ConvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(ConvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -319,7 +319,7 @@ inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), in_expected);
   CHECK_EQ(out_attrs->size(), 1);
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 
 inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs,
@@ -333,7 +333,7 @@ inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), in_expected);
   CHECK_EQ(out_attrs->size(), out_expected);
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 #endif
 
@@ -507,7 +507,7 @@ There are other options to tune the performance.
 #endif
     .set_attr<FCompute>("FCompute<cpu>", ConvolutionCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionComputeExCPU)
 #endif
     .set_attr<nnvm::FGradient>("FGradient", ConvolutionGrad{"_backward_Convolution"})
@@ -540,7 +540,7 @@ NNVM_REGISTER_OP(_backward_Convolution)
                                 })
     .set_attr_parser(ConvolutionParamParser)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionGradComputeExCPU)
 #endif
     .set_attr<FCompute>("FCompute<cpu>", ConvolutionGradCompute<cpu>);
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index 2c7167271e76..86cde82765be 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -27,8 +27,8 @@
 #include "../operator_common.h"
 #include "../../common/utils.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./mkldnn/mkldnn_base-inl.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
+#include "./dnnl/dnnl_base-inl.h"
+#include "./dnnl/dnnl_ops-inl.h"
 #endif  // MXNET_USE_ONEDNN
 
 namespace mxnet {
@@ -41,10 +41,10 @@ static void DeconvolutionComputeExCPU(const nnvm::NodeAttrs& attrs,
                                       const std::vector<OpReqType>& req,
                                       const std::vector<NDArray>& outputs) {
   const DeconvolutionParam& params = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  if (SupportMKLDNNDeconv(params, inputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNDeconvolutionForward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(DeconvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLDeconv(params, inputs[0])) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLDeconvolutionForward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(DeconvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(DeconvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -56,10 +56,10 @@ static void DeconvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                           const std::vector<OpReqType>& req,
                                           const std::vector<NDArray>& outputs) {
   const DeconvolutionParam& params = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  if (SupportMKLDNNDeconv(params, inputs[0])) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNDeconvolutionBackward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(DeconvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLDeconv(params, inputs[0])) {
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLDeconvolutionBackward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(DeconvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(DeconvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -75,7 +75,7 @@ inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), in_expected);
   CHECK_EQ(out_attrs->size(), 1);
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 
 inline static bool BackwardDeconvStorageType(const nnvm::NodeAttrs& attrs,
@@ -89,7 +89,7 @@ inline static bool BackwardDeconvStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), in_expected);
   CHECK_EQ(out_attrs->size(), out_expected);
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 #endif
 
@@ -436,7 +436,7 @@ NNVM_REGISTER_OP(Deconvolution)
     .set_attr<FCompute>("FCompute<cpu>", DeconvolutionCompute<cpu>)
     .set_attr<nnvm::FGradient>("FGradient", DeconvolutionGrad{"_backward_Deconvolution"})
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FInferStorageType>("FInferStorageType", DeconvStorageType)
     .set_attr<FComputeEx>("FComputeEx<cpu>", DeconvolutionComputeExCPU)
 #endif
@@ -464,7 +464,7 @@ NNVM_REGISTER_OP(_backward_Deconvolution)
                                 })
     .set_attr_parser(DeconvolutionParamParser)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FInferStorageType>("FInferStorageType", BackwardDeconvStorageType)
     .set_attr<FComputeEx>("FComputeEx<cpu>", DeconvolutionGradComputeExCPU)
 #endif
diff --git a/src/operator/nn/dnnl/dnnl_act-inl.h b/src/operator/nn/dnnl/dnnl_act-inl.h
new file mode 100644
index 000000000000..3c8c16b1558b
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_act-inl.h
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_act-inl.h
+ * \brief DNNL Activation operator
+ * /author Zhiyuan Huang
+ */
+
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_ACT_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_ACT_INL_H_
+
+#if MXNET_USE_ONEDNN == 1
+#include <utility>
+#include <vector>
+
+#include "../../leaky_relu-inl.h"
+#include "../activation-inl.h"
+
+namespace mxnet {
+namespace op {
+
+struct DNNLActParam {
+  dnnl::algorithm alg;
+  float slope = 0.f;
+
+  bool operator==(const DNNLActParam& other) const {
+    return this->alg == other.alg && this->slope == other.slope;
+  }
+};
+
+dnnl::algorithm GetDNNLActAlgo(const ActivationParam& param);
+dnnl::algorithm GetDNNLActAlgo(const LeakyReLUParam& param);
+
+dnnl::eltwise_forward::primitive_desc GetActFwdDescImpl(const DNNLActParam& param,
+                                                        bool is_train,
+                                                        const dnnl::memory& input_mem);
+
+class DNNLActForward {
+ public:
+  const dnnl::eltwise_forward::primitive_desc fwd_pd;
+
+  DNNLActForward(const DNNLActParam& param,
+                 bool is_train,
+                 const NDArray& data,
+                 const dnnl::memory& mem)
+      : fwd_pd(GetActFwdDescImpl(param, is_train, mem)) {
+    fwd_ = std::make_shared<dnnl::eltwise_forward>(fwd_pd);
+  }
+  const inline dnnl::eltwise_forward& GetFwd() const;
+
+ private:
+  std::shared_ptr<dnnl::eltwise_forward> fwd_;
+};
+
+typedef ParamOpSign<DNNLActParam> DNNLActSignature;
+DNNLActForward& GetActForward(const DNNLActParam& param,
+                              const OpContext& ctx,
+                              const NDArray& in_data,
+                              const dnnl::memory& in_mem);
+
+dnnl::eltwise_backward::primitive_desc GetActBwdDescImpl(const DNNLActParam& param,
+                                                         const dnnl::memory& input_mem,
+                                                         const dnnl::memory& diff_dst_memory);
+
+class DNNLActBackward {
+ public:
+  const dnnl::eltwise_backward::primitive_desc bwd_pd;
+
+  explicit DNNLActBackward(const DNNLActParam& param,
+                           const NDArray& data,
+                           const dnnl::memory& mem,
+                           const dnnl::memory& diff_dst_memory)
+      : bwd_pd(GetActBwdDescImpl(param, mem, diff_dst_memory)) {
+    bwd_prim_ = std::make_shared<dnnl::eltwise_backward>(bwd_pd);
+  }
+  const inline dnnl::eltwise_backward& GetBwd() const;
+
+ private:
+  std::shared_ptr<dnnl::eltwise_backward> bwd_prim_;
+};
+}  // namespace op
+}  // namespace mxnet
+
+namespace std {
+template <>
+struct hash<mxnet::op::DNNLActParam> {
+  size_t operator()(const mxnet::op::DNNLActParam& val) {
+    size_t ret = 0;
+    ret        = dmlc::HashCombine(ret, static_cast<size_t>(val.alg));
+    ret        = dmlc::HashCombine(ret, val.slope);
+    return ret;
+  }
+};
+}  // namespace std
+
+#endif  // MXNET_USE_ONEDNN == 1
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_ACT_INL_H_
diff --git a/src/operator/nn/dnnl/dnnl_act.cc b/src/operator/nn/dnnl/dnnl_act.cc
new file mode 100644
index 000000000000..90a8fd0787f8
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_act.cc
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_act.cc
+ * \brief
+ * \author Da Zheng
+ */
+
+#if MXNET_USE_ONEDNN == 1
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "../../operator_common.h"
+#include "./dnnl_base-inl.h"
+#include "dnnl_act-inl.h"
+
+namespace mxnet {
+namespace op {
+
+bool SupportDNNLAct(const ActivationParam& param) {
+  return param.act_type == activation::kReLU || param.act_type == activation::kSigmoid ||
+         param.act_type == activation::kLogSigmoid || param.act_type == activation::kMish ||
+         param.act_type == activation::kSoftReLU || param.act_type == activation::kTanh;
+}
+
+bool SupportDNNLAct(const ActivationParam& param, const NDArray& input) {
+  // DNNL Activation supports 1d, 2d, 3d, 4d and 5d data layout
+  if ((input.shape().ndim() < 1) || (input.shape().ndim() > 5) ||
+      !(input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16))
+    return false;
+  return SupportDNNLAct(param);
+}
+
+bool SupportDNNLLeakyRelu(const LeakyReLUParam& param) {
+  return param.act_type == leakyrelu::kLeakyReLU || param.act_type == leakyrelu::kELU ||
+         param.act_type == leakyrelu::kGELU;
+}
+
+bool SupportDNNLLeakyRelu(const LeakyReLUParam& param, const NDArray& input) {
+  // DNNL Activation supports 1d, 2d, 3d, 4d and 5d data layout
+  if ((input.shape().ndim() < 1) || (input.shape().ndim() > 5) ||
+      !(input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16))
+    return false;
+  return SupportDNNLLeakyRelu(param);
+}
+
+bool SupportQuantizedDNNLAct(const ActivationParam& param) {
+  // TODO(zhennan): Add more activation type when dnnl supports.
+  //                Remove this when it's identity to SupportDNNLAct.
+  return param.act_type == activation::kReLU;
+}
+
+dnnl::algorithm GetDNNLActAlgo(const ActivationParam& param) {
+  switch (param.act_type) {
+    case activation::kReLU:
+      return dnnl::algorithm::eltwise_relu;
+    case activation::kSigmoid:
+      return dnnl::algorithm::eltwise_logistic;
+    case activation::kLogSigmoid:
+      return dnnl::algorithm::eltwise_logsigmoid;
+    case activation::kMish:
+      return dnnl::algorithm::eltwise_mish;
+    case activation::kTanh:
+      return dnnl::algorithm::eltwise_tanh;
+    case activation::kSoftReLU:
+      return dnnl::algorithm::eltwise_soft_relu;
+    default:
+      LOG(FATAL) << "unknown activation type";
+      return dnnl::algorithm::eltwise_relu;
+  }
+}
+
+dnnl::algorithm GetDNNLActAlgo(const LeakyReLUParam& param) {
+  switch (param.act_type) {
+    case leakyrelu::kLeakyReLU:
+      return dnnl::algorithm::eltwise_relu;
+    case leakyrelu::kELU:
+      return dnnl::algorithm::eltwise_elu;
+    case leakyrelu::kGELU:
+      return dnnl::algorithm::eltwise_gelu_erf;
+    default:
+      LOG(FATAL) << "unknown activation type for LeakyReLU: " << param.act_type;
+      return dnnl::algorithm::eltwise_relu;
+  }
+}
+
+dnnl::eltwise_forward::primitive_desc GetActFwdDescImpl(const DNNLActParam& param,
+                                                        bool is_train,
+                                                        const dnnl::memory& input_mem) {
+  dnnl::memory::desc data_md = input_mem.get_desc();
+  auto cpu_engine            = CpuEngine::Get()->get_engine();
+  auto alg                   = param.alg;
+
+  auto prop = is_train ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_scoring;
+  auto desc = dnnl::eltwise_forward::desc(prop, alg, data_md, param.slope);
+  return dnnl::eltwise_forward::primitive_desc(desc, cpu_engine);
+}
+
+const inline dnnl::eltwise_forward& DNNLActForward::GetFwd() const {
+  return *fwd_;
+}
+
+DNNLActForward& GetActForward(const DNNLActParam& param,
+                              const OpContext& ctx,
+                              const NDArray& in_data,
+                              const dnnl::memory& in_mem) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLActSignature, DNNLActForward, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLActSignature, DNNLActForward, OpHash> fwds;
+#endif
+  DNNLActSignature key(param);
+  key.AddSign(ctx.is_train);
+  key.AddSign(static_cast<int>(param.alg));
+  key.AddSign(param.slope);
+  key.AddSign(in_data);
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    DNNLActForward fwd(param, ctx.is_train, in_data, in_mem);
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void DNNLActivationForward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const NDArray& in_data,
+                           const OpReqType& req,
+                           const NDArray& out_data) {
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  DNNLActParam param_;
+  param_.alg               = GetDNNLActAlgo(param);
+  const NDArray& in_buffer = in_data;
+  DNNLStream* stream       = DNNLStream::Get();
+  auto input_mem           = in_buffer.GetDNNLData();
+  DNNLActForward& fwd      = GetActForward(param_, ctx, in_buffer, *input_mem);
+  auto out_mem_t           = CreateDNNLMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
+  stream->RegisterPrimArgs(fwd.GetFwd(),
+                           {{DNNL_ARG_SRC, *input_mem}, {DNNL_ARG_DST, *out_mem_t.second}});
+  CommitOutput(out_data, out_mem_t);
+  stream->Submit();
+}
+
+void DNNLLeakyReluForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const NDArray& in_data,
+                          const OpReqType& req,
+                          const NDArray& out_data) {
+  const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
+  DNNLActParam param_;
+  param_.alg   = GetDNNLActAlgo(param);
+  param_.slope = param.slope;
+
+  NDArray in_buffer  = in_data;
+  DNNLStream* stream = DNNLStream::Get();
+
+  if (in_data.IsView() && in_data.IsDNNLData())
+    in_buffer = in_data.Reorder2Default();
+
+  auto input_mem      = in_buffer.GetDNNLData();
+  DNNLActForward& fwd = GetActForward(param_, ctx, in_buffer, *input_mem);
+  auto out_mem_t      = CreateDNNLMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
+  stream->RegisterPrimArgs(fwd.GetFwd(),
+                           {{DNNL_ARG_SRC, *input_mem}, {DNNL_ARG_DST, *out_mem_t.second}});
+  CommitOutput(out_data, out_mem_t);
+  stream->Submit();
+}
+
+dnnl::eltwise_backward::primitive_desc GetActBwdDescImpl(const DNNLActParam& param,
+                                                         const dnnl::memory& input_mem,
+                                                         const dnnl::memory& diff_dst_memory) {
+  dnnl::memory::desc data_md = input_mem.get_desc();
+  dnnl::memory::desc diff_md = diff_dst_memory.get_desc();
+  auto cpu_engine            = CpuEngine::Get()->get_engine();
+  auto alg                   = param.alg;
+
+  dnnl::eltwise_forward::desc fw_desc(dnnl::prop_kind::forward_training, alg, data_md, param.slope);
+  dnnl::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine);
+  dnnl::eltwise_backward::desc bw_desc(alg, diff_md, data_md, param.slope);
+  dnnl::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, fw_pdesc);
+  return bw_pdesc;
+}
+
+const inline dnnl::eltwise_backward& DNNLActBackward::GetBwd() const {
+  return *bwd_prim_;
+}
+
+static inline DNNLActBackward& GetActBackward(const DNNLActParam& param,
+                                              const OpContext& ctx,
+                                              const NDArray& in_data,
+                                              const NDArray& out_grad,
+                                              const dnnl::memory& in_mem) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLActSignature, DNNLActBackward, OpHash> bwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLActSignature, DNNLActBackward, OpHash> bwds;
+#endif
+  DNNLActSignature key(param);
+  key.AddSign(in_data);
+  key.AddSign(out_grad);
+
+  auto it = bwds.find(key);
+  if (it == bwds.end()) {
+    DNNLActBackward bwd(param, in_data, in_mem, *out_grad.GetDNNLData());
+    it = AddToCache(&bwds, key, bwd);
+  }
+  return it->second;
+}
+
+// For backward relu activation, it's okay to pass "out_data" as "in_data" to this
+// function, since the computation only involes non-zeros.
+void DNNLActivationBackward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& outputs) {
+  if (req[0] == kNullOp) {
+    return;
+  }
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  // XXX: for y = relu(x), y is passed as "in_data" to Backward()
+  const bool relu           = param.act_type == activation::kReLU;
+  const NDArray& out_buffer = inputs[0];
+  const NDArray& in_buffer  = relu ? inputs[1] : inputs[2];
+  const NDArray& in_grad    = outputs[0];
+  DNNLActParam param_;
+  param_.alg = GetDNNLActAlgo(param);
+  TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]);
+  auto diff_dst_memory = out_buffer.GetDNNLData();
+  auto input_mem       = in_buffer.GetDNNLData();
+  // We need to make sure the two inputs to eltwise_backward has the same memory
+  // descriptor. Otherwise, the perf will suffer.
+  if (input_mem->get_desc() != diff_dst_memory->get_desc()) {
+    input_mem = in_buffer.GetDNNLDataReorder(diff_dst_memory->get_desc());
+  }
+
+  DNNLActBackward& bwd = GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
+  DNNLStream* stream   = DNNLStream::Get();
+  dnnl_args_map_t args = {{DNNL_ARG_SRC, *input_mem}, {DNNL_ARG_DIFF_DST, *diff_dst_memory}};
+  if (req[0] != kAddTo) {
+    // req[0] is kWriteTo or kWriteInplace
+    auto diff_src_memory = const_cast<NDArray&>(in_grad).CreateDNNLData(bwd.bwd_pd.diff_src_desc());
+    args.insert({DNNL_ARG_DIFF_SRC, *diff_src_memory});
+    stream->RegisterPrimArgs(bwd.GetBwd(), args);
+    stream->Submit();
+  } else {
+    auto diff_src_memory = CreateDNNLMem(in_grad, bwd.bwd_pd.diff_src_desc(), req[0]);
+    args.insert({DNNL_ARG_DIFF_SRC, *diff_src_memory.second});
+    stream->RegisterPrimArgs(bwd.GetBwd(), args);
+    CommitOutput(in_grad, diff_src_memory);
+    stream->Submit();
+  }
+}
+
+void DNNLLeakyReluBackward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs) {
+  if (req[0] == kNullOp) {
+    return;
+  }
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  const NDArray& out_buffer = inputs[0];
+  const NDArray& in_buffer  = inputs[1];
+  const NDArray& output     = outputs[0];
+
+  const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
+  DNNLActParam param_;
+  param_.alg   = GetDNNLActAlgo(param);
+  param_.slope = param.slope;
+
+  TmpMemMgr::Get()->Init(ctx.requested[leakyrelu::kRandom]);
+  auto diff_dst_memory = out_buffer.GetDNNLData();
+  auto input_mem       = in_buffer.GetDNNLData();
+  // We need to make sure the two inputs to eltwise_backward has the same memory
+  // descriptor. Otherwise, the perf will suffer.
+  if (input_mem->get_desc() != diff_dst_memory->get_desc())
+    input_mem = in_buffer.GetDNNLDataReorder(diff_dst_memory->get_desc());
+  DNNLActBackward& bwd          = GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
+  DNNLStream* stream            = DNNLStream::Get();
+  dnnl_output_t diff_src_memory = CreateDNNLMem(output, bwd.bwd_pd.diff_src_desc(), req[0]);
+  dnnl_args_map_t args          = {
+      {DNNL_ARG_SRC, *input_mem},
+      {DNNL_ARG_DIFF_DST, *diff_dst_memory},
+      {DNNL_ARG_DIFF_SRC, *diff_src_memory.second},
+  };
+  stream->RegisterPrimArgs(bwd.GetBwd(), args);
+  CommitOutput(output, diff_src_memory);
+  stream->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/dnnl/dnnl_base-inl.h
similarity index 52%
rename from src/operator/nn/mkldnn/mkldnn_base-inl.h
rename to src/operator/nn/dnnl/dnnl_base-inl.h
index cf7c9b1f5d62..d0a48715693d 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/dnnl/dnnl_base-inl.h
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
- * \file mkldnn_base-inl.h
+ * \file dnnl_base-inl.h
  * \brief
  * \author young.jin.kim@intel.com
  *         ashok.emani@intel.com
@@ -24,8 +24,8 @@
  *
  *******************************************************************************/
 
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_BASE_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_BASE_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 #include <algorithm>
@@ -36,13 +36,13 @@
 #include <utility>
 #include <vector>
 
-#include "mkldnn.hpp"
+#include "dnnl.hpp"
 #include "mxnet/graph_attr_types.h"
 #include "mxnet/ndarray.h"
 #include "mxnet/op_attr_types.h"
 #include "mxnet/resource.h"
 
-#define MKLDNN_REAL_TYPE_SWITCH(type, DType, ...) \
+#define DNNL_REAL_TYPE_SWITCH(type, DType, ...)   \
   switch (type) {                                 \
     case mshadow::kFloat32: {                     \
       typedef float DType;                        \
@@ -64,7 +64,7 @@ class CpuEngine {
  public:
   static CpuEngine* Get() {
     // I's thread-safe in C++11.
-    // ensure same mkldnn engine is used across threads
+    // ensure same dnnl engine is used across threads
     static CpuEngine myInstance;
     return &myInstance;
   }
@@ -73,16 +73,16 @@ class CpuEngine {
   CpuEngine& operator=(CpuEngine const&) = delete;  // Copy assign
   CpuEngine& operator=(CpuEngine&&) = delete;       // Move assign
 
-  mkldnn::engine& get_engine() {
+  dnnl::engine& get_engine() {
     return _cpu_engine;
   }
 
  protected:
-  CpuEngine() : _cpu_engine(mkldnn::engine::kind::cpu, 0) {}
+  CpuEngine() : _cpu_engine(dnnl::engine::kind::cpu, 0) {}
   ~CpuEngine() {}
 
  private:
-  mkldnn::engine _cpu_engine;
+  dnnl::engine _cpu_engine;
 };
 
 // type enumerator
@@ -91,30 +91,30 @@ struct data_type_enum {};
 
 template <>
 struct data_type_enum<float> {
-  enum { type = static_cast<unsigned int>(mkldnn::memory::data_type::f32) };
+  enum { type = static_cast<unsigned int>(dnnl::memory::data_type::f32) };
 };
 
 template <>
 struct data_type_enum<mshadow::bfloat::bf16_t> {
-  enum { type = static_cast<unsigned int>(mkldnn::memory::data_type::bf16) };
+  enum { type = static_cast<unsigned int>(dnnl::memory::data_type::bf16) };
 };
 
 template <>
 struct data_type_enum<int32_t> {
-  enum { type = static_cast<unsigned int>(mkldnn::memory::data_type::s32) };
+  enum { type = static_cast<unsigned int>(dnnl::memory::data_type::s32) };
 };
 
 template <>
 struct data_type_enum<int8_t> {
-  enum { type = static_cast<unsigned int>(mkldnn::memory::data_type::s8) };
+  enum { type = static_cast<unsigned int>(dnnl::memory::data_type::s8) };
 };
 
 template <>
 struct data_type_enum<uint8_t> {
-  enum { type = static_cast<unsigned int>(mkldnn::memory::data_type::u8) };
+  enum { type = static_cast<unsigned int>(dnnl::memory::data_type::u8) };
 };
 
-static inline bool SupportMKLDNNArray(int dtype, const mxnet::TShape& shape) {
+static inline bool SupportDNNLArray(int dtype, const mxnet::TShape& shape) {
   int ndim     = shape.ndim();
   bool support = ndim == 1 || ndim == 2 || ndim == 4;
   support      = support &&
@@ -123,37 +123,37 @@ static inline bool SupportMKLDNNArray(int dtype, const mxnet::TShape& shape) {
   return support;
 }
 
-static inline bool SupportStorageMKLDNN(int stype) {
+static inline bool SupportStorageDNNL(int stype) {
   return stype == kDefaultStorage;
 }
 
-static inline bool SupportMKLDNN(int dtype, const mxnet::TShape& shape) {
+static inline bool SupportDNNL(int dtype, const mxnet::TShape& shape) {
   int ndim = shape.ndim();
   if (ndim == 0 || shape.Size() == 0) {
-    // MKLDNN currently does not support 0-dim Tensor and 0-size Tensor
+    // DNNL currently does not support 0-dim Tensor and 0-size Tensor
     return false;
   }
   return (dtype == mshadow::kFloat32 || dtype == mshadow::kBfloat16) &&
          (ndim == 1 || ndim == 2 || ndim == 4);
 }
 
-static inline bool IsMKLDNNType(int dtype) {
+static inline bool IsDNNLType(int dtype) {
   return dtype == mshadow::kFloat32 || dtype == mshadow::kInt8 || dtype == mshadow::kUint8 ||
          dtype == mshadow::kBfloat16;
 }
 
-static inline bool SupportMKLDNN(const NDArray& input) {
-  return SupportMKLDNN(input.dtype(), input.shape()) && SupportStorageMKLDNN(input.storage_type());
+static inline bool SupportDNNL(const NDArray& input) {
+  return SupportDNNL(input.dtype(), input.shape()) && SupportStorageDNNL(input.storage_type());
 }
 
-static inline bool MKLDNNEnvSet() {
-  static bool is_mkldnn_enabled = dmlc::GetEnv("MXNET_ONEDNN_ENABLED", true);
-  return is_mkldnn_enabled;
+static inline bool DNNLEnvSet() {
+  static bool is_dnnl_enabled = dmlc::GetEnv("MXNET_ONEDNN_ENABLED", true);
+  return is_dnnl_enabled;
 }
 
-static inline int GetMKLDNNCacheSize() {
-  static int mkldnn_cache_size = dmlc::GetEnv("MXNET_ONEDNN_CACHE_NUM", -1);
-  return mkldnn_cache_size;
+static inline int GetDNNLCacheSize() {
+  static int dnnl_cache_size = dmlc::GetEnv("MXNET_ONEDNN_CACHE_NUM", -1);
+  return dnnl_cache_size;
 }
 
 // TODO(alex): (MXNET-1075) Will remove env variable and calculate cache size during runtime
@@ -161,8 +161,8 @@ template <typename S, typename I, typename H>
 static typename std::unordered_map<S, I, H>::iterator AddToCache(std::unordered_map<S, I, H>* cache,
                                                                  const S& key,
                                                                  const I& item) {
-  int mkldnn_cache_size = GetMKLDNNCacheSize();
-  if (mkldnn_cache_size != -1 && static_cast<int>(cache->size()) > mkldnn_cache_size)
+  int dnnl_cache_size = GetDNNLCacheSize();
+  if (dnnl_cache_size != -1 && static_cast<int>(cache->size()) > dnnl_cache_size)
     cache->erase(cache->begin());
   auto ins_return = cache->insert(std::pair<S, I>(key, item));
   CHECK(ins_return.second);
@@ -184,22 +184,20 @@ struct SoftmaxOutputParam;
 struct TransposeParam;
 struct ReshapeParam;
 struct LayerNormParam;
-bool SupportMKLDNNAct(const ActivationParam& param);
-bool SupportMKLDNNAct(const ActivationParam& param, const NDArray& input);
-bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param);
-bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param, const NDArray& input);
-bool SupportQuantizedMKLDNNAct(const ActivationParam& param);
-bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray& input);
-bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray& input);
-bool SupportMKLDNNSoftmax(const SoftmaxParam& param, const NDArray& input, const NDArray& output);
-bool SupportMKLDNNLogSoftmax(const SoftmaxParam& param,
-                             const NDArray& input,
-                             const NDArray& output);
-bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam& param);
-bool SupportMKLDNNTranspose(const TransposeParam& param, const NDArray& data);
-bool SupportMKLDNNBatchDot(const std::vector<NDArray>& inputs, const NDArray& output);
-bool SupportMKLDNNLayerNorm(const LayerNormParam& param, const std::vector<NDArray>& inputs);
-bool SupportMKLDNNReshape(const NDArray& input, const NDArray& output);
+bool SupportDNNLAct(const ActivationParam& param);
+bool SupportDNNLAct(const ActivationParam& param, const NDArray& input);
+bool SupportDNNLLeakyRelu(const LeakyReLUParam& param);
+bool SupportDNNLLeakyRelu(const LeakyReLUParam& param, const NDArray& input);
+bool SupportQuantizedDNNLAct(const ActivationParam& param);
+bool SupportDNNLConv(const ConvolutionParam& params, const NDArray& input);
+bool SupportDNNLDeconv(const DeconvolutionParam& params, const NDArray& input);
+bool SupportDNNLSoftmax(const SoftmaxParam& param, const NDArray& input, const NDArray& output);
+bool SupportDNNLLogSoftmax(const SoftmaxParam& param, const NDArray& input, const NDArray& output);
+bool SupportDNNLSoftmaxOutput(const SoftmaxOutputParam& param);
+bool SupportDNNLTranspose(const TransposeParam& param, const NDArray& data);
+bool SupportDNNLBatchDot(const std::vector<NDArray>& inputs, const NDArray& output);
+bool SupportDNNLLayerNorm(const LayerNormParam& param, const std::vector<NDArray>& inputs);
+bool SupportDNNLReshape(const NDArray& input, const NDArray& output);
 }  // namespace op
 
 static int GetTypeSize(int dtype) {
@@ -209,64 +207,64 @@ static int GetTypeSize(int dtype) {
 }
 
 static inline size_t GetArraySize(const NDArray& arr) {
-  if (arr.IsMKLDNNData()) {
-    return arr.GetMKLDNNData()->get_desc().get_size();
+  if (arr.IsDNNLData()) {
+    return arr.GetDNNLData()->get_desc().get_size();
   }
   return arr.shape().Size() * GetTypeSize(arr.dtype());
 }
 
-static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) {
+static inline dnnl::memory::data_type get_dnnl_type(int dtype) {
   switch (dtype) {
     case mshadow::kFloat32:
-      return mkldnn::memory::data_type::f32;
+      return dnnl::memory::data_type::f32;
     case mshadow::kBfloat16:
-      return mkldnn::memory::data_type::bf16;
+      return dnnl::memory::data_type::bf16;
     case mshadow::kInt32:
-      return mkldnn::memory::data_type::s32;
+      return dnnl::memory::data_type::s32;
     case mshadow::kInt8:
-      return mkldnn::memory::data_type::s8;
+      return dnnl::memory::data_type::s8;
     case mshadow::kUint8:
-      return mkldnn::memory::data_type::u8;
+      return dnnl::memory::data_type::u8;
     default:
-      LOG(FATAL) << "unknown type for MKLDNN :" << static_cast<int>(dtype);
-      return mkldnn::memory::data_type::undef;
+      LOG(FATAL) << "unknown type for DNNL :" << static_cast<int>(dtype);
+      return dnnl::memory::data_type::undef;
   }
 }
 
 template <typename T>
-static inline mkldnn::memory::data_type get_mkldnn_type() {
-  return static_cast<mkldnn::memory::data_type>(data_type_enum<T>::type);
+static inline dnnl::memory::data_type get_dnnl_type() {
+  return static_cast<dnnl::memory::data_type>(data_type_enum<T>::type);
 }
 
-static inline mkldnn_data_type_t get_mkldnn_type_t(int dtype) {
-  return static_cast<mkldnn_data_type_t>(get_mkldnn_type(dtype));
+static inline dnnl_data_type_t get_dnnl_type_t(int dtype) {
+  return static_cast<dnnl_data_type_t>(get_dnnl_type(dtype));
 }
 
 template <typename T>
-static inline mkldnn_data_type_t get_mkldnn_type_t() {
-  return static_cast<mkldnn_data_type_t>(data_type_enum<T>::type);
+static inline dnnl_data_type_t get_dnnl_type_t() {
+  return static_cast<dnnl_data_type_t>(data_type_enum<T>::type);
 }
 
-static inline int get_mxnet_type(mkldnn_data_type_t dtype) {
-  auto mkldnn_dtype = static_cast<mkldnn::memory::data_type>(dtype);
-  switch (mkldnn_dtype) {
-    case mkldnn::memory::data_type::f32:
+static inline int get_mxnet_type(dnnl_data_type_t dtype) {
+  auto dnnl_dtype = static_cast<dnnl::memory::data_type>(dtype);
+  switch (dnnl_dtype) {
+    case dnnl::memory::data_type::f32:
       return mshadow::kFloat32;
-    case mkldnn::memory::data_type::bf16:
+    case dnnl::memory::data_type::bf16:
       return mshadow::kBfloat16;
-    case mkldnn::memory::data_type::s32:
+    case dnnl::memory::data_type::s32:
       return mshadow::kInt32;
-    case mkldnn::memory::data_type::s8:
+    case dnnl::memory::data_type::s8:
       return mshadow::kInt8;
-    case mkldnn::memory::data_type::u8:
+    case dnnl::memory::data_type::u8:
       return mshadow::kUint8;
     default:
-      LOG(FATAL) << "unknown MKLDNN type";
+      LOG(FATAL) << "unknown DNNL type";
       return mshadow::kFloat32;
   }
 }
 
-static inline size_t GetMemDescSize(const mkldnn::memory::desc& md) {
+static inline size_t GetMemDescSize(const dnnl::memory::desc& md) {
   if (md.data.ndims == 0)
     return 0;
 
@@ -279,53 +277,53 @@ static inline size_t GetMemDescSize(const mkldnn::memory::desc& md) {
   return ret;
 }
 
-inline static mkldnn::memory::desc GetMemDesc(const NDArray& arr, int dtype = -1) {
+inline static dnnl::memory::desc GetMemDesc(const NDArray& arr, int dtype = -1) {
   int ndim = arr.shape().ndim();
-  mkldnn::memory::dims dims(ndim);
+  dnnl::memory::dims dims(ndim);
   dtype = (dtype == -1) ? arr.dtype() : dtype;
   for (size_t i = 0; i < dims.size(); i++)
     dims[i] = arr.shape()[i];
-  return mkldnn::memory::desc{dims, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
+  return dnnl::memory::desc{dims, get_dnnl_type(dtype), dnnl::memory::format_tag::any};
 }
 
-inline static bool ChooseBRGEMMImpl(const mkldnn::memory::dims& weight_dims, size_t batch_size) {
+inline static bool ChooseBRGEMMImpl(const dnnl::memory::dims& weight_dims, size_t batch_size) {
   // Conditions based on measurement results done on CLX8280
   // https://github.com/apache/incubator-mxnet/pull/20533
   return weight_dims[0] >= 1024 && weight_dims[1] >= 1024 && batch_size >= 16384 &&
          weight_dims[0] % 64 == 0 && weight_dims[1] % 64 == 0;
 }
 
-inline static mkldnn::memory::desc GetFCWeightDesc(const NDArray& arr,
-                                                   size_t batch_size,
-                                                   int dtype = -1) {
+inline static dnnl::memory::desc GetFCWeightDesc(const NDArray& arr,
+                                                 size_t batch_size,
+                                                 int dtype = -1) {
   int ndim = arr.shape().ndim();
-  mkldnn::memory::dims dims(ndim);
+  dnnl::memory::dims dims(ndim);
   dtype = (dtype == -1) ? arr.dtype() : dtype;
   for (size_t i = 0; i < dims.size(); i++)
     dims[i] = arr.shape()[i];
-  auto format = mkldnn::memory::format_tag::any;
+  auto format = dnnl::memory::format_tag::any;
   // for batch 256 alexnet benchmark test
   const bool force_fc_ab_format = dmlc::GetEnv("MXNET_ONEDNN_FORCE_FC_AB_FORMAT", false);
   if (dims.size() == 2) {
     if (force_fc_ab_format || !ChooseBRGEMMImpl(dims, batch_size)) {
-      format = mkldnn::memory::format_tag::ab;
+      format = dnnl::memory::format_tag::ab;
     }
   }
 
-  return mkldnn::memory::desc{dims, get_mkldnn_type(dtype), format};
+  return dnnl::memory::desc{dims, get_dnnl_type(dtype), format};
 }
 
-inline static mkldnn::memory::desc GetWeightDesc(const NDArray& arr,
-                                                 int num_groups,
-                                                 bool quantized = false) {
+inline static dnnl::memory::desc GetWeightDesc(const NDArray& arr,
+                                               int num_groups,
+                                               bool quantized = false) {
   int dtype = quantized ? mshadow::kInt8 : arr.dtype();
   if (num_groups == 1) {
     return GetMemDesc(arr, dtype);
   } else {
     const auto ndim = arr.shape().ndim();
     CHECK((ndim == 3) || (ndim == 4) || (ndim == 5))
-        << "MKL-DNN weight currently supports 3d or 4d or 5d layout";
-    auto tz = mkldnn::memory::dims{0};
+        << "DNNL weight currently supports 3d or 4d or 5d layout";
+    auto tz = dnnl::memory::dims{0};
     int N = 0, C = 1, H = 2, W = 3;
     int D = -1;
     if (ndim == 5) {
@@ -335,39 +333,39 @@ inline static mkldnn::memory::desc GetWeightDesc(const NDArray& arr,
     }
     switch (ndim) {
       case 3:
-        tz = mkldnn::memory::dims{
+        tz = dnnl::memory::dims{
             num_groups, arr.shape()[N] / num_groups, arr.shape()[C], arr.shape()[H]};
         break;
       case 4:
-        tz = mkldnn::memory::dims{num_groups,
-                                  arr.shape()[N] / num_groups,
-                                  arr.shape()[C],
-                                  arr.shape()[H],
-                                  arr.shape()[W]};
+        tz = dnnl::memory::dims{num_groups,
+                                arr.shape()[N] / num_groups,
+                                arr.shape()[C],
+                                arr.shape()[H],
+                                arr.shape()[W]};
         break;
       case 5:
-        tz = mkldnn::memory::dims{num_groups,
-                                  arr.shape()[N] / num_groups,
-                                  arr.shape()[C],
-                                  arr.shape()[D],
-                                  arr.shape()[H],
-                                  arr.shape()[W]};
+        tz = dnnl::memory::dims{num_groups,
+                                arr.shape()[N] / num_groups,
+                                arr.shape()[C],
+                                arr.shape()[D],
+                                arr.shape()[H],
+                                arr.shape()[W]};
     }
-    return mkldnn::memory::desc{tz, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
+    return dnnl::memory::desc{tz, get_dnnl_type(dtype), dnnl::memory::format_tag::any};
   }
 }
 
-inline static bool CheckMKLDNNInputArrayIsView(const std::vector<NDArray>& inputs) {
+inline static bool CheckDNNLInputArrayIsView(const std::vector<NDArray>& inputs) {
   for (const auto& in : inputs) {
-    if (in.IsView() && in.IsMKLDNNData()) {
+    if (in.IsView() && in.IsDNNLData()) {
       return true;
     }
   }
   return false;
 }
 
-typedef std::shared_ptr<mkldnn::memory> mkldnn_mem_ptr;
-typedef std::shared_ptr<const mkldnn::memory> mkldnn_mem_const_ptr;
+typedef std::shared_ptr<dnnl::memory> dnnl_mem_ptr;
+typedef std::shared_ptr<const dnnl::memory> dnnl_mem_const_ptr;
 
 /*
  * This is to manage the temporary memory provided by MXNet for operators.
@@ -388,7 +386,7 @@ class TmpMemMgr {
   size_t curr_size;
   // This estimate the required temp memory size in an operator.
   size_t est_size;
-  const size_t alignment = kMKLDNNAlign;
+  const size_t alignment = kDNNLAlign;
 
  public:
   static TmpMemMgr* Get() {
@@ -428,26 +426,26 @@ class TmpMemMgr {
     this->est_size = 0;
   }
 
-  mkldnn::memory* Alloc(const mkldnn::memory::desc& md);
+  dnnl::memory* Alloc(const dnnl::memory::desc& md);
 };
 
-typedef std::unordered_map<int, mkldnn::memory> mkldnn_args_map_t;
-class MKLDNNStream {
-  std::vector<std::pair<mkldnn::primitive, mkldnn_args_map_t> > net_prim_args;
+typedef std::unordered_map<int, dnnl::memory> dnnl_args_map_t;
+class DNNLStream {
+  std::vector<std::pair<dnnl::primitive, dnnl_args_map_t> > net_prim_args;
   // Here we hold all memory related to the operators in the stream.
-  std::vector<std::shared_ptr<const mkldnn::memory> > mem_holder;
-  mkldnn::stream s;
+  std::vector<std::shared_ptr<const dnnl::memory> > mem_holder;
+  dnnl::stream s;
 
  public:
-  static MKLDNNStream* Get();
+  static DNNLStream* Get();
 
-  MKLDNNStream() : s(CpuEngine::Get()->get_engine()) {}
+  DNNLStream() : s(CpuEngine::Get()->get_engine()) {}
 
-  void RegisterPrimArgs(const mkldnn::primitive& prim, const mkldnn_args_map_t& args) {
+  void RegisterPrimArgs(const dnnl::primitive& prim, const dnnl_args_map_t& args) {
     net_prim_args.emplace_back(prim, args);
   }
 
-  void RegisterMem(std::shared_ptr<const mkldnn::memory> mem) {
+  void RegisterMem(std::shared_ptr<const dnnl::memory> mem) {
     mem_holder.push_back(mem);
   }
 
@@ -456,9 +454,9 @@ class MKLDNNStream {
   }
 
   /*
-   * After submitting mkldnn operations for execution, we need to
+   * After submitting dnnl operations for execution, we need to
    * clean up memory held by the stream. However, sometimes users
-   * might want to separate mkldnn execution and memory cleanup.
+   * might want to separate dnnl execution and memory cleanup.
    */
   void Submit(bool cleanup = true) {
     if (!net_prim_args.empty()) {
@@ -483,62 +481,61 @@ enum OutDataOp {
   AddBack,
 };
 
-typedef std::pair<OutDataOp, mkldnn::memory*> mkldnn_output_t;
-void MKLDNNMemoryCopy(const mkldnn::memory& mem, const mkldnn::memory* this_mem);
+typedef std::pair<OutDataOp, dnnl::memory*> dnnl_output_t;
+void DNNLMemoryCopy(const dnnl::memory& mem, const dnnl::memory* this_mem);
 
 /*
- * Here we want to get MKLDNN memory whose desc is exactly the same as
+ * Here we want to get DNNL memory whose desc is exactly the same as
  * the given one. operator== can't guarantee that. == can return true even if
  * the formats are different. I need to double check its format.
  */
-static inline mkldnn::memory* GetMKLDNNExact(const mkldnn::memory* mem,
-                                             const mkldnn::memory::desc& desc) {
-  mkldnn::memory::desc src_desc = mem->get_desc();
+static inline dnnl::memory* GetDNNLExact(const dnnl::memory* mem, const dnnl::memory::desc& desc) {
+  dnnl::memory::desc src_desc = mem->get_desc();
   if (desc == src_desc) {
-    return const_cast<mkldnn::memory*>(mem);
+    return const_cast<dnnl::memory*>(mem);
   } else {
-    std::shared_ptr<mkldnn::memory> ret(
-        new mkldnn::memory(desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
-    MKLDNNStream::Get()->RegisterMem(ret);
+    std::shared_ptr<dnnl::memory> ret(
+        new dnnl::memory(desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
+    DNNLStream::Get()->RegisterMem(ret);
     return ret.get();
   }
 }
 
 /*
- * These two functions try to create MKLDNN memory in an NDArray based on `req'.
- * The difference is that the first function can create MKLDNN memory with
- * special layouts in an NDArray, while the second one can only create MKLDNN
+ * These two functions try to create DNNL memory in an NDArray based on `req'.
+ * The difference is that the first function can create DNNL memory with
+ * special layouts in an NDArray, while the second one can only create DNNL
  * memory with default layouts.
  * Also an optional in_arr parameter can be passed in the first function with
- * the kWriteInPlace req to validate if mkldnn can support write in place;
+ * the kWriteInPlace req to validate if dnnl can support write in place;
  * otherwise new memory will be written to an copied back onto out_arr.
  * If these two functions are used, we have to call CommitOutput to write
  * the output back to the output NDArray.
  */
-mkldnn_output_t CreateMKLDNNMem(const NDArray& out_arr,
-                                const mkldnn::memory::desc& desc,
-                                OpReqType req,
-                                const NDArray* in_arr = nullptr);
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray& out_arr,
-                                       const mkldnn::memory::desc& desc,
-                                       OpReqType req);
+dnnl_output_t CreateDNNLMem(const NDArray& out_arr,
+                            const dnnl::memory::desc& desc,
+                            OpReqType req,
+                            const NDArray* in_arr = nullptr);
+dnnl_output_t CreateDNNLWeightGrad(const NDArray& out_arr,
+                                   const dnnl::memory::desc& desc,
+                                   OpReqType req);
 /* This function has to be used with one of the functions above. */
-void CommitOutput(const NDArray& arr, const mkldnn_output_t& res);
+void CommitOutput(const NDArray& arr, const dnnl_output_t& res);
 
-const mkldnn::memory* GetWeights(const NDArray& arr, int num_groups);
+const dnnl::memory* GetWeights(const NDArray& arr, int num_groups);
 
-const mkldnn::memory* GetWeights(const NDArray& arr,
-                                 const mkldnn::memory::desc& target_md,
-                                 int num_groups);
+const dnnl::memory* GetWeights(const NDArray& arr,
+                               const dnnl::memory::desc& target_md,
+                               int num_groups);
 
-bool IsDefaultFormat(const mkldnn::memory::desc& desc);
-bool IsMKLDNN(const mkldnn::memory::desc& desc);
+bool IsDefaultFormat(const dnnl::memory::desc& desc);
+bool IsDNNL(const dnnl::memory::desc& desc);
 
-mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc& md);
-mkldnn_format_tag_t GetDefaultFormat(int num_dims);
-mkldnn::memory::desc GetDesc(const mkldnn::memory::desc& md, const mkldnn_format_tag_t& format);
+dnnl_format_tag_t GetDefaultFormat(const dnnl::memory::desc& md);
+dnnl_format_tag_t GetDefaultFormat(int num_dims);
+dnnl::memory::desc GetDesc(const dnnl::memory::desc& md, const dnnl_format_tag_t& format);
 
-inline bool same_shape(const mxnet::TShape& shape, const mkldnn_dims_t dims, int ndims) {
+inline bool same_shape(const mxnet::TShape& shape, const dnnl_dims_t dims, int ndims) {
   if (shape.ndim() != ndims)
     return false;
   for (int i = 0; i < ndims; i++)
@@ -547,7 +544,7 @@ inline bool same_shape(const mxnet::TShape& shape, const mkldnn_dims_t dims, int
   return true;
 }
 
-inline bool same_shape(const mkldnn::memory::desc& desc1, const mkldnn::memory::desc& desc2) {
+inline bool same_shape(const dnnl::memory::desc& desc1, const dnnl::memory::desc& desc2) {
   if (desc1.data.ndims != desc2.data.ndims)
     return false;
   for (int i = 0; i < desc1.data.ndims; i++)
@@ -556,28 +553,28 @@ inline bool same_shape(const mkldnn::memory::desc& desc1, const mkldnn::memory::
   return true;
 }
 
-inline bool same_shape(const mxnet::TShape& shape, int dtype, const mkldnn::memory::desc& desc) {
+inline bool same_shape(const mxnet::TShape& shape, int dtype, const dnnl::memory::desc& desc) {
   return same_shape(shape, desc.data.dims, desc.data.ndims) &&
-         get_mkldnn_type(dtype) == desc.data.data_type;
+         get_dnnl_type(dtype) == desc.data.data_type;
 }
 
 /*
- * There is a large overhead of getting mkldnn::memory::desc from
- * mkldnn::memory. This class is created to cache the metadata of mkldnn memory
+ * There is a large overhead of getting dnnl::memory::desc from
+ * dnnl::memory. This class is created to cache the metadata of dnnl memory
  * to provide a much more lightweight method to access them.
  */
-class MKLDNNMemory {
-  std::shared_ptr<mkldnn::memory> mem;
-  mkldnn::memory::desc desc;
+class DNNLMemory {
+  std::shared_ptr<dnnl::memory> mem;
+  dnnl::memory::desc desc;
   size_t size;  // The number of bytes.
 
  public:
-  MKLDNNMemory(mkldnn::memory::desc md, void* addr) : desc(md) {
-    mem.reset(new mkldnn::memory(md, CpuEngine::Get()->get_engine(), addr));
+  DNNLMemory(dnnl::memory::desc md, void* addr) : desc(md) {
+    mem.reset(new dnnl::memory(md, CpuEngine::Get()->get_engine(), addr));
     size = desc.get_size();
   }
 
-  explicit MKLDNNMemory(std::shared_ptr<mkldnn::memory> mem) : desc(mem->get_desc()) {
+  explicit DNNLMemory(std::shared_ptr<dnnl::memory> mem) : desc(mem->get_desc()) {
     this->mem = mem;
     size      = desc.get_size();
   }
@@ -590,11 +587,11 @@ class MKLDNNMemory {
     return mem->get_data_handle();
   }
 
-  std::shared_ptr<mkldnn::memory> GetMem() const {
+  std::shared_ptr<dnnl::memory> GetMem() const {
     return mem;
   }
 
-  mkldnn::memory* GetRaw() const {
+  dnnl::memory* GetRaw() const {
     return mem.get();
   }
 
@@ -602,31 +599,31 @@ class MKLDNNMemory {
     return size;
   }
 
-  mkldnn::memory::desc GetDesc() const {
+  dnnl::memory::desc GetDesc() const {
     return mem->get_desc();
   }
 
-  mkldnn::memory::desc GetDesc(
-      mkldnn_format_tag_t format,
-      mkldnn::memory::data_type data_type = mkldnn::memory::data_type::undef) const {
-    mkldnn::memory::dims dims(desc.data.dims, desc.data.dims + desc.data.ndims);
-    mkldnn::memory::data_type cpp_type =
-        (data_type == mkldnn::memory::data_type::undef)
-            ? static_cast<mkldnn::memory::data_type>(desc.data.data_type)
+  dnnl::memory::desc GetDesc(
+      dnnl_format_tag_t format,
+      dnnl::memory::data_type data_type = dnnl::memory::data_type::undef) const {
+    dnnl::memory::dims dims(desc.data.dims, desc.data.dims + desc.data.ndims);
+    dnnl::memory::data_type cpp_type =
+        (data_type == dnnl::memory::data_type::undef)
+            ? static_cast<dnnl::memory::data_type>(desc.data.data_type)
             : data_type;
-    mkldnn::memory::desc data_md(dims, cpp_type, static_cast<mkldnn::memory::format_tag>(format));
+    dnnl::memory::desc data_md(dims, cpp_type, static_cast<dnnl::memory::format_tag>(format));
     return data_md;
   }
 
-  mkldnn_format_tag_t GetDefaultFormat() const {
+  dnnl_format_tag_t GetDefaultFormat() const {
     return mxnet::GetDefaultFormat(desc);
   }
 
-  bool IsMKLDNN() const {
-    return mxnet::IsMKLDNN(desc);
+  bool IsDNNL() const {
+    return mxnet::IsDNNL(desc);
   }
 
-  bool SameFormat(mkldnn::memory::desc md) const {
+  bool SameFormat(dnnl::memory::desc md) const {
     return mem->get_desc() == md;
   }
 
@@ -634,14 +631,14 @@ class MKLDNNMemory {
     return same_shape(shape, dtype, desc);
   }
 
-  void ReorderTo(mkldnn::memory* other) const {
-    mkldnn::stream s(CpuEngine::Get()->get_engine());
-    mkldnn::reorder(*mem, *other).execute(s, *mem, *other);
+  void ReorderTo(dnnl::memory* other) const {
+    dnnl::stream s(CpuEngine::Get()->get_engine());
+    dnnl::reorder(*mem, *other).execute(s, *mem, *other);
   }
 };
 
-// reorder mkldnn src to dst format dtype
-void ReorderTo(const mkldnn::memory* src, const mkldnn::memory* dst);
+// reorder dnnl src to dst format dtype
+void ReorderTo(const dnnl::memory* src, const dnnl::memory* dst);
 
 template <typename Compute, typename AttrState>
 void FallBackCompute(Compute fn,
@@ -652,7 +649,7 @@ void FallBackCompute(Compute fn,
                      const std::vector<NDArray>& outputs);
 
 /*
- * This class is used to check the correctness of MKLDNN operators.
+ * This class is used to check the correctness of DNNL operators.
  */
 class OpCheck {
   std::vector<mxnet::NDArray> inputs;
@@ -679,39 +676,39 @@ class OpCheck {
   void CopyResult(const std::vector<mxnet::NDArray>& outputs_, const std::vector<size_t>& indice);
 };
 
-bool MKLDNNStorageType(const nnvm::NodeAttrs& attrs,
-                       const int dev_mask,
-                       bool support_mkldnn,
-                       DispatchMode* dispatch_mode,
-                       std::vector<int>* in_attrs,
-                       std::vector<int>* out_attrs);
-
-#define MKLDNN_OPCHECK_INIT(backward, num_checks, inputs, outputs) \
-  static bool debug = dmlc::GetEnv("MXNET_ONEDNN_DEBUG", false);   \
-  OpCheck check(backward, num_checks);                             \
-  if (debug)                                                       \
-    check.Init(inputs, outputs);
+bool DNNLStorageType(const nnvm::NodeAttrs& attrs,
+                     const int dev_mask,
+                     bool support_dnnl,
+                     DispatchMode* dispatch_mode,
+                     std::vector<int>* in_attrs,
+                     std::vector<int>* out_attrs);
 
-#define MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs) \
+#define DNNL_OPCHECK_INIT(backward, num_checks, inputs, outputs) \
+  static bool debug = dmlc::GetEnv("MXNET_ONEDNN_DEBUG", false); \
+  OpCheck check(backward, num_checks);                           \
   if (debug)                                                     \
+    check.Init(inputs, outputs);
+
+#define DNNL_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs) \
+  if (debug)                                                   \
     check.Run(fn, attrs, ctx, inputs, req, outputs);
-#define MKLDNN_OPCHECK_COPY_RESULT(outputs, indice) \
-  if (debug)                                        \
+#define DNNL_OPCHECK_COPY_RESULT(outputs, indice) \
+  if (debug)                                      \
     check.CopyResult(outputs, indice);
 
-struct MKLDNNPostEltwiseParam {
-  mkldnn::algorithm alg = mkldnn::algorithm::undef;
-  float scale           = 1.f;
-  float alpha           = 0.f;
-  float beta            = 1.f;
+struct DNNLPostEltwiseParam {
+  dnnl::algorithm alg = dnnl::algorithm::undef;
+  float scale         = 1.f;
+  float alpha         = 0.f;
+  float beta          = 1.f;
 };
 
-void MKLDNNRun(mxnet::FComputeEx fn,
-               const nnvm::NodeAttrs& attrs,
-               const mxnet::OpContext& ctx,
-               const std::vector<mxnet::NDArray>& inputs_,
-               const std::vector<mxnet::OpReqType>& req,
-               const std::vector<mxnet::NDArray>& outputs_);
+void DNNLRun(mxnet::FComputeEx fn,
+             const nnvm::NodeAttrs& attrs,
+             const mxnet::OpContext& ctx,
+             const std::vector<mxnet::NDArray>& inputs_,
+             const std::vector<mxnet::OpReqType>& req,
+             const std::vector<mxnet::NDArray>& outputs_);
 
 using FComputeExUnary = std::function<void(const nnvm::NodeAttrs& attrs,
                                            const OpContext& ctx,
@@ -719,13 +716,13 @@ using FComputeExUnary = std::function<void(const nnvm::NodeAttrs& attrs,
                                            const OpReqType& req,
                                            const NDArray& output)>;
 
-void MKLDNNRun(FComputeExUnary fn,
-               const nnvm::NodeAttrs& attrs,
-               const mxnet::OpContext& ctx,
-               const mxnet::NDArray& inputs_,
-               const mxnet::OpReqType& req,
-               const mxnet::NDArray& outputs_);
+void DNNLRun(FComputeExUnary fn,
+             const nnvm::NodeAttrs& attrs,
+             const mxnet::OpContext& ctx,
+             const mxnet::NDArray& inputs_,
+             const mxnet::OpReqType& req,
+             const mxnet::NDArray& outputs_);
 
 }  // namespace mxnet
 #endif
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_BASE_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/dnnl/dnnl_base.cc
similarity index 57%
rename from src/operator/nn/mkldnn/mkldnn_base.cc
rename to src/operator/nn/dnnl/dnnl_base.cc
index 5415e9e6eaea..d1e8918c3bde 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/dnnl/dnnl_base.cc
@@ -21,19 +21,18 @@
 
 #include <atomic>
 
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-
 #include "../../../common/exec_utils.h"
 #include "../../operator_common.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
 
 namespace mxnet {
 
-MKLDNNStream* MKLDNNStream::Get() {
+DNNLStream* DNNLStream::Get() {
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local MKLDNNStream stream;
+  static thread_local DNNLStream stream;
 #else
-  static MX_THREAD_LOCAL MKLDNNStream stream;
+  static MX_THREAD_LOCAL DNNLStream stream;
 #endif
   return &stream;
 }
@@ -56,15 +55,15 @@ void* AlignMem(void* mem, size_t size, size_t alignment, size_t* space) {
   return reinterpret_cast<void*>(addr);
 }
 
-mkldnn::memory* TmpMemMgr::Alloc(const mkldnn::memory::desc& md) {
+dnnl::memory* TmpMemMgr::Alloc(const dnnl::memory::desc& md) {
   // We need to include the size of the memory used for alignment.
   this->est_size += md.get_size() + alignment;
   void* mem = AlignMem(this->curr_mem, md.get_size(), alignment, &this->curr_size);
   if (mem) {
     // The memory is allocated from the temporary memory space in the
     // operator. It'll only become invalid after we exit from the operator.
-    mkldnn_mem_ptr ret(new mkldnn::memory(md, CpuEngine::Get()->get_engine(), mem));
-    MKLDNNStream::Get()->RegisterMem(ret);
+    dnnl_mem_ptr ret(new dnnl::memory(md, CpuEngine::Get()->get_engine(), mem));
+    DNNLStream::Get()->RegisterMem(ret);
     CHECK_EQ(mem, mem);
     this->curr_size -= md.get_size();
     this->curr_mem = static_cast<char*>(mem) + md.get_size();
@@ -73,170 +72,163 @@ mkldnn::memory* TmpMemMgr::Alloc(const mkldnn::memory::desc& md) {
     // If curr_mem has been initialized and we still reach here, it means the current
     // allocated memory isn't enough. But it doesn't matter for multiple invokes of a
     // operator, as the TmpMemMgr could estimate the space at the first iteration and
-    // then re-requests abundant space from MXNet resource. MKL-DNN could allocate
+    // then re-requests abundant space from MXNet resource. DNNL could allocate
     // the space by itself. Thus, we just let it continue for estimating the maximum
     // required space size. It will be allocated at next call.
     if (this->curr_mem && dmlc::GetEnv("MXNET_ONEDNN_DEBUG", false)) {
-      LOG(WARNING) << "mkl-dnn debug message: The rest of the temporary space is not "
-                   << "adequate for allocating " << md.get_size() << " bytes. Thus, mkl-dnn "
+      LOG(WARNING) << "DNNL debug message: The rest of the temporary space is not "
+                   << "adequate for allocating " << md.get_size() << " bytes. Thus, DNNL "
                    << "allocate the space by itself.";
     }
-    mkldnn_mem_ptr ret(new mkldnn::memory(md, CpuEngine::Get()->get_engine()));
-    MKLDNNStream::Get()->RegisterMem(ret);
+    dnnl_mem_ptr ret(new dnnl::memory(md, CpuEngine::Get()->get_engine()));
+    DNNLStream::Get()->RegisterMem(ret);
     return ret.get();
   }
 }
 
-void MKLDNNMemoryCopy(const mkldnn::memory& mem, const mkldnn::memory* this_mem) {
-  MKLDNNStream* stream                = MKLDNNStream::Get();
-  mkldnn::memory::desc from_desc      = mem.get_desc();
-  mkldnn::memory::desc this_desc      = this_mem->get_desc();
-  mkldnn_format_tag_t from_def_format = GetDefaultFormat(from_desc);
-  mkldnn_format_tag_t this_def_format = GetDefaultFormat(this_desc);
+void DNNLMemoryCopy(const dnnl::memory& mem, const dnnl::memory* this_mem) {
+  DNNLStream* stream                = DNNLStream::Get();
+  dnnl::memory::desc from_desc      = mem.get_desc();
+  dnnl::memory::desc this_desc      = this_mem->get_desc();
+  dnnl_format_tag_t from_def_format = GetDefaultFormat(from_desc);
+  dnnl_format_tag_t this_def_format = GetDefaultFormat(this_desc);
 
   if (!same_shape(this_desc, from_desc) && IsDefaultFormat(from_desc)) {
-    // In this case, we can simply create a new MKLDNN memory for the required
+    // In this case, we can simply create a new DNNL memory for the required
     // shape.
-    mkldnn::memory::dims dims(this_desc.data.dims, this_desc.data.dims + this_desc.data.ndims);
-    auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
-    mkldnn::memory::desc data_md(
-        dims, this_dtype, static_cast<mkldnn::memory::format_tag>(this_def_format));
+    dnnl::memory::dims dims(this_desc.data.dims, this_desc.data.dims + this_desc.data.ndims);
+    auto this_dtype = static_cast<dnnl::memory::data_type>(this_desc.data.data_type);
+    dnnl::memory::desc data_md(
+        dims, this_dtype, static_cast<dnnl::memory::format_tag>(this_def_format));
 
-    mkldnn_mem_ptr tmp_mem(new mkldnn::memory(data_md, mem.get_engine(), mem.get_data_handle()));
+    dnnl_mem_ptr tmp_mem(new dnnl::memory(data_md, mem.get_engine(), mem.get_data_handle()));
     stream->RegisterMem(tmp_mem);
-    std::unordered_map<int, mkldnn::memory> args(
-        {{MKLDNN_ARG_FROM, *tmp_mem}, {MKLDNN_ARG_TO, *this_mem}});
-    stream->RegisterPrimArgs(mkldnn::reorder(*tmp_mem, *this_mem), args);
+    std::unordered_map<int, dnnl::memory> args(
+        {{DNNL_ARG_FROM, *tmp_mem}, {DNNL_ARG_TO, *this_mem}});
+    stream->RegisterPrimArgs(dnnl::reorder(*tmp_mem, *this_mem), args);
   } else if (!same_shape(this_desc, from_desc)) {
     // In this case, the source memory stores data in a customized layout. We
     // need to reorganize the data in memory before we can reshape.
-    mkldnn::memory::desc def_desc = GetDesc(from_desc, from_def_format);
-    mkldnn::memory* def_mem       = TmpMemMgr::Get()->Alloc(def_desc);
-    std::unordered_map<int, mkldnn::memory> args(
-        {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *def_mem}});
-    stream->RegisterPrimArgs(mkldnn::reorder(mem, *def_mem), args);
+    dnnl::memory::desc def_desc = GetDesc(from_desc, from_def_format);
+    dnnl::memory* def_mem       = TmpMemMgr::Get()->Alloc(def_desc);
+    std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, mem}, {DNNL_ARG_TO, *def_mem}});
+    stream->RegisterPrimArgs(dnnl::reorder(mem, *def_mem), args);
 
     // Now we can reshape it
-    mkldnn_mem_ptr tmp_mem(
-        new mkldnn::memory(this_desc, mem.get_engine(), def_mem->get_data_handle()));
+    dnnl_mem_ptr tmp_mem(new dnnl::memory(this_desc, mem.get_engine(), def_mem->get_data_handle()));
     stream->RegisterMem(tmp_mem);
-    args = {{MKLDNN_ARG_FROM, *tmp_mem}, {MKLDNN_ARG_TO, *this_mem}};
-    stream->RegisterPrimArgs(mkldnn::reorder(*tmp_mem, *this_mem), args);
+    args = {{DNNL_ARG_FROM, *tmp_mem}, {DNNL_ARG_TO, *this_mem}};
+    stream->RegisterPrimArgs(dnnl::reorder(*tmp_mem, *this_mem), args);
   } else if (this_desc == from_desc) {
-    std::unordered_map<int, mkldnn::memory> args(
-        {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *this_mem}});
+    std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, mem}, {DNNL_ARG_TO, *this_mem}});
     // If the layout is the same, we can just copy data.
-    stream->RegisterPrimArgs(mkldnn::reorder(mem, *this_mem), args);
+    stream->RegisterPrimArgs(dnnl::reorder(mem, *this_mem), args);
   } else {
     // If both are not using the default layouts. There isn't much we can do,
     // other than reorder data layout directly.
     if (!IsDefaultFormat(this_desc) && !IsDefaultFormat(from_desc)) {
-      std::unordered_map<int, mkldnn::memory> args(
-          {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *this_mem}});
-      stream->RegisterPrimArgs(mkldnn::reorder(mem, *this_mem), args);
+      std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, mem}, {DNNL_ARG_TO, *this_mem}});
+      stream->RegisterPrimArgs(dnnl::reorder(mem, *this_mem), args);
     } else if (IsDefaultFormat(this_desc)) {
       // If the dest mem uses the default memory layout, we can simply use
       // the default format of the source memory to improve perf of reorder.
-      mkldnn::memory::desc desc = GetDesc(from_desc, from_def_format);
-      mkldnn_mem_ptr tmp_mem(
-          new mkldnn::memory(desc, mem.get_engine(), this_mem->get_data_handle()));
+      dnnl::memory::desc desc = GetDesc(from_desc, from_def_format);
+      dnnl_mem_ptr tmp_mem(new dnnl::memory(desc, mem.get_engine(), this_mem->get_data_handle()));
       stream->RegisterMem(tmp_mem);
-      std::unordered_map<int, mkldnn::memory> args(
-          {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *tmp_mem}});
-      stream->RegisterPrimArgs(mkldnn::reorder(mem, *tmp_mem), args);
+      std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, mem}, {DNNL_ARG_TO, *tmp_mem}});
+      stream->RegisterPrimArgs(dnnl::reorder(mem, *tmp_mem), args);
     } else {
       // If the src mem uses the default memory layout, we can use
       // the default format of the source memory to improve perf.
-      mkldnn::memory::desc desc = GetDesc(this_desc, this_def_format);
-      mkldnn_mem_ptr tmp_mem(
-          new mkldnn::memory(desc, this_mem->get_engine(), mem.get_data_handle()));
+      dnnl::memory::desc desc = GetDesc(this_desc, this_def_format);
+      dnnl_mem_ptr tmp_mem(new dnnl::memory(desc, this_mem->get_engine(), mem.get_data_handle()));
       stream->RegisterMem(tmp_mem);
-      std::unordered_map<int, mkldnn::memory> args(
-          {{MKLDNN_ARG_FROM, *tmp_mem}, {MKLDNN_ARG_TO, *this_mem}});
-      stream->RegisterPrimArgs(mkldnn::reorder(*tmp_mem, *this_mem), args);
+      std::unordered_map<int, dnnl::memory> args(
+          {{DNNL_ARG_FROM, *tmp_mem}, {DNNL_ARG_TO, *this_mem}});
+      stream->RegisterPrimArgs(dnnl::reorder(*tmp_mem, *this_mem), args);
     }
   }
 }
 
-bool CanWriteTo(const NDArray& out_arr, const NDArray& in_arr, const mkldnn::memory::desc& desc) {
-  auto in_mem     = in_arr.GetMKLDNNData();
-  bool add_same   = in_mem->get_data_handle() == out_arr.GetMKLDNNData()->get_data_handle();
-  bool pdesc_same = out_arr.GetMKLDNNData()->get_desc() == desc && in_mem->get_desc() == desc;
+bool CanWriteTo(const NDArray& out_arr, const NDArray& in_arr, const dnnl::memory::desc& desc) {
+  auto in_mem     = in_arr.GetDNNLData();
+  bool add_same   = in_mem->get_data_handle() == out_arr.GetDNNLData()->get_data_handle();
+  bool pdesc_same = out_arr.GetDNNLData()->get_desc() == desc && in_mem->get_desc() == desc;
   return add_same && pdesc_same;
 }
 
-mkldnn_output_t CreateMKLDNNMem(const NDArray& out_arr,
-                                const mkldnn::memory::desc& desc,
-                                OpReqType req,
-                                const NDArray* in_arr) {
+dnnl_output_t CreateDNNLMem(const NDArray& out_arr,
+                            const dnnl::memory::desc& desc,
+                            OpReqType req,
+                            const NDArray* in_arr) {
   if (kAddTo == req) {
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
-    return mkldnn_output_t(OutDataOp::AddBack, tmp);
+    return dnnl_output_t(OutDataOp::AddBack, tmp);
   } else if (kWriteInplace == req && in_arr != nullptr && CanWriteTo(out_arr, *in_arr, desc)) {
-    mkldnn::memory* mem = const_cast<NDArray&>(out_arr).CreateMKLDNNData(desc);
-    // mem is nullptr if out_arr is view and desc is MKLDNN format.
-    // need to Reorder2Default before calling CreateMKLDNNMem
+    dnnl::memory* mem = const_cast<NDArray&>(out_arr).CreateDNNLData(desc);
+    // mem is nullptr if out_arr is view and desc is DNNL format.
+    // need to Reorder2Default before calling CreateDNNLMem
     CHECK(mem != nullptr);
-    return mkldnn_output_t(OutDataOp::Noop, mem);
+    return dnnl_output_t(OutDataOp::Noop, mem);
   } else if (kWriteInplace == req) {
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
-    return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+    return dnnl_output_t(OutDataOp::CopyBack, tmp);
   } else if (kWriteTo == req) {
-    mkldnn::memory* mem = const_cast<NDArray&>(out_arr).CreateMKLDNNData(desc);
+    dnnl::memory* mem = const_cast<NDArray&>(out_arr).CreateDNNLData(desc);
     if (nullptr == mem) {
       auto tmp = TmpMemMgr::Get()->Alloc(desc);
-      return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+      return dnnl_output_t(OutDataOp::CopyBack, tmp);
     }
-    return mkldnn_output_t(OutDataOp::Noop, mem);
+    return dnnl_output_t(OutDataOp::Noop, mem);
   }
   auto tmp = TmpMemMgr::Get()->Alloc(desc);
-  return mkldnn_output_t(OutDataOp::Noop, tmp);
+  return dnnl_output_t(OutDataOp::Noop, tmp);
 }
 
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray& out_arr,
-                                       const mkldnn::memory::desc& desc,
-                                       OpReqType req) {
+dnnl_output_t CreateDNNLWeightGrad(const NDArray& out_arr,
+                                   const dnnl::memory::desc& desc,
+                                   OpReqType req) {
   if (kAddTo == req) {
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
-    return mkldnn_output_t(OutDataOp::AddBack, tmp);
+    return dnnl_output_t(OutDataOp::AddBack, tmp);
   } else if (kWriteInplace == req) {
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
-    return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+    return dnnl_output_t(OutDataOp::CopyBack, tmp);
   } else {
-    mkldnn::memory* mem = nullptr;
+    dnnl::memory* mem = nullptr;
     if (IsDefaultFormat(desc)) {
-      mem = const_cast<NDArray&>(out_arr).CreateMKLDNNData(desc);
+      mem = const_cast<NDArray&>(out_arr).CreateDNNLData(desc);
     }
     if (mem == nullptr) {
       auto tmp = TmpMemMgr::Get()->Alloc(desc);
-      return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+      return dnnl_output_t(OutDataOp::CopyBack, tmp);
     } else {
-      return mkldnn_output_t(OutDataOp::Noop, mem);
+      return dnnl_output_t(OutDataOp::Noop, mem);
     }
   }
 }
 
-void CommitOutput(const NDArray& arr, const mkldnn_output_t& res) {
+void CommitOutput(const NDArray& arr, const dnnl_output_t& res) {
   if (res.first == CopyBack) {
     const_cast<NDArray&>(arr).CopyFrom(*res.second);
   } else if (res.first == AddBack) {
     auto res_memory = res.second;
-    auto target_pd  = arr.GetMKLDNNData()->get_desc();
-    auto mem        = arr.GetMKLDNNData(res.second->get_desc());
+    auto target_pd  = arr.GetDNNLData()->get_desc();
+    auto mem        = arr.GetDNNLData(res.second->get_desc());
     if (mem == nullptr) {
       auto tmp_memory = TmpMemMgr::Get()->Alloc(target_pd);
-      MKLDNNMemoryCopy(*res_memory, tmp_memory);
+      DNNLMemoryCopy(*res_memory, tmp_memory);
       res_memory = tmp_memory;
-      mem        = arr.GetMKLDNNData();
+      mem        = arr.GetDNNLData();
     }
-    op::MKLDNNSum(*mem, *res_memory, *mem);
+    op::DNNLSum(*mem, *res_memory, *mem);
   }
 }
 
-const mkldnn::memory* GetWeights(const NDArray& arr, int num_groups) {
-  const auto type = get_mkldnn_type(arr.dtype());
-  auto tz         = mkldnn::memory::dims{0};
-  auto format_tag = mkldnn::memory::format_tag::undef;
+const dnnl::memory* GetWeights(const NDArray& arr, int num_groups) {
+  const auto type = get_dnnl_type(arr.dtype());
+  auto tz         = dnnl::memory::dims{0};
+  auto format_tag = dnnl::memory::format_tag::undef;
   auto engine     = CpuEngine::Get()->get_engine();
   const int ndim  = arr.shape().ndim();
   int O = 0, I = 1, H = 2, W = 3;
@@ -247,69 +239,67 @@ const mkldnn::memory* GetWeights(const NDArray& arr, int num_groups) {
     W = 4;
   }
   if (ndim == 2) {
-    tz         = mkldnn::memory::dims{arr.shape()[O], arr.shape()[I]};
-    format_tag = mkldnn::memory::format_tag::oi;
+    tz         = dnnl::memory::dims{arr.shape()[O], arr.shape()[I]};
+    format_tag = dnnl::memory::format_tag::oi;
   } else if (ndim == 3) {
-    tz = num_groups > 1 ? mkldnn::memory::dims{num_groups,
-                                               arr.shape()[O] / num_groups,
-                                               arr.shape()[I],
-                                               arr.shape()[H]}
-                        : mkldnn::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H]};
-    format_tag =
-        num_groups > 1 ? mkldnn::memory::format_tag::goiw : mkldnn::memory::format_tag::oiw;
+    tz = num_groups > 1 ? dnnl::memory::dims{num_groups,
+                                             arr.shape()[O] / num_groups,
+                                             arr.shape()[I],
+                                             arr.shape()[H]}
+                        : dnnl::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H]};
+    format_tag = num_groups > 1 ? dnnl::memory::format_tag::goiw : dnnl::memory::format_tag::oiw;
   } else if (ndim == 4) {
     tz = num_groups > 1
-             ? mkldnn::memory::dims{num_groups,
-                                    arr.shape()[O] / num_groups,
-                                    arr.shape()[I],
-                                    arr.shape()[H],
-                                    arr.shape()[W]}
-             : mkldnn::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H], arr.shape()[W]};
-    format_tag =
-        num_groups > 1 ? mkldnn::memory::format_tag::goihw : mkldnn::memory::format_tag::oihw;
+             ? dnnl::memory::dims{num_groups,
+                                  arr.shape()[O] / num_groups,
+                                  arr.shape()[I],
+                                  arr.shape()[H],
+                                  arr.shape()[W]}
+             : dnnl::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H], arr.shape()[W]};
+    format_tag = num_groups > 1 ? dnnl::memory::format_tag::goihw : dnnl::memory::format_tag::oihw;
   } else if (ndim == 5) {
     tz = num_groups > 1
-             ? mkldnn::memory::dims{num_groups,
-                                    arr.shape()[O] / num_groups,
-                                    arr.shape()[I],
-                                    arr.shape()[D],
-                                    arr.shape()[H],
-                                    arr.shape()[W]}
-             : mkldnn::memory::dims{
+             ? dnnl::memory::dims{num_groups,
+                                  arr.shape()[O] / num_groups,
+                                  arr.shape()[I],
+                                  arr.shape()[D],
+                                  arr.shape()[H],
+                                  arr.shape()[W]}
+             : dnnl::memory::dims{
                    arr.shape()[O], arr.shape()[I], arr.shape()[D], arr.shape()[H], arr.shape()[W]};
     format_tag =
-        num_groups > 1 ? mkldnn::memory::format_tag::goidhw : mkldnn::memory::format_tag::oidhw;
+        num_groups > 1 ? dnnl::memory::format_tag::goidhw : dnnl::memory::format_tag::oidhw;
   } else {
     LOG(FATAL) << "The weight array has an unsupported number of dimensions";
   }
-  const auto md = mkldnn::memory::desc{tz, type, format_tag};
-  return arr.GetMKLDNNData(md);
+  const auto md = dnnl::memory::desc{tz, type, format_tag};
+  return arr.GetDNNLData(md);
 }
 
-const mkldnn::memory* GetWeights(const NDArray& arr,
-                                 const mkldnn::memory::desc& target_desc,
-                                 int num_groups) {
-  const mkldnn::memory* mem = arr.GetMKLDNNData(target_desc);
+const dnnl::memory* GetWeights(const NDArray& arr,
+                               const dnnl::memory::desc& target_desc,
+                               int num_groups) {
+  const dnnl::memory* mem = arr.GetDNNLData(target_desc);
   // If the weight array already uses the target layout, simply return it directly.
   if (mem)
     return mem;
   mem = GetWeights(arr, num_groups);
   if (mem == nullptr)
-    mem = arr.GetMKLDNNDataReorder(target_desc);
+    mem = arr.GetDNNLDataReorder(target_desc);
   if (mem->get_desc() == target_desc)
     return mem;
 
   auto ret = TmpMemMgr::Get()->Alloc(target_desc);
-  std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *mem}, {MKLDNN_ARG_TO, *ret}});
-  MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::reorder(*mem, *ret), args);
+  std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, *mem}, {DNNL_ARG_TO, *ret}});
+  DNNLStream::Get()->RegisterPrimArgs(dnnl::reorder(*mem, *ret), args);
   return ret;
 }
 
 // default: block and dims' stride increase monotonically
-// mkldnn: 1.winograd 2.rnn packed 3. block and dims'stride is not increase monotonically
-bool IsMKLDNN(const mkldnn::memory::desc& desc) {
+// dnnl: 1.winograd 2.rnn packed 3. block and dims'stride is not increase monotonically
+bool IsDNNL(const dnnl::memory::desc& desc) {
   bool rslt = true;
-  if (desc.data.format_kind == mkldnn_blocked) {
+  if (desc.data.format_kind == dnnl_blocked) {
     if (desc.data.format_desc.blocking.inner_nblks == 0) {
       int i = 0;
       for (i = 0; i < desc.data.ndims - 1; i++) {
@@ -326,33 +316,33 @@ bool IsMKLDNN(const mkldnn::memory::desc& desc) {
   return rslt;
 }
 
-mkldnn_format_tag_t GetDefaultFormat(int num_dims) {
+dnnl_format_tag_t GetDefaultFormat(int num_dims) {
   switch (num_dims) {
     case 1:
-      return mkldnn_a;
+      return dnnl_a;
     case 2:
-      return mkldnn_ab;
+      return dnnl_ab;
     case 3:
-      return mkldnn_abc;
+      return dnnl_abc;
     case 4:
-      return mkldnn_abcd;
+      return dnnl_abcd;
     case 5:
-      return mkldnn_abcde;
+      return dnnl_abcde;
     case 6:
-      return mkldnn_abcdef;
+      return dnnl_abcdef;
     default:
-      LOG(FATAL) << "Not implemented dimension (" << num_dims << ") for MKLDNN";
-      return mkldnn_format_tag_undef;
+      LOG(FATAL) << "Not implemented dimension (" << num_dims << ") for DNNL";
+      return dnnl_format_tag_undef;
   }
 }
 
-mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc& desc) {
+dnnl_format_tag_t GetDefaultFormat(const dnnl::memory::desc& desc) {
   return GetDefaultFormat(desc.data.ndims);
 }
 
-bool IsDefaultFormat(const mkldnn::memory::desc& desc) {
+bool IsDefaultFormat(const dnnl::memory::desc& desc) {
   bool rslt = false;
-  if (desc.data.format_kind == mkldnn_blocked) {
+  if (desc.data.format_kind == dnnl_blocked) {
     if (desc.data.format_desc.blocking.inner_nblks == 0) {
       int i = 0;
       for (i = 0; i < desc.data.ndims - 1; i++) {
@@ -369,22 +359,22 @@ bool IsDefaultFormat(const mkldnn::memory::desc& desc) {
   return rslt;
 }
 
-mkldnn::memory::desc GetDesc(const mkldnn::memory::desc& desc, const mkldnn_format_tag_t& format) {
-  mkldnn::memory::dims dims(desc.data.ndims);
+dnnl::memory::desc GetDesc(const dnnl::memory::desc& desc, const dnnl_format_tag_t& format) {
+  dnnl::memory::dims dims(desc.data.ndims);
   for (size_t i = 0; i < dims.size(); i++)
     dims[i] = desc.data.dims[i];
-  mkldnn::memory::format_tag cpp_format = static_cast<mkldnn::memory::format_tag>(format);
-  mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(desc.data.data_type);
-  mkldnn::memory::desc data_md(dims, cpp_type, cpp_format);
-  return mkldnn::memory::desc(dims, cpp_type, cpp_format);
+  dnnl::memory::format_tag cpp_format = static_cast<dnnl::memory::format_tag>(format);
+  dnnl::memory::data_type cpp_type    = static_cast<dnnl::memory::data_type>(desc.data.data_type);
+  dnnl::memory::desc data_md(dims, cpp_type, cpp_format);
+  return dnnl::memory::desc(dims, cpp_type, cpp_format);
 }
 
-// reorder mkldnn src to dst format dtype
-void ReorderTo(const mkldnn::memory* src, const mkldnn::memory* dst) {
-  mkldnn::stream s(CpuEngine::Get()->get_engine());
+// reorder dnnl src to dst format dtype
+void ReorderTo(const dnnl::memory* src, const dnnl::memory* dst) {
+  dnnl::stream s(CpuEngine::Get()->get_engine());
   auto new_src = *src;
   auto new_dst = *dst;
-  mkldnn::reorder(new_src, new_dst).execute(s, new_src, new_dst);
+  dnnl::reorder(new_src, new_dst).execute(s, new_src, new_dst);
 }
 
 template <typename Compute, typename AttrState>
@@ -415,7 +405,7 @@ void FallBackCompute(Compute fn,
       in_blobs[i] = in_bufs.back().data();
     }
   }
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 
   std::vector<TBlob> out_blobs(outputs.size());
   std::vector<NDArray> temp_src, temp_dst;
@@ -432,14 +422,14 @@ void FallBackCompute(Compute fn,
         new_req[i] = kWriteTo;
       }
     } else {
-      // ensure output does not use mkldnn mem.
+      // ensure output does not use dnnl mem.
       // for inplace, we already converted & copied input above.
       if ((req[i] == kWriteTo) || (req[i] == kWriteInplace)) {
-        const_cast<NDArray&>(output).InvalidateMKLDNNData();
+        const_cast<NDArray&>(output).InvalidateDNNLData();
         if (req[i] == kWriteInplace) {
           new_req[i] = kWriteTo;
         }
-      } else if (req[i] == kAddTo && output.IsMKLDNNData()) {
+      } else if (req[i] == kAddTo && output.IsDNNLData()) {
         NDArray temp = outputs[i].Reorder2Default();
         temp_src.emplace_back(temp);
         temp_dst.emplace_back(outputs[i]);
@@ -452,11 +442,11 @@ void FallBackCompute(Compute fn,
   fn(attrs_states, ctx, in_blobs, new_req, out_blobs);
   for (size_t i = 0, bf16_pos = 0; i < out_blobs.size(); i++) {
     if (outputs[i].dtype() == mshadow::kBfloat16) {
-      auto src_mem = temp_bf16_src[bf16_pos].GetMKLDNNData();
-      auto dst_mem = temp_bf16_dst[bf16_pos].GetMKLDNNData();
+      auto src_mem = temp_bf16_src[bf16_pos].GetDNNLData();
+      auto dst_mem = temp_bf16_dst[bf16_pos].GetDNNLData();
       bf16_pos++;
       ReorderTo(src_mem, dst_mem);
-    } else if (req[i] == kAddTo && outputs[i].IsMKLDNNData()) {
+    } else if (req[i] == kAddTo && outputs[i].IsDNNLData()) {
       mxnet::common::CastNonDefaultStorage(temp_src, temp_dst, ctx, false);
     }
   }
@@ -479,28 +469,28 @@ static bool SimilarArray(const mxnet::NDArray& arr1,
   if (arr1.shape().Size() != arr2.shape().Size())
     return false;
 
-  // This function should be used outside an MKLDNN operator.
+  // This function should be used outside an DNNL operator.
   // There shouldn't be any operators in the stream.
-  CHECK(!MKLDNNStream::Get()->HasOps());
+  CHECK(!DNNLStream::Get()->HasOps());
   // We need to reorder data in the arrays to the default layout.
   // But we shouldn't reorder data in the original array.
   NDArray buf1, buf2;
-  if (arr1.IsMKLDNNData()) {
+  if (arr1.IsDNNLData()) {
     buf1     = NDArray(arr1.shape(), arr1.ctx(), false, arr1.dtype());
-    auto mem = arr1.GetMKLDNNData();
+    auto mem = arr1.GetDNNLData();
     buf1.CopyFrom(*mem);
   }
-  if (arr2.IsMKLDNNData()) {
+  if (arr2.IsDNNLData()) {
     buf2     = NDArray(arr2.shape(), arr2.ctx(), false, arr2.dtype());
-    auto mem = arr2.GetMKLDNNData();
+    auto mem = arr2.GetDNNLData();
     buf2.CopyFrom(*mem);
   }
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 
   DType* data1 =
-      reinterpret_cast<DType*>(arr1.IsMKLDNNData() ? buf1.data().dptr_ : arr1.data().dptr_);
+      reinterpret_cast<DType*>(arr1.IsDNNLData() ? buf1.data().dptr_ : arr1.data().dptr_);
   DType* data2 =
-      reinterpret_cast<DType*>(arr2.IsMKLDNNData() ? buf2.data().dptr_ : arr2.data().dptr_);
+      reinterpret_cast<DType*>(arr2.IsDNNLData() ? buf2.data().dptr_ : arr2.data().dptr_);
   std::atomic<bool> success(true);
 #pragma omp parallel for
 #ifdef _MSC_VER
@@ -543,23 +533,23 @@ template void FallBackCompute(void (*)(OpStatePtr const&,
 void OpCheck::Init(const std::vector<mxnet::NDArray>& inputs_,
                    const std::vector<mxnet::NDArray>& outputs_) {
   auto ctx = inputs_[0].ctx();
-  CHECK(!MKLDNNStream::Get()->HasOps());
+  CHECK(!DNNLStream::Get()->HasOps());
   for (size_t i = 0; i < inputs_.size(); i++) {
     NDArray data = inputs_[i];
     inputs.emplace_back(data.shape(), ctx, false, data.dtype());
-    if (data.IsMKLDNNData() && data.IsView())
+    if (data.IsDNNLData() && data.IsView())
       data = data.Reorder2Default();
-    auto mem = data.GetMKLDNNData();
+    auto mem = data.GetDNNLData();
     inputs[i].CopyFrom(*mem);
   }
   for (size_t i = 0; i < outputs_.size(); i++) {
     outputs.emplace_back(outputs_[i].shape(), ctx, false, outputs_[i].dtype());
     if (backward) {
-      auto mem = outputs_[i].GetMKLDNNData();
+      auto mem = outputs_[i].GetDNNLData();
       outputs[i].CopyFrom(*mem);
     }
   }
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
 void OpCheck::Run(mxnet::FCompute fn,
@@ -568,9 +558,9 @@ void OpCheck::Run(mxnet::FCompute fn,
                   const std::vector<mxnet::NDArray>& inputs_,
                   const std::vector<mxnet::OpReqType>& req,
                   const std::vector<mxnet::NDArray>& outputs_) {
-  static auto& is_excluded = Op::GetAttr<bool>("TExcludeMKLDNNDebug");
+  static auto& is_excluded = Op::GetAttr<bool>("TExcludeDNNLDebug");
   if (is_excluded.get(attrs.op, false)) {
-    LOG(WARNING) << attrs.op->name << " not checked. TExcludeMKLDNNDebug flag present";
+    LOG(WARNING) << attrs.op->name << " not checked. TExcludeDNNLDebug flag present";
     return;
   }
   std::vector<mxnet::TBlob> in_blobs(inputs.size());
@@ -601,30 +591,30 @@ void OpCheck::Run(mxnet::FCompute fn,
 
 void OpCheck::CopyResult(const std::vector<mxnet::NDArray>& outputs_,
                          const std::vector<size_t>& indice) {
-  CHECK(!MKLDNNStream::Get()->HasOps());
+  CHECK(!DNNLStream::Get()->HasOps());
   auto non_const_outputs_ = const_cast<std::vector<mxnet::NDArray>&>(outputs_);
   for (auto i = indice.begin(); i != indice.end(); ++i) {
-    auto mem = outputs[*i].GetMKLDNNData();
+    auto mem = outputs[*i].GetDNNLData();
     non_const_outputs_[*i].CopyFrom(*mem);
   }
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
-bool MKLDNNStorageType(const nnvm::NodeAttrs& attrs,
-                       const int dev_mask,
-                       bool support_mkldnn,
-                       DispatchMode* dispatch_mode,
-                       std::vector<int>* in_attrs,
-                       std::vector<int>* out_attrs) {
+bool DNNLStorageType(const nnvm::NodeAttrs& attrs,
+                     const int dev_mask,
+                     bool support_dnnl,
+                     DispatchMode* dispatch_mode,
+                     std::vector<int>* in_attrs,
+                     std::vector<int>* out_attrs) {
   for (int& v : *in_attrs)
     if (v == -1)
       v = kDefaultStorage;
 
   DispatchMode wanted_mode;
 #if MXNET_USE_ONEDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet())
+  if (dev_mask == mshadow::cpu::kDevMask && !DNNLEnvSet())
     wanted_mode = DispatchMode::kFComputeFallback;
-  else if (dev_mask == mshadow::cpu::kDevMask && support_mkldnn)
+  else if (dev_mask == mshadow::cpu::kDevMask && support_dnnl)
     wanted_mode = DispatchMode::kFComputeEx;
   else
 #endif
@@ -641,11 +631,11 @@ bool MKLDNNStorageType(const nnvm::NodeAttrs& attrs,
   return dispatched;
 }
 
-inline static const std::vector<NDArray> GetMKLDNNInputArray(const std::vector<NDArray>& inputs) {
+inline static const std::vector<NDArray> GetDNNLInputArray(const std::vector<NDArray>& inputs) {
   std::vector<NDArray> ret;
   ret.reserve(inputs.size());
   for (const auto& in : inputs) {
-    if (in.IsView() && in.IsMKLDNNData()) {
+    if (in.IsView() && in.IsDNNLData()) {
       ret.push_back(in.Reorder2Default());
     } else {
       ret.push_back(in);
@@ -654,30 +644,30 @@ inline static const std::vector<NDArray> GetMKLDNNInputArray(const std::vector<N
   return ret;
 }
 
-void MKLDNNRun(mxnet::FComputeEx fn,
-               const nnvm::NodeAttrs& attrs,
-               const mxnet::OpContext& ctx,
-               const std::vector<mxnet::NDArray>& inputs,
-               const std::vector<mxnet::OpReqType>& req,
-               const std::vector<mxnet::NDArray>& outputs) {
-  if (CheckMKLDNNInputArrayIsView(inputs)) {
-    const auto mkldnn_inputs = GetMKLDNNInputArray(inputs);
-    fn(attrs, ctx, mkldnn_inputs, req, outputs);
+void DNNLRun(mxnet::FComputeEx fn,
+             const nnvm::NodeAttrs& attrs,
+             const mxnet::OpContext& ctx,
+             const std::vector<mxnet::NDArray>& inputs,
+             const std::vector<mxnet::OpReqType>& req,
+             const std::vector<mxnet::NDArray>& outputs) {
+  if (CheckDNNLInputArrayIsView(inputs)) {
+    const auto dnnl_inputs = GetDNNLInputArray(inputs);
+    fn(attrs, ctx, dnnl_inputs, req, outputs);
   } else {
     fn(attrs, ctx, inputs, req, outputs);
   }
 }
 
-void MKLDNNRun(FComputeExUnary fn,
-               const nnvm::NodeAttrs& attrs,
-               const mxnet::OpContext& ctx,
-               const mxnet::NDArray& input,
-               const mxnet::OpReqType& req,
-               const mxnet::NDArray& output) {
-  auto mkldnn_input = input;
-  if (input.IsView() && input.IsMKLDNNData()) {
-    mkldnn_input = input.Reorder2Default();
-    fn(attrs, ctx, mkldnn_input, req, output);
+void DNNLRun(FComputeExUnary fn,
+             const nnvm::NodeAttrs& attrs,
+             const mxnet::OpContext& ctx,
+             const mxnet::NDArray& input,
+             const mxnet::OpReqType& req,
+             const mxnet::NDArray& output) {
+  auto dnnl_input = input;
+  if (input.IsView() && input.IsDNNLData()) {
+    dnnl_input = input.Reorder2Default();
+    fn(attrs, ctx, dnnl_input, req, output);
   } else {
     fn(attrs, ctx, input, req, output);
   }
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_dot-inl.h b/src/operator/nn/dnnl/dnnl_batch_dot-inl.h
similarity index 64%
rename from src/operator/nn/mkldnn/mkldnn_batch_dot-inl.h
rename to src/operator/nn/dnnl/dnnl_batch_dot-inl.h
index 2459ea1a91e4..2c07a32f2153 100644
--- a/src/operator/nn/mkldnn/mkldnn_batch_dot-inl.h
+++ b/src/operator/nn/dnnl/dnnl_batch_dot-inl.h
@@ -18,12 +18,12 @@
  */
 
 /*!
- * \file mkldnn_batch_dot-inl.h
+ * \file dnnl_batch_dot-inl.h
  * \author: Bartosz Kuncer, bartosz.kuncer@intel.com
  */
 
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_DOT_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_DOT_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_BATCH_DOT_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_BATCH_DOT_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 
@@ -31,28 +31,27 @@
 #include <utility>
 #include <vector>
 
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-
 #include "../../tensor/dot-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 
-using batch_dot_fwd_t    = mkldnn::matmul;
-using batch_dot_fwd_pd_t = mkldnn::matmul::primitive_desc;
+using batch_dot_fwd_t    = dnnl::matmul;
+using batch_dot_fwd_pd_t = dnnl::matmul::primitive_desc;
 
 typedef ParamOpSign<DotParam> BatchDotSignature;
 
-class MKLDNNBatchDotFwd {
+class DNNLBatchDotFwd {
  public:
-  static MKLDNNBatchDotFwd& GetCached(const DotParam& param,
-                                      const std::vector<NDArray>& inputs,
-                                      const std::vector<NDArray>& outputs);
+  static DNNLBatchDotFwd& GetCached(const DotParam& param,
+                                    const std::vector<NDArray>& inputs,
+                                    const std::vector<NDArray>& outputs);
 
-  MKLDNNBatchDotFwd(const DotParam& param,
-                    const std::vector<NDArray>& inputs,
-                    const std::vector<NDArray>& outputs);
+  DNNLBatchDotFwd(const DotParam& param,
+                  const std::vector<NDArray>& inputs,
+                  const std::vector<NDArray>& outputs);
 
   void Execute(const std::vector<NDArray>& inputs,
                const std::vector<OpReqType>& req,
@@ -66,4 +65,4 @@ class MKLDNNBatchDotFwd {
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_DOT_INL_H__
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_BATCH_DOT_INL_H__
diff --git a/src/operator/nn/dnnl/dnnl_batch_dot.cc b/src/operator/nn/dnnl/dnnl_batch_dot.cc
new file mode 100644
index 000000000000..bb9f911ee8ec
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_batch_dot.cc
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_batch_dot.cc
+ * \author: Bartosz Kuncer, bartosz.kuncer@intel.com
+ */
+
+#if MXNET_USE_ONEDNN == 1
+
+#include "./dnnl_batch_dot-inl.h"
+
+namespace mxnet {
+namespace op {
+
+bool SupportDNNLBatchDot(const std::vector<NDArray>& inputs, const NDArray& output) {
+  return inputs[0].shape().Size() != 0 && inputs[1].shape().Size() != 0 &&
+         output.shape().Size() != 0 &&
+         (inputs[0].dtype() == mshadow::kFloat32 || inputs[0].dtype() == mshadow::kBfloat16);
+}
+
+void DNNLBatchDotForward(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<NDArray>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<NDArray>& outputs) {
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  DNNLBatchDotFwd& fwd  = DNNLBatchDotFwd::GetCached(param, inputs, outputs);
+  fwd.Execute(inputs, req, outputs);
+}
+
+DNNLBatchDotFwd& DNNLBatchDotFwd::GetCached(const DotParam& param,
+                                            const std::vector<NDArray>& inputs,
+                                            const std::vector<NDArray>& outputs) {
+  using batch_dot_fwd_map = std::unordered_map<BatchDotSignature, DNNLBatchDotFwd, OpHash>;
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local batch_dot_fwd_map fwds;
+#else
+  static MX_THREAD_LOCAL batch_dot_fwd_map fwds;
+#endif
+
+  BatchDotSignature key(param);
+  key.AddSign(inputs[0]);
+  key.AddSign(inputs[1]);
+  key.AddSign(outputs[0]);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    const DNNLBatchDotFwd fwd(param, inputs, outputs);
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+DNNLBatchDotFwd::DNNLBatchDotFwd(const DotParam& param,
+                                 const std::vector<NDArray>& inputs,
+                                 const std::vector<NDArray>& outputs) {
+  auto shape  = inputs[0].shape();
+  auto ndim   = shape.ndim();
+  auto bigDim = shape[0];
+  for (size_t i = 1; i < ndim - 2; ++i) {
+    bigDim *= shape[i];
+  }
+
+  auto GetMemoryDesc = [&ndim, &bigDim](const NDArray& tensor, const bool transpose) {
+    auto shape = tensor.shape();
+    if (transpose) {
+      return dnnl::memory::desc(dnnl::memory::dims{bigDim, shape[ndim - 1], shape[ndim - 2]},
+                                get_dnnl_type(tensor.dtype()),
+                                dnnl::memory::format_tag::acb);
+    } else {
+      return dnnl::memory::desc(dnnl::memory::dims{bigDim, shape[ndim - 2], shape[ndim - 1]},
+                                get_dnnl_type(tensor.dtype()),
+                                dnnl::memory::format_tag::any);
+    }
+  };
+
+  dnnl::memory::desc data_md    = GetMemoryDesc(inputs[0], param.transpose_a);
+  dnnl::memory::desc weights_md = GetMemoryDesc(inputs[1], param.transpose_b);
+  dnnl::memory::desc out_md({bigDim, data_md.dims()[1], weights_md.dims()[2]},
+                            get_dnnl_type(outputs[0].dtype()),
+                            dnnl::memory::format_tag::any);
+  dnnl::matmul::desc fwd_desc(data_md, weights_md, out_md);
+  fwd_pd = std::make_shared<batch_dot_fwd_pd_t>(fwd_desc, mxnet::CpuEngine::Get()->get_engine());
+  fwd    = std::make_shared<batch_dot_fwd_t>(*fwd_pd);
+}
+
+void DNNLBatchDotFwd::Execute(const std::vector<NDArray>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& outputs) {
+  auto engine = mxnet::CpuEngine::Get()->get_engine();
+  auto data =
+      dnnl::memory(fwd_pd->src_desc(), engine, reinterpret_cast<void*>(inputs[0].data().dptr_));
+  auto weights =
+      dnnl::memory(fwd_pd->weights_desc(), engine, reinterpret_cast<void*>(inputs[1].data().dptr_));
+  dnnl_output_t out_mem = CreateDNNLMem(outputs[0], fwd_pd->dst_desc(), req[0], &inputs[0]);
+
+  dnnl_args_map_t args = {
+      {DNNL_ARG_SRC, data},
+      {DNNL_ARG_WEIGHTS, weights},
+      {DNNL_ARG_DST, *out_mem.second},
+  };
+
+  DNNLStream::Get()->RegisterPrimArgs(*fwd, args);
+  CommitOutput(outputs[0], out_mem);
+  DNNLStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_ONEDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/dnnl/dnnl_batch_norm-inl.h
similarity index 59%
rename from src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
rename to src/operator/nn/dnnl/dnnl_batch_norm-inl.h
index 2a4b2bfdf5d5..f7dc97b58685 100644
--- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
+++ b/src/operator/nn/dnnl/dnnl_batch_norm-inl.h
@@ -18,98 +18,96 @@
  */
 
 /*!
- * \file mkldnn_batch_norm.cc
+ * \file dnnl_batch_norm.cc
  * \brief
  * \author Tao Lv
  */
 
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_BATCH_NORM_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_BATCH_NORM_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
-#include <mkldnn.hpp>
-
+#include <dnnl.hpp>
 #include <utility>
 #include <vector>
 
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-
 #include "../batch_norm-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 
-typedef mkldnn::batch_normalization_forward::primitive_desc t_bn_f_pdesc;
-typedef mkldnn::batch_normalization_forward::desc t_bn_f_desc;
-typedef mkldnn::batch_normalization_backward::primitive_desc t_bn_b_pdesc;
-typedef mkldnn::batch_normalization_backward::desc t_bn_b_desc;
+typedef dnnl::batch_normalization_forward::primitive_desc t_bn_f_pdesc;
+typedef dnnl::batch_normalization_forward::desc t_bn_f_desc;
+typedef dnnl::batch_normalization_backward::primitive_desc t_bn_b_pdesc;
+typedef dnnl::batch_normalization_backward::desc t_bn_b_desc;
 
-inline static mkldnn::normalization_flags _GetFlags(const std::vector<NDArray>& in_data,
-                                                    const std::vector<NDArray>& aux_states,
-                                                    bool is_train_and_not_global_stats,
-                                                    bool fuse_relu) {
-  mkldnn::normalization_flags flags = static_cast<mkldnn::normalization_flags>(0U);
+inline static dnnl::normalization_flags _GetFlags(const std::vector<NDArray>& in_data,
+                                                  const std::vector<NDArray>& aux_states,
+                                                  bool is_train_and_not_global_stats,
+                                                  bool fuse_relu) {
+  dnnl::normalization_flags flags = static_cast<dnnl::normalization_flags>(0U);
   if (in_data.size() == 3U) {
-    flags |= mkldnn::normalization_flags::use_scale_shift;
+    flags |= dnnl::normalization_flags::use_scale_shift;
   }
 
   // aux_states[0]: inMean
   // aux_states[1]: inVariance
   if (aux_states.size() == 2U && !is_train_and_not_global_stats) {
-    flags |= mkldnn::normalization_flags::use_global_stats;
+    flags |= dnnl::normalization_flags::use_global_stats;
   }
 
   if (fuse_relu) {
-    flags |= mkldnn::normalization_flags::fuse_norm_relu;
+    flags |= dnnl::normalization_flags::fuse_norm_relu;
   }
   return flags;
 }
 
-inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory& data_mem,
+inline static t_bn_f_pdesc _GetFwd(const dnnl::memory& data_mem,
                                    bool is_train,
                                    float eps,
-                                   mkldnn::normalization_flags flags) {
+                                   dnnl::normalization_flags flags) {
   auto data_md = data_mem.get_desc();
   auto engine  = CpuEngine::Get()->get_engine();
 
   if (is_train) {
-    t_bn_f_desc bnFwd_desc(mkldnn::prop_kind::forward_training, data_md, eps, flags);
+    t_bn_f_desc bnFwd_desc(dnnl::prop_kind::forward_training, data_md, eps, flags);
     return t_bn_f_pdesc(bnFwd_desc, engine);
   } else {
-    t_bn_f_desc bnFwd_desc(mkldnn::prop_kind::forward_inference, data_md, eps, flags);
+    t_bn_f_desc bnFwd_desc(dnnl::prop_kind::forward_inference, data_md, eps, flags);
     return t_bn_f_pdesc(bnFwd_desc, engine);
   }
 }
 
-inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory& data_mem,
-                                   const mkldnn::memory& diff_mem,
+inline static t_bn_b_pdesc _GetBwd(const dnnl::memory& data_mem,
+                                   const dnnl::memory& diff_mem,
                                    float eps,
-                                   mkldnn::normalization_flags flags) {
+                                   dnnl::normalization_flags flags) {
   auto data_md = data_mem.get_desc();
   auto diff_md = diff_mem.get_desc();
   auto engine  = CpuEngine::Get()->get_engine();
 
-  t_bn_b_desc bnBwd_desc(mkldnn::prop_kind::backward, diff_md, data_md, eps, flags);
+  t_bn_b_desc bnBwd_desc(dnnl::prop_kind::backward, diff_md, data_md, eps, flags);
   return t_bn_b_pdesc(bnBwd_desc, engine, _GetFwd(data_mem, true, eps, flags));
 }
 
-typedef ParamOpSign<BatchNormParam> MKLDNNBNSignature;
+typedef ParamOpSign<BatchNormParam> DNNLBNSignature;
 
-class MKLDNNBNForward {
-  std::shared_ptr<const mkldnn::memory> weight_m;
-  std::shared_ptr<mkldnn::batch_normalization_forward> fwd;
+class DNNLBNForward {
+  std::shared_ptr<const dnnl::memory> weight_m;
+  std::shared_ptr<dnnl::batch_normalization_forward> fwd;
   bool is_train_and_not_global_stats;
   t_bn_f_pdesc pd;
 
  public:
-  MKLDNNBNForward(const t_bn_f_pdesc& _pd, bool is_train_and_not_global_stats) : pd(_pd) {
-    weight_m.reset(new mkldnn::memory(pd.weights_desc(), CpuEngine::Get()->get_engine()));
-    fwd.reset(new mkldnn::batch_normalization_forward(pd));
+  DNNLBNForward(const t_bn_f_pdesc& _pd, bool is_train_and_not_global_stats) : pd(_pd) {
+    weight_m.reset(new dnnl::memory(pd.weights_desc(), CpuEngine::Get()->get_engine()));
+    fwd.reset(new dnnl::batch_normalization_forward(pd));
     this->is_train_and_not_global_stats = is_train_and_not_global_stats;
   }
 
-  const mkldnn::memory& GetWeight() const {
+  const dnnl::memory& GetWeight() const {
     return *weight_m;
   }
 
@@ -117,22 +115,22 @@ class MKLDNNBNForward {
     return pd;
   }
 
-  const mkldnn::batch_normalization_forward& GetFwd() const {
+  const dnnl::batch_normalization_forward& GetFwd() const {
     return *fwd;
   }
 };
 
 template <typename DType>
-static MKLDNNBNForward& GetBNForward(const BatchNormParam& param,
-                                     const OpContext& ctx,
-                                     const mkldnn::memory* data_mem,
-                                     mkldnn::normalization_flags flags) {
+static DNNLBNForward& GetBNForward(const BatchNormParam& param,
+                                   const OpContext& ctx,
+                                   const dnnl::memory* data_mem,
+                                   dnnl::normalization_flags flags) {
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNBNSignature, MKLDNNBNForward, OpHash> fwds;
+  static thread_local std::unordered_map<DNNLBNSignature, DNNLBNForward, OpHash> fwds;
 #else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNBNSignature, MKLDNNBNForward, OpHash> fwds;
+  static MX_THREAD_LOCAL std::unordered_map<DNNLBNSignature, DNNLBNForward, OpHash> fwds;
 #endif
-  MKLDNNBNSignature key(param);
+  DNNLBNSignature key(param);
   key.AddSign(ctx.is_train);
   key.AddSign(*data_mem);
   key.AddSign(static_cast<int>(flags));
@@ -140,19 +138,19 @@ static MKLDNNBNForward& GetBNForward(const BatchNormParam& param,
   auto it = fwds.find(key);
   if (it == fwds.end()) {
     auto fwd_pd = _GetFwd(*data_mem, ctx.is_train, param.eps, flags);
-    MKLDNNBNForward fwd(fwd_pd, ctx.is_train && !param.use_global_stats);
+    DNNLBNForward fwd(fwd_pd, ctx.is_train && !param.use_global_stats);
     it = AddToCache(&fwds, key, fwd);
   }
   return it->second;
 }
 
 template <typename DType>
-void MKLDNNBatchNormForward(const nnvm::NodeAttrs& attrs,
-                            const OpContext& ctx,
-                            const std::vector<NDArray>& inputs,
-                            const std::vector<OpReqType>& req,
-                            const std::vector<NDArray>& outputs,
-                            bool fuse_relu) {
+void DNNLBatchNormForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs,
+                          bool fuse_relu) {
   const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
   std::vector<NDArray> in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean);
 
@@ -173,27 +171,27 @@ void MKLDNNBatchNormForward(const nnvm::NodeAttrs& attrs,
 
   const std::vector<NDArray> aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end());
   TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
-  mkldnn::normalization_flags flags =
+  dnnl::normalization_flags flags =
       _GetFlags(in_data, aux_states, ctx.is_train && !param.use_global_stats, fuse_relu);
   NDArray& data = in_data[batchnorm::kData];
-  if (data.IsMKLDNNData() && data.IsView())
+  if (data.IsDNNLData() && data.IsView())
     data = data.Reorder2Default();
-  auto data_mem = data.GetMKLDNNData();
+  auto data_mem = data.GetDNNLData();
   auto& fwd     = GetBNForward<DType>(param, ctx, data_mem, flags);
 
   // for output memory
-  auto out_mem = const_cast<NDArray&>(out).CreateMKLDNNData(fwd.GetPd().dst_desc());
+  auto out_mem = const_cast<NDArray&>(out).CreateDNNLData(fwd.GetPd().dst_desc());
 
   // mxnet will always use scale shift.
   // But if fix_gamma is true, then all scale elements will be set to 1.0f
-  if (static_cast<int>(flags) & static_cast<int>(mkldnn::normalization_flags::use_scale_shift)) {
+  if (static_cast<int>(flags) & static_cast<int>(dnnl::normalization_flags::use_scale_shift)) {
     const NDArray& gamma = in_data[batchnorm::kGamma];
     const NDArray& beta  = in_data[batchnorm::kBeta];
     CHECK_EQ(gamma.storage_type(), mxnet::kDefaultStorage);
     CHECK_EQ(beta.storage_type(), mxnet::kDefaultStorage);
 
-    const mkldnn::memory& weight_mem = fwd.GetWeight();
-    float* weight_buf                = reinterpret_cast<float*>(weight_mem.get_data_handle());
+    const dnnl::memory& weight_mem = fwd.GetWeight();
+    float* weight_buf              = reinterpret_cast<float*>(weight_mem.get_data_handle());
 
     index_t channels_ = data.shape()[1];
     CHECK(weight_mem.get_desc().get_size() == channels_ * sizeof(float) * 2);
@@ -216,20 +214,20 @@ void MKLDNNBatchNormForward(const nnvm::NodeAttrs& attrs,
       }
     }
 
-    mkldnn_args_map_t net_args;
-    net_args[MKLDNN_ARG_SRC]         = *data_mem;
-    net_args[MKLDNN_ARG_SCALE_SHIFT] = weight_mem;
-    net_args[MKLDNN_ARG_DST]         = *out_mem;
+    dnnl_args_map_t net_args;
+    net_args[DNNL_ARG_SRC]         = *data_mem;
+    net_args[DNNL_ARG_SCALE_SHIFT] = weight_mem;
+    net_args[DNNL_ARG_DST]         = *out_mem;
     if (fuse_relu) {
       const NDArray* workspace = nullptr;
       workspace                = &outputs[3];
       auto engine              = CpuEngine::Get()->get_engine();
       if (workspace == nullptr) {
-        LOG(FATAL) << "MKLDNN BatchNorm: incorrect workspace input";
+        LOG(FATAL) << "DNNL BatchNorm: incorrect workspace input";
       }
-      auto ws = std::make_shared<mkldnn::memory>(
-          fwd.GetPd().workspace_desc(), engine, workspace->GetMKLDNNData()->get_data_handle());
-      net_args[MKLDNN_ARG_WORKSPACE] = *ws;
+      auto ws = std::make_shared<dnnl::memory>(
+          fwd.GetPd().workspace_desc(), engine, workspace->GetDNNLData()->get_data_handle());
+      net_args[DNNL_ARG_WORKSPACE] = *ws;
     }
     if (!ctx.is_train || param.use_global_stats) {
       float* omean  = outputs[batchnorm::kMean].data().dptr<float>();
@@ -241,17 +239,17 @@ void MKLDNNBatchNormForward(const nnvm::NodeAttrs& attrs,
         omean[i] = inmean[i];
         ovar[i]  = VARIANCE_TO_INVSTD(invar[i], param.eps);
       }
-      net_args[MKLDNN_ARG_MEAN]     = *(aux_states[batchnorm::kMovingMean].GetMKLDNNData());
-      net_args[MKLDNN_ARG_VARIANCE] = *(aux_states[batchnorm::kMovingVar].GetMKLDNNData());
-      MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
-      MKLDNNStream::Get()->Submit();
+      net_args[DNNL_ARG_MEAN]     = *(aux_states[batchnorm::kMovingMean].GetDNNLData());
+      net_args[DNNL_ARG_VARIANCE] = *(aux_states[batchnorm::kMovingVar].GetDNNLData());
+      DNNLStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
+      DNNLStream::Get()->Submit();
     } else {  // training
-      const NDArray& outMean        = outputs[batchnorm::kMean];
-      const NDArray& outVar         = outputs[batchnorm::kVar];
-      net_args[MKLDNN_ARG_MEAN]     = *(outMean.GetMKLDNNData());
-      net_args[MKLDNN_ARG_VARIANCE] = *(outVar.GetMKLDNNData());
-      MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
-      MKLDNNStream::Get()->Submit();
+      const NDArray& outMean      = outputs[batchnorm::kMean];
+      const NDArray& outVar       = outputs[batchnorm::kVar];
+      net_args[DNNL_ARG_MEAN]     = *(outMean.GetDNNLData());
+      net_args[DNNL_ARG_VARIANCE] = *(outVar.GetDNNLData());
+      DNNLStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
+      DNNLStream::Get()->Submit();
 
       float* ovar = outVar.data().dptr<float>();
       for (index_t i = 0; i < channels_; i++) {
@@ -259,52 +257,52 @@ void MKLDNNBatchNormForward(const nnvm::NodeAttrs& attrs,
       }
     }
   } else {  // no input gamma and beta
-    LOG(FATAL) << "MKLDNN batch normalization: should not reach here ...";
+    LOG(FATAL) << "DNNL batch normalization: should not reach here ...";
   }
 }
 
-class MKLDNNBNBackward {
-  std::shared_ptr<mkldnn::batch_normalization_backward> bwd;
-  const std::shared_ptr<mkldnn::memory> weight_m;
-  const std::shared_ptr<mkldnn::memory> gradw_m;
+class DNNLBNBackward {
+  std::shared_ptr<dnnl::batch_normalization_backward> bwd;
+  const std::shared_ptr<dnnl::memory> weight_m;
+  const std::shared_ptr<dnnl::memory> gradw_m;
 
  public:
   const t_bn_b_pdesc pd;
 
-  explicit MKLDNNBNBackward(const t_bn_b_pdesc& _pd)
-      : weight_m(new mkldnn::memory(_pd.weights_desc(), CpuEngine::Get()->get_engine())),
-        gradw_m(new mkldnn::memory(_pd.diff_weights_desc(), CpuEngine::Get()->get_engine())),
+  explicit DNNLBNBackward(const t_bn_b_pdesc& _pd)
+      : weight_m(new dnnl::memory(_pd.weights_desc(), CpuEngine::Get()->get_engine())),
+        gradw_m(new dnnl::memory(_pd.diff_weights_desc(), CpuEngine::Get()->get_engine())),
         pd(_pd) {
-    bwd.reset(new mkldnn::batch_normalization_backward(pd));
+    bwd.reset(new dnnl::batch_normalization_backward(pd));
   }
 
-  const mkldnn::memory& GetWeight() const {
+  const dnnl::memory& GetWeight() const {
     return *weight_m;
   }
 
-  const mkldnn::memory& GetGradw() const {
+  const dnnl::memory& GetGradw() const {
     return *gradw_m;
   }
 
-  const mkldnn::batch_normalization_backward& GetBwd() const {
+  const dnnl::batch_normalization_backward& GetBwd() const {
     return *bwd;
   }
 };
 
 template <typename DType>
-static MKLDNNBNBackward& GetBNBackward(const BatchNormParam& param,
-                                       const OpContext& ctx,
-                                       const NDArray& in_data,
-                                       const mkldnn::memory& in_mem,
-                                       const NDArray& diff_data,
-                                       const mkldnn::memory& diff_mem,
-                                       mkldnn::normalization_flags flags) {
+static DNNLBNBackward& GetBNBackward(const BatchNormParam& param,
+                                     const OpContext& ctx,
+                                     const NDArray& in_data,
+                                     const dnnl::memory& in_mem,
+                                     const NDArray& diff_data,
+                                     const dnnl::memory& diff_mem,
+                                     dnnl::normalization_flags flags) {
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNBNSignature, MKLDNNBNBackward, OpHash> bwds;
+  static thread_local std::unordered_map<DNNLBNSignature, DNNLBNBackward, OpHash> bwds;
 #else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNBNSignature, MKLDNNBNBackward, OpHash> bwds;
+  static MX_THREAD_LOCAL std::unordered_map<DNNLBNSignature, DNNLBNBackward, OpHash> bwds;
 #endif
-  MKLDNNBNSignature key(param);
+  DNNLBNSignature key(param);
   key.AddSign(in_data);
   key.AddSign(diff_data);
   key.AddSign(static_cast<int>(flags));
@@ -312,19 +310,19 @@ static MKLDNNBNBackward& GetBNBackward(const BatchNormParam& param,
   auto it = bwds.find(key);
   if (it == bwds.end()) {
     auto bwd_pd = _GetBwd(in_mem, diff_mem, param.eps, flags);
-    MKLDNNBNBackward bwd(bwd_pd);
+    DNNLBNBackward bwd(bwd_pd);
     it = AddToCache(&bwds, key, bwd);
   }
   return it->second;
 }
 
 template <typename DType>
-void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
-                             const OpContext& ctx,
-                             const std::vector<NDArray>& inputs,
-                             const std::vector<OpReqType>& req,
-                             const std::vector<NDArray>& outputs,
-                             bool fuse_relu) {
+void DNNLBatchNormBackward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs,
+                           bool fuse_relu) {
   if (fuse_relu) {
     CHECK_EQ(inputs.size(), 9U);
   } else {
@@ -345,7 +343,7 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
   aux_states[batchnorm::kMovingVar]   = inputs[7];
   const std::vector<NDArray>& in_grad = outputs;
   TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
-  mkldnn::normalization_flags flags =
+  dnnl::normalization_flags flags =
       _GetFlags(in_data, aux_states, ctx.is_train && !param.use_global_stats, fuse_relu);
 
   NDArray data               = in_data[batchnorm::kData];
@@ -376,19 +374,19 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
     gradIn = gradIn.Reshape(new_shape);
   }
 
-  auto data_mem = data.GetMKLDNNData();
-  auto diff_mem = diff.GetMKLDNNData();
-  // MKLDNN batchnorm should run on special layouts. If one of them isn't, we
+  auto data_mem = data.GetDNNLData();
+  auto diff_mem = diff.GetDNNLData();
+  // DNNL batchnorm should run on special layouts. If one of them isn't, we
   // should reorder them.
   if (data.IsDefaultData())
-    data_mem = data.GetMKLDNNDataReorder(diff_mem->get_desc());
+    data_mem = data.GetDNNLDataReorder(diff_mem->get_desc());
   else if (diff.IsDefaultData())
-    diff_mem = diff.GetMKLDNNDataReorder(data_mem->get_desc());
+    diff_mem = diff.GetDNNLDataReorder(data_mem->get_desc());
   auto& bwd = GetBNBackward<DType>(param, ctx, data, *data_mem, diff, *diff_mem, flags);
   auto gradi_mem =
-      CreateMKLDNNMem(const_cast<NDArray&>(gradIn), bwd.pd.diff_src_desc(), req[batchnorm::kData]);
+      CreateDNNLMem(const_cast<NDArray&>(gradIn), bwd.pd.diff_src_desc(), req[batchnorm::kData]);
 
-  if (static_cast<int>(flags) & static_cast<int>(mkldnn::normalization_flags::use_scale_shift)) {
+  if (static_cast<int>(flags) & static_cast<int>(dnnl::normalization_flags::use_scale_shift)) {
     const NDArray& gamma   = in_data[batchnorm::kGamma];
     const NDArray& beta    = in_data[batchnorm::kBeta];
     DType* weight_buf      = reinterpret_cast<DType*>(bwd.GetWeight().get_data_handle());
@@ -405,18 +403,18 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
       }
       memcpy(&weight_buf[channels_], bias_ptr, copy_size);
     }
-    mkldnn_args_map_t net_args;
-    net_args[MKLDNN_ARG_SRC]              = *data_mem;
-    net_args[MKLDNN_ARG_DIFF_SRC]         = *gradi_mem.second;
-    net_args[MKLDNN_ARG_SCALE_SHIFT]      = bwd.GetWeight();
-    net_args[MKLDNN_ARG_DIFF_SCALE_SHIFT] = bwd.GetGradw();
-    net_args[MKLDNN_ARG_DIFF_DST]         = *diff_mem;
+    dnnl_args_map_t net_args;
+    net_args[DNNL_ARG_SRC]              = *data_mem;
+    net_args[DNNL_ARG_DIFF_SRC]         = *gradi_mem.second;
+    net_args[DNNL_ARG_SCALE_SHIFT]      = bwd.GetWeight();
+    net_args[DNNL_ARG_DIFF_SCALE_SHIFT] = bwd.GetGradw();
+    net_args[DNNL_ARG_DIFF_DST]         = *diff_mem;
 
     if (fuse_relu) {
       const NDArray* workspace = nullptr;
       workspace                = &inputs[8];
       if (workspace != nullptr) {
-        net_args[MKLDNN_ARG_WORKSPACE] = *(workspace->GetMKLDNNData());
+        net_args[DNNL_ARG_WORKSPACE] = *(workspace->GetDNNLData());
       }
     }
 
@@ -426,7 +424,7 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
       DType* moving_var_ptr  = moving_var.data().dptr<DType>();
       DType* out_mean_ptr    = out_mean.data().dptr<DType>();
       DType* out_var_ptr     = out_var.data().dptr<DType>();
-      mkldnn::memory var_mem(bwd.pd.variance_desc(), CpuEngine::Get()->get_engine());
+      dnnl::memory var_mem(bwd.pd.variance_desc(), CpuEngine::Get()->get_engine());
       DType* tmp_var_ptr = reinterpret_cast<DType*>(var_mem.get_data_handle());
 
       DType minus_mom = (1.0f - param.momentum);
@@ -436,15 +434,15 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
         tmp_var_ptr[i]     = variance;
         moving_var_ptr[i]  = moving_var_ptr[i] * param.momentum + variance * minus_mom;
       }
-      net_args[MKLDNN_ARG_MEAN]     = *(out_mean.GetMKLDNNData());
-      net_args[MKLDNN_ARG_VARIANCE] = var_mem;
+      net_args[DNNL_ARG_MEAN]     = *(out_mean.GetDNNLData());
+      net_args[DNNL_ARG_VARIANCE] = var_mem;
     } else {
-      net_args[MKLDNN_ARG_MEAN]     = *(moving_mean.GetMKLDNNData());
-      net_args[MKLDNN_ARG_VARIANCE] = *(moving_var.GetMKLDNNData());
+      net_args[DNNL_ARG_MEAN]     = *(moving_mean.GetDNNLData());
+      net_args[DNNL_ARG_VARIANCE] = *(moving_var.GetDNNLData());
     }
-    MKLDNNStream::Get()->RegisterPrimArgs(bwd.GetBwd(), net_args);
+    DNNLStream::Get()->RegisterPrimArgs(bwd.GetBwd(), net_args);
     CommitOutput(gradIn, gradi_mem);
-    MKLDNNStream::Get()->Submit();
+    DNNLStream::Get()->Submit();
 
     // copy data from gradw_mem to in_grad[1] and in_grad[2]
     DType* gw_buf   = reinterpret_cast<DType*>(bwd.GetGradw().get_data_handle());
@@ -480,10 +478,10 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
       }
     }
   } else {
-    LOG(FATAL) << "MKLDNN batch normalization backward: should not reach here ...";
+    LOG(FATAL) << "DNNL batch normalization backward: should not reach here ...";
   }
 }
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_ONEDNN
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_BATCH_NORM_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_concat-inl.h b/src/operator/nn/dnnl/dnnl_concat-inl.h
similarity index 58%
rename from src/operator/nn/mkldnn/mkldnn_concat-inl.h
rename to src/operator/nn/dnnl/dnnl_concat-inl.h
index a78b5a65c797..4646137aa6d4 100644
--- a/src/operator/nn/mkldnn/mkldnn_concat-inl.h
+++ b/src/operator/nn/dnnl/dnnl_concat-inl.h
@@ -18,46 +18,45 @@
  */
 
 /*!
- * \file mkldnn_concat-inl.h
+ * \file dnnl_concat-inl.h
  * \brief
  * \author
  */
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONCAT_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONCAT_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_CONCAT_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_CONCAT_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 #include <utility>
 #include <vector>
 
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-
 #include "../concat-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 
-class MKLDNNConcatFwd {
+class DNNLConcatFwd {
  public:
-  mkldnn::concat::primitive_desc fwd_pd;
+  dnnl::concat::primitive_desc fwd_pd;
 
-  MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc>& data_md);
+  DNNLConcatFwd(int concat_dim, const std::vector<dnnl::memory::desc>& data_md);
 
-  const mkldnn::concat& GetFwd() const {
+  const dnnl::concat& GetFwd() const {
     return *fwd_;
   }
 
  private:
-  std::shared_ptr<mkldnn::concat> fwd_;
+  std::shared_ptr<dnnl::concat> fwd_;
 };
 
-static MKLDNNConcatFwd& GetConcatForward(int concat_dim,
-                                         const std::vector<NDArray>& in_data,
-                                         const std::vector<mkldnn::memory::desc>& data_md) {
+static DNNLConcatFwd& GetConcatForward(int concat_dim,
+                                       const std::vector<NDArray>& in_data,
+                                       const std::vector<dnnl::memory::desc>& data_md) {
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<OpSignature, MKLDNNConcatFwd, OpHash> fwds;
+  static thread_local std::unordered_map<OpSignature, DNNLConcatFwd, OpHash> fwds;
 #else
-  static MX_THREAD_LOCAL std::unordered_map<OpSignature, MKLDNNConcatFwd, OpHash> fwds;
+  static MX_THREAD_LOCAL std::unordered_map<OpSignature, DNNLConcatFwd, OpHash> fwds;
 #endif
   OpSignature key;
   key.AddSign(concat_dim);
@@ -65,7 +64,7 @@ static MKLDNNConcatFwd& GetConcatForward(int concat_dim,
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
-    MKLDNNConcatFwd fwd(concat_dim, data_md);
+    DNNLConcatFwd fwd(concat_dim, data_md);
     it = AddToCache(&fwds, key, fwd);
   }
   return it->second;
@@ -75,4 +74,4 @@ static MKLDNNConcatFwd& GetConcatForward(int concat_dim,
 }  // namespace mxnet
 
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONCAT_INL_H_
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_CONCAT_INL_H_
diff --git a/src/operator/nn/dnnl/dnnl_concat.cc b/src/operator/nn/dnnl/dnnl_concat.cc
new file mode 100644
index 000000000000..1214a31805c9
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_concat.cc
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_concat.cc
+ * \brief
+ * \author
+ */
+
+#if MXNET_USE_ONEDNN == 1
+#include "dnnl_concat-inl.h"
+
+namespace mxnet {
+namespace op {
+
+static inline bool IsUsingPadding(const dnnl::memory::desc& dst_md) {
+  // make sure a blocked format is used (at least one dimension is blocked)
+  bool is_blocked_format =
+      dst_md.data.format_kind == dnnl_blocked && dst_md.data.format_desc.blocking.inner_nblks > 0;
+  return is_blocked_format &&
+         !std::equal(
+             dst_md.data.dims, dst_md.data.dims + dst_md.data.ndims, dst_md.data.padded_dims);
+}
+
+DNNLConcatFwd::DNNLConcatFwd(int concat_dim, const std::vector<dnnl::memory::desc>& data_md)
+    : fwd_pd(concat_dim, data_md, CpuEngine::Get()->get_engine()) {
+  // DNNL introduced padded formats since 0.15 which require more memory
+  // compared to the actual size of the tensor. Currently, DNNL operators
+  // still reuse memory from memory planning, so here we need to select a
+  // format that has the expected memory size requirements (a plain format)
+
+  // When fwd_pd uses padding, impose a plain format
+  const auto& dst_md = fwd_pd.dst_desc();
+  if (IsUsingPadding(dst_md)) {
+    auto plain_dst_tag = static_cast<dnnl::memory::format_tag>(GetDefaultFormat(dst_md.data.ndims));
+    auto plain_dst_md  = dnnl::memory::desc(dst_md.dims(), dst_md.data_type(), plain_dst_tag);
+    fwd_pd             = dnnl::concat::primitive_desc(
+        plain_dst_md, concat_dim, data_md, CpuEngine::Get()->get_engine());
+  }
+  fwd_ = std::make_shared<dnnl::concat>(fwd_pd);
+}
+
+void DNNLConcatForward(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx,
+                       const std::vector<NDArray>& in_data,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<NDArray>& out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
+  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
+  const int num_in_data    = param.num_args;
+  const int concat_dim     = param.dim;
+  std::vector<dnnl::memory::desc> data_md;
+  std::vector<const dnnl::memory*> data_mem;
+  data_md.reserve(num_in_data);
+  data_mem.reserve(num_in_data);
+  for (int i = 0; i < num_in_data; i++) {
+    const dnnl::memory* tmp_mem = in_data[i].GetDNNLData();
+    dnnl::memory::desc tmp_md   = tmp_mem->get_desc();
+    data_md.push_back(tmp_md);
+    data_mem.push_back(tmp_mem);
+  }
+  DNNLConcatFwd& fwd = GetConcatForward(concat_dim, in_data, data_md);
+  mxnet::dnnl_output_t out_mem =
+      CreateDNNLMem(out_data[concat_enum::kOut], fwd.fwd_pd.dst_desc(), req[concat_enum::kOut]);
+  std::unordered_map<int, dnnl::memory> net_args;
+  net_args.insert({DNNL_ARG_DST, *out_mem.second});
+  for (int i = 0; i < num_in_data; i++) {
+    net_args.insert({DNNL_ARG_MULTIPLE_SRC + i, *data_mem[i]});
+  }
+  DNNLStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
+  CommitOutput(out_data[concat_enum::kOut], out_mem);
+  DNNLStream::Get()->Submit();
+}
+
+void DNNLConcatBackward(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<NDArray>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<NDArray>& outputs) {
+  TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
+  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
+  const int num_in_data    = param.num_args;
+  const int axis           = param.dim;
+  const auto gradz_mem     = inputs[0].GetDNNLData();
+  /* init the offset */
+  dnnl::memory::dims offsets(outputs[0].shape().ndim());
+  for (auto& v : offsets) {
+    v = 0;
+  }
+
+  for (int i = 0; i < num_in_data; i++) {
+    dnnl::memory::dims diff_src_tz(outputs[i].shape().begin(), outputs[i].shape().end());
+    auto diff_src_md = outputs[i].GetDNNLData()->get_desc();
+    auto gradi_mem   = CreateDNNLMem(outputs[i], diff_src_md, req[i]);
+
+    auto from_md = gradz_mem->get_desc().submemory_desc(diff_src_tz, offsets);
+    auto from_mem =
+        new dnnl::memory(from_md, gradz_mem->get_engine(), gradz_mem->get_data_handle());
+    offsets[axis] += diff_src_tz[axis];
+
+    std::unordered_map<int, dnnl::memory> net_args(
+        {{DNNL_ARG_FROM, *gradz_mem}, {DNNL_ARG_TO, *gradi_mem.second}});
+    DNNLStream::Get()->RegisterPrimArgs(dnnl::reorder(*from_mem, *gradi_mem.second), net_args);
+    CommitOutput(outputs[i], gradi_mem);
+  }
+
+  DNNLStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_ONEDNN == 1
diff --git a/src/operator/nn/dnnl/dnnl_convolution-inl.h b/src/operator/nn/dnnl/dnnl_convolution-inl.h
new file mode 100644
index 000000000000..529b6c3caa50
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_convolution-inl.h
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_convolution-inl.h
+ * \brief
+ */
+
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_CONVOLUTION_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_CONVOLUTION_INL_H_
+
+#if MXNET_USE_ONEDNN == 1
+
+#include <utility>
+#include <vector>
+
+#include "../convolution-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
+
+namespace mxnet {
+namespace op {
+
+struct DNNLConvParam : public dmlc::Parameter<DNNLConvParam> {
+  bool with_bn;
+  bool with_act;
+  bool with_sum;
+  bool with_postsum_act;
+  bool quantized;
+  bool dedup_sum;
+
+  dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
+  dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
+
+  DMLC_DECLARE_PARAMETER(DNNLConvParam) {
+    DMLC_DECLARE_FIELD(with_bn).set_default(false).describe("Add post batchnorm.");
+    DMLC_DECLARE_FIELD(with_act).set_default(false).describe("Add post activation");
+    DMLC_DECLARE_FIELD(with_sum).set_default(false).describe("Add post sum");
+    DMLC_DECLARE_FIELD(with_postsum_act)
+        .set_default(false)
+        .describe("Add post activation after sum");
+    DMLC_DECLARE_FIELD(quantized).set_default(false).describe("enable quantization");
+    DMLC_DECLARE_FIELD(dedup_sum).set_default(false).describe("deduplicated sum input");
+    DMLC_DECLARE_FIELD(min_calib_range)
+        .set_default(dmlc::optional<float>())
+        .describe(
+            "The minimum scalar value in the form of float32 obtained "
+            "through calibration. If present, it will be used to by "
+            "quantized convolution op to calculate primitive scale");
+    DMLC_DECLARE_FIELD(max_calib_range)
+        .set_default(dmlc::optional<float>())
+        .describe(
+            "The maximum scalar value in the form of float32 obtained "
+            "through calibration. If present, it will be used to by "
+            "quantized convolution op to calculate primitive scale");
+  }
+};
+
+struct DNNLConvFullParam {
+  ConvolutionParam conv_param;
+  DNNLConvParam dnnl_param;
+  float sum_scale = 1.f;
+  std::vector<float> requantize_scales;
+  DNNLPostEltwiseParam act_param;
+  DNNLPostEltwiseParam postsum_act_param;
+};
+
+std::shared_ptr<dnnl::convolution_forward::primitive_desc> GetConvFwdImpl(
+    const ConvolutionParam& param,
+    const bool is_train,
+    const NDArray& data,
+    const NDArray& weight,
+    const NDArray* bias,
+    const NDArray& output);
+
+class DNNLConvForward {
+ public:
+  DNNLConvForward(const DNNLConvFullParam& param,
+                  const bool is_train,
+                  const NDArray& data,
+                  const NDArray& weight,
+                  const NDArray* bias,
+                  const NDArray& output);
+
+  const dnnl::convolution_forward& GetFwd() const {
+    return *fwd_;
+  }
+
+  const dnnl::convolution_forward::primitive_desc& GetPd() const {
+    return *pd_;
+  }
+
+ private:
+  std::shared_ptr<dnnl::convolution_forward> fwd_;
+  std::shared_ptr<dnnl::convolution_forward::primitive_desc> pd_;
+};
+
+typedef ParamOpSign<ConvolutionParam> DNNLConvSignature;
+
+DNNLConvForward& GetConvFwd(const DNNLConvFullParam& param,
+                            const bool is_train,
+                            const NDArray& data,
+                            const NDArray& weight,
+                            const NDArray* bias,
+                            const NDArray& output);
+
+void DNNLConvolutionForwardFullFeature(const DNNLConvFullParam& param,
+                                       const OpContext& ctx,
+                                       DNNLConvForward* fwd,
+                                       const std::vector<NDArray>& in_data,
+                                       const std::vector<OpReqType>& req,
+                                       const std::vector<NDArray>& out_data);
+
+void DNNLConvolutionForward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& in_data,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& out_data);
+
+class DNNLConvBackward {
+ public:
+  DNNLConvBackward(const DNNLConvFullParam& param,
+                   const NDArray& data,
+                   const NDArray& weight,
+                   const NDArray* bias,
+                   const NDArray& output);
+
+  const dnnl::convolution_backward_data& GetBwdData() const {
+    return *bwd_data_;
+  }
+
+  const dnnl::convolution_backward_weights& GetBwdWeights() const {
+    return *bwd_weight_;
+  }
+
+  const dnnl::convolution_backward_data::primitive_desc& GetDataPd() const {
+    return *bwd_data_pd_;
+  }
+
+  const dnnl::convolution_backward_weights::primitive_desc& GetWeightsPd() const {
+    return *bwd_weight_pd_;
+  }
+
+ private:
+  std::shared_ptr<dnnl::convolution_backward_data::primitive_desc> bwd_data_pd_;
+  std::shared_ptr<dnnl::convolution_backward_weights::primitive_desc> bwd_weight_pd_;
+  std::shared_ptr<dnnl::convolution_backward_data> bwd_data_;
+  std::shared_ptr<dnnl::convolution_backward_weights> bwd_weight_;
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_ONEDNN == 1
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_CONVOLUTION_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/dnnl/dnnl_convolution.cc
similarity index 54%
rename from src/operator/nn/mkldnn/mkldnn_convolution.cc
rename to src/operator/nn/dnnl/dnnl_convolution.cc
index ef2c57e4a9b8..9754f7fa4505 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/dnnl/dnnl_convolution.cc
@@ -18,50 +18,47 @@
  */
 
 /*!
- * \file mkldnn_convolution.cc
+ * \file dnnl_convolution.cc
  * \brief
  * \author Da Zheng
  */
 
 #if MXNET_USE_ONEDNN == 1
 
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_convolution-inl.h"
-#include "./mkldnn_ops-inl.h"
-
 #include "../convolution-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_convolution-inl.h"
+#include "./dnnl_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 
-DMLC_REGISTER_PARAMETER(MKLDNNConvParam);
+DMLC_REGISTER_PARAMETER(DNNLConvParam);
 
-bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray& input) {
+bool SupportDNNLConv(const ConvolutionParam& params, const NDArray& input) {
   if (params.kernel.ndim() > 3 || params.kernel.ndim() == 0)
     return false;
-  return IsMKLDNNType(input.dtype()) &&
-         input.shape().ndim() >= 3 && input.shape().ndim() <= 5;
+  return IsDNNLType(input.dtype()) && input.shape().ndim() >= 3 && input.shape().ndim() <= 5;
 }
 
-std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
-    const MKLDNNConvFullParam& param,
+std::shared_ptr<dnnl::convolution_forward::primitive_desc> GetConvFwdImpl(
+    const DNNLConvFullParam& param,
     const bool is_train,
     const NDArray& data,
     const NDArray& weights,
     const NDArray* bias,
     const NDArray& output) {
-  auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
+  auto prop      = is_train ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_scoring;
   auto data_md   = GetMemDesc(data);
-  auto weight_md = GetWeightDesc(weights, param.conv_param.num_group, param.mkldnn_param.quantized);
+  auto weight_md = GetWeightDesc(weights, param.conv_param.num_group, param.dnnl_param.quantized);
   auto out_md    = GetMemDesc(output);
   auto bias_md =
-      bias ? (param.mkldnn_param.quantized ? GetMemDesc(*bias, mshadow::kInt32) : GetMemDesc(*bias))
-           : mkldnn::memory::desc{
-                 {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any};
+      bias ? (param.dnnl_param.quantized ? GetMemDesc(*bias, mshadow::kInt32) : GetMemDesc(*bias))
+           : dnnl::memory::desc{{}, dnnl::memory::data_type::undef, dnnl::memory::format_tag::any};
   auto bias_md_ptr = bias ? &bias_md : nullptr;
 
-  mkldnn::memory::dims strides(param.conv_param.kernel.ndim());
-  mkldnn::memory::dims padding(param.conv_param.kernel.ndim());
+  dnnl::memory::dims strides(param.conv_param.kernel.ndim());
+  dnnl::memory::dims padding(param.conv_param.kernel.ndim());
   if (param.conv_param.kernel.ndim() == 1) {
     CHECK_GE(param.conv_param.stride.ndim(), 1);
     CHECK_GE(param.conv_param.pad.ndim(), 1);
@@ -87,48 +84,48 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
     padding[1] = param.conv_param.pad[1];
     padding[2] = param.conv_param.pad[2];
   } else {
-    LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size " << param.conv_param.kernel.ndim()
+    LOG(FATAL) << "Unexpected DNNL Conv kernel size " << param.conv_param.kernel.ndim()
                << ", supporting only 1 or 2 or 3.";
   }
-  mkldnn::primitive_attr attr;
-  mkldnn::post_ops ops;
-  if (param.mkldnn_param.with_act) {
+  dnnl::primitive_attr attr;
+  dnnl::post_ops ops;
+  if (param.dnnl_param.with_act) {
     const auto& act_param = param.act_param;
     ops.append_eltwise(act_param.scale, act_param.alg, act_param.alpha, act_param.beta);
   }
-  if (param.mkldnn_param.with_sum) {
+  if (param.dnnl_param.with_sum) {
     ops.append_sum(param.sum_scale);
   }
-  if (param.mkldnn_param.with_postsum_act) {
+  if (param.dnnl_param.with_postsum_act) {
     const auto& act_param = param.postsum_act_param;
     ops.append_eltwise(act_param.scale, act_param.alg, act_param.alpha, act_param.beta);
   }
   attr.set_post_ops(ops);
 
-  if (param.mkldnn_param.quantized && param.requantize_scales.size()) {
+  if (param.dnnl_param.quantized && param.requantize_scales.size()) {
     int mask = (param.requantize_scales.size() > 1) ? 2 : 0;
     attr.set_output_scales(mask, param.requantize_scales);
   }
   auto GetConvFwdPd =
-      [&param, &data, &weights, &output, &attr](const mkldnn::convolution_forward::desc& desc) {
+      [&param, &data, &weights, &output, &attr](const dnnl::convolution_forward::desc& desc) {
         auto engine = CpuEngine::Get()->get_engine();
         try {
-          // MKL-DNN introduced padded formats since 0.15 which require more memory
-          // compared to the actual size of the tensor. Currently, MKL-DNN operators
+          // DNNL introduced padded formats since 0.15 which require more memory
+          // compared to the actual size of the tensor. Currently, DNNL operators
           // still reuse memory from memory planning, so here we need to select a
           // suboptimal kernel for computation that has the expected memory size requirements
           auto conv_pd =
-              std::make_shared<mkldnn::convolution_forward::primitive_desc>(desc, attr, engine);
+              std::make_shared<dnnl::convolution_forward::primitive_desc>(desc, attr, engine);
           while (conv_pd->dst_desc().get_size() != GetArraySize(output) ||
                  conv_pd->src_desc().get_size() != GetArraySize(data) ||
-                 (!param.mkldnn_param.quantized &&
+                 (!param.dnnl_param.quantized &&
                   conv_pd->weights_desc().get_size() != GetArraySize(weights))) {
             // next_impl() will visit desc and engine, please make sure they are still alive here.
             CHECK(conv_pd->next_impl()) << "No convolution implementation for this request.";
           }
           return conv_pd;
-        } catch (mkldnn::error& e) {
-          if (e.status == mkldnn_unimplemented && param.mkldnn_param.quantized) {
+        } catch (dnnl::error& e) {
+          if (e.status == dnnl_unimplemented && param.dnnl_param.quantized) {
             LOG(ERROR) << "AVX512-BW support or Intel(R) MKL dependency is "
                           "required for int8 convolution";
           } else {
@@ -139,28 +136,28 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
       };
 
   if (param.conv_param.dilate.ndim() == 0 && bias_md_ptr == nullptr) {
-    mkldnn::convolution_forward::desc desc(prop,
-                                           mkldnn::algorithm::convolution_direct,
-                                           data_md,
-                                           weight_md,
-                                           out_md,
-                                           strides,
-                                           padding,
-                                           padding);
+    dnnl::convolution_forward::desc desc(prop,
+                                         dnnl::algorithm::convolution_direct,
+                                         data_md,
+                                         weight_md,
+                                         out_md,
+                                         strides,
+                                         padding,
+                                         padding);
     return GetConvFwdPd(desc);
   } else if (param.conv_param.dilate.ndim() == 0) {
-    mkldnn::convolution_forward::desc desc(prop,
-                                           mkldnn::algorithm::convolution_direct,
-                                           data_md,
-                                           weight_md,
-                                           *bias_md_ptr,
-                                           out_md,
-                                           strides,
-                                           padding,
-                                           padding);
+    dnnl::convolution_forward::desc desc(prop,
+                                         dnnl::algorithm::convolution_direct,
+                                         data_md,
+                                         weight_md,
+                                         *bias_md_ptr,
+                                         out_md,
+                                         strides,
+                                         padding,
+                                         padding);
     return GetConvFwdPd(desc);
   } else {
-    mkldnn::memory::dims dilates(param.conv_param.kernel.ndim());
+    dnnl::memory::dims dilates(param.conv_param.kernel.ndim());
     if (param.conv_param.dilate.ndim() == 1) {
       dilates[0] = param.conv_param.dilate[0] - 1;
     } else if (param.conv_param.dilate.ndim() == 2) {
@@ -171,48 +168,48 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
       dilates[1] = param.conv_param.dilate[1] - 1;
       dilates[2] = param.conv_param.dilate[2] - 1;
     } else {
-      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size " << param.conv_param.dilate.ndim()
+      LOG(FATAL) << "Unexpected DNNL Conv dilate size " << param.conv_param.dilate.ndim()
                  << ", supporting only 1 or 2 or 3.";
     }
     if (bias_md_ptr == nullptr) {
-      mkldnn::convolution_forward::desc desc(prop,
-                                             mkldnn::algorithm::convolution_direct,
-                                             data_md,
-                                             weight_md,
-                                             out_md,
-                                             strides,
-                                             dilates,
-                                             padding,
-                                             padding);
+      dnnl::convolution_forward::desc desc(prop,
+                                           dnnl::algorithm::convolution_direct,
+                                           data_md,
+                                           weight_md,
+                                           out_md,
+                                           strides,
+                                           dilates,
+                                           padding,
+                                           padding);
       return GetConvFwdPd(desc);
     } else {
-      mkldnn::convolution_forward::desc desc(prop,
-                                             mkldnn::algorithm::convolution_direct,
-                                             data_md,
-                                             weight_md,
-                                             *bias_md_ptr,
-                                             out_md,
-                                             strides,
-                                             dilates,
-                                             padding,
-                                             padding);
+      dnnl::convolution_forward::desc desc(prop,
+                                           dnnl::algorithm::convolution_direct,
+                                           data_md,
+                                           weight_md,
+                                           *bias_md_ptr,
+                                           out_md,
+                                           strides,
+                                           dilates,
+                                           padding,
+                                           padding);
       return GetConvFwdPd(desc);
     }
   }
 }
 
-static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetConvBwdData(
+static std::shared_ptr<dnnl::convolution_backward_data::primitive_desc> GetConvBwdData(
     const ConvolutionParam& param,
     const NDArray& data,
     const NDArray& weight,
     const NDArray& output,
-    const mkldnn::convolution_forward::primitive_desc& fwd_pd) {
+    const dnnl::convolution_forward::primitive_desc& fwd_pd) {
   auto data_md   = GetMemDesc(data);
   auto weight_md = GetWeightDesc(weight, param.num_group);
   auto out_md    = GetMemDesc(output);
   auto engine    = CpuEngine::Get()->get_engine();
-  mkldnn::memory::dims strides(param.kernel.ndim());
-  mkldnn::memory::dims padding(param.kernel.ndim());
+  dnnl::memory::dims strides(param.kernel.ndim());
+  dnnl::memory::dims padding(param.kernel.ndim());
   if (param.kernel.ndim() == 1) {
     CHECK_GE(param.stride.ndim(), 1);
     CHECK_GE(param.pad.ndim(), 1);
@@ -238,20 +235,20 @@ static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetCon
     padding[1] = param.pad[1];
     padding[2] = param.pad[2];
   } else {
-    LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size " << param.kernel.ndim()
+    LOG(FATAL) << "Unexpected DNNL Conv kernel size " << param.kernel.ndim()
                << ", supporting only 1 or 2 or 3.";
   }
 
   auto GetConvBwdDataPd = [&data, &weight, &output, &fwd_pd](
-                              const mkldnn::convolution_backward_data::desc& desc) {
+                              const dnnl::convolution_backward_data::desc& desc) {
     auto engine = CpuEngine::Get()->get_engine();
     try {
-      // MKL-DNN introduced padded formats since 0.15 which require more memory
-      // compared to the actual size of the tensor. Currently, MKL-DNN operators
+      // DNNL introduced padded formats since 0.15 which require more memory
+      // compared to the actual size of the tensor. Currently, DNNL operators
       // still reuse memory from memory planning, so here we need to select a
       // suboptimal kernel for computation that has the expected memory size requirements
       auto conv_pd =
-          std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(desc, engine, fwd_pd);
+          std::make_shared<dnnl::convolution_backward_data::primitive_desc>(desc, engine, fwd_pd);
       while (conv_pd->diff_dst_desc().get_size() != GetArraySize(output) ||
              conv_pd->diff_src_desc().get_size() != GetArraySize(data) ||
              conv_pd->weights_desc().get_size() != GetArraySize(weight)) {
@@ -259,23 +256,18 @@ static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetCon
         CHECK(conv_pd->next_impl()) << "No convolution backward implementation for this request.";
       }
       return conv_pd;
-    } catch (mkldnn::error& e) {
+    } catch (dnnl::error& e) {
       LOG(ERROR) << e.message;
       throw;
     }
   };
 
   if (param.dilate.ndim() == 0) {
-    mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
-                                                 data_md,
-                                                 weight_md,
-                                                 out_md,
-                                                 strides,
-                                                 padding,
-                                                 padding);
+    dnnl::convolution_backward_data::desc desc(
+        dnnl::algorithm::convolution_direct, data_md, weight_md, out_md, strides, padding, padding);
     return GetConvBwdDataPd(desc);
   } else {
-    mkldnn::memory::dims dilates(param.kernel.ndim());
+    dnnl::memory::dims dilates(param.kernel.ndim());
     if (param.dilate.ndim() == 1) {
       dilates[0] = param.dilate[0] - 1;
     } else if (param.dilate.ndim() == 2) {
@@ -286,34 +278,34 @@ static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetCon
       dilates[1] = param.dilate[1] - 1;
       dilates[2] = param.dilate[2] - 1;
     } else {
-      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size " << param.dilate.ndim()
+      LOG(FATAL) << "Unexpected DNNL Conv dilate size " << param.dilate.ndim()
                  << ", supporting only 1 or 2 or 3.";
     }
-    mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
-                                                 data_md,
-                                                 weight_md,
-                                                 out_md,
-                                                 strides,
-                                                 dilates,
-                                                 padding,
-                                                 padding);
+    dnnl::convolution_backward_data::desc desc(dnnl::algorithm::convolution_direct,
+                                               data_md,
+                                               weight_md,
+                                               out_md,
+                                               strides,
+                                               dilates,
+                                               padding,
+                                               padding);
     return GetConvBwdDataPd(desc);
   }
 }
 
-static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> GetConvBwdWeights(
+static std::shared_ptr<dnnl::convolution_backward_weights::primitive_desc> GetConvBwdWeights(
     const ConvolutionParam& param,
     const NDArray& data,
     const NDArray& weight,
     const NDArray* bias,
     const NDArray& output,
-    const mkldnn::convolution_forward::primitive_desc& fwd_pd) {
+    const dnnl::convolution_forward::primitive_desc& fwd_pd) {
   auto data_md   = GetMemDesc(data);
   auto weight_md = GetWeightDesc(weight, param.num_group);
   auto out_md    = GetMemDesc(output);
   auto engine    = CpuEngine::Get()->get_engine();
-  mkldnn::memory::dims strides(param.kernel.ndim());
-  mkldnn::memory::dims padding(param.kernel.ndim());
+  dnnl::memory::dims strides(param.kernel.ndim());
+  dnnl::memory::dims padding(param.kernel.ndim());
   if (param.kernel.ndim() == 1) {
     CHECK_GE(param.stride.ndim(), 1);
     CHECK_GE(param.pad.ndim(), 1);
@@ -339,19 +331,19 @@ static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> Get
     padding[1] = param.pad[1];
     padding[2] = param.pad[2];
   } else {
-    LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size " << param.kernel.ndim()
+    LOG(FATAL) << "Unexpected DNNL Conv kernel size " << param.kernel.ndim()
                << ", supporting only 1 or 2 or 3.";
   }
 
   auto GetConvBwdWeightsPd = [&data, &weight, &output, &fwd_pd](
-                                 const mkldnn::convolution_backward_weights::desc& desc) {
+                                 const dnnl::convolution_backward_weights::desc& desc) {
     auto engine = CpuEngine::Get()->get_engine();
     try {
-      // MKL-DNN introduced padded formats since 0.15 which require more memory
-      // compared to the actual size of the tensor. Currently, MKL-DNN operators
+      // DNNL introduced padded formats since 0.15 which require more memory
+      // compared to the actual size of the tensor. Currently, DNNL operators
       // still reuse memory from memory planning, so here we need to select a
       // suboptimal kernel for computation that has the expected memory size requirements
-      auto conv_pd = std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
+      auto conv_pd = std::make_shared<dnnl::convolution_backward_weights::primitive_desc>(
           desc, engine, fwd_pd);
       while (conv_pd->diff_dst_desc().get_size() != GetArraySize(output) ||
              conv_pd->src_desc().get_size() != GetArraySize(data) ||
@@ -360,34 +352,29 @@ static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> Get
         CHECK(conv_pd->next_impl()) << "No convolution backward implementation for this request.";
       }
       return conv_pd;
-    } catch (mkldnn::error& e) {
+    } catch (dnnl::error& e) {
       LOG(ERROR) << e.message;
       throw;
     }
   };
 
   if (param.dilate.ndim() == 0 && bias == nullptr) {
-    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-                                                    data_md,
-                                                    weight_md,
-                                                    out_md,
-                                                    strides,
-                                                    padding,
-                                                    padding);
+    dnnl::convolution_backward_weights::desc desc(
+        dnnl::algorithm::convolution_direct, data_md, weight_md, out_md, strides, padding, padding);
     return GetConvBwdWeightsPd(desc);
   } else if (param.dilate.ndim() == 0) {
     auto bias_md = GetMemDesc(*bias);
-    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-                                                    data_md,
-                                                    weight_md,
-                                                    bias_md,
-                                                    out_md,
-                                                    strides,
-                                                    padding,
-                                                    padding);
+    dnnl::convolution_backward_weights::desc desc(dnnl::algorithm::convolution_direct,
+                                                  data_md,
+                                                  weight_md,
+                                                  bias_md,
+                                                  out_md,
+                                                  strides,
+                                                  padding,
+                                                  padding);
     return GetConvBwdWeightsPd(desc);
   } else {
-    mkldnn::memory::dims dilates(param.kernel.ndim());
+    dnnl::memory::dims dilates(param.kernel.ndim());
     if (param.dilate.ndim() == 1) {
       dilates[0] = param.dilate[0] - 1;
     } else if (param.dilate.ndim() == 2) {
@@ -398,52 +385,52 @@ static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> Get
       dilates[1] = param.dilate[1] - 1;
       dilates[2] = param.dilate[2] - 1;
     } else {
-      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size " << param.dilate.ndim()
+      LOG(FATAL) << "Unexpected DNNL Conv dilate size " << param.dilate.ndim()
                  << ", supporting only 1 or 2 or 3.";
     }
     if (bias == nullptr) {
-      mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-                                                      data_md,
-                                                      weight_md,
-                                                      out_md,
-                                                      strides,
-                                                      dilates,
-                                                      padding,
-                                                      padding);
+      dnnl::convolution_backward_weights::desc desc(dnnl::algorithm::convolution_direct,
+                                                    data_md,
+                                                    weight_md,
+                                                    out_md,
+                                                    strides,
+                                                    dilates,
+                                                    padding,
+                                                    padding);
       return GetConvBwdWeightsPd(desc);
     } else {
       auto bias_md = GetMemDesc(*bias);
-      mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-                                                      data_md,
-                                                      weight_md,
-                                                      bias_md,
-                                                      out_md,
-                                                      strides,
-                                                      dilates,
-                                                      padding,
-                                                      padding);
+      dnnl::convolution_backward_weights::desc desc(dnnl::algorithm::convolution_direct,
+                                                    data_md,
+                                                    weight_md,
+                                                    bias_md,
+                                                    out_md,
+                                                    strides,
+                                                    dilates,
+                                                    padding,
+                                                    padding);
       return GetConvBwdWeightsPd(desc);
     }
   }
 }
 
-MKLDNNConvForward::MKLDNNConvForward(const MKLDNNConvFullParam& param,
-                                     const bool is_train,
-                                     const NDArray& data,
-                                     const NDArray& weight,
-                                     const NDArray* bias,
-                                     const NDArray& output)
+DNNLConvForward::DNNLConvForward(const DNNLConvFullParam& param,
+                                 const bool is_train,
+                                 const NDArray& data,
+                                 const NDArray& weight,
+                                 const NDArray* bias,
+                                 const NDArray& output)
     : pd_(GetConvFwdImpl(param, is_train, data, weight, bias, output)) {
-  fwd_ = std::make_shared<mkldnn::convolution_forward>(GetPd());
+  fwd_ = std::make_shared<dnnl::convolution_forward>(GetPd());
 }
 
-MKLDNNConvForward& GetConvFwd(const MKLDNNConvFullParam& param,
-                              const bool is_train,
-                              const NDArray& data,
-                              const NDArray& weight,
-                              const NDArray* bias,
-                              const NDArray& output) {
-  using conv_fwd_map = std::unordered_map<MKLDNNConvSignature, MKLDNNConvForward, OpHash>;
+DNNLConvForward& GetConvFwd(const DNNLConvFullParam& param,
+                            const bool is_train,
+                            const NDArray& data,
+                            const NDArray& weight,
+                            const NDArray* bias,
+                            const NDArray& output) {
+  using conv_fwd_map = std::unordered_map<DNNLConvSignature, DNNLConvForward, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local conv_fwd_map fwds;
 #else
@@ -451,7 +438,7 @@ MKLDNNConvForward& GetConvFwd(const MKLDNNConvFullParam& param,
 #endif
   // TODO(zhennan): Hash conv_param for now, need to hash full param if we want to enable cache for
   // fused conv
-  MKLDNNConvSignature key(param.conv_param);
+  DNNLConvSignature key(param.conv_param);
   key.AddSign(is_train);
   // Here we can sign the conv op with NDArray because conv primitive will decide the right layout
   // for the, so we only need to get the shape and the data type of the arrays.
@@ -463,30 +450,30 @@ MKLDNNConvForward& GetConvFwd(const MKLDNNConvFullParam& param,
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
-    auto fwd = MKLDNNConvForward(param, is_train, data, weight, bias, output);
+    auto fwd = DNNLConvForward(param, is_train, data, weight, bias, output);
     it       = AddToCache(&fwds, key, fwd);
   }
   return it->second;
 }
 
-void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam& param,
-                                         const OpContext& ctx,
-                                         MKLDNNConvForward* fwd,
-                                         const std::vector<NDArray>& in_data,
-                                         const std::vector<OpReqType>& req,
-                                         const std::vector<NDArray>& out_data) {
+void DNNLConvolutionForwardFullFeature(const DNNLConvFullParam& param,
+                                       const OpContext& ctx,
+                                       DNNLConvForward* fwd,
+                                       const std::vector<NDArray>& in_data,
+                                       const std::vector<OpReqType>& req,
+                                       const std::vector<NDArray>& out_data) {
   TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
 
   auto& data   = in_data[conv::kData];
   auto& weight = in_data[conv::kWeight];
-  bool no_bias = param.conv_param.no_bias && !param.mkldnn_param.with_bn;
+  bool no_bias = param.conv_param.no_bias && !param.dnnl_param.with_bn;
 
-  auto data_mem = data.GetMKLDNNDataReorder(fwd->GetPd().src_desc());
-  const mkldnn::memory* weight_mem;
+  auto data_mem = data.GetDNNLDataReorder(fwd->GetPd().src_desc());
+  const dnnl::memory* weight_mem;
   if (ctx.is_train) {
-    // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it to the default format
+    // TODO(zhengda) kvstore doesn't handle DNNL correctly. Let's reorder it to the default format
     // for now.
-    if (weight.IsMKLDNNData())
+    if (weight.IsDNNLData())
       // This asks the engine to change the layout of the weight array after it's used.
       weight.Reorder2DefaultAsync();
     weight_mem = GetWeights(weight, fwd->GetPd().weights_desc(), param.conv_param.num_group);
@@ -496,77 +483,77 @@ void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam& param,
     if (weight.IsDefaultData()) {
       // We also need to modify the layout on the original weight array. The data conversion happens
       // after the weight array is used.
-      weight.MKLDNNDataReorderAsync(fwd->GetPd().weights_desc());
+      weight.DNNLDataReorderAsync(fwd->GetPd().weights_desc());
       weight_mem = GetWeights(weight, fwd->GetPd().weights_desc(), param.conv_param.num_group);
     } else {
-      weight_mem = weight.GetMKLDNNDataReorder(fwd->GetPd().weights_desc());
+      weight_mem = weight.GetDNNLDataReorder(fwd->GetPd().weights_desc());
     }
   }
-  mkldnn_output_t out_mem;
-  if (param.mkldnn_param.with_sum) {
-    out_mem = mkldnn_output_t(OutDataOp::Noop,
-                              const_cast<mkldnn::memory*>(out_data[conv::kOut].GetMKLDNNData()));
+  dnnl_output_t out_mem;
+  if (param.dnnl_param.with_sum) {
+    out_mem = dnnl_output_t(OutDataOp::Noop,
+                            const_cast<dnnl::memory*>(out_data[conv::kOut].GetDNNLData()));
   } else {
-    out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd->GetPd().dst_desc(), req[conv::kOut]);
+    out_mem = CreateDNNLMem(out_data[conv::kOut], fwd->GetPd().dst_desc(), req[conv::kOut]);
   }
 
-  mkldnn_args_map_t net_args;
+  dnnl_args_map_t net_args;
   if (!no_bias) {
-    const mkldnn::memory* bias_mem = in_data[conv::kBias].GetMKLDNNData();
-    net_args.insert({MKLDNN_ARG_BIAS, *bias_mem});
+    const dnnl::memory* bias_mem = in_data[conv::kBias].GetDNNLData();
+    net_args.insert({DNNL_ARG_BIAS, *bias_mem});
   }
 
-  net_args.insert({MKLDNN_ARG_SRC, *data_mem});
-  net_args.insert({MKLDNN_ARG_WEIGHTS, *weight_mem});
-  net_args.insert({MKLDNN_ARG_DST, *out_mem.second});
-  MKLDNNStream::Get()->RegisterPrimArgs(fwd->GetFwd(), net_args);
+  net_args.insert({DNNL_ARG_SRC, *data_mem});
+  net_args.insert({DNNL_ARG_WEIGHTS, *weight_mem});
+  net_args.insert({DNNL_ARG_DST, *out_mem.second});
+  DNNLStream::Get()->RegisterPrimArgs(fwd->GetFwd(), net_args);
   CommitOutput(out_data[conv::kOut], out_mem);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
-void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs,
-                              const OpContext& ctx,
-                              const std::vector<NDArray>& in_data,
-                              const std::vector<OpReqType>& req,
-                              const std::vector<NDArray>& out_data) {
-  MKLDNNConvFullParam param;
+void DNNLConvolutionForward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& in_data,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& out_data) {
+  DNNLConvFullParam param;
   param.conv_param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
+  param.dnnl_param.Init(std::unordered_map<std::string, std::string>());
   auto& fwd = GetConvFwd(param,
                          ctx.is_train,
                          in_data[conv::kData],
                          in_data[conv::kWeight],
                          param.conv_param.no_bias ? nullptr : &in_data[conv::kBias],
                          out_data[conv::kOut]);
-  MKLDNNConvolutionForwardFullFeature(param, ctx, &fwd, in_data, req, out_data);
+  DNNLConvolutionForwardFullFeature(param, ctx, &fwd, in_data, req, out_data);
 }
 
-MKLDNNConvBackward::MKLDNNConvBackward(const MKLDNNConvFullParam& param,
-                                       const NDArray& data,
-                                       const NDArray& weight,
-                                       const NDArray* bias,
-                                       const NDArray& output) {
+DNNLConvBackward::DNNLConvBackward(const DNNLConvFullParam& param,
+                                   const NDArray& data,
+                                   const NDArray& weight,
+                                   const NDArray* bias,
+                                   const NDArray& output) {
   const auto fwd_pd = GetConvFwdImpl(param, true, data, weight, bias, output);
   bwd_data_pd_      = GetConvBwdData(param.conv_param, data, weight, output, *fwd_pd);
   bwd_weight_pd_    = GetConvBwdWeights(param.conv_param, data, weight, bias, output, *fwd_pd);
-  bwd_data_         = std::make_shared<mkldnn::convolution_backward_data>(GetDataPd());
-  bwd_weight_       = std::make_shared<mkldnn::convolution_backward_weights>(GetWeightsPd());
+  bwd_data_         = std::make_shared<dnnl::convolution_backward_data>(GetDataPd());
+  bwd_weight_       = std::make_shared<dnnl::convolution_backward_weights>(GetWeightsPd());
 }
 
-static inline MKLDNNConvBackward& GetConvBwd(const MKLDNNConvFullParam& param,
-                                             const NDArray& data,
-                                             const NDArray& weight,
-                                             const NDArray* bias,
-                                             const NDArray& output) {
-  using mkldnn_conv_bwd_map = std::unordered_map<MKLDNNConvSignature, MKLDNNConvBackward, OpHash>;
+static inline DNNLConvBackward& GetConvBwd(const DNNLConvFullParam& param,
+                                           const NDArray& data,
+                                           const NDArray& weight,
+                                           const NDArray* bias,
+                                           const NDArray& output) {
+  using dnnl_conv_bwd_map = std::unordered_map<DNNLConvSignature, DNNLConvBackward, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local mkldnn_conv_bwd_map bwds;
+  static thread_local dnnl_conv_bwd_map bwds;
 #else
-  static MX_THREAD_LOCAL mkldnn_conv_bwd_map bwds;
+  static MX_THREAD_LOCAL dnnl_conv_bwd_map bwds;
 #endif
   // TODO(zhennan): Hash conv_param for now, need to hash full param if we want to enable cache for
   // fused conv
-  MKLDNNConvSignature key(param.conv_param);
+  DNNLConvSignature key(param.conv_param);
   // Here we can sign the conv op with NDArray because conv primitive will decide the right layout
   // for the, so we only need to get the shape and the data type of the arrays.
   key.AddSign(data);
@@ -577,22 +564,22 @@ static inline MKLDNNConvBackward& GetConvBwd(const MKLDNNConvFullParam& param,
 
   auto it = bwds.find(key);
   if (it == bwds.end()) {
-    auto bwd = MKLDNNConvBackward(param, data, weight, bias, output);
+    auto bwd = DNNLConvBackward(param, data, weight, bias, output);
     it       = AddToCache(&bwds, key, bwd);
   }
   return it->second;
 }
 
-void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs,
-                               const OpContext& ctx,
-                               const std::vector<NDArray>& inputs,
-                               const std::vector<OpReqType>& req,
-                               const std::vector<NDArray>& outputs) {
+void DNNLConvolutionBackward(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<NDArray>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<NDArray>& outputs) {
   TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
   const std::vector<NDArray>& in_grad = outputs;
-  MKLDNNConvFullParam full_param;
+  DNNLConvFullParam full_param;
   full_param.conv_param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  full_param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
+  full_param.dnnl_param.Init(std::unordered_map<std::string, std::string>());
 
   auto& data       = inputs[conv::kData + 1];
   auto& weight     = inputs[conv::kWeight + 1];
@@ -602,16 +589,16 @@ void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs,
   const ConvolutionParam& param = full_param.conv_param;
 
   CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace";
-  MKLDNNConvBackward& convBwd = GetConvBwd(full_param, data, weight, bias, out_grad);
-  auto out_grad_mem           = out_grad.GetMKLDNNDataReorder(convBwd.GetDataPd().diff_dst_desc());
+  DNNLConvBackward& convBwd = GetConvBwd(full_param, data, weight, bias, out_grad);
+  auto out_grad_mem         = out_grad.GetDNNLDataReorder(convBwd.GetDataPd().diff_dst_desc());
   if (req[conv::kData]) {
-    auto weight_mem  = GetWeights(weight, convBwd.GetDataPd().weights_desc(), param.num_group);
-    auto in_grad_mem = CreateMKLDNNMem(
-        in_grad[conv::kData], convBwd.GetDataPd().diff_src_desc(), req[conv::kData]);
-    MKLDNNStream::Get()->RegisterPrimArgs(convBwd.GetBwdData(),
-                                          {{MKLDNN_ARG_DIFF_DST, *out_grad_mem},
-                                           {MKLDNN_ARG_WEIGHTS, *weight_mem},
-                                           {MKLDNN_ARG_DIFF_SRC, *in_grad_mem.second}});
+    auto weight_mem = GetWeights(weight, convBwd.GetDataPd().weights_desc(), param.num_group);
+    auto in_grad_mem =
+        CreateDNNLMem(in_grad[conv::kData], convBwd.GetDataPd().diff_src_desc(), req[conv::kData]);
+    DNNLStream::Get()->RegisterPrimArgs(convBwd.GetBwdData(),
+                                        {{DNNL_ARG_DIFF_DST, *out_grad_mem},
+                                         {DNNL_ARG_WEIGHTS, *weight_mem},
+                                         {DNNL_ARG_DIFF_SRC, *in_grad_mem.second}});
     CommitOutput(in_grad[conv::kData], in_grad_mem);
   }
 
@@ -619,28 +606,28 @@ void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs,
   auto req_bias   = req.size() > conv::kBias ? req.at(conv::kBias) : kNullOp;
   if (req_weight || req_bias) {
     if (convBwd.GetDataPd().diff_dst_desc() != convBwd.GetWeightsPd().diff_dst_desc())
-      out_grad_mem = out_grad.GetMKLDNNDataReorder(convBwd.GetWeightsPd().diff_dst_desc());
-    auto data_mem       = data.GetMKLDNNDataReorder(convBwd.GetWeightsPd().src_desc());
-    auto in_grad_weight = CreateMKLDNNWeightGrad(
+      out_grad_mem = out_grad.GetDNNLDataReorder(convBwd.GetWeightsPd().diff_dst_desc());
+    auto data_mem       = data.GetDNNLDataReorder(convBwd.GetWeightsPd().src_desc());
+    auto in_grad_weight = CreateDNNLWeightGrad(
         in_grad[conv::kWeight], convBwd.GetWeightsPd().diff_weights_desc(), req[conv::kWeight]);
 
-    mkldnn_args_map_t net_args = {{MKLDNN_ARG_DIFF_DST, *out_grad_mem},
-                                  {MKLDNN_ARG_SRC, *data_mem},
-                                  {MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second}};
-    mkldnn_output_t in_grad_bias;
+    dnnl_args_map_t net_args = {{DNNL_ARG_DIFF_DST, *out_grad_mem},
+                                {DNNL_ARG_SRC, *data_mem},
+                                {DNNL_ARG_DIFF_WEIGHTS, *in_grad_weight.second}};
+    dnnl_output_t in_grad_bias;
     if (!param.no_bias) {
-      in_grad_bias = CreateMKLDNNMem(
+      in_grad_bias = CreateDNNLMem(
           in_grad[conv::kBias], convBwd.GetWeightsPd().diff_bias_desc(), req[conv::kBias]);
-      net_args.insert({MKLDNN_ARG_DIFF_BIAS, *in_grad_bias.second});
+      net_args.insert({DNNL_ARG_DIFF_BIAS, *in_grad_bias.second});
     }
-    MKLDNNStream::Get()->RegisterPrimArgs(convBwd.GetBwdWeights(), net_args);
+    DNNLStream::Get()->RegisterPrimArgs(convBwd.GetBwdWeights(), net_args);
     CommitOutput(in_grad[conv::kWeight], in_grad_weight);
     // CommitOutput Should run after RegisterPrimArgs for memory dependency
     if (!param.no_bias) {
       CommitOutput(in_grad[conv::kBias], in_grad_bias);
     }
   }
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
 }  // namespace op
diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/dnnl/dnnl_copy.cc
similarity index 69%
rename from src/operator/nn/mkldnn/mkldnn_copy.cc
rename to src/operator/nn/dnnl/dnnl_copy.cc
index 813016d264d3..2b78103a02ee 100644
--- a/src/operator/nn/mkldnn/mkldnn_copy.cc
+++ b/src/operator/nn/dnnl/dnnl_copy.cc
@@ -18,40 +18,40 @@
  */
 
 /*!
- * \file mkldnn_copy.cc
+ * \file dnnl_copy.cc
  * \brief
  * \author
  */
 
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
 
 #if MXNET_USE_ONEDNN == 1
 namespace mxnet {
 namespace op {
 
-void MKLDNNCopy(const nnvm::NodeAttrs& attrs,
-                const OpContext& ctx,
-                const NDArray& in_data,
-                const OpReqType& req,
-                const NDArray& out_data) {
+void DNNLCopy(const nnvm::NodeAttrs& attrs,
+              const OpContext& ctx,
+              const NDArray& in_data,
+              const OpReqType& req,
+              const NDArray& out_data) {
   if (req == kNullOp || req == kWriteInplace)
     return;
   TmpMemMgr::Get()->Init(ctx.requested[0]);
-  auto in_mem = in_data.GetMKLDNNData();
+  auto in_mem = in_data.GetDNNLData();
   if (req == kAddTo) {
     TmpMemMgr::Get()->Init(ctx.requested[0]);
     // We should try and force the input memory has the same format
     // as the input output. If not, we'll have to reorder memory.
-    auto out_mem = out_data.GetMKLDNNData();
-    in_mem       = in_data.GetMKLDNNData(out_mem->get_desc());
+    auto out_mem = out_data.GetDNNLData();
+    in_mem       = in_data.GetDNNLData(out_mem->get_desc());
     if (in_mem == nullptr)
-      in_mem = in_data.GetMKLDNNDataReorder(out_mem->get_desc());
-    MKLDNNSum(*out_mem, *in_mem, *out_mem);
+      in_mem = in_data.GetDNNLDataReorder(out_mem->get_desc());
+    DNNLSum(*out_mem, *in_mem, *out_mem);
   } else {
     const_cast<NDArray&>(out_data).CopyFrom(*in_mem);
   }
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
 }  // namespace op
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h b/src/operator/nn/dnnl/dnnl_deconvolution-inl.h
similarity index 56%
rename from src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
rename to src/operator/nn/dnnl/dnnl_deconvolution-inl.h
index a66d3a887326..301537967df3 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
+++ b/src/operator/nn/dnnl/dnnl_deconvolution-inl.h
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file mkldnn_deconvolution-inl.h
+ * \file dnnl_deconvolution-inl.h
  * Naming convention:
  *                 ________
  *  (src) data --->|Deconv|
@@ -31,10 +31,10 @@
  *                                 |______|<--- bias
  *
  * "out" in this (and .cc) file will always refer to the output of Deconv FWD and
- * "out_grad" to its gradient. The corresponding MKLDNN names are in parentheses.
+ * "out_grad" to its gradient. The corresponding DNNL names are in parentheses.
  */
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_DECONVOLUTION_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_DECONVOLUTION_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 #include <numeric>
@@ -42,25 +42,25 @@
 #include <vector>
 
 #include "../deconvolution-inl.h"
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 
-using deconv_fwd_t    = mkldnn::deconvolution_forward;
-using deconv_fwd_pd_t = mkldnn::deconvolution_forward::primitive_desc;
+using deconv_fwd_t    = dnnl::deconvolution_forward;
+using deconv_fwd_pd_t = dnnl::deconvolution_forward::primitive_desc;
 
-using deconv_bwd_data_t    = mkldnn::deconvolution_backward_data;
-using deconv_bwd_data_pd_t = mkldnn::deconvolution_backward_data::primitive_desc;
+using deconv_bwd_data_t    = dnnl::deconvolution_backward_data;
+using deconv_bwd_data_pd_t = dnnl::deconvolution_backward_data::primitive_desc;
 
-using deconv_bwd_weights_t    = mkldnn::deconvolution_backward_weights;
-using deconv_bwd_weights_pd_t = mkldnn::deconvolution_backward_weights::primitive_desc;
+using deconv_bwd_weights_t    = dnnl::deconvolution_backward_weights;
+using deconv_bwd_weights_pd_t = dnnl::deconvolution_backward_weights::primitive_desc;
 
 // Swaps the logical order of dimensions that in plain format would correspond to input and output
 // channels (for example: oihw => iohw, iohw => oihw, goihw => giohw).
-inline mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc& desc,
-                                              const uint32_t num_group) {
+inline dnnl::memory::desc IOLogicalSwapDesc(const dnnl::memory::desc& desc,
+                                            const uint32_t num_group) {
   std::vector<int> order(desc.data.ndims);
   std::iota(std::begin(order), std::end(order), 0);
   const int offset = static_cast<int>(num_group > 1);
@@ -68,29 +68,29 @@ inline mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc& desc,
   return desc.permute_axes(order);
 }
 
-// Applies IOLogicalSwapDesc to MKLDNN memory of arr
-inline void IOLogicalSwapMKLDNNMem(const NDArray& arr, const uint32_t num_group) {
-  mkldnn::memory::desc desc;
-  if (arr.IsMKLDNNData()) {
-    desc = arr.GetMKLDNNData()->get_desc();
+// Applies IOLogicalSwapDesc to DNNL memory of arr
+inline void IOLogicalSwapDNNLMem(const NDArray& arr, const uint32_t num_group) {
+  dnnl::memory::desc desc;
+  if (arr.IsDNNLData()) {
+    desc = arr.GetDNNLData()->get_desc();
   } else {
-    // GetMKLDNNData won't take groups into account when creating mkldnn::memory, we need to use
+    // GetDNNLData won't take groups into account when creating dnnl::memory, we need to use
     // descriptor from GetWeightDesc but with default format
     const auto& temp = GetWeightDesc(arr, num_group);
-    desc             = mkldnn::memory::desc(
+    desc             = dnnl::memory::desc(
         temp.dims(),
         temp.data_type(),
-        static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(temp.data.ndims)));
+        static_cast<dnnl::memory::format_tag>(GetDefaultFormat(temp.data.ndims)));
   }
-  const_cast<NDArray&>(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_group));
+  const_cast<NDArray&>(arr).UpdateDNNLMemDesc(IOLogicalSwapDesc(desc, num_group));
 }
 
 // Version of GetWeightsDesc for deconvolution (with swap)
-inline mkldnn::memory::desc GetDeconvWeightsDesc(const NDArray& weights, const uint32_t num_group) {
+inline dnnl::memory::desc GetDeconvWeightsDesc(const NDArray& weights, const uint32_t num_group) {
   return IOLogicalSwapDesc(GetWeightDesc(weights, num_group), num_group);
 }
 
-class MKLDNNDeconvFwd {
+class DNNLDeconvFwd {
  public:
   struct Tensors {
     Tensors(const NDArray& data,
@@ -107,65 +107,65 @@ class MKLDNNDeconvFwd {
     const NDArray& out;
   };
 
-  static MKLDNNDeconvFwd& GetCached(const DeconvolutionParam& param, const Tensors& tensors);
+  static DNNLDeconvFwd& GetCached(const DeconvolutionParam& param, const Tensors& tensors);
   static std::shared_ptr<deconv_fwd_pd_t> CreatePrimitiveDesc(const DeconvolutionParam& param,
                                                               const Tensors& tensors);
 
-  MKLDNNDeconvFwd(const DeconvolutionParam& param, const Tensors& tensors);
+  DNNLDeconvFwd(const DeconvolutionParam& param, const Tensors& tensors);
   void ControlWeightsFormat(const uint32_t num_group,
                             const bool is_train,
                             const NDArray& weights) const;
   void Execute(const uint32_t num_group, const OpReqType req, const Tensors& tensors) const;
 
  private:
-  const mkldnn::memory* DataMem(const NDArray& data) const;
-  const mkldnn::memory* WeightsMem(const uint32_t num_group, const NDArray& weights) const;
-  const mkldnn::memory* BiasMem(const NDArray& bias) const;
+  const dnnl::memory* DataMem(const NDArray& data) const;
+  const dnnl::memory* WeightsMem(const uint32_t num_group, const NDArray& weights) const;
+  const dnnl::memory* BiasMem(const NDArray& bias) const;
 
-  mkldnn_output_t OutMem(const OpReqType req, const NDArray& out) const;
+  dnnl_output_t OutMem(const OpReqType req, const NDArray& out) const;
 
  private:
   std::shared_ptr<deconv_fwd_t> fwd;
   std::shared_ptr<deconv_fwd_pd_t> fwd_pd;
 };
 
-MKLDNNDeconvFwd::Tensors::Tensors(const bool no_bias,
-                                  const std::vector<NDArray>& inputs,
-                                  const std::vector<NDArray>& outputs)
+DNNLDeconvFwd::Tensors::Tensors(const bool no_bias,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<NDArray>& outputs)
     : data(inputs[deconv::kData]),
       weights(inputs[deconv::kWeight]),
       bias(no_bias ? nullptr : &inputs[deconv::kBias]),
       out(outputs[deconv::kOut]) {}
 
-MKLDNNDeconvFwd::Tensors::Tensors(const NDArray& data,
-                                  const NDArray& weights,
-                                  const NDArray* const bias,
-                                  const NDArray& out)
+DNNLDeconvFwd::Tensors::Tensors(const NDArray& data,
+                                const NDArray& weights,
+                                const NDArray* const bias,
+                                const NDArray& out)
     : data(data), weights(weights), bias(bias), out(out) {}
 
-MKLDNNDeconvFwd::MKLDNNDeconvFwd(const DeconvolutionParam& param, const Tensors& tensors)
+DNNLDeconvFwd::DNNLDeconvFwd(const DeconvolutionParam& param, const Tensors& tensors)
     : fwd_pd(CreatePrimitiveDesc(param, tensors)) {
   fwd = std::make_shared<deconv_fwd_t>(*fwd_pd);
 }
 
-inline const mkldnn::memory* MKLDNNDeconvFwd::DataMem(const NDArray& data) const {
-  return data.GetMKLDNNDataReorder(fwd_pd->src_desc());
+inline const dnnl::memory* DNNLDeconvFwd::DataMem(const NDArray& data) const {
+  return data.GetDNNLDataReorder(fwd_pd->src_desc());
 }
 
-inline const mkldnn::memory* MKLDNNDeconvFwd::WeightsMem(const uint32_t num_group,
-                                                         const NDArray& weights) const {
+inline const dnnl::memory* DNNLDeconvFwd::WeightsMem(const uint32_t num_group,
+                                                     const NDArray& weights) const {
   return GetWeights(weights, fwd_pd->weights_desc(), num_group);
 }
 
-inline const mkldnn::memory* MKLDNNDeconvFwd::BiasMem(const NDArray& bias) const {
-  return bias.GetMKLDNNData();
+inline const dnnl::memory* DNNLDeconvFwd::BiasMem(const NDArray& bias) const {
+  return bias.GetDNNLData();
 }
 
-inline mkldnn_output_t MKLDNNDeconvFwd::OutMem(const OpReqType req, const NDArray& out) const {
-  return CreateMKLDNNMem(out, fwd_pd->dst_desc(), req);
+inline dnnl_output_t DNNLDeconvFwd::OutMem(const OpReqType req, const NDArray& out) const {
+  return CreateDNNLMem(out, fwd_pd->dst_desc(), req);
 }
 
-class MKLDNNDeconvBwd {
+class DNNLDeconvBwd {
  public:
   struct ReadTensors {
     ReadTensors(const bool no_bias, const std::vector<NDArray>& inputs);
@@ -181,8 +181,7 @@ class MKLDNNDeconvBwd {
     const NDArray* const bias_grad;
   };
 
-  static MKLDNNDeconvBwd& GetCached(const DeconvolutionParam& param,
-                                    const ReadTensors& read_tensors);
+  static DNNLDeconvBwd& GetCached(const DeconvolutionParam& param, const ReadTensors& read_tensors);
 
   static std::shared_ptr<deconv_bwd_data_pd_t> CreateDataPrimitiveDesc(
       const DeconvolutionParam& param,
@@ -194,7 +193,7 @@ class MKLDNNDeconvBwd {
       const ReadTensors& read_tensors,
       const deconv_fwd_pd_t& fwd_pd);
 
-  MKLDNNDeconvBwd(const DeconvolutionParam& param, const ReadTensors& read_tensors);
+  DNNLDeconvBwd(const DeconvolutionParam& param, const ReadTensors& read_tensors);
 
   void Execute(const uint32_t num_group,
                const std::vector<OpReqType>& req,
@@ -209,31 +208,31 @@ class MKLDNNDeconvBwd {
 
   // returns the output gradient memory used to calculate the data (input) gradient,
   // which might be reused when calculating the gradient of weights
-  const mkldnn::memory* ScheduleBwdData(const uint32_t num_group,
-                                        const OpReqType req,
-                                        const ReadTensors& read_tensors,
-                                        const WriteTensors& write_tensors) const;
+  const dnnl::memory* ScheduleBwdData(const uint32_t num_group,
+                                      const OpReqType req,
+                                      const ReadTensors& read_tensors,
+                                      const WriteTensors& write_tensors) const;
 
   void ScheduleBwdWeights(const uint32_t num_group,
                           const std::vector<OpReqType>& req,
                           const ReadTensors& read_tensors,
                           const WriteTensors& write_tensors,
-                          const mkldnn::memory* const out_grad_mem) const;
+                          const dnnl::memory* const out_grad_mem) const;
 
-  const mkldnn::memory* DataMem(const NDArray& data) const;
-  const mkldnn::memory* WeightsMem(const uint32_t num_group, const NDArray& weights) const;
+  const dnnl::memory* DataMem(const NDArray& data) const;
+  const dnnl::memory* WeightsMem(const uint32_t num_group, const NDArray& weights) const;
 
   // for calculating the gradient of data (input)
-  const mkldnn::memory* OutGradMem(const NDArray& out_grad) const;
+  const dnnl::memory* OutGradMem(const NDArray& out_grad) const;
   // for calculating the gradient of weights
-  const mkldnn::memory* OutGradMem(const NDArray& out_grad,
-                                   const mkldnn::memory* const out_grad_mem) const;
+  const dnnl::memory* OutGradMem(const NDArray& out_grad,
+                                 const dnnl::memory* const out_grad_mem) const;
 
-  mkldnn_output_t DataGradMem(const OpReqType req, const NDArray& data_grad) const;
-  mkldnn_output_t WeightsGradMem(const uint32_t num_group,
-                                 const OpReqType req,
-                                 const NDArray& weights_grad) const;
-  mkldnn_output_t BiasGradMem(const OpReqType req, const NDArray* const bias) const;
+  dnnl_output_t DataGradMem(const OpReqType req, const NDArray& data_grad) const;
+  dnnl_output_t WeightsGradMem(const uint32_t num_group,
+                               const OpReqType req,
+                               const NDArray& weights_grad) const;
+  dnnl_output_t BiasGradMem(const OpReqType req, const NDArray* const bias) const;
 
   std::shared_ptr<deconv_bwd_data_pd_t> bwd_data_pd;
   std::shared_ptr<deconv_bwd_weights_pd_t> bwd_weights_pd;
@@ -241,21 +240,21 @@ class MKLDNNDeconvBwd {
   std::shared_ptr<deconv_bwd_weights_t> bwd_weights;
 };
 
-MKLDNNDeconvBwd::ReadTensors::ReadTensors(const bool no_bias, const std::vector<NDArray>& inputs)
+DNNLDeconvBwd::ReadTensors::ReadTensors(const bool no_bias, const std::vector<NDArray>& inputs)
     : data(inputs[deconv::kData + 1]),
       weights(inputs[deconv::kWeight + 1]),
       bias(no_bias ? nullptr : &inputs[deconv::kBias + 1]),
       out_grad(inputs[deconv::kOut]) {}
 
-MKLDNNDeconvBwd::WriteTensors::WriteTensors(const bool no_bias, const std::vector<NDArray>& outputs)
+DNNLDeconvBwd::WriteTensors::WriteTensors(const bool no_bias, const std::vector<NDArray>& outputs)
     : data_grad(outputs[deconv::kData]),
       weights_grad(outputs[deconv::kWeight]),
       bias_grad(no_bias ? nullptr : &outputs[deconv::kBias]) {}
 
-MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam& param, const ReadTensors& read_tensors) {
-  const auto& fwd_pd = MKLDNNDeconvFwd::CreatePrimitiveDesc(
+DNNLDeconvBwd::DNNLDeconvBwd(const DeconvolutionParam& param, const ReadTensors& read_tensors) {
+  const auto& fwd_pd = DNNLDeconvFwd::CreatePrimitiveDesc(
       param,
-      MKLDNNDeconvFwd::Tensors(
+      DNNLDeconvFwd::Tensors(
           read_tensors.data, read_tensors.weights, read_tensors.bias, read_tensors.out_grad));
   bwd_data_pd    = CreateDataPrimitiveDesc(param, read_tensors, *fwd_pd);
   bwd_weights_pd = CreateWeightsPrimitiveDesc(param, read_tensors, *fwd_pd);
@@ -263,62 +262,61 @@ MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam& param, const ReadTens
   bwd_weights    = std::make_shared<deconv_bwd_weights_t>(*bwd_weights_pd);
 }
 
-inline void MKLDNNDeconvBwd::IOSwapWeightsTensors(const uint32_t num_group,
-                                                  const std::vector<OpReqType>& req,
-                                                  const NDArray& weights,
-                                                  const NDArray& weights_grad) const {
+inline void DNNLDeconvBwd::IOSwapWeightsTensors(const uint32_t num_group,
+                                                const std::vector<OpReqType>& req,
+                                                const NDArray& weights,
+                                                const NDArray& weights_grad) const {
   if (req[deconv::kData]) {
-    IOLogicalSwapMKLDNNMem(weights, num_group);
+    IOLogicalSwapDNNLMem(weights, num_group);
   }
   if (req[deconv::kWeight] || (req.size() < deconv::kBias && req[deconv::kBias])) {
-    IOLogicalSwapMKLDNNMem(weights_grad, num_group);
+    IOLogicalSwapDNNLMem(weights_grad, num_group);
   }
 }
 
-inline const mkldnn::memory* MKLDNNDeconvBwd::DataMem(const NDArray& data) const {
-  return data.GetMKLDNNDataReorder(bwd_weights_pd->src_desc());
+inline const dnnl::memory* DNNLDeconvBwd::DataMem(const NDArray& data) const {
+  return data.GetDNNLDataReorder(bwd_weights_pd->src_desc());
 }
 
-inline const mkldnn::memory* MKLDNNDeconvBwd::WeightsMem(const uint32_t num_group,
-                                                         const NDArray& weights) const {
+inline const dnnl::memory* DNNLDeconvBwd::WeightsMem(const uint32_t num_group,
+                                                     const NDArray& weights) const {
   return GetWeights(weights, bwd_data_pd->weights_desc(), num_group);
 }
 
-inline const mkldnn::memory* MKLDNNDeconvBwd::OutGradMem(const NDArray& out_grad) const {
-  return out_grad.GetMKLDNNDataReorder(bwd_data_pd->diff_dst_desc());
+inline const dnnl::memory* DNNLDeconvBwd::OutGradMem(const NDArray& out_grad) const {
+  return out_grad.GetDNNLDataReorder(bwd_data_pd->diff_dst_desc());
 }
 
-inline const mkldnn::memory* MKLDNNDeconvBwd::OutGradMem(
-    const NDArray& out_grad,
-    const mkldnn::memory* const out_grad_mem) const {
+inline const dnnl::memory* DNNLDeconvBwd::OutGradMem(const NDArray& out_grad,
+                                                     const dnnl::memory* const out_grad_mem) const {
   return (out_grad_mem && out_grad_mem->get_desc() == bwd_weights_pd->diff_dst_desc())
              ? out_grad_mem
-             : out_grad.GetMKLDNNDataReorder(bwd_weights_pd->diff_dst_desc());
+             : out_grad.GetDNNLDataReorder(bwd_weights_pd->diff_dst_desc());
 }
 
-inline mkldnn_output_t MKLDNNDeconvBwd::DataGradMem(const OpReqType req,
-                                                    const NDArray& data_grad) const {
-  return CreateMKLDNNMem(data_grad, bwd_data_pd->diff_src_desc(), req);
+inline dnnl_output_t DNNLDeconvBwd::DataGradMem(const OpReqType req,
+                                                const NDArray& data_grad) const {
+  return CreateDNNLMem(data_grad, bwd_data_pd->diff_src_desc(), req);
 }
 
-inline mkldnn_output_t MKLDNNDeconvBwd::WeightsGradMem(const uint32_t num_group,
-                                                       const OpReqType req,
-                                                       const NDArray& weights_grad) const {
-  // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because
-  // of the logical swap - explained in MKLDNNDeconvFwd::Execute). We try to reuse weights_grad
+inline dnnl_output_t DNNLDeconvBwd::WeightsGradMem(const uint32_t num_group,
+                                                   const OpReqType req,
+                                                   const NDArray& weights_grad) const {
+  // CreateDNNLWeightGrad always creates a new tensor as IsDefaultFormat always fails (because
+  // of the logical swap - explained in DNNLDeconvFwd::Execute). We try to reuse weights_grad
   // memory (which, when not swapped, is always in default format), so here we check if after a
   // swap, weights_md will have a default format
   const auto& weights_md = bwd_weights_pd->diff_weights_desc();
   if (req == OpReqType::kWriteTo && IsDefaultFormat(IOLogicalSwapDesc(weights_md, num_group))) {
-    return {OutDataOp::Noop, const_cast<NDArray&>(weights_grad).CreateMKLDNNData(weights_md)};
+    return {OutDataOp::Noop, const_cast<NDArray&>(weights_grad).CreateDNNLData(weights_md)};
   }
-  return CreateMKLDNNWeightGrad(weights_grad, weights_md, req);
+  return CreateDNNLWeightGrad(weights_grad, weights_md, req);
 }
 
-inline mkldnn_output_t MKLDNNDeconvBwd::BiasGradMem(const OpReqType req,
-                                                    const NDArray* const bias) const {
-  return bias ? CreateMKLDNNMem(*bias, bwd_weights_pd->diff_bias_desc(), req)
-              : mkldnn_output_t(OutDataOp::Noop, nullptr);
+inline dnnl_output_t DNNLDeconvBwd::BiasGradMem(const OpReqType req,
+                                                const NDArray* const bias) const {
+  return bias ? CreateDNNLMem(*bias, bwd_weights_pd->diff_bias_desc(), req)
+              : dnnl_output_t(OutDataOp::Noop, nullptr);
 }
 
 // Utility class for creating operation descriptors of deconvolution primitives
@@ -349,21 +347,21 @@ class DeconvDescCreator {
   deconv_bwd_weights_t::desc CreateBwdWeightsDesc() const;
 
  private:
-  mkldnn::memory::desc data_md;
-  mkldnn::memory::desc weights_md;
-  mkldnn::memory::desc bias_md;
-  mkldnn::memory::desc out_md;
-
-  mkldnn::memory::dims strides;
-  mkldnn::memory::dims padding;
-  mkldnn::memory::dims dilates;
+  dnnl::memory::desc data_md;
+  dnnl::memory::desc weights_md;
+  dnnl::memory::desc bias_md;
+  dnnl::memory::desc out_md;
+
+  dnnl::memory::dims strides;
+  dnnl::memory::dims padding;
+  dnnl::memory::dims dilates;
 };
 
 inline bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size,
                                                 const size_t weights_size,
                                                 const size_t out_size) const {
-  // MKLDNN introduced padded formats since 0.15 which require more memory
-  // compared to the actual size of the tensor. Currently, MKLDNN operators
+  // DNNL introduced padded formats since 0.15 which require more memory
+  // compared to the actual size of the tensor. Currently, DNNL operators
   // still reuse memory from memory planning, so here we need to accept only a
   // kernel that has the expected memory size requirements (which is suboptimal)
   return (data_size == GetMemDescSize(data_md) && weights_size == GetMemDescSize(weights_md) &&
@@ -371,8 +369,8 @@ inline bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size,
 }
 
 inline deconv_fwd_t::desc DeconvDescCreator::CreateFwdDesc() const {
-  return deconv_fwd_t::desc(mkldnn::prop_kind::forward_training,
-                            mkldnn::algorithm::deconvolution_direct,
+  return deconv_fwd_t::desc(dnnl::prop_kind::forward_training,
+                            dnnl::algorithm::deconvolution_direct,
                             data_md,
                             weights_md,
                             bias_md,
@@ -384,7 +382,7 @@ inline deconv_fwd_t::desc DeconvDescCreator::CreateFwdDesc() const {
 }
 
 inline deconv_bwd_data_t::desc DeconvDescCreator::CreateBwdDataDesc() const {
-  return deconv_bwd_data_t::desc(mkldnn::algorithm::deconvolution_direct,
+  return deconv_bwd_data_t::desc(dnnl::algorithm::deconvolution_direct,
                                  data_md,
                                  weights_md,
                                  out_md,
@@ -395,7 +393,7 @@ inline deconv_bwd_data_t::desc DeconvDescCreator::CreateBwdDataDesc() const {
 }
 
 inline deconv_bwd_weights_t::desc DeconvDescCreator::CreateBwdWeightsDesc() const {
-  return deconv_bwd_weights_t::desc(mkldnn::algorithm::deconvolution_direct,
+  return deconv_bwd_weights_t::desc(dnnl::algorithm::deconvolution_direct,
                                     data_md,
                                     weights_md,
                                     bias_md,
@@ -409,4 +407,4 @@ inline deconv_bwd_weights_t::desc DeconvDescCreator::CreateBwdWeightsDesc() cons
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H__
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_DECONVOLUTION_INL_H__
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/dnnl/dnnl_deconvolution.cc
similarity index 69%
rename from src/operator/nn/mkldnn/mkldnn_deconvolution.cc
rename to src/operator/nn/dnnl/dnnl_deconvolution.cc
index 7621a510a0fa..f4766a12c7f3 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ b/src/operator/nn/dnnl/dnnl_deconvolution.cc
@@ -18,40 +18,39 @@
  */
 
 /*!
- * \file mkldnn_deconvolution.cc
+ * \file dnnl_deconvolution.cc
  */
 
 #if MXNET_USE_ONEDNN == 1
 
 #include "../deconvolution-inl.h"
-#include "./mkldnn_deconvolution-inl.h"
+#include "./dnnl_deconvolution-inl.h"
 
 namespace mxnet {
 namespace op {
 
-bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray& input) {
+bool SupportDNNLDeconv(const DeconvolutionParam& params, const NDArray& input) {
   return params.kernel.ndim() >= 1 && params.kernel.ndim() <= 3 &&
          input.shape().ndim() == (params.kernel.ndim() + 2) &&
          (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16);
 }
 
-void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs,
-                                const OpContext& ctx,
-                                const std::vector<NDArray>& inputs,
-                                const std::vector<OpReqType>& req,
-                                const std::vector<NDArray>& outputs) {
+void DNNLDeconvolutionForward(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& outputs) {
   TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
   const auto& param  = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  const auto tensors = MKLDNNDeconvFwd::Tensors(param.no_bias, inputs, outputs);
-  const auto& fwd    = MKLDNNDeconvFwd::GetCached(param, tensors);
+  const auto tensors = DNNLDeconvFwd::Tensors(param.no_bias, inputs, outputs);
+  const auto& fwd    = DNNLDeconvFwd::GetCached(param, tensors);
 
   fwd.ControlWeightsFormat(param.num_group, ctx.is_train, tensors.weights);
   fwd.Execute(param.num_group, req[deconv::kOut], tensors);
 }
 
-MKLDNNDeconvFwd& MKLDNNDeconvFwd::GetCached(const DeconvolutionParam& param,
-                                            const Tensors& tensors) {
-  using deconv_fwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvFwd, OpHash>;
+DNNLDeconvFwd& DNNLDeconvFwd::GetCached(const DeconvolutionParam& param, const Tensors& tensors) {
+  using deconv_fwd_map = std::unordered_map<DeconvSignature, DNNLDeconvFwd, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local deconv_fwd_map fwds;
 #else
@@ -67,15 +66,14 @@ MKLDNNDeconvFwd& MKLDNNDeconvFwd::GetCached(const DeconvolutionParam& param,
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
-    const MKLDNNDeconvFwd fwd(param, tensors);
+    const DNNLDeconvFwd fwd(param, tensors);
     it = AddToCache(&fwds, key, fwd);
   }
   return it->second;
 }
 
-std::shared_ptr<deconv_fwd_pd_t> MKLDNNDeconvFwd::CreatePrimitiveDesc(
-    const DeconvolutionParam& param,
-    const Tensors& tensors) {
+std::shared_ptr<deconv_fwd_pd_t> DNNLDeconvFwd::CreatePrimitiveDesc(const DeconvolutionParam& param,
+                                                                    const Tensors& tensors) {
   DeconvDescCreator ddc(param, tensors.data, tensors.weights, tensors.bias, tensors.out);
   const auto& engine          = CpuEngine::Get()->get_engine();
   const auto pd               = std::make_shared<deconv_fwd_pd_t>(ddc.CreateFwdDesc(), engine);
@@ -93,13 +91,13 @@ std::shared_ptr<deconv_fwd_pd_t> MKLDNNDeconvFwd::CreatePrimitiveDesc(
   return pd;
 }
 
-void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group,
-                                           const bool is_train,
-                                           const NDArray& weights) const {
+void DNNLDeconvFwd::ControlWeightsFormat(const uint32_t num_group,
+                                         const bool is_train,
+                                         const NDArray& weights) const {
   if (is_train) {
-    // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
+    // TODO(zhengda) kvstore doesn't handle DNNL correctly. Let's reorder it
     // to the default format for now.
-    if (weights.IsMKLDNNData()) {
+    if (weights.IsDNNLData()) {
       // This asks the engine to change the layout of the weights array after it's used.
       weights.Reorder2DefaultAsync();
     }
@@ -109,17 +107,17 @@ void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group,
     if (weights.IsDefaultData()) {
       // We also need to modify the layout on the original weights array.
       // The data conversion happens after the weights array is used.
-      weights.MKLDNNDataReorderAsync(IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group));
+      weights.DNNLDataReorderAsync(IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group));
     } else {
-      CHECK(weights.GetMKLDNNData()->get_desc() ==
+      CHECK(weights.GetDNNLData()->get_desc() ==
             IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group));
     }
   }
 }
 
-void MKLDNNDeconvFwd::Execute(const uint32_t num_group,
-                              const OpReqType req,
-                              const Tensors& tensors) const {
+void DNNLDeconvFwd::Execute(const uint32_t num_group,
+                            const OpReqType req,
+                            const Tensors& tensors) const {
   // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives.
   // For that, we would pass input tensor in place of output and output tensor in place of input
   // (for appropriate convolution primitives: deconvolution forward = convolution backward data,
@@ -129,56 +127,56 @@ void MKLDNNDeconvFwd::Execute(const uint32_t num_group,
   // primitive_out_channels = deconv_in_channels, primitive_in_channels = deconv_out_channels,
   // so it becomes (deconv_in_channels, deconv_out_channels, h, w) and MXNet provides such tensor.
   //
-  // MKLDNN deconvolution primitive also (as convolution) expects weights tensor with the shape of
+  // DNNL deconvolution primitive also (as convolution) expects weights tensor with the shape of
   // (primitive_out_channels, primitive_in_channels, h, w), but this time we don't swap input and
   // output tensors, so:
   // primitive_out_channels = deconv_out_channels, primitive_in_channels = deconv_in_channels,
   // thus the current weights tensor won't fit (when deconv_out_channels != deconv_in_channels).
-  // However, underneath deconvolution MKLDNN also uses convolution, so even though it expects the
+  // However, underneath deconvolution DNNL also uses convolution, so even though it expects the
   // weights tensor with the logical order of oihw, it wants its physical representation to
   // match the order of iohw, which is the same as current weights tensor.
   //
   // So here we swap logical order of input and output dimensions for weights tensor just for
-  // MKLDNN operations.
-  IOLogicalSwapMKLDNNMem(tensors.weights, num_group);
+  // DNNL operations.
+  IOLogicalSwapDNNLMem(tensors.weights, num_group);
   {
-    mkldnn_args_map_t net_args;
+    dnnl_args_map_t net_args;
     const auto& out_mem = OutMem(req, tensors.out);
 
-    net_args.insert({MKLDNN_ARG_SRC, *DataMem(tensors.data)});
-    net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightsMem(num_group, tensors.weights)});
-    net_args.insert({MKLDNN_ARG_DST, *out_mem.second});
+    net_args.insert({DNNL_ARG_SRC, *DataMem(tensors.data)});
+    net_args.insert({DNNL_ARG_WEIGHTS, *WeightsMem(num_group, tensors.weights)});
+    net_args.insert({DNNL_ARG_DST, *out_mem.second});
     if (tensors.bias) {
-      net_args.insert({MKLDNN_ARG_BIAS, *BiasMem(*tensors.bias)});
+      net_args.insert({DNNL_ARG_BIAS, *BiasMem(*tensors.bias)});
     }
 
     // CommitOutput should run after RegisterPrimArgs for memory dependency
-    MKLDNNStream::Get()->RegisterPrimArgs(*fwd, net_args);
+    DNNLStream::Get()->RegisterPrimArgs(*fwd, net_args);
     CommitOutput(tensors.out, out_mem);
-    MKLDNNStream::Get()->Submit();
+    DNNLStream::Get()->Submit();
   }
-  IOLogicalSwapMKLDNNMem(tensors.weights, num_group);  // swap back from oihw to iohw
+  IOLogicalSwapDNNLMem(tensors.weights, num_group);  // swap back from oihw to iohw
 }
 
-void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs,
-                                 const OpContext& ctx,
-                                 const std::vector<NDArray>& inputs,
-                                 const std::vector<OpReqType>& req,
-                                 const std::vector<NDArray>& outputs) {
+void DNNLDeconvolutionBackward(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx,
+                               const std::vector<NDArray>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<NDArray>& outputs) {
   CHECK_NE(req[deconv::kWeight], kWriteInplace) << "Cannot write weights inplace";
 
   TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
   const auto& param        = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  const auto read_tensors  = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs);
-  const auto write_tensors = MKLDNNDeconvBwd::WriteTensors(param.no_bias, outputs);
-  MKLDNNDeconvBwd& bwd     = MKLDNNDeconvBwd::GetCached(param, read_tensors);
+  const auto read_tensors  = DNNLDeconvBwd::ReadTensors(param.no_bias, inputs);
+  const auto write_tensors = DNNLDeconvBwd::WriteTensors(param.no_bias, outputs);
+  DNNLDeconvBwd& bwd       = DNNLDeconvBwd::GetCached(param, read_tensors);
 
   bwd.Execute(param.num_group, req, read_tensors, write_tensors);
 }
 
-MKLDNNDeconvBwd& MKLDNNDeconvBwd::GetCached(const DeconvolutionParam& param,
-                                            const ReadTensors& read_tensors) {
-  using deconv_bwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvBwd, OpHash>;
+DNNLDeconvBwd& DNNLDeconvBwd::GetCached(const DeconvolutionParam& param,
+                                        const ReadTensors& read_tensors) {
+  using deconv_bwd_map = std::unordered_map<DeconvSignature, DNNLDeconvBwd, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local deconv_bwd_map bwds;
 #else
@@ -194,13 +192,13 @@ MKLDNNDeconvBwd& MKLDNNDeconvBwd::GetCached(const DeconvolutionParam& param,
 
   auto it = bwds.find(key);
   if (it == bwds.end()) {
-    const MKLDNNDeconvBwd bwd(param, read_tensors);
+    const DNNLDeconvBwd bwd(param, read_tensors);
     it = AddToCache(&bwds, key, bwd);
   }
   return it->second;
 }
 
-std::shared_ptr<deconv_bwd_data_pd_t> MKLDNNDeconvBwd::CreateDataPrimitiveDesc(
+std::shared_ptr<deconv_bwd_data_pd_t> DNNLDeconvBwd::CreateDataPrimitiveDesc(
     const DeconvolutionParam& param,
     const ReadTensors& read_tensors,
     const deconv_fwd_pd_t& fwd_pd) {
@@ -222,7 +220,7 @@ std::shared_ptr<deconv_bwd_data_pd_t> MKLDNNDeconvBwd::CreateDataPrimitiveDesc(
   return pd;
 }
 
-std::shared_ptr<deconv_bwd_weights_pd_t> MKLDNNDeconvBwd::CreateWeightsPrimitiveDesc(
+std::shared_ptr<deconv_bwd_weights_pd_t> DNNLDeconvBwd::CreateWeightsPrimitiveDesc(
     const DeconvolutionParam& param,
     const ReadTensors& read_tensors,
     const deconv_fwd_pd_t& fwd_pd) {
@@ -245,64 +243,64 @@ std::shared_ptr<deconv_bwd_weights_pd_t> MKLDNNDeconvBwd::CreateWeightsPrimitive
   return pd;
 }
 
-void MKLDNNDeconvBwd::Execute(const uint32_t num_group,
-                              const std::vector<OpReqType>& req,
-                              const ReadTensors& read_tensors,
-                              const WriteTensors& write_tensors) const {
-  // swaps are explained in MKLDNNDeconvFwd::Execute
+void DNNLDeconvBwd::Execute(const uint32_t num_group,
+                            const std::vector<OpReqType>& req,
+                            const ReadTensors& read_tensors,
+                            const WriteTensors& write_tensors) const {
+  // swaps are explained in DNNLDeconvFwd::Execute
   IOSwapWeightsTensors(num_group, req, read_tensors.weights, write_tensors.weights_grad);
   {
     auto* const out_grad_mem =
         ScheduleBwdData(num_group, req[deconv::kData], read_tensors, write_tensors);
     ScheduleBwdWeights(num_group, req, read_tensors, write_tensors, out_grad_mem);
-    MKLDNNStream::Get()->Submit();
+    DNNLStream::Get()->Submit();
   }
   IOSwapWeightsTensors(num_group, req, read_tensors.weights, write_tensors.weights_grad);
 }
 
-const mkldnn::memory* MKLDNNDeconvBwd::ScheduleBwdData(const uint32_t num_group,
-                                                       const OpReqType req,
-                                                       const ReadTensors& read_tensors,
-                                                       const WriteTensors& write_tensors) const {
+const dnnl::memory* DNNLDeconvBwd::ScheduleBwdData(const uint32_t num_group,
+                                                   const OpReqType req,
+                                                   const ReadTensors& read_tensors,
+                                                   const WriteTensors& write_tensors) const {
   if (req) {
-    mkldnn_args_map_t net_args;
+    dnnl_args_map_t net_args;
     auto* const out_grad_mem  = OutGradMem(read_tensors.out_grad);
     const auto& data_grad_mem = DataGradMem(req, write_tensors.data_grad);
 
-    net_args.insert({MKLDNN_ARG_DIFF_DST, *out_grad_mem});
-    net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightsMem(num_group, read_tensors.weights)});
-    net_args.insert({MKLDNN_ARG_DIFF_SRC, *data_grad_mem.second});
+    net_args.insert({DNNL_ARG_DIFF_DST, *out_grad_mem});
+    net_args.insert({DNNL_ARG_WEIGHTS, *WeightsMem(num_group, read_tensors.weights)});
+    net_args.insert({DNNL_ARG_DIFF_SRC, *data_grad_mem.second});
 
     // CommitOutput should run after RegisterPrimArgs for memory dependency
-    MKLDNNStream::Get()->RegisterPrimArgs(*bwd_data, net_args);
+    DNNLStream::Get()->RegisterPrimArgs(*bwd_data, net_args);
     CommitOutput(write_tensors.data_grad, data_grad_mem);
     return out_grad_mem;
   }
   return nullptr;
 }
 
-void MKLDNNDeconvBwd::ScheduleBwdWeights(const uint32_t num_group,
-                                         const std::vector<OpReqType>& req,
-                                         const ReadTensors& read_tensors,
-                                         const WriteTensors& write_tensors,
-                                         const mkldnn::memory* const out_grad_mem) const {
+void DNNLDeconvBwd::ScheduleBwdWeights(const uint32_t num_group,
+                                       const std::vector<OpReqType>& req,
+                                       const ReadTensors& read_tensors,
+                                       const WriteTensors& write_tensors,
+                                       const dnnl::memory* const out_grad_mem) const {
   OpReqType weight_req = req[deconv::kWeight];
   OpReqType bias_req   = req.size() > deconv::kBias ? req[deconv::kBias] : OpReqType::kNullOp;
   if (weight_req || bias_req) {
-    mkldnn_args_map_t net_args;
+    dnnl_args_map_t net_args;
     const auto& weights_grad_mem =
         WeightsGradMem(num_group, weight_req, write_tensors.weights_grad);
     const auto& bias_grad_mem = BiasGradMem(bias_req, write_tensors.bias_grad);
 
-    net_args.insert({MKLDNN_ARG_DIFF_DST, *OutGradMem(read_tensors.out_grad, out_grad_mem)});
-    net_args.insert({MKLDNN_ARG_SRC, *DataMem(read_tensors.data)});
-    net_args.insert({MKLDNN_ARG_DIFF_WEIGHTS, *weights_grad_mem.second});
+    net_args.insert({DNNL_ARG_DIFF_DST, *OutGradMem(read_tensors.out_grad, out_grad_mem)});
+    net_args.insert({DNNL_ARG_SRC, *DataMem(read_tensors.data)});
+    net_args.insert({DNNL_ARG_DIFF_WEIGHTS, *weights_grad_mem.second});
     if (bias_grad_mem.second) {
-      net_args.insert({MKLDNN_ARG_DIFF_BIAS, *bias_grad_mem.second});
+      net_args.insert({DNNL_ARG_DIFF_BIAS, *bias_grad_mem.second});
     }
 
     // CommitOutput should run after RegisterPrimArgs for memory dependency
-    MKLDNNStream::Get()->RegisterPrimArgs(*bwd_weights, net_args);
+    DNNLStream::Get()->RegisterPrimArgs(*bwd_weights, net_args);
     CommitOutput(write_tensors.weights_grad, weights_grad_mem);
     if (bias_grad_mem.second) {
       CommitOutput(*write_tensors.bias_grad, bias_grad_mem);
@@ -317,7 +315,7 @@ DeconvDescCreator::DeconvDescCreator(const DeconvolutionParam& param,
                                      const NDArray& out)
     : data_md(GetMemDesc(data)),
       weights_md(GetDeconvWeightsDesc(weights, param.num_group)),
-      bias_md(bias ? GetMemDesc(*bias) : mkldnn::memory::desc()),
+      bias_md(bias ? GetMemDesc(*bias) : dnnl::memory::desc()),
       out_md(GetMemDesc(out)),
       strides(param.stride.ndim()),
       padding(param.pad.ndim()),
diff --git a/src/operator/nn/dnnl/dnnl_fully_connected-inl.h b/src/operator/nn/dnnl/dnnl_fully_connected-inl.h
new file mode 100644
index 000000000000..980b931851f3
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_fully_connected-inl.h
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_fully_connected-inl.h
+ * \brief Common functions used by DNNL (Quantized) FullyConnected operator
+ * \author Ciyong Chen
+ */
+
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_FULLY_CONNECTED_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_FULLY_CONNECTED_INL_H_
+
+#if MXNET_USE_ONEDNN == 1
+
+#include <string>
+#include <vector>
+
+#include "../fully_connected-inl.h"
+#include "./dnnl_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+struct DNNLFCParam : public dmlc::Parameter<DNNLFCParam> {
+  bool quantized;
+  bool enable_float_output;
+  bool with_eltwise;
+  dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
+  dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
+  dmlc::optional<bool> channel_wise_quantize;
+
+  DMLC_DECLARE_PARAMETER(DNNLFCParam) {
+    DMLC_DECLARE_FIELD(quantized).set_default(false).describe(
+        "Whether it's a quantized FullyConnected operator");
+    DMLC_DECLARE_FIELD(enable_float_output)
+        .set_default(false)
+        .describe("Whether to enable float32 output");
+    DMLC_DECLARE_FIELD(with_eltwise)
+        .set_default(false)
+        .describe("Whether there's a post with_eltwise after FullyConnected operator");
+    DMLC_DECLARE_FIELD(min_calib_range)
+        .set_default(dmlc::optional<float>())
+        .describe(
+            "The minimum scalar value in the form of float32 obtained "
+            "through calibration. If present, it will be used to by "
+            "quantized fullyconnected op to calculate primitive scale");
+    DMLC_DECLARE_FIELD(max_calib_range)
+        .set_default(dmlc::optional<float>())
+        .describe(
+            "The maximum scalar value in the form of float32 obtained "
+            "through calibration. If present, it will be used to by "
+            "quantized fullyconnected op to calculate primitive scale");
+    DMLC_DECLARE_FIELD(channel_wise_quantize)
+        .set_default(dmlc::optional<bool>())
+        .describe("Whether support channel-wise-quantize for weight.");
+  }
+};
+
+struct DNNLFCFullParam {
+  FullyConnectedParam default_param;
+  DNNLFCParam dnnl_param;
+  DNNLPostEltwiseParam eltwise_param;
+  std::vector<float> output_scales = {0.0f};
+};
+
+dnnl::inner_product_forward::primitive_desc GetFCFwdImpl(const DNNLFCFullParam& full_param,
+                                                         const bool is_train,
+                                                         const NDArray& data,
+                                                         const NDArray& weight,
+                                                         const NDArray* bias,
+                                                         const dnnl::memory::desc& out_md);
+
+class DNNLFullyConnectedForward {
+ public:
+  dnnl::inner_product_forward::primitive_desc fwd_pd;
+
+  DNNLFullyConnectedForward(const DNNLFCFullParam& full_param,
+                            const bool is_train,
+                            const NDArray& data,
+                            const NDArray& weight,
+                            const NDArray* bias,
+                            const dnnl::memory::desc& out_md)
+      : fwd_pd(GetFCFwdImpl(full_param, is_train, data, weight, bias, out_md)) {
+    fwd_ = std::make_shared<dnnl::inner_product_forward>(fwd_pd);
+  }
+
+  const dnnl::inner_product_forward& GetFwd() const {
+    return *fwd_;
+  }
+
+ private:
+  std::shared_ptr<dnnl::inner_product_forward> fwd_;
+};
+
+typedef ParamOpSign<FullyConnectedParam> DNNLFullyconSignature;
+
+DNNLFullyConnectedForward& GetFCFwd(const FullyConnectedParam& param,
+                                    const bool is_train,
+                                    const NDArray& data,
+                                    const NDArray& weight,
+                                    const NDArray* bias,
+                                    const dnnl::memory::desc& out_md);
+
+void DNNLFCFlattenData(const FullyConnectedParam& param,
+                       const NDArray& out_data,
+                       NDArray* in_data,
+                       dnnl::memory::desc* out_md);
+
+void DNNLFCForward(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<NDArray>& in_data,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<NDArray>& out_data);
+
+void DNNLFCForwardFullFeature(const DNNLFCFullParam& param,
+                              const OpContext& ctx,
+                              DNNLFullyConnectedForward* fwd,
+                              const std::vector<NDArray>& in_data,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& out_data);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_ONEDNN == 1
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_FULLY_CONNECTED_INL_H_
diff --git a/src/operator/nn/dnnl/dnnl_fully_connected.cc b/src/operator/nn/dnnl/dnnl_fully_connected.cc
new file mode 100644
index 000000000000..5bb3c9d79ec0
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_fully_connected.cc
@@ -0,0 +1,327 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_fully_connected.cc
+ * \brief DNNL FullyConnected operator
+ * \author Da Zheng, Ciyong Chen
+ */
+
+#if MXNET_USE_ONEDNN == 1
+#include "dnnl_fully_connected-inl.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(DNNLFCParam);
+
+dnnl::inner_product_forward::primitive_desc GetFCFwdImpl(const DNNLFCFullParam& full_param,
+                                                         const bool is_train,
+                                                         const NDArray& data,
+                                                         const NDArray& weight,
+                                                         const NDArray* bias,
+                                                         const dnnl::memory::desc& out_md) {
+  auto engine    = CpuEngine::Get()->get_engine();
+  auto data_md   = GetMemDesc(data);
+  auto weight_md = full_param.dnnl_param.quantized
+                       ? GetFCWeightDesc(weight, data.shape()[0], mshadow::kInt8)
+                       : GetFCWeightDesc(weight, data.shape()[0]);
+  auto propagation =
+      is_train ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_scoring;
+
+  dnnl::primitive_attr attr;
+  dnnl::post_ops ops;
+  if (full_param.dnnl_param.with_eltwise) {
+    ops.append_eltwise(full_param.eltwise_param.scale,
+                       full_param.eltwise_param.alg,
+                       full_param.eltwise_param.alpha,
+                       full_param.eltwise_param.beta);
+  }
+  attr.set_post_ops(ops);
+
+  if (full_param.dnnl_param.quantized && full_param.output_scales.size()) {
+    int mask = (full_param.output_scales.size() == 1) ? 0 : (1 << 1);
+    attr.set_output_scales(mask, full_param.output_scales);
+  }
+
+  auto GetFCFwdPd = [&full_param, &attr, &engine](const dnnl::inner_product_forward::desc& desc) {
+    try {
+      return dnnl::inner_product_forward::primitive_desc(desc, attr, engine);
+    } catch (dnnl::error& e) {
+      if (e.status == dnnl_unimplemented && full_param.dnnl_param.quantized) {
+        LOG(ERROR) << "AVX512-BW support or DNNL v0.18 is required for INT8 fully_connected.";
+      } else {
+        LOG(ERROR) << e.message;
+      }
+      throw;
+    }
+  };
+
+  if (bias) {
+    if ((*bias).shape().ndim() != 1)
+      LOG(FATAL) << "Unexpected shape for bias " << (*bias).shape();
+    auto bias_md =
+        full_param.dnnl_param.quantized ? GetMemDesc(*bias, mshadow::kInt32) : GetMemDesc(*bias);
+    dnnl::inner_product_forward::desc desc(propagation, data_md, weight_md, bias_md, out_md);
+    return GetFCFwdPd(desc);
+  } else {
+    dnnl::inner_product_forward::desc desc(propagation, data_md, weight_md, out_md);
+    return GetFCFwdPd(desc);
+  }
+}
+
+inline static dnnl::inner_product_backward_data::primitive_desc GetFCBwdData(
+    const NDArray& data,
+    const NDArray& weight,
+    const NDArray& output,
+    dnnl::inner_product_forward::primitive_desc fwd_pd) {
+  auto data_md   = GetMemDesc(data);
+  auto weight_md = GetFCWeightDesc(weight, data.shape()[0]);
+  auto out_md    = GetMemDesc(output);
+  auto engine    = CpuEngine::Get()->get_engine();
+  dnnl::inner_product_backward_data::desc desc(data_md, weight_md, out_md);
+  return dnnl::inner_product_backward_data::primitive_desc(desc, engine, fwd_pd);
+}
+
+inline static dnnl::inner_product_backward_weights::primitive_desc GetFCBwdWeights(
+    const NDArray& data,
+    const NDArray& weight,
+    const NDArray* bias,
+    const NDArray& output,
+    dnnl::inner_product_forward::primitive_desc fwd_pd) {
+  auto data_md   = GetMemDesc(data);
+  auto weight_md = GetFCWeightDesc(weight, data.shape()[0]);
+  auto out_md    = GetMemDesc(output);
+  auto engine    = CpuEngine::Get()->get_engine();
+  if (bias) {
+    auto bias_md = GetMemDesc(*bias);
+    dnnl::inner_product_backward_weights::desc desc(data_md, weight_md, bias_md, out_md);
+    return dnnl::inner_product_backward_weights::primitive_desc(desc, engine, fwd_pd);
+  } else {
+    dnnl::inner_product_backward_weights::desc desc(data_md, weight_md, out_md);
+    return dnnl::inner_product_backward_weights::primitive_desc(desc, engine, fwd_pd);
+  }
+}
+
+DNNLFullyConnectedForward& GetFCFwd(const FullyConnectedParam& param,
+                                    const bool is_train,
+                                    const NDArray& data,
+                                    const NDArray& weight,
+                                    const NDArray* bias,
+                                    const dnnl::memory::desc& out_md) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLFullyconSignature, DNNLFullyConnectedForward, OpHash>
+      fcFwds;
+#else
+  static MX_THREAD_LOCAL
+      std::unordered_map<DNNLFullyconSignature, DNNLFullyConnectedForward, OpHash>
+          fcFwds;
+#endif
+  DNNLFullyconSignature key(param);
+  key.AddSign(is_train);
+  key.AddSign(data);
+  key.AddSign(weight);
+  if (bias)
+    key.AddSign(*bias);
+
+  auto it = fcFwds.find(key);
+  if (it == fcFwds.end()) {
+    DNNLFCFullParam full_param;
+    full_param.default_param = param;
+    full_param.dnnl_param.Init(std::unordered_map<std::string, std::string>());
+    DNNLFullyConnectedForward fcFwd(full_param, is_train, data, weight, bias, out_md);
+    it = AddToCache(&fcFwds, key, fcFwd);
+  }
+  return it->second;
+}
+
+void DNNLFCFlattenData(const FullyConnectedParam& param,
+                       const NDArray& out_data,
+                       NDArray* in_data,
+                       dnnl::memory::desc* out_md) {
+  const mxnet::TShape ishape = in_data->shape();
+  const mxnet::TShape oshape = out_data.shape();
+  if (ishape.ndim() != 2) {
+    if (!param.flatten) {
+      *in_data = in_data->DNNLDataReshape(
+          Shape2(ishape.ProdShape(0, ishape.ndim() - 1), ishape[ishape.ndim() - 1]));
+      dnnl::memory::dims out_dims{static_cast<int>(oshape.ProdShape(0, oshape.ndim() - 1)),
+                                  static_cast<int>(oshape[ishape.ndim() - 1])};
+      *out_md = dnnl::memory::desc(
+          out_dims, get_dnnl_type(out_data.dtype()), dnnl::memory::format_tag::any);
+    } else {
+      *in_data = in_data->DNNLDataReshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())));
+      dnnl::memory::dims out_dims{static_cast<int>(oshape[0]),
+                                  static_cast<int>(oshape.ProdShape(1, oshape.ndim()))};
+      *out_md = dnnl::memory::desc(
+          out_dims, get_dnnl_type(out_data.dtype()), dnnl::memory::format_tag::any);
+    }
+  }
+}
+
+void DNNLFCForwardFullFeature(const DNNLFCFullParam& full_param,
+                              const OpContext& ctx,
+                              DNNLFullyConnectedForward* fwd,
+                              const std::vector<NDArray>& in_data,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
+  NDArray weight = in_data[fullc::kWeight];
+  NDArray data   = in_data[fullc::kData];
+
+  auto data_mem = data.GetDNNLDataReorder(fwd->fwd_pd.src_desc());
+  const dnnl::memory* weight_mem;
+  if (ctx.is_train) {
+    if (weight.IsDNNLData()) {
+      weight.Reorder2DefaultAsync();
+    }
+    weight_mem = GetWeights(weight, fwd->fwd_pd.weights_desc(), 1);
+  } else {
+    weight_mem = weight.GetDNNLData();
+    if (weight_mem->get_desc() != fwd->fwd_pd.weights_desc()) {
+      weight.DNNLDataReorderAsync(fwd->fwd_pd.weights_desc());
+      weight_mem = GetWeights(weight, fwd->fwd_pd.weights_desc(), 1);
+    }
+  }
+  auto out_mem =
+      CreateDNNLMem(out_data[fullc::kOut], fwd->fwd_pd.dst_desc(), req[fullc::kOut], &data);
+
+  dnnl_args_map_t args = {
+      {DNNL_ARG_SRC, *data_mem},
+      {DNNL_ARG_WEIGHTS, *weight_mem},
+      {DNNL_ARG_DST, *out_mem.second},
+  };
+  if (!full_param.default_param.no_bias) {
+    auto bias_mem       = in_data[fullc::kBias].GetDNNLDataReorder(fwd->fwd_pd.bias_desc());
+    args[DNNL_ARG_BIAS] = *bias_mem;
+  }
+  DNNLStream::Get()->RegisterPrimArgs(fwd->GetFwd(), args);
+  CommitOutput(out_data[fullc::kOut], out_mem);
+  DNNLStream::Get()->Submit();
+}
+
+void DNNLFCForward(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<NDArray>& in_data,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<NDArray>& out_data) {
+  DNNLFCFullParam full_param;
+  full_param.default_param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  full_param.dnnl_param.Init(std::unordered_map<std::string, std::string>());
+
+  NDArray data              = in_data[fullc::kData];
+  dnnl::memory::desc out_md = GetMemDesc(out_data[fullc::kOut]);
+  DNNLFCFlattenData(full_param.default_param, out_data[fullc::kOut], &data, &out_md);
+  auto& fwd = GetFCFwd(full_param.default_param,
+                       ctx.is_train,
+                       data,
+                       in_data[fullc::kWeight],
+                       full_param.default_param.no_bias ? nullptr : &in_data[fullc::kBias],
+                       out_md);
+  std::vector<NDArray> new_inputs;
+  if (full_param.default_param.no_bias)
+    new_inputs = {data, in_data[fullc::kWeight]};
+  else
+    new_inputs = {data, in_data[fullc::kWeight], in_data[fullc::kBias]};
+  DNNLFCForwardFullFeature(full_param, ctx, &fwd, new_inputs, req, out_data);
+}
+
+void DNNLFCBackward(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<NDArray>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<NDArray>& outputs) {
+  TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
+  const std::vector<NDArray>& in_grad = outputs;
+  DNNLFCFullParam full_param;
+  full_param.default_param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  full_param.dnnl_param.Init(std::unordered_map<std::string, std::string>());
+  const FullyConnectedParam& param = full_param.default_param;
+  const mxnet::TShape& ishape      = inputs[fullc::kData + 1].shape();
+  const mxnet::TShape& oshape      = inputs[fullc::kOut].shape();
+
+  NDArray weight = inputs[fullc::kWeight + 1];
+  NDArray data   = inputs[fullc::kData + 1];
+  if (data.shape().ndim() != 2 && !param.flatten)
+    data = data.DNNLDataReshape(
+        Shape2(ishape.ProdShape(0, ishape.ndim() - 1), ishape[ishape.ndim() - 1]));
+  else if (data.shape().ndim() != 2)
+    data = data.DNNLDataReshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())));
+  NDArray out_grad = inputs[fullc::kOut];
+  if (out_grad.shape().ndim() != 2 && !param.flatten)
+    out_grad = out_grad.DNNLDataReshape(
+        Shape2(oshape.ProdShape(0, oshape.ndim() - 1), oshape[oshape.ndim() - 1]));
+  else if (out_grad.shape().ndim() != 2)
+    out_grad = out_grad.DNNLDataReshape(Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())));
+
+  dnnl::inner_product_forward::primitive_desc fwd_pd =
+      GetFCFwdImpl(full_param,
+                   ctx.is_train,
+                   data,
+                   weight,
+                   param.no_bias ? nullptr : &in_grad[fullc::kBias],
+                   GetMemDesc(out_grad));
+
+  CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
+  if (req[fullc::kWeight]) {
+    dnnl::inner_product_backward_weights::primitive_desc ipBwdWeights_pd = GetFCBwdWeights(
+        data, weight, param.no_bias ? nullptr : &in_grad[fullc::kBias], out_grad, fwd_pd);
+    auto out_grad_mem   = out_grad.GetDNNLDataReorder(ipBwdWeights_pd.diff_dst_desc());
+    auto data_mem       = data.GetDNNLDataReorder(ipBwdWeights_pd.src_desc());
+    auto in_grad_weight = CreateDNNLWeightGrad(
+        in_grad[fullc::kWeight], ipBwdWeights_pd.diff_weights_desc(), req[fullc::kWeight]);
+    dnnl_args_map_t args = {
+        {DNNL_ARG_DIFF_DST, *out_grad_mem},
+        {DNNL_ARG_SRC, *data_mem},
+        {DNNL_ARG_DIFF_WEIGHTS, *in_grad_weight.second},
+    };
+
+    dnnl_output_t in_grad_bias;
+    if (!param.no_bias) {
+      in_grad_bias =
+          CreateDNNLMem(in_grad[fullc::kBias], ipBwdWeights_pd.diff_bias_desc(), req[fullc::kBias]);
+      args[DNNL_ARG_DIFF_BIAS] = *in_grad_bias.second;
+    }
+    DNNLStream::Get()->RegisterPrimArgs(dnnl::inner_product_backward_weights(ipBwdWeights_pd),
+                                        args);
+    CommitOutput(in_grad[fullc::kWeight], in_grad_weight);
+    if (!param.no_bias) {
+      CommitOutput(in_grad[fullc::kBias], in_grad_bias);
+    }
+  }
+  if (req[fullc::kData]) {
+    dnnl::inner_product_backward_data::primitive_desc ipBwdData_pd =
+        GetFCBwdData(data, weight, out_grad, fwd_pd);
+    auto out_grad_mem = out_grad.GetDNNLDataReorder(ipBwdData_pd.diff_dst_desc());
+    auto weight_mem   = weight.GetDNNLDataReorder(ipBwdData_pd.weights_desc());
+    auto in_grad_mem =
+        CreateDNNLMem(in_grad[fullc::kData], ipBwdData_pd.diff_src_desc(), req[fullc::kData]);
+    dnnl_args_map_t args = {{DNNL_ARG_DIFF_DST, *out_grad_mem},
+                            {DNNL_ARG_WEIGHTS, *weight_mem},
+                            {DNNL_ARG_DIFF_SRC, *in_grad_mem.second}};
+
+    DNNLStream::Get()->RegisterPrimArgs(dnnl::inner_product_backward_data(ipBwdData_pd), args);
+    CommitOutput(in_grad[fullc::kData], in_grad_mem);
+  }
+  DNNLStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_ONEDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_layer_norm-inl.h b/src/operator/nn/dnnl/dnnl_layer_norm-inl.h
similarity index 57%
rename from src/operator/nn/mkldnn/mkldnn_layer_norm-inl.h
rename to src/operator/nn/dnnl/dnnl_layer_norm-inl.h
index a14673b140db..ccd3e9cb00ea 100644
--- a/src/operator/nn/mkldnn/mkldnn_layer_norm-inl.h
+++ b/src/operator/nn/dnnl/dnnl_layer_norm-inl.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file mkldnn_layer_norm-inl.h
+ * \file dnnl_layer_norm-inl.h
  * \author: Bartosz Kuncer, bartosz.kuncer@intel.com
  */
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LAYER_NORM_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LAYER_NORM_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_LAYER_NORM_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_LAYER_NORM_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 
@@ -30,31 +30,30 @@
 #include <vector>
 
 #include "../layer_norm-inl.h"
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 
-using layernorm_fwd_t    = mkldnn::layer_normalization_forward;
-using layernorm_fwd_pd_t = mkldnn::layer_normalization_forward::primitive_desc;
+using layernorm_fwd_t    = dnnl::layer_normalization_forward;
+using layernorm_fwd_pd_t = dnnl::layer_normalization_forward::primitive_desc;
 
-using layernorm_bwd_t    = mkldnn::layer_normalization_backward;
-using layernorm_bwd_pd_t = mkldnn::layer_normalization_backward::primitive_desc;
+using layernorm_bwd_t    = dnnl::layer_normalization_backward;
+using layernorm_bwd_pd_t = dnnl::layer_normalization_backward::primitive_desc;
 
 typedef ParamOpSign<LayerNormParam> LayerNormSignature;
 
-class MKLDNNLayerNormFwd {
+class DNNLLayerNormFwd {
  public:
-  static MKLDNNLayerNormFwd& GetCached(const LayerNormParam& param,
-                                       const OpContext& ctx,
-                                       const NDArray& data);
+  static DNNLLayerNormFwd& GetCached(const LayerNormParam& param,
+                                     const OpContext& ctx,
+                                     const NDArray& data);
 
-  MKLDNNLayerNormFwd(const LayerNormParam& param, const NDArray& data);
+  DNNLLayerNormFwd(const LayerNormParam& param, const NDArray& data);
 
-  static std::shared_ptr<layernorm_fwd_pd_t> CreatePrimitiveDesc(
-      const LayerNormParam& param,
-      const mkldnn::memory::desc& src_md);
+  static std::shared_ptr<layernorm_fwd_pd_t> CreatePrimitiveDesc(const LayerNormParam& param,
+                                                                 const dnnl::memory::desc& src_md);
 
   void Execute(const LayerNormParam& param,
                const OpContext& ctx,
@@ -62,34 +61,34 @@ class MKLDNNLayerNormFwd {
                const OpReqType& req,
                const std::vector<NDArray>& outputs) const;
 
-  ~MKLDNNLayerNormFwd() {}
+  ~DNNLLayerNormFwd() {}
 
  private:
   std::shared_ptr<layernorm_fwd_t> fwd;
   std::shared_ptr<layernorm_fwd_pd_t> fwd_pd;
 };
 
-class MKLDNNLayerNormBwd {
+class DNNLLayerNormBwd {
  public:
-  static MKLDNNLayerNormBwd& GetCached(const LayerNormParam& param,
-                                       const std::vector<NDArray>& inputs);
+  static DNNLLayerNormBwd& GetCached(const LayerNormParam& param,
+                                     const std::vector<NDArray>& inputs);
 
-  MKLDNNLayerNormBwd(const LayerNormParam& param,
-                     const std::vector<NDArray>& inputs,
-                     const mkldnn::memory::desc& data_md,
-                     const mkldnn::memory::desc& diff_md);
+  DNNLLayerNormBwd(const LayerNormParam& param,
+                   const std::vector<NDArray>& inputs,
+                   const dnnl::memory::desc& data_md,
+                   const dnnl::memory::desc& diff_md);
 
   static std::shared_ptr<layernorm_bwd_pd_t> CreatePrimitiveDesc(
       const LayerNormParam& param,
-      const mkldnn::memory::desc& data_md,
-      const mkldnn::memory::desc& diff_md,
+      const dnnl::memory::desc& data_md,
+      const dnnl::memory::desc& diff_md,
       const layernorm_fwd_pd_t& layernorm_fwd_pd);
 
   void Execute(const std::vector<NDArray>& inputs,
                const std::vector<NDArray>& outputs,
                const std::vector<OpReqType>& req) const;
 
-  ~MKLDNNLayerNormBwd() {}
+  ~DNNLLayerNormBwd() {}
 
  private:
   std::shared_ptr<layernorm_bwd_t> bwd;
@@ -100,4 +99,4 @@ class MKLDNNLayerNormBwd {
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LAYER_NORM_INL_H__
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_LAYER_NORM_INL_H__
diff --git a/src/operator/nn/mkldnn/mkldnn_layer_norm.cc b/src/operator/nn/dnnl/dnnl_layer_norm.cc
similarity index 52%
rename from src/operator/nn/mkldnn/mkldnn_layer_norm.cc
rename to src/operator/nn/dnnl/dnnl_layer_norm.cc
index 2b63319b0fef..2e720d084bed 100644
--- a/src/operator/nn/mkldnn/mkldnn_layer_norm.cc
+++ b/src/operator/nn/dnnl/dnnl_layer_norm.cc
@@ -18,29 +18,29 @@
  */
 
 /*!
- * \file mkldnn_layer_norm.cc
+ * \file dnnl_layer_norm.cc
  * \author: Bartosz Kuncer, bartosz.kuncer@intel.com
  */
 
 #if MXNET_USE_ONEDNN == 1
 
-#include "./mkldnn_layer_norm-inl.h"
+#include "./dnnl_layer_norm-inl.h"
 
 namespace mxnet {
 namespace op {
 
-bool SupportMKLDNNLayerNorm(const LayerNormParam& param, const std::vector<NDArray>& inputs) {
+bool SupportDNNLLayerNorm(const LayerNormParam& param, const std::vector<NDArray>& inputs) {
   const mxnet::TShape& shape = inputs[layernorm::kData].shape();
 
   // Native implementation (which can be found in function LayerNormCPU) is faster than oneDNN's one
   // for small tensors. Below is the heuristic based on measurements on clx machine deciding whether
   // the shape is better for oneDNN or native implementation.
-  auto ShapeBetterForMKLDNN = [](const mxnet::TShape& shape) {
+  auto ShapeBetterForDNNL = [](const mxnet::TShape& shape) {
     constexpr size_t shapeLimit = 1024;
     return shape.Size() / shape[0] >= shapeLimit && shape[0] >= shapeLimit;
   };
 
-  return (ShapeBetterForMKLDNN(shape) &&
+  return (ShapeBetterForDNNL(shape) &&
           (GetRealAxis(param.axis, shape.ndim()) == shape.ndim() - 1) && (shape.ndim() >= 2) &&
           (shape.ndim() <= 5) &&
           (inputs[layernorm::kData].dtype() == mshadow::kFloat32 ||
@@ -49,20 +49,20 @@ bool SupportMKLDNNLayerNorm(const LayerNormParam& param, const std::vector<NDArr
           inputs[layernorm::kBeta].dtype() == mshadow::kFloat32);
 }
 
-void MKLDNNLayerNormForward(const nnvm::NodeAttrs& attrs,
-                            const OpContext& ctx,
-                            const std::vector<NDArray>& inputs,
-                            const std::vector<OpReqType>& req,
-                            const std::vector<NDArray>& outputs) {
+void DNNLLayerNormForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs) {
   const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
-  const auto& fwd             = MKLDNNLayerNormFwd::GetCached(param, ctx, inputs[layernorm::kData]);
+  const auto& fwd             = DNNLLayerNormFwd::GetCached(param, ctx, inputs[layernorm::kData]);
   fwd.Execute(param, ctx, inputs, req[layernorm::kOut], outputs);
 }
 
-MKLDNNLayerNormFwd& MKLDNNLayerNormFwd::GetCached(const LayerNormParam& param,
-                                                  const OpContext& ctx,
-                                                  const NDArray& data) {
-  using layernorm_fwd_map = std::unordered_map<LayerNormSignature, MKLDNNLayerNormFwd, OpHash>;
+DNNLLayerNormFwd& DNNLLayerNormFwd::GetCached(const LayerNormParam& param,
+                                              const OpContext& ctx,
+                                              const NDArray& data) {
+  using layernorm_fwd_map = std::unordered_map<LayerNormSignature, DNNLLayerNormFwd, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local layernorm_fwd_map layer_norm_fwds;
 #else
@@ -74,52 +74,52 @@ MKLDNNLayerNormFwd& MKLDNNLayerNormFwd::GetCached(const LayerNormParam& param,
 
   auto it = layer_norm_fwds.find(key);
   if (it == layer_norm_fwds.end()) {
-    MKLDNNLayerNormFwd fwd(param, data);
+    DNNLLayerNormFwd fwd(param, data);
     it = AddToCache(&layer_norm_fwds, key, fwd);
   }
   return it->second;
 }
 
-MKLDNNLayerNormFwd::MKLDNNLayerNormFwd(const LayerNormParam& param, const NDArray& data) {
-  const mkldnn::memory::desc data_md = data.GetMKLDNNData()->get_desc();
-  fwd_pd                             = CreatePrimitiveDesc(param, data_md);
-  fwd                                = std::make_shared<layernorm_fwd_t>(*fwd_pd);
+DNNLLayerNormFwd::DNNLLayerNormFwd(const LayerNormParam& param, const NDArray& data) {
+  const dnnl::memory::desc data_md = data.GetDNNLData()->get_desc();
+  fwd_pd                           = CreatePrimitiveDesc(param, data_md);
+  fwd                              = std::make_shared<layernorm_fwd_t>(*fwd_pd);
 }
 
-std::shared_ptr<layernorm_fwd_pd_t> MKLDNNLayerNormFwd::CreatePrimitiveDesc(
+std::shared_ptr<layernorm_fwd_pd_t> DNNLLayerNormFwd::CreatePrimitiveDesc(
     const LayerNormParam& param,
-    const mkldnn::memory::desc& src_md) {
-  layernorm_fwd_t::desc fwd_desc(mkldnn::prop_kind::forward_training,
+    const dnnl::memory::desc& src_md) {
+  layernorm_fwd_t::desc fwd_desc(dnnl::prop_kind::forward_training,
                                  src_md,
                                  param.eps,
                                  dnnl::normalization_flags::use_scale_shift);
-  mkldnn::engine& engine = CpuEngine::Get()->get_engine();
+  dnnl::engine& engine = CpuEngine::Get()->get_engine();
   return std::make_shared<layernorm_fwd_pd_t>(fwd_desc, engine);
 }
 
-inline mkldnn::memory::desc GetMeanVarDesc(const mkldnn::memory::data_type& dtype,
-                                           const mxnet::TShape& _shape) {
+inline dnnl::memory::desc GetMeanVarDesc(const dnnl::memory::data_type& dtype,
+                                         const mxnet::TShape& _shape) {
   const auto ndim = _shape.ndim();
 
-  mkldnn::memory::dims shape(ndim, 1), strides(ndim, 1);
+  dnnl::memory::dims shape(ndim, 1), strides(ndim, 1);
   shape[0] = _shape[0];
   for (int i = ndim - 1; i > 0; --i) {
     shape[i]       = _shape[i];
     strides[i - 1] = strides[i] * shape[i];
   }
 
-  return mkldnn::memory::desc{shape, dtype, strides};
+  return dnnl::memory::desc{shape, dtype, strides};
 }
 
-inline mkldnn::memory GetScaleShiftMem(const NDArray& gamma, const NDArray& beta) {
+inline dnnl::memory GetScaleShiftMem(const NDArray& gamma, const NDArray& beta) {
   // OneDNN takes gamma and beta as one SCALE_SHIFT tensor when both scale and shift are used. In
   // mxnet scale is called gamma and shift is called beta.
   constexpr size_t gammaAndBeta = 2;
   CHECK_EQ(gamma.shape()[0], beta.shape()[0]);
-  const mkldnn::memory::desc scale_shift_md(mkldnn::memory::dims{gammaAndBeta, gamma.shape()[0]},
-                                            get_mkldnn_type(gamma.dtype()),
-                                            mkldnn::memory::format_tag::nc);
-  auto scale_shift_mem = mkldnn::memory(scale_shift_md, CpuEngine::Get()->get_engine());
+  const dnnl::memory::desc scale_shift_md(dnnl::memory::dims{gammaAndBeta, gamma.shape()[0]},
+                                          get_dnnl_type(gamma.dtype()),
+                                          dnnl::memory::format_tag::nc);
+  auto scale_shift_mem = dnnl::memory(scale_shift_md, CpuEngine::Get()->get_engine());
   char* ptr            = reinterpret_cast<char*>(scale_shift_mem.get_data_handle());
   const size_t bytes   = scale_shift_md.get_size() / gammaAndBeta;
   memcpy(ptr, gamma.data().dptr_, bytes);
@@ -127,62 +127,60 @@ inline mkldnn::memory GetScaleShiftMem(const NDArray& gamma, const NDArray& beta
   return scale_shift_mem;
 }
 
-void MKLDNNLayerNormFwd::Execute(const LayerNormParam& param,
-                                 const OpContext& ctx,
-                                 const std::vector<NDArray>& inputs,
-                                 const OpReqType& req,
-                                 const std::vector<NDArray>& outputs) const {
-  auto mean_var_md = GetMeanVarDesc(get_mkldnn_type(outputs[layernorm::kMean].dtype()),
+void DNNLLayerNormFwd::Execute(const LayerNormParam& param,
+                               const OpContext& ctx,
+                               const std::vector<NDArray>& inputs,
+                               const OpReqType& req,
+                               const std::vector<NDArray>& outputs) const {
+  auto mean_var_md = GetMeanVarDesc(get_dnnl_type(outputs[layernorm::kMean].dtype()),
                                     outputs[layernorm::kMean].shape());
-  auto mean_mem    = mkldnn_output_t(
-      OutDataOp::Noop,
-      const_cast<NDArray&>(outputs[layernorm::kMean]).CreateMKLDNNData(mean_var_md));
-  auto variance_mem =
-      mkldnn_output_t(OutDataOp::Noop,
-                      const_cast<NDArray&>(outputs[layernorm::kStd]).CreateMKLDNNData(mean_var_md));
+  auto mean_mem    = dnnl_output_t(
+      OutDataOp::Noop, const_cast<NDArray&>(outputs[layernorm::kMean]).CreateDNNLData(mean_var_md));
+  auto variance_mem = dnnl_output_t(
+      OutDataOp::Noop, const_cast<NDArray&>(outputs[layernorm::kStd]).CreateDNNLData(mean_var_md));
 
-  auto output_mem      = CreateMKLDNNMem(outputs[layernorm::kOut], fwd_pd->dst_desc(), req);
+  auto output_mem      = CreateDNNLMem(outputs[layernorm::kOut], fwd_pd->dst_desc(), req);
   auto scale_shift_mem = GetScaleShiftMem(inputs[layernorm::kGamma], inputs[layernorm::kBeta]);
 
-  mkldnn_args_map_t args = {{MKLDNN_ARG_SRC, *inputs[layernorm::kData].GetMKLDNNData()},
-                            {MKLDNN_ARG_DST, *output_mem.second},
-                            {MKLDNN_ARG_MEAN, *mean_mem.second},
-                            {MKLDNN_ARG_VARIANCE, *variance_mem.second},
-                            {MKLDNN_ARG_SCALE_SHIFT, scale_shift_mem}};
+  dnnl_args_map_t args = {{DNNL_ARG_SRC, *inputs[layernorm::kData].GetDNNLData()},
+                          {DNNL_ARG_DST, *output_mem.second},
+                          {DNNL_ARG_MEAN, *mean_mem.second},
+                          {DNNL_ARG_VARIANCE, *variance_mem.second},
+                          {DNNL_ARG_SCALE_SHIFT, scale_shift_mem}};
 
-  MKLDNNStream::Get()->RegisterPrimArgs(*fwd, args);
+  DNNLStream::Get()->RegisterPrimArgs(*fwd, args);
   CommitOutput(outputs[layernorm::kOut], output_mem);
   CommitOutput(outputs[layernorm::kMean], mean_mem);
   CommitOutput(outputs[layernorm::kStd], variance_mem);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
-MKLDNNLayerNormBwd::MKLDNNLayerNormBwd(const LayerNormParam& param,
-                                       const std::vector<NDArray>& inputs,
-                                       const mkldnn::memory::desc& data_md,
-                                       const mkldnn::memory::desc& diff_md)
-    : fwd_pd(MKLDNNLayerNormFwd::CreatePrimitiveDesc(param, data_md)),
+DNNLLayerNormBwd::DNNLLayerNormBwd(const LayerNormParam& param,
+                                   const std::vector<NDArray>& inputs,
+                                   const dnnl::memory::desc& data_md,
+                                   const dnnl::memory::desc& diff_md)
+    : fwd_pd(DNNLLayerNormFwd::CreatePrimitiveDesc(param, data_md)),
       bwd_pd(CreatePrimitiveDesc(param, data_md, diff_md, *fwd_pd)) {
   bwd = std::make_shared<layernorm_bwd_t>(*bwd_pd);
 }
 
-std::shared_ptr<layernorm_bwd_pd_t> MKLDNNLayerNormBwd::CreatePrimitiveDesc(
+std::shared_ptr<layernorm_bwd_pd_t> DNNLLayerNormBwd::CreatePrimitiveDesc(
     const LayerNormParam& param,
-    const mkldnn::memory::desc& data_md,
-    const mkldnn::memory::desc& diff_md,
+    const dnnl::memory::desc& data_md,
+    const dnnl::memory::desc& diff_md,
     const layernorm_fwd_pd_t& layernorm_fwd_pd) {
   layernorm_bwd_t::desc layernorm_bwd_desc(dnnl::prop_kind::backward,
                                            diff_md,
                                            data_md,
                                            param.eps,
                                            dnnl::normalization_flags::use_scale_shift);
-  mkldnn::engine& engine = CpuEngine::Get()->get_engine();
+  dnnl::engine& engine = CpuEngine::Get()->get_engine();
   return std::make_shared<layernorm_bwd_pd_t>(layernorm_bwd_desc, engine, layernorm_fwd_pd);
 }
 
-void MKLDNNLayerNormBwd::Execute(const std::vector<NDArray>& inputs,
-                                 const std::vector<NDArray>& outputs,
-                                 const std::vector<OpReqType>& req) const {
+void DNNLLayerNormBwd::Execute(const std::vector<NDArray>& inputs,
+                               const std::vector<NDArray>& outputs,
+                               const std::vector<OpReqType>& req) const {
   auto scale_shift_mem =
       GetScaleShiftMem(inputs[layernorm::kBwdGamma], inputs[layernorm::kBwdBeta]);
   auto diff_weights_ndarray = NDArray(scale_shift_mem.get_desc());
@@ -197,21 +195,21 @@ void MKLDNNLayerNormBwd::Execute(const std::vector<NDArray>& inputs,
            outputs[layernorm::kBwdBetaGrad].data().dptr_,
            bytes);
   }
-  mkldnn_output_t diff_src_mem = CreateMKLDNNMem(
+  dnnl_output_t diff_src_mem = CreateDNNLMem(
       outputs[layernorm::kBwdDataGrad], bwd_pd->diff_src_desc(), req[layernorm::kBwdDataGrad]);
-  mkldnn_output_t diff_weights_mem = CreateMKLDNNMem(
+  dnnl_output_t diff_weights_mem = CreateDNNLMem(
       diff_weights_ndarray, bwd_pd->diff_weights_desc(), req[layernorm::kBwdGammaGrad]);
-  mkldnn_args_map_t args = {{MKLDNN_ARG_DIFF_DST, *inputs[layernorm::kBwdOutGrad].GetMKLDNNData()},
-                            {MKLDNN_ARG_SRC, *inputs[layernorm::kBwdData].GetMKLDNNData()},
-                            {MKLDNN_ARG_SCALE_SHIFT, scale_shift_mem},
-                            {MKLDNN_ARG_MEAN, *inputs[layernorm::kBwdMean].GetMKLDNNData()},
-                            {MKLDNN_ARG_VARIANCE, *inputs[layernorm::kBwdStd].GetMKLDNNData()},
-                            {MKLDNN_ARG_DIFF_SRC, *diff_src_mem.second},
-                            {MKLDNN_ARG_DIFF_SCALE_SHIFT, *diff_weights_mem.second}};
-  MKLDNNStream::Get()->RegisterPrimArgs(*bwd, args);
+  dnnl_args_map_t args = {{DNNL_ARG_DIFF_DST, *inputs[layernorm::kBwdOutGrad].GetDNNLData()},
+                          {DNNL_ARG_SRC, *inputs[layernorm::kBwdData].GetDNNLData()},
+                          {DNNL_ARG_SCALE_SHIFT, scale_shift_mem},
+                          {DNNL_ARG_MEAN, *inputs[layernorm::kBwdMean].GetDNNLData()},
+                          {DNNL_ARG_VARIANCE, *inputs[layernorm::kBwdStd].GetDNNLData()},
+                          {DNNL_ARG_DIFF_SRC, *diff_src_mem.second},
+                          {DNNL_ARG_DIFF_SCALE_SHIFT, *diff_weights_mem.second}};
+  DNNLStream::Get()->RegisterPrimArgs(*bwd, args);
   CommitOutput(outputs[layernorm::kBwdDataGrad], diff_src_mem);
   CommitOutput(diff_weights_ndarray, diff_weights_mem);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
   // Commit scale_shift diff
   memcpy(outputs[layernorm::kBwdGammaGrad].data().dptr_, diff_weights_ndarray.data().dptr_, bytes);
   memcpy(outputs[layernorm::kBwdBetaGrad].data().dptr_,
@@ -219,9 +217,9 @@ void MKLDNNLayerNormBwd::Execute(const std::vector<NDArray>& inputs,
          bytes);
 }
 
-MKLDNNLayerNormBwd& MKLDNNLayerNormBwd::GetCached(const LayerNormParam& param,
-                                                  const std::vector<NDArray>& inputs) {
-  using layernorm_bwd_map = std::unordered_map<LayerNormSignature, MKLDNNLayerNormBwd, OpHash>;
+DNNLLayerNormBwd& DNNLLayerNormBwd::GetCached(const LayerNormParam& param,
+                                              const std::vector<NDArray>& inputs) {
+  using layernorm_bwd_map = std::unordered_map<LayerNormSignature, DNNLLayerNormBwd, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local layernorm_bwd_map layer_norm_bwds;
 #else
@@ -237,21 +235,21 @@ MKLDNNLayerNormBwd& MKLDNNLayerNormBwd::GetCached(const LayerNormParam& param,
 
   auto it = layer_norm_bwds.find(key);
   if (it == layer_norm_bwds.end()) {
-    const mkldnn::memory::desc data_md = inputs[layernorm::kBwdData].GetMKLDNNData()->get_desc();
-    const mkldnn::memory::desc diff_md = inputs[layernorm::kBwdOutGrad].GetMKLDNNData()->get_desc();
-    MKLDNNLayerNormBwd bwd(param, inputs, data_md, diff_md);
+    const dnnl::memory::desc data_md = inputs[layernorm::kBwdData].GetDNNLData()->get_desc();
+    const dnnl::memory::desc diff_md = inputs[layernorm::kBwdOutGrad].GetDNNLData()->get_desc();
+    DNNLLayerNormBwd bwd(param, inputs, data_md, diff_md);
     it = AddToCache(&layer_norm_bwds, key, bwd);
   }
   return it->second;
 }
 
-void MKLDNNLayerNormBackward(const nnvm::NodeAttrs& attrs,
-                             const OpContext& ctx,
-                             const std::vector<NDArray>& inputs,
-                             const std::vector<OpReqType>& req,
-                             const std::vector<NDArray>& outputs) {
+void DNNLLayerNormBackward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs) {
   const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
-  MKLDNNLayerNormBwd& bwd     = MKLDNNLayerNormBwd::GetCached(param, inputs);
+  DNNLLayerNormBwd& bwd       = DNNLLayerNormBwd::GetCached(param, inputs);
   bwd.Execute(inputs, outputs, req);
 }
 
diff --git a/src/operator/nn/dnnl/dnnl_log_softmax.cc b/src/operator/nn/dnnl/dnnl_log_softmax.cc
new file mode 100644
index 000000000000..9408e6019610
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_log_softmax.cc
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_log_softmax.cc
+ * \brief Implementation of log_softmax function with DNNL support
+ */
+
+#include "../softmax-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
+
+#if MXNET_USE_ONEDNN == 1
+namespace mxnet {
+namespace op {
+
+static dnnl::logsoftmax_forward::primitive_desc GetLogSoftmaxFwdPd(bool is_train,
+                                                                   const int axis,
+                                                                   const dnnl::memory& input_mem) {
+  dnnl::memory::desc data_md = input_mem.get_desc();
+  auto cpu_engine            = CpuEngine::Get()->get_engine();
+  auto prop = is_train ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_scoring;
+  auto desc = dnnl::logsoftmax_forward::desc(prop, data_md, axis);
+  return dnnl::logsoftmax_forward::primitive_desc(desc, cpu_engine);
+}
+
+static dnnl::logsoftmax_backward::primitive_desc GetLogSoftmaxBwdPd(
+    const dnnl::memory& diff_mem,
+    const dnnl::memory& data_mem,
+    const int axis,
+    const dnnl::logsoftmax_forward::primitive_desc& hint_fwd_pd) {
+  dnnl::memory::desc diff_md = diff_mem.get_desc();
+  dnnl::memory::desc data_md = data_mem.get_desc();
+  auto cpu_engine            = CpuEngine::Get()->get_engine();
+  auto desc                  = dnnl::logsoftmax_backward::desc(diff_md, data_md, axis);
+  return dnnl::logsoftmax_backward::primitive_desc(desc, cpu_engine, hint_fwd_pd);
+}
+
+bool SupportDNNLLogSoftmax(const SoftmaxParam& param, const NDArray& data, const NDArray& output) {
+  const int ndim      = data.shape().ndim();
+  const int in_dtype  = data.dtype();
+  const int out_dtype = output.dtype();
+  const int axis      = CheckAxis(param.axis, ndim);
+  // DNNL does not support temperature argument in their log_softmax function
+  // now. Need update this once they start to support it.
+  // Currently, DNNL shows bad performance when log_softmax is not performed on the last dimension
+  if (param.temperature.has_value() || in_dtype != mshadow::kFloat32 || in_dtype != out_dtype ||
+      axis != (ndim - 1)) {
+    return false;
+  }
+
+  // only supports ndim = 1, 2, 3, 4 for now
+  return (ndim >= 1 && ndim <= 4);
+}
+
+class DNNLLogSoftmaxFwd {
+ public:
+  dnnl::logsoftmax_forward::primitive_desc pd;
+
+  DNNLLogSoftmaxFwd(const bool is_train, const int axis, const dnnl::memory& input)
+      : pd(GetLogSoftmaxFwdPd(is_train, axis, input)) {
+    fwd_ = std::make_shared<dnnl::logsoftmax_forward>(pd);
+  }
+
+  const dnnl::logsoftmax_forward& GetFwd() const {
+    return *fwd_;
+  }
+
+ private:
+  std::shared_ptr<dnnl::logsoftmax_forward> fwd_;
+};
+
+typedef ParamOpSign<SoftmaxParam> DNNLSoftmaxSignature;
+
+static DNNLLogSoftmaxFwd& GetLogSoftmaxFwd(const SoftmaxParam& param,
+                                           const int real_axis,
+                                           const bool is_train,
+                                           const NDArray& data,
+                                           const NDArray& output) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLSoftmaxSignature, DNNLLogSoftmaxFwd, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLSoftmaxSignature, DNNLLogSoftmaxFwd, OpHash> fwds;
+#endif
+
+  DNNLSoftmaxSignature key(param);
+  key.AddSign(real_axis);
+  key.AddSign(is_train);
+  key.AddSign(data);
+  key.AddSign(output);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    DNNLLogSoftmaxFwd fwd(is_train, real_axis, *(data.GetDNNLData()));
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void DNNLLogSoftmaxForward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const NDArray& in_data,
+                           const OpReqType& req,
+                           const NDArray& out_data) {
+  if (req == kNullOp)
+    return;
+  // same as the FCompute path, log_softmax only supports kWriteTo and kWriteInplace for now.
+  CHECK_NE(req, kAddTo);
+
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  int axis                  = CheckAxis(param.axis, in_data.shape().ndim());
+  auto fwd                  = GetLogSoftmaxFwd(param, axis, ctx.is_train, in_data, out_data);
+
+  auto in_mem        = in_data.GetDNNLData();
+  auto out_mem       = out_data.GetDNNLData(fwd.pd.dst_desc());
+  DNNLStream* stream = DNNLStream::Get();
+  stream->RegisterPrimArgs(fwd.GetFwd(), {{DNNL_ARG_SRC, *in_mem}, {DNNL_ARG_DST, *out_mem}});
+  stream->Submit();
+}
+
+class DNNLLogSoftmaxBwd {
+ public:
+  dnnl::logsoftmax_backward::primitive_desc pd;
+
+  DNNLLogSoftmaxBwd(const dnnl::memory& diff_mem,
+                    const dnnl::memory& data_mem,
+                    const int axis,
+                    const dnnl::logsoftmax_forward::primitive_desc& hint_fwd_pd)
+      : pd(GetLogSoftmaxBwdPd(diff_mem, data_mem, axis, hint_fwd_pd)) {
+    bwd_ = std::make_shared<dnnl::logsoftmax_backward>(pd);
+  }
+
+  const dnnl::logsoftmax_backward& GetBwd() const {
+    return *bwd_;
+  }
+
+ private:
+  std::shared_ptr<dnnl::logsoftmax_backward> bwd_;
+};
+
+static DNNLLogSoftmaxBwd& GetLogSoftmaxBwd(const SoftmaxParam& param,
+                                           const int real_axis,
+                                           const std::vector<NDArray>& data,
+                                           const std::vector<NDArray>& output) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLSoftmaxSignature, DNNLLogSoftmaxBwd, OpHash> bwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLSoftmaxSignature, DNNLLogSoftmaxBwd, OpHash> bwds;
+#endif
+
+  DNNLSoftmaxSignature key(param);
+  key.AddSign(real_axis);
+  key.AddSign(data);
+  key.AddSign(output);
+
+  auto it = bwds.find(key);
+  if (it == bwds.end()) {
+    auto diff_mem = data[0].GetDNNLData();
+    auto data_mem = data[1].GetDNNLData();
+    auto fwd_pd   = GetLogSoftmaxFwdPd(true, real_axis, *data_mem);
+    DNNLLogSoftmaxBwd bwd(*diff_mem, *data_mem, real_axis, fwd_pd);
+    it = AddToCache(&bwds, key, bwd);
+  }
+  return it->second;
+}
+
+void DNNLLogSoftmaxBackward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& in_data,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& out_data) {
+  if (req[0] == kNullOp)
+    return;
+  CHECK_EQ(in_data.size(), 2U);
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  int axis                  = CheckAxis(param.axis, in_data[1].shape().ndim());
+  auto diff_mem             = in_data[0].GetDNNLData();
+  auto data_mem             = in_data[1].GetDNNLData();
+  auto bwd                  = GetLogSoftmaxBwd(param, axis, in_data, out_data);
+
+  auto out_mem         = CreateDNNLMem(out_data[0], bwd.pd.diff_src_desc(), req[0]);
+  DNNLStream* stream   = DNNLStream::Get();
+  dnnl_args_map_t args = {{DNNL_ARG_DST, *data_mem},
+                          {DNNL_ARG_DIFF_DST, *diff_mem},
+                          {DNNL_ARG_DIFF_SRC, *out_mem.second}};
+
+  stream->RegisterPrimArgs(bwd.GetBwd(), args);
+  CommitOutput(out_data[0], out_mem);
+  stream->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/nn/dnnl/dnnl_lrn-inl.h b/src/operator/nn/dnnl/dnnl_lrn-inl.h
new file mode 100644
index 000000000000..842705b254ee
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_lrn-inl.h
@@ -0,0 +1,262 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_lrn-inl.h
+ * \brief
+ * \Author: Patric Zhao, patric.zhao@intel.com
+ */
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_LRN_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_LRN_INL_H_
+
+#if MXNET_USE_ONEDNN == 1
+#include <dnnl.hpp>
+#include <utility>
+#include <vector>
+
+#include "../lrn-inl.h"
+#include "./dnnl_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+inline dnnl::algorithm GetDNNLLRNAlgo(const LRNParam& param) {
+  // TODO(Patric): lrn_within_channel will cause core dump in DNNL backward
+  //               Need to confirm with DNNL team and fix later
+  return dnnl::algorithm::lrn_across_channels;
+}
+
+inline dnnl::lrn_forward::primitive_desc GetLRNFwdDesc(const LRNParam& param,
+                                                       const bool is_train,
+                                                       const dnnl::memory::desc& src_md) {
+  dnnl::engine& engine      = CpuEngine::Get()->get_engine();
+  const dnnl::algorithm alg = GetDNNLLRNAlgo(param);
+  const float alpha         = param.alpha;
+  const float beta          = param.beta;
+  const int nsize           = param.nsize;
+  const float k             = param.knorm;
+  auto kind                 = dnnl::prop_kind::forward_training;
+  if (is_train) {
+    kind = dnnl::prop_kind::forward_training;
+  } else {
+    kind = dnnl::prop_kind::forward_scoring;
+  }
+  dnnl::lrn_forward::desc fwd_desc(kind, alg, src_md, nsize, alpha, beta, k);
+  return dnnl::lrn_forward::primitive_desc(fwd_desc, engine);
+}
+
+inline dnnl::lrn_backward::primitive_desc GetLRNBwdDesc(
+    const LRNParam& param,
+    const dnnl::memory::desc& data_in_md,
+    const dnnl::memory::desc& diff_md,
+    const dnnl::lrn_forward::primitive_desc& lrnFwd_desc) {
+  dnnl::engine& engine      = CpuEngine::Get()->get_engine();
+  const dnnl::algorithm alg = GetDNNLLRNAlgo(param);
+  const float alpha         = param.alpha;
+  const float beta          = param.beta;
+  const int nsize           = param.nsize;
+  const float k             = param.knorm;
+
+  dnnl::lrn_backward::desc lrnBwd_desc(alg, data_in_md, diff_md, nsize, alpha, beta, k);
+  return dnnl::lrn_backward::primitive_desc(lrnBwd_desc, engine, lrnFwd_desc);
+}
+
+typedef ParamOpSign<LRNParam> DNNLLRNSignature;
+
+// LRN Forward Class
+class DNNLLRNFwd {
+ public:
+  DNNLLRNFwd(const LRNParam& param, bool is_train, const NDArray& in_data) {
+    _Init(param, is_train, in_data);
+  }
+
+  ~DNNLLRNFwd() {}
+
+  void Execute(const OpContext& ctx,
+               const NDArray& in_data,
+               const OpReqType req,
+               const NDArray& out_data);
+
+  dnnl::lrn_forward& GetFwd();
+  const dnnl::memory* GetWs();
+  dnnl::lrn_forward::primitive_desc& GetFwdPd();
+
+ private:
+  std::shared_ptr<dnnl::lrn_forward> fwd;
+  dnnl::lrn_forward::primitive_desc fwd_pd;
+
+ private:
+  void _Init(const LRNParam& param, bool is_train, const NDArray& in_data);
+};  // End of LRN Forword Class
+
+void DNNLLRNFwd::_Init(const LRNParam& param, bool is_train, const NDArray& in_data) {
+  dnnl::memory::desc in_data_md = in_data.GetDNNLData()->get_desc();
+  this->fwd_pd                  = GetLRNFwdDesc(param, is_train, in_data_md);
+
+  this->fwd = std::shared_ptr<dnnl::lrn_forward>(new dnnl::lrn_forward(this->fwd_pd));
+}
+
+void DNNLLRNFwd::Execute(const OpContext& ctx,
+                         const NDArray& in_data,
+                         const OpReqType req,
+                         const NDArray& out_data) {
+  auto output_mem_t = CreateDNNLMem(out_data, (this->fwd_pd).dst_desc(), req);
+
+  dnnl_args_map_t args = {
+      {DNNL_ARG_SRC, *in_data.GetDNNLData()},
+      {DNNL_ARG_DST, *output_mem_t.second},
+  };
+  std::shared_ptr<dnnl::memory> workspace;
+  if (ctx.is_train) {
+    auto engine = CpuEngine::Get()->get_engine();
+    workspace   = std::make_shared<dnnl::memory>((this->fwd_pd).workspace_desc(), engine);
+    args[DNNL_ARG_WORKSPACE] = *(workspace);
+  }
+  DNNLStream::Get()->RegisterPrimArgs(*(this->fwd), args);
+  CommitOutput(out_data, output_mem_t);
+  DNNLStream::Get()->Submit();
+}
+
+dnnl::lrn_forward& DNNLLRNFwd::GetFwd() {
+  return *this->fwd;
+}
+dnnl::lrn_forward::primitive_desc& DNNLLRNFwd::GetFwdPd() {
+  return this->fwd_pd;
+}
+
+// End of LRN Class and its functions
+
+static DNNLLRNFwd& GetLRNFwd(const LRNParam& param, const OpContext& ctx, const NDArray& in_data) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLLRNSignature, DNNLLRNFwd, OpHash> lrn_fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLLRNSignature, DNNLLRNFwd, OpHash> lrn_fwds;
+#endif
+  auto kind_ = ctx.is_train ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_scoring;
+
+  DNNLLRNSignature key(param);
+  key.AddSign(static_cast<int>(kind_));
+  key.AddSign(in_data);
+
+  auto it = lrn_fwds.find(key);
+  if (it == lrn_fwds.end()) {
+    DNNLLRNFwd fwd(param, ctx.is_train, in_data);
+    it = AddToCache(&lrn_fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void DNNLLRNForward(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const NDArray& in_data,
+                    const OpReqType req,
+                    const NDArray& out_data) {
+  const LRNParam& param = nnvm::get<LRNParam>(attrs.parsed);
+  auto in_buffer        = in_data;
+  if (in_buffer.IsView() && in_buffer.IsDNNLData())
+    in_buffer = in_buffer.Reorder2Default();
+  DNNLLRNFwd fwd = GetLRNFwd(param, ctx, in_buffer);
+  fwd.Execute(ctx, in_buffer, req, out_data);
+}
+
+// LRN Backward Class
+class DNNLLRNBwd {
+  std::shared_ptr<dnnl::lrn_backward> bwd;
+
+ public:
+  const dnnl::lrn_forward::primitive_desc fwd_pd;
+  const dnnl::lrn_backward::primitive_desc bwd_pd;
+
+  ~DNNLLRNBwd() {}
+
+  DNNLLRNBwd(const LRNParam& param,
+             const dnnl::memory::desc in_data_md,
+             const dnnl::memory::desc diff_md)
+      : fwd_pd(GetLRNFwdDesc(param, true, in_data_md)),
+        bwd_pd(GetLRNBwdDesc(param, in_data_md, diff_md, this->fwd_pd)) {
+    bwd = std::make_shared<dnnl::lrn_backward>(bwd_pd);
+  }
+
+  const dnnl::lrn_backward& GetBwd() const {
+    return *bwd;
+  }
+
+  void Execute(const NDArray& out_grad,
+               const NDArray& in_data,
+               const NDArray& in_grad,
+               const dnnl_output_t& diff_src_mem) {
+    auto engine          = CpuEngine::Get()->get_engine();
+    auto workspace       = std::make_shared<dnnl::memory>((this->fwd_pd).workspace_desc(), engine);
+    dnnl_args_map_t args = {{DNNL_ARG_SRC, *in_data.GetDNNLData()},
+                            {DNNL_ARG_DIFF_DST, *out_grad.GetDNNLData()},
+                            {DNNL_ARG_WORKSPACE, *workspace},
+                            {DNNL_ARG_DIFF_SRC, *diff_src_mem.second}};
+    DNNLStream::Get()->RegisterPrimArgs(*(this->bwd), args);
+    CommitOutput(in_grad, diff_src_mem);
+    DNNLStream::Get()->Submit();
+  }
+};  // End of LRN Class
+
+static DNNLLRNBwd& GetLRNBwd(const LRNParam& param,
+                             const NDArray& in_data,
+                             const NDArray& in_grad,
+                             const NDArray& out_grad) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLLRNSignature, DNNLLRNBwd, OpHash> lrn_bwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLLRNSignature, DNNLLRNBwd, OpHash> lrn_bwds;
+#endif
+  DNNLLRNSignature key(param);
+  key.AddSign(in_data);
+  key.AddSign(in_grad);
+  key.AddSign(out_grad);
+
+  auto it = lrn_bwds.find(key);
+  if (it == lrn_bwds.end()) {
+    const dnnl::memory::desc in_data_md = in_data.GetDNNLData()->get_desc();
+    const dnnl::memory::desc diff_md    = out_grad.GetDNNLData()->get_desc();
+    DNNLLRNBwd bwd(param, in_data_md, diff_md);
+    it = AddToCache(&lrn_bwds, key, bwd);
+  }
+  return it->second;
+}
+
+void DNNLLRNBackward(const nnvm::NodeAttrs& attrs,
+                     const OpContext& ctx,
+                     const std::vector<NDArray>& inputs,
+                     const std::vector<OpReqType>& req,
+                     const std::vector<NDArray>& outputs) {
+  if (req[0] == kNullOp) {
+    return;
+  }
+  const LRNParam& param   = nnvm::get<LRNParam>(attrs.parsed);
+  const NDArray& out_grad = inputs[0];
+  const NDArray& in_data  = inputs[1];
+  const NDArray& in_grad  = outputs[0];
+  // TODO(alex): (MXNET-846) figure out why in_grad output incorrect when in_data is nchw8c
+  const auto in_buffer       = in_data.Reorder2Default();
+  DNNLLRNBwd& bwd            = GetLRNBwd(param, in_buffer, in_grad, out_grad);
+  dnnl_output_t diff_src_mem = CreateDNNLMem(in_grad, bwd.bwd_pd.diff_src_desc(), req[0]);
+
+  bwd.Execute(out_grad, in_buffer, in_grad, diff_src_mem);
+}
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_ONEDNN == 1
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_LRN_INL_H__
diff --git a/src/operator/nn/dnnl/dnnl_ops-inl.h b/src/operator/nn/dnnl/dnnl_ops-inl.h
new file mode 100644
index 000000000000..8816c3c1f659
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_ops-inl.h
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_ops-inl.h
+ * \brief
+ * \author Da Zheng
+ */
+
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_OPS_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_OPS_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/optional.h>
+#include <mxnet/base.h>
+#include <mxnet/io.h>
+#include <mxnet/ndarray.h>
+#include <mxnet/operator.h>
+#include <mxnet/operator_util.h>
+
+#include <vector>
+
+#if MXNET_USE_ONEDNN == 1
+#include <dnnl.hpp>
+
+namespace mxnet {
+namespace op {
+
+/* For fully connected. */
+void DNNLFCForward(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<NDArray>& in_data,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<NDArray>& out_data);
+void DNNLFCBackward(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<NDArray>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<NDArray>& outputs);
+
+/* For convolution. */
+void DNNLConvolutionForward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& in_data,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& out_data);
+void DNNLConvolutionBackward(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<NDArray>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<NDArray>& outputs);
+
+/* For deconvolution */
+void DNNLDeconvolutionForward(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& in_data,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& out_data);
+void DNNLDeconvolutionBackward(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx,
+                               const std::vector<NDArray>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<NDArray>& outputs);
+
+/* For activation */
+void DNNLActivationForward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const NDArray& in_data,
+                           const OpReqType& req,
+                           const NDArray& out_data);
+void DNNLActivationBackward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& outputs);
+
+void DNNLLeakyReluForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const NDArray& in_data,
+                          const OpReqType& req,
+                          const NDArray& out_data);
+void DNNLLeakyReluBackward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs);
+
+/* For softmax */
+void DNNLSoftmaxForward(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const NDArray& in_data,
+                        const OpReqType& req,
+                        const NDArray& out_data);
+void DNNLSoftmaxBackward(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<NDArray>& in_data,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<NDArray>& out_data);
+
+/* For log_softmax */
+void DNNLLogSoftmaxForward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const NDArray& in_data,
+                           const OpReqType& req,
+                           const NDArray& out_data);
+void DNNLLogSoftmaxBackward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& in_data,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& out_data);
+
+/* For softmax_output */
+void DNNLSoftmaxOutputForward(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& in_data,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& out_data);
+
+/* For sum */
+void DNNLSumForward(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<NDArray>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<NDArray>& outputs);
+
+/* For copy */
+void DNNLCopy(const nnvm::NodeAttrs& attrs,
+              const OpContext& ctx,
+              const NDArray& in_data,
+              const OpReqType& req,
+              const NDArray& out_data);
+
+/* For concat */
+void DNNLConcatForward(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx,
+                       const std::vector<NDArray>& in_data,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<NDArray>& out_data);
+void DNNLConcatBackward(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<NDArray>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<NDArray>& outputs);
+
+/* For batch dot */
+void DNNLBatchDotForward(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<NDArray>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<NDArray>& outputs);
+
+/* For layer normalization */
+void DNNLLayerNormForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs);
+void DNNLLayerNormBackward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs);
+
+void DNNLSum(const dnnl::memory& arr1, const dnnl::memory& arr2, const dnnl::memory& out);
+
+void DNNLTransposeForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const NDArray& data,
+                          const OpReqType& req,
+                          const NDArray& output);
+
+void DNNLReshapeForward(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const NDArray& input,
+                        const OpReqType& req,
+                        const NDArray& output);
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_ONEDNN == 1
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_OPS_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/dnnl/dnnl_pooling-inl.h
similarity index 54%
rename from src/operator/nn/mkldnn/mkldnn_pooling-inl.h
rename to src/operator/nn/dnnl/dnnl_pooling-inl.h
index be2c9f2aacc3..83d27e5e6469 100644
--- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
+++ b/src/operator/nn/dnnl/dnnl_pooling-inl.h
@@ -18,41 +18,39 @@
  */
 
 /*!
- * \file mkldnn_pooling-inl.h
+ * \file dnnl_pooling-inl.h
  * \brief
  */
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_POOLING_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_POOLING_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 
-#include <mkldnn.hpp>
-
+#include <dnnl.hpp>
 #include <utility>
 
-#include "./mkldnn_base-inl.h"
-
 #include "../pooling-inl.h"
+#include "./dnnl_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-class MKLDNNPoolingFwd {
+class DNNLPoolingFwd {
  public:
-  MKLDNNPoolingFwd(const mxnet::NDArray& input,
-                   const mxnet::NDArray& output,
-                   const mkldnn::memory::dims& kernel,
-                   const mkldnn::memory::dims& strides,
-                   const mkldnn::memory::dims& pad_l,
-                   const mkldnn::memory::dims& pad_r,
-                   const mkldnn::algorithm alg_kind,
-                   const bool with_workspace,
-                   const bool is_train)
+  DNNLPoolingFwd(const mxnet::NDArray& input,
+                 const mxnet::NDArray& output,
+                 const dnnl::memory::dims& kernel,
+                 const dnnl::memory::dims& strides,
+                 const dnnl::memory::dims& pad_l,
+                 const dnnl::memory::dims& pad_r,
+                 const dnnl::algorithm alg_kind,
+                 const bool with_workspace,
+                 const bool is_train)
       : with_workspace_(with_workspace), fwd_(nullptr) {
     Init(input, output, kernel, strides, pad_l, pad_r, is_train, alg_kind);
   }
 
-  ~MKLDNNPoolingFwd() {}
+  ~DNNLPoolingFwd() {}
   void Execute(const NDArray& in_data,
                const OpReqType req,
                const NDArray& out_data,
@@ -61,32 +59,32 @@ class MKLDNNPoolingFwd {
  private:
   bool with_workspace_;
 
-  std::shared_ptr<mkldnn::pooling_forward::primitive_desc> fwd_pd_;
-  std::shared_ptr<mkldnn::pooling_forward> fwd_;
+  std::shared_ptr<dnnl::pooling_forward::primitive_desc> fwd_pd_;
+  std::shared_ptr<dnnl::pooling_forward> fwd_;
 
  private:
   void Init(const mxnet::NDArray& input,
             const mxnet::NDArray& output,
-            const mkldnn::memory::dims& kernel,
-            const mkldnn::memory::dims& strides,
-            const mkldnn::memory::dims& pad_l,
-            const mkldnn::memory::dims& pad_r,
+            const dnnl::memory::dims& kernel,
+            const dnnl::memory::dims& strides,
+            const dnnl::memory::dims& pad_l,
+            const dnnl::memory::dims& pad_r,
             const bool is_train,
-            const mkldnn::algorithm alg_kind);
+            const dnnl::algorithm alg_kind);
 };
 
-class MKLDNNPoolingBwd {
-  std::shared_ptr<const mkldnn::pooling_backward> bwd;
+class DNNLPoolingBwd {
+  std::shared_ptr<const dnnl::pooling_backward> bwd;
   bool with_workspace;
 
  public:
-  const mkldnn::pooling_backward::primitive_desc pd;
+  const dnnl::pooling_backward::primitive_desc pd;
 
-  MKLDNNPoolingBwd(const mkldnn::pooling_backward::primitive_desc& pdesc, bool with_ws);
+  DNNLPoolingBwd(const dnnl::pooling_backward::primitive_desc& pdesc, bool with_ws);
 
-  ~MKLDNNPoolingBwd() {}
-  const mkldnn::pooling_backward& GetBwd();
-  const mkldnn::pooling_backward::primitive_desc& GetPd();
+  ~DNNLPoolingBwd() {}
+  const dnnl::pooling_backward& GetBwd();
+  const dnnl::pooling_backward::primitive_desc& GetPd();
 };
 
 inline int GetPaddingSizeFull(dim_t x, int padl, int padr, int k, int s) {
@@ -97,7 +95,7 @@ inline int GetPaddingSizeFull(dim_t x, int padl, int padr, int k, int s) {
   }
 }
 
-inline bool SupportMKLDNNPooling(const PoolingParam& param) {
+inline bool SupportDNNLPooling(const PoolingParam& param) {
   return (param.kernel.ndim() == 1 || param.kernel.ndim() == 2 || param.kernel.ndim() == 3) &&
          (param.pool_type == pool_enum::kMaxPooling || param.pool_type == pool_enum::kAvgPooling) &&
          (!param.layout.has_value() ||
@@ -105,23 +103,23 @@ inline bool SupportMKLDNNPooling(const PoolingParam& param) {
            param.layout.value() == mshadow::kNCDHW));
 }
 
-inline bool SupportMKLDNNPooling(const PoolingParam& param, const NDArray& input) {
+inline bool SupportDNNLPooling(const PoolingParam& param, const NDArray& input) {
   const auto dshape = input.shape();
   const auto ndim   = dshape.ndim();
   const auto dtype  = input.dtype();
 
-  if (!(SupportStorageMKLDNN(input.storage_type()) && (ndim == 3 || ndim == 4 || ndim == 5) &&
+  if (!(SupportStorageDNNL(input.storage_type()) && (ndim == 3 || ndim == 4 || ndim == 5) &&
         (dtype == mshadow::kFloat32 || dtype == mshadow::kBfloat16)))
     return false;
 
-  if (!SupportMKLDNNPooling(param))
+  if (!SupportDNNLPooling(param))
     return false;
 
   if (param.pooling_convention == pool_enum::kValid) {
     return true;
   } else {
     if (param.pool_type == pool_enum::kAvgPooling) {
-      // mkldnn works differently when padding is asymmetric, so let's skip this case.
+      // dnnl works differently when padding is asymmetric, so let's skip this case.
       bool is_symmetric = true;
       switch (ndim) {
         case 5:
@@ -149,30 +147,30 @@ inline bool SupportMKLDNNPooling(const PoolingParam& param, const NDArray& input
   }
 }
 
-inline bool MKLDNNRequireWorkspace(const PoolingParam& param) {
+inline bool DNNLRequireWorkspace(const PoolingParam& param) {
   return param.pool_type != pool_enum::kAvgPooling;
 }
 
-typedef ParamOpSign<PoolingParam> MKLDNNPoolingSignature;
-void MKLDNNPoolingCompute(const OpContext& ctx,
-                          const PoolingParam& param,
-                          const NDArray& in_data,
-                          const OpReqType req,
-                          const NDArray& out_data,
-                          const NDArray* workspace);
-
-void MKLDNNPoolingGradCompute(const OpContext& ctx,
-                              const PoolingParam& param,
-                              const NDArray& out_grad,
-                              const NDArray& in_data,
-                              const NDArray* workspace,
-                              const OpReqType req,
-                              const NDArray& in_grad);
-MKLDNNPoolingFwd& GetPoolingFwd(const PoolingParam& param,
-                                const bool is_train,
-                                const NDArray& data,
-                                const NDArray& output);
+typedef ParamOpSign<PoolingParam> DNNLPoolingSignature;
+void DNNLPoolingCompute(const OpContext& ctx,
+                        const PoolingParam& param,
+                        const NDArray& in_data,
+                        const OpReqType req,
+                        const NDArray& out_data,
+                        const NDArray* workspace);
+
+void DNNLPoolingGradCompute(const OpContext& ctx,
+                            const PoolingParam& param,
+                            const NDArray& out_grad,
+                            const NDArray& in_data,
+                            const NDArray* workspace,
+                            const OpReqType req,
+                            const NDArray& in_grad);
+DNNLPoolingFwd& GetPoolingFwd(const PoolingParam& param,
+                              const bool is_train,
+                              const NDArray& data,
+                              const NDArray& output);
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_POOLING_INL_H_
diff --git a/src/operator/nn/dnnl/dnnl_pooling.cc b/src/operator/nn/dnnl/dnnl_pooling.cc
new file mode 100644
index 000000000000..252bf05a1025
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_pooling.cc
@@ -0,0 +1,401 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_pooling.cc
+ * \brief
+ * \author Tao Lv
+ */
+
+#if MXNET_USE_ONEDNN == 1
+
+#include "./dnnl_pooling-inl.h"
+
+namespace mxnet {
+namespace op {
+
+static inline dnnl::memory::data_type get_data_type(const dnnl::memory::desc& md) {
+  return static_cast<dnnl::memory::data_type>(md.data_type());
+}
+
+void DNNLPoolingFwd::Init(const mxnet::NDArray& input,
+                          const mxnet::NDArray& output,
+                          const dnnl::memory::dims& kernel,
+                          const dnnl::memory::dims& strides,
+                          const dnnl::memory::dims& pad_l,
+                          const dnnl::memory::dims& pad_r,
+                          const bool is_train,
+                          const dnnl::algorithm alg_kind) {
+  const auto src_md         = input.GetDNNLData()->get_desc();
+  const auto dst_md         = GetMemDesc(output);
+  const dnnl::engine engine = CpuEngine::Get()->get_engine();
+  if (alg_kind != dnnl::algorithm::pooling_max && alg_kind != dnnl::algorithm::pooling_avg &&
+      alg_kind != dnnl::algorithm::pooling_avg_include_padding &&
+      alg_kind != dnnl::algorithm::pooling_avg_exclude_padding) {
+    LOG(FATAL) << "DNNL Pooling: algorithm is not supported";
+  }
+
+  dnnl::prop_kind prop = dnnl::prop_kind::forward_scoring;
+  if (is_train && alg_kind != dnnl::algorithm::pooling_avg) {
+    prop = dnnl::prop_kind::forward_training;
+  }
+  if (is_train && prop == dnnl::prop_kind::forward_scoring) {
+    LOG(INFO) << "DNNL Pooling: training with prop_kind is forward_scoring";
+  }
+
+  const auto fwd_desc =
+      dnnl::pooling_forward::desc(prop, alg_kind, src_md, dst_md, strides, kernel, pad_l, pad_r);
+  this->fwd_pd_.reset(new dnnl::pooling_forward::primitive_desc(fwd_desc, engine));
+  this->fwd_.reset(new dnnl::pooling_forward(*(this->fwd_pd_)));
+
+  return;
+}
+
+void DNNLPoolingFwd::Execute(const NDArray& in_data,
+                             const OpReqType req,
+                             const NDArray& out_data,
+                             const NDArray* workspace) {
+  NDArray in_buffer = in_data;
+  if (in_data.IsView() && in_data.IsDNNLData())
+    in_buffer = in_data.Reorder2Default();
+
+  auto input_mem     = in_buffer.GetDNNLData();
+  auto output_mem_t_ = CreateDNNLMem(out_data, this->fwd_pd_->dst_desc(), req);
+
+  dnnl_args_map_t args = {
+      {DNNL_ARG_SRC, *input_mem},
+      {DNNL_ARG_DST, *(output_mem_t_.second)},
+  };
+
+  if (this->with_workspace_) {
+    auto engine = CpuEngine::Get()->get_engine();
+
+    if (workspace == nullptr) {
+      LOG(FATAL) << "DNNL Pooling: incorrect workspace input";
+    }
+
+    auto ws = std::make_shared<dnnl::memory>(
+        (*(this->fwd_pd_)).workspace_desc(), engine, workspace->GetDNNLData()->get_data_handle());
+    args[DNNL_ARG_WORKSPACE] = *ws;
+  }
+  if (this->fwd_) {
+    DNNLStream::Get()->RegisterPrimArgs(*(this->fwd_), args);
+    CommitOutput(out_data, output_mem_t_);
+    DNNLStream::Get()->Submit();
+  } else {
+    LOG(FATAL) << "DNNL Pooling: forward primitive is nullptr";
+  }
+}
+
+dnnl::algorithm GetDNNLPoolAlgo(const PoolingParam& param) {
+  switch (param.pool_type) {
+    case pool_enum::kMaxPooling:
+      return dnnl::algorithm::pooling_max;
+      break;
+    case pool_enum::kAvgPooling:
+      if (param.count_include_pad.has_value() && !param.count_include_pad.value()) {
+        return dnnl::algorithm::pooling_avg_exclude_padding;
+      } else {
+        return dnnl::algorithm::pooling_avg_include_padding;
+      }
+      break;
+    default:
+      LOG(FATAL) << "DNNL Pooling: Unknown pooling method.";
+      return dnnl::algorithm::pooling_max;
+  }
+}
+
+void InitPoolingPrimitiveParams(const PoolingParam& param,
+                                const dnnl::memory::desc& data_md,
+                                const dnnl::memory::dims& new_kernel,
+                                const dnnl::memory::dims& new_strides,
+                                const dnnl::memory::dims& new_pad_l,
+                                const dnnl::memory::dims& new_pad_r) {
+  const int kernel_ndims      = param.kernel.ndim();
+  dnnl::memory::dims& kernel  = const_cast<dnnl::memory::dims&>(new_kernel);
+  dnnl::memory::dims& strides = const_cast<dnnl::memory::dims&>(new_strides);
+  dnnl::memory::dims& pad_l   = const_cast<dnnl::memory::dims&>(new_pad_l);
+  dnnl::memory::dims& pad_r   = const_cast<dnnl::memory::dims&>(new_pad_r);
+  if (kernel_ndims == 1) {
+    CHECK_GE(param.pad.ndim(), 1);
+    CHECK_GE(param.stride.ndim(), 1);
+    kernel[0]  = param.kernel[0];
+    pad_l[0]   = param.pad[0];
+    pad_r[0]   = param.pad[0];
+    strides[0] = param.stride[0];
+
+    if (param.pooling_convention == pool_enum::kFull) {
+      pad_r[0] =
+          GetPaddingSizeFull(data_md.data.dims[2], pad_l[0], pad_r[0], kernel[0], strides[0]);
+    }
+
+    if (param.global_pool) {
+      kernel[0]  = data_md.data.dims[2];
+      strides[0] = 1;
+      pad_l[0] = pad_r[0] = 0;
+    }
+
+    CHECK_GT(kernel[0], 0) << "Filter dimensions cannot be zero.";
+  } else if (kernel_ndims == 2) {
+    CHECK_GE(param.pad.ndim(), 2);
+    CHECK_GE(param.stride.ndim(), 2);
+    kernel[0]  = param.kernel[0];
+    kernel[1]  = param.kernel[1];
+    pad_l[0]   = param.pad[0];
+    pad_l[1]   = param.pad[1];
+    pad_r[0]   = param.pad[0];
+    pad_r[1]   = param.pad[1];
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+
+    if (param.pooling_convention == pool_enum::kFull) {
+      pad_r[0] =
+          GetPaddingSizeFull(data_md.data.dims[2], pad_l[0], pad_r[0], kernel[0], strides[0]);
+      pad_r[1] =
+          GetPaddingSizeFull(data_md.data.dims[3], pad_l[1], pad_r[1], kernel[1], strides[1]);
+    }
+
+    if (param.global_pool) {
+      kernel[0]  = data_md.data.dims[2];
+      kernel[1]  = data_md.data.dims[3];
+      strides[0] = strides[1] = 1;
+      pad_l[0] = pad_l[1] = pad_r[0] = pad_r[1] = 0;
+    }
+
+    CHECK_GT(kernel[0], 0) << "Filter dimensions cannot be zero.";
+    CHECK_GT(kernel[1], 0) << "Filter dimensions cannot be zero.";
+  } else {
+    CHECK_GE(param.pad.ndim(), 3);
+    CHECK_GE(param.stride.ndim(), 3);
+    kernel[0]  = param.kernel[0];
+    kernel[1]  = param.kernel[1];
+    kernel[2]  = param.kernel[2];
+    pad_l[0]   = param.pad[0];
+    pad_l[1]   = param.pad[1];
+    pad_l[2]   = param.pad[2];
+    pad_r[0]   = param.pad[0];
+    pad_r[1]   = param.pad[1];
+    pad_r[2]   = param.pad[2];
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+    strides[2] = param.stride[2];
+
+    if (param.pooling_convention == pool_enum::kFull) {
+      pad_r[0] =
+          GetPaddingSizeFull(data_md.data.dims[2], pad_l[0], pad_r[0], kernel[0], strides[0]);
+      pad_r[1] =
+          GetPaddingSizeFull(data_md.data.dims[3], pad_l[1], pad_r[1], kernel[1], strides[1]);
+      pad_r[2] =
+          GetPaddingSizeFull(data_md.data.dims[4], pad_l[2], pad_r[2], kernel[2], strides[2]);
+    }
+
+    if (param.global_pool) {
+      kernel[0]  = data_md.data.dims[2];
+      kernel[1]  = data_md.data.dims[3];
+      kernel[2]  = data_md.data.dims[4];
+      strides[0] = strides[1] = strides[2] = 1;
+      pad_l[0] = pad_l[1] = pad_l[2] = pad_r[0] = pad_r[1] = pad_r[2] = 0;
+    }
+
+    CHECK_GT(kernel[0], 0) << "Filter dimensions cannot be zero.";
+    CHECK_GT(kernel[1], 0) << "Filter dimensions cannot be zero.";
+    CHECK_GT(kernel[2], 0) << "Filter dimensions cannot be zero.";
+  }
+
+  if (pad_l[0] != 0 || (kernel_ndims == 2 && pad_l[1] != 0) ||
+      (kernel_ndims == 3 && pad_l[2] != 0)) {
+    CHECK(param.pool_type == pool_enum::kAvgPooling || param.pool_type == pool_enum::kMaxPooling)
+        << "Padding implemented only for average and max pooling.";
+    CHECK_LT(pad_l[0], kernel[0]);
+    if (kernel_ndims > 1)
+      CHECK_LT(pad_l[1], kernel[1]);
+    if (kernel_ndims > 2)
+      CHECK_LT(pad_l[2], kernel[2]);
+  }
+}
+
+dnnl::pooling_forward::primitive_desc GetPoolingFwdPdesc(const PoolingParam& param,
+                                                         const bool is_train,
+                                                         const dnnl::memory::desc& data_md,
+                                                         const dnnl::memory::desc& out_md) {
+  CHECK(param.kernel.ndim() == 1 || param.kernel.ndim() == 2 || param.kernel.ndim() == 3)
+      << "Not Implemented";
+
+  const int kernel_ndims = param.kernel.ndim();
+  dnnl::memory::dims kernel(kernel_ndims);
+  dnnl::memory::dims strides(kernel_ndims);
+  dnnl::memory::dims pad_l(kernel_ndims);
+  dnnl::memory::dims pad_r(kernel_ndims);
+
+  InitPoolingPrimitiveParams(param, data_md, kernel, strides, pad_l, pad_r);
+
+  const dnnl::algorithm alg = GetDNNLPoolAlgo(param);
+  dnnl::prop_kind kind      = dnnl::prop_kind::forward_scoring;
+  if (is_train && alg != dnnl::algorithm::pooling_avg) {
+    kind = dnnl::prop_kind::forward_training;
+  }
+
+  const dnnl::pooling_forward::desc poolingFwd_desc(
+      kind, alg, data_md, out_md, strides, kernel, pad_l, pad_r);
+  return dnnl::pooling_forward::primitive_desc(poolingFwd_desc, CpuEngine::Get()->get_engine());
+}
+
+DNNLPoolingFwd& GetPoolingFwd(const PoolingParam& param,
+                              const bool is_train,
+                              const NDArray& data,
+                              const NDArray& output) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLPoolingSignature, DNNLPoolingFwd, OpHash> pooling_fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLPoolingSignature, DNNLPoolingFwd, OpHash>
+      pooling_fwds;
+#endif
+
+  bool with_workspace = is_train && DNNLRequireWorkspace(param);
+  DNNLPoolingSignature key(param);
+  key.AddSign(is_train);
+  key.AddSign(with_workspace);
+  key.AddSign(data);
+  key.AddSign(output);
+
+  auto it = pooling_fwds.find(key);
+  if (it == pooling_fwds.end()) {
+    CHECK(param.kernel.ndim() == 1 || param.kernel.ndim() == 2 || param.kernel.ndim() == 3)
+        << "Not Implemented";
+    auto data_md = data.GetDNNLData()->get_desc();
+
+    const auto kernel_ndims = param.kernel.ndim();
+    dnnl::memory::dims kernel(kernel_ndims);
+    dnnl::memory::dims strides(kernel_ndims);
+    dnnl::memory::dims pad_l(kernel_ndims);
+    dnnl::memory::dims pad_r(kernel_ndims);
+    InitPoolingPrimitiveParams(param, data_md, kernel, strides, pad_l, pad_r);
+
+    const dnnl::algorithm alg = GetDNNLPoolAlgo(param);
+    DNNLPoolingFwd fwd(data, output, kernel, strides, pad_l, pad_r, alg, with_workspace, is_train);
+    it = AddToCache(&pooling_fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void DNNLPoolingCompute(const OpContext& ctx,
+                        const PoolingParam& param,
+                        const NDArray& in_data,
+                        const OpReqType req,
+                        const NDArray& out_data,
+                        const NDArray* workspace) {
+  auto& fwd = GetPoolingFwd(param, ctx.is_train, in_data, out_data);
+  fwd.Execute(in_data, req, out_data, workspace);
+}
+
+DNNLPoolingBwd::DNNLPoolingBwd(const dnnl::pooling_backward::primitive_desc& pdesc, bool with_ws)
+    : with_workspace(with_ws), pd(pdesc) {
+  bwd = std::make_shared<dnnl::pooling_backward>(pd);
+}
+
+const dnnl::pooling_backward& DNNLPoolingBwd::GetBwd() {
+  return *this->bwd;
+}
+
+DNNLPoolingBwd& GetPoolingBwd(const PoolingParam& param,
+                              const NDArray& in_data,
+                              const NDArray& in_grad,
+                              const NDArray& out_grad) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLPoolingSignature, DNNLPoolingBwd, OpHash> pooling_bwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLPoolingSignature, DNNLPoolingBwd, OpHash>
+      pooling_bwds;
+#endif
+
+  bool with_workspace = DNNLRequireWorkspace(param);
+  DNNLPoolingSignature key(param);
+  key.AddSign(in_data);
+  key.AddSign(in_grad);
+  key.AddSign(out_grad);
+
+  auto it = pooling_bwds.find(key);
+  if (it == pooling_bwds.end()) {
+    auto input_mem = in_data.GetDNNLData();
+    auto data_md   = input_mem->get_desc();
+
+    auto dst_dims = dnnl::memory::dims(out_grad.shape().begin(), out_grad.shape().end());
+    auto any      = dnnl::memory::format_tag::any;
+    auto dst_md   = dnnl::memory::desc(dst_dims, get_data_type(data_md), any);
+
+    // fwd hint
+    auto fwd_pd = GetPoolingFwdPdesc(param, true, data_md, dst_md);
+
+    // creat bwd desc
+    auto diff_src_dims = dnnl::memory::dims(in_grad.shape().begin(), in_grad.shape().end());
+    auto diff_src_md   = dnnl::memory::desc(diff_src_dims, get_data_type(data_md), any);
+    auto cpu_engine    = CpuEngine::Get()->get_engine();
+    auto alg           = GetDNNLPoolAlgo(param);
+
+    const int kernel_ndims = param.kernel.ndim();
+    dnnl::memory::dims kernel(kernel_ndims);
+    dnnl::memory::dims strides(kernel_ndims);
+    dnnl::memory::dims pad_l(kernel_ndims);
+    dnnl::memory::dims pad_r(kernel_ndims);
+
+    InitPoolingPrimitiveParams(param, data_md, kernel, strides, pad_l, pad_r);
+
+    // use dst_md as diff_dst_md with any format
+    auto bwd_desc =
+        dnnl::pooling_backward::desc(alg, diff_src_md, dst_md, strides, kernel, pad_l, pad_r);
+    auto pdesc = dnnl::pooling_backward::primitive_desc(bwd_desc, cpu_engine, fwd_pd);
+
+    DNNLPoolingBwd bwd(pdesc, with_workspace);
+    it = AddToCache(&pooling_bwds, key, bwd);
+  }
+  return it->second;
+}
+
+void DNNLPoolingGradCompute(const OpContext& ctx,
+                            const PoolingParam& param,
+                            const NDArray& out_grad,
+                            const NDArray& in_data,
+                            const NDArray* workspace,
+                            const OpReqType req,
+                            const NDArray& in_grad) {
+  if (req == kNullOp) {
+    return;
+  }
+
+  TmpMemMgr::Get()->Init(ctx.requested[0]);
+
+  auto& bwd            = GetPoolingBwd(param, in_data, in_grad, out_grad);
+  auto diff_dst_mem    = out_grad.GetDNNLDataReorder(bwd.pd.diff_dst_desc());
+  auto diff_src_mem    = CreateDNNLMem(in_grad, bwd.pd.diff_src_desc(), req);
+  dnnl_args_map_t args = {
+      {DNNL_ARG_DIFF_DST, *diff_dst_mem},
+      {DNNL_ARG_DIFF_SRC, *diff_src_mem.second},
+  };
+  if (DNNLRequireWorkspace(param) && workspace != nullptr) {
+    args[DNNL_ARG_WORKSPACE] = *(workspace->GetDNNLData());
+  }
+
+  DNNLStream::Get()->RegisterPrimArgs(bwd.GetBwd(), args);
+  CommitOutput(in_grad, diff_src_mem);
+  DNNLStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_ONEDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_reshape-inl.h b/src/operator/nn/dnnl/dnnl_reshape-inl.h
similarity index 62%
rename from src/operator/nn/mkldnn/mkldnn_reshape-inl.h
rename to src/operator/nn/dnnl/dnnl_reshape-inl.h
index cab6ec18117a..a814c1d958d0 100644
--- a/src/operator/nn/mkldnn/mkldnn_reshape-inl.h
+++ b/src/operator/nn/dnnl/dnnl_reshape-inl.h
@@ -18,31 +18,30 @@
  */
 
 /*!
- * \file mkldnn_reshape-inl.h
- * \brief Function definition of mkldnn reshape operator
+ * \file dnnl_reshape-inl.h
+ * \brief Function definition of dnnl reshape operator
  */
 
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RESHAPE_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RESHAPE_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_RESHAPE_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_RESHAPE_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 #include <vector>
 
 #include "../../tensor/matrix_op-inl.h"
-
-#include "mkldnn_base-inl.h"
+#include "dnnl_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-class MKLDNNReshapeFwd {
+class DNNLReshapeFwd {
  protected:
-  std::shared_ptr<mkldnn::memory> out_;
-  std::shared_ptr<mkldnn::memory> temp_;
-  std::vector<mkldnn::primitive> prims_;
+  std::shared_ptr<dnnl::memory> out_;
+  std::shared_ptr<dnnl::memory> temp_;
+  std::vector<dnnl::primitive> prims_;
 
  public:
-  MKLDNNReshapeFwd(const OpReqType& req, const NDArray& input, const NDArray& output);
+  DNNLReshapeFwd(const OpReqType& req, const NDArray& input, const NDArray& output);
   int GetWorkspaceSize();
   void Execute(const NDArray& input,
                const NDArray& output,
@@ -50,12 +49,12 @@ class MKLDNNReshapeFwd {
                void* workspace = nullptr);
 };
 
-typedef OpSignature MKLDNNReshapeSignature;
-MKLDNNReshapeFwd& GetReshapeForward(const OpReqType& req,
-                                    const NDArray& input,
-                                    const NDArray& output);
+typedef OpSignature DNNLReshapeSignature;
+DNNLReshapeFwd& GetReshapeForward(const OpReqType& req,
+                                  const NDArray& input,
+                                  const NDArray& output);
 }  // namespace op
 }  // namespace mxnet
 
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RESHAPE_INL_H_
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_RESHAPE_INL_H_
diff --git a/src/operator/nn/dnnl/dnnl_reshape.cc b/src/operator/nn/dnnl/dnnl_reshape.cc
new file mode 100644
index 000000000000..5d2591916271
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_reshape.cc
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_reshape.cc
+ * \brief Implement reshape operator via DNNL reorder primitive
+ * \author Tao Lv
+ */
+
+#if MXNET_USE_ONEDNN == 1
+#include "../../tensor/elemwise_unary_op.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
+#include "./dnnl_reshape-inl.h"
+
+namespace mxnet {
+namespace op {
+
+bool SupportDNNLReshape(const NDArray& input, const NDArray& output) {
+  const int input_ndims  = input.shape().ndim();
+  const int output_ndims = output.shape().ndim();
+  return input.shape().Size() > 0 && input_ndims >= 1 && input_ndims <= 6 && output_ndims >= 1 &&
+         output_ndims <= 6 && IsDNNLType(input.dtype());
+}
+
+DNNLReshapeFwd::DNNLReshapeFwd(const OpReqType& req, const NDArray& input, const NDArray& output) {
+  const auto engine = CpuEngine::Get()->get_engine();
+  auto in_mem       = input.GetDNNLData();
+
+  // Create temp memory
+  auto temp_dims = dnnl::memory::dims(input.shape().begin(), input.shape().end());
+  auto temp_type = static_cast<dnnl::memory::data_type>(get_dnnl_type(input.dtype()));
+  auto temp_fmt  = static_cast<dnnl::memory::format_tag>(GetDefaultFormat(input.shape().ndim()));
+  auto temp_desc = dnnl::memory::desc(temp_dims, temp_type, temp_fmt);
+
+  out_ = std::make_shared<dnnl::memory>(temp_desc, engine, nullptr);
+  if (req == kWriteInplace) {
+    // If the input has DNNL internal layout, we need reorder it to a temporal buffer with
+    // default layout and copy from the temporal buffer back to output buffer which has the same
+    // address with input buffer.
+    // If the input has default layout, then nothing need to do.
+    if (input.IsDNNLData()) {
+      temp_ = std::make_shared<dnnl::memory>(temp_desc, engine, nullptr);
+      prims_.push_back(dnnl::reorder(*in_mem, *temp_));  // reorder to default
+      prims_.push_back(dnnl::reorder(*temp_, *out_));    // copy back
+    }
+  } else if (req == kWriteTo) {
+    prims_.push_back(dnnl::reorder(*in_mem, *out_));
+  } else {
+    LOG(FATAL) << "not supported req type: " << req;
+  }
+}
+
+int DNNLReshapeFwd::GetWorkspaceSize() {
+  return temp_ ? temp_->get_desc().get_size() : 0;
+}
+
+void DNNLReshapeFwd::Execute(const NDArray& input,
+                             const NDArray& output,
+                             const OpReqType& req,
+                             void* workspace) {
+  auto stream = DNNLStream::Get();
+  auto in_mem = input.GetDNNLData();
+  // register primitives and arguments
+  std::vector<dnnl_args_map_t> args_map;
+  size_t prims_size = prims_.size();
+  if (prims_size == 1) {
+    args_map.push_back({{DNNL_ARG_FROM, *in_mem}, {DNNL_ARG_TO, *output.GetDNNLData()}});
+  } else if (prims_size == 2) {
+    if (workspace) {
+      temp_->set_data_handle(workspace);
+    }
+    args_map.push_back({{DNNL_ARG_FROM, *in_mem}, {DNNL_ARG_TO, *temp_}});
+    args_map.push_back({{DNNL_ARG_FROM, *temp_}, {DNNL_ARG_TO, *output.GetDNNLData()}});
+  } else {
+    CHECK(prims_size == 0 && req != kWriteTo) << "kWriteTo should never reach here.";
+  }
+
+  for (size_t i = 0; i < prims_size; i++) {
+    stream->RegisterPrimArgs(prims_[i], args_map[i]);
+  }
+  stream->Submit();
+  // invalidate dnnl memory in output
+  const_cast<NDArray&>(output).InvalidateDNNLData();
+}
+
+DNNLReshapeFwd& GetReshapeForward(const OpReqType& req,
+                                  const NDArray& input,
+                                  const NDArray& output) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLReshapeSignature, DNNLReshapeFwd, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLReshapeSignature, DNNLReshapeFwd, OpHash> fwds;
+#endif
+  DNNLReshapeSignature key;
+  key.AddSign(req);
+  key.AddSign(input);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    DNNLReshapeFwd fwd(req, input, output);
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void DNNLReshapeForward(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const NDArray& input,
+                        const OpReqType& req,
+                        const NDArray& output) {
+  if (req == kNullOp)
+    return;
+  CHECK_NE(req, kAddTo) << "kAddTo is not supported yet";
+  auto fwd     = GetReshapeForward(req, input, output);
+  auto ws_size = fwd.GetWorkspaceSize();
+  void* ws_ptr = nullptr;
+  if (ws_size) {
+    mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+    mshadow::Tensor<cpu, 1, char> ws =
+        ctx.requested[0].get_space_typed<cpu, 1, char>(mshadow::Shape1(ws_size), s);
+    ws_ptr = static_cast<void*>(ws.dptr_);
+  }
+  fwd.Execute(input, output, req, ws_ptr);
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_rnn-inl.h b/src/operator/nn/dnnl/dnnl_rnn-inl.h
similarity index 61%
rename from src/operator/nn/mkldnn/mkldnn_rnn-inl.h
rename to src/operator/nn/dnnl/dnnl_rnn-inl.h
index dee8213ee9a8..bd2a63f7a908 100644
--- a/src/operator/nn/mkldnn/mkldnn_rnn-inl.h
+++ b/src/operator/nn/dnnl/dnnl_rnn-inl.h
@@ -18,28 +18,27 @@
  */
 
 /*!
- * \file mkldnn_rnn-inl.h
- * \brief Common functions used by MKLDNN RNN operator
+ * \file dnnl_rnn-inl.h
+ * \brief Common functions used by DNNL RNN operator
  * \author Zixuan Wei
  */
 
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RNN_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RNN_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_RNN_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_RNN_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 
 #include <vector>
 
-#include "./mkldnn_base-inl.h"
-
 #include "../../rnn-inl.h"
+#include "./dnnl_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-struct MKLDNNRnnLayerParam {
-  using memory = mkldnn::memory;
-  using dims   = mkldnn::memory::dims;
+struct DNNLRnnLayerParam {
+  using memory = dnnl::memory;
+  using dims   = dnnl::memory::dims;
 
   int mode;
   bool bidirectional;
@@ -60,21 +59,21 @@ struct MKLDNNRnnLayerParam {
   dims state_dims;         // Dimensions of the state cell in format_tag::ldnc
   dims cell_dims;          // Dimensions of LSTM cell state in format_tag::ldnc
 
-  size_t workspace_size;        // used for the cached mkl-dnn memory in Forward inference
+  size_t workspace_size;        // used for the cached DNNL memory in Forward inference
   size_t reserve_size;          // used for the reserved cached memory in Backward
   size_t single_w_size;         // weights size of a single cell
-  size_t single_b_size;         // bias size of a single cell from mkl-dnn
+  size_t single_b_size;         // bias size of a single cell from DNNL
   size_t native_single_b_size;  // bias size of a single cell from framework
   size_t single_state_size;     // state size of a single cell, hy, cy
 
-  MKLDNNRnnLayerParam(int num_layer,
-                      index_t batch_size,
-                      index_t seq_len,
-                      index_t input_size,
-                      int state_size,
-                      int proj_size,
-                      int mode,
-                      bool bidirectional = true)
+  DNNLRnnLayerParam(int num_layer,
+                    index_t batch_size,
+                    index_t seq_len,
+                    index_t input_size,
+                    int state_size,
+                    int proj_size,
+                    int mode,
+                    bool bidirectional = true)
       : mode(mode),
         bidirectional(bidirectional),
         state_outputs(true),
@@ -88,33 +87,33 @@ struct MKLDNNRnnLayerParam {
   void SetDims();
 };
 
-typedef std::vector<MKLDNNRnnLayerParam> LayerParamVector;
-struct MKLDNNRnnFullParam {
+typedef std::vector<DNNLRnnLayerParam> LayerParamVector;
+struct DNNLRnnFullParam {
   RNNParam default_param;
   LayerParamVector layer_params;
 };
 
-MKLDNNRnnFullParam MKLDNNRnnFullParamParser(const RNNParam& rnn_param,
-                                            const index_t seq_len,
-                                            const index_t batch_size,
-                                            const index_t input_size);
+DNNLRnnFullParam DNNLRnnFullParamParser(const RNNParam& rnn_param,
+                                        const index_t seq_len,
+                                        const index_t batch_size,
+                                        const index_t input_size);
 
 /*
- * Use this to allocate memory from MKLDNNRnnOp temporary space.
+ * Use this to allocate memory from DNNLRnnOp temporary space.
  */
-class MKLDNNRnnMemMgr {
+class DNNLRnnMemMgr {
   // The memory buffer in NDArray life-cycle
   NDArray workspace_;
   // This points to the memory buffer from a NDArray
   char* curr_mem;
-  // The total bytes of the workspace of a MKLDNNRnnOp
+  // The total bytes of the workspace of a DNNLRnnOp
   size_t mem_size = 0;
   // The current available memory bytes
-  size_t curr_size                 = 0;
-  const size_t alignment           = kMKLDNNAlign;
-  const mkldnn::engine& cpu_engine = CpuEngine::Get()->get_engine();
+  size_t curr_size               = 0;
+  const size_t alignment         = kDNNLAlign;
+  const dnnl::engine& cpu_engine = CpuEngine::Get()->get_engine();
   // Here we hold all memory related to the stateful RNN operators
-  std::vector<std::shared_ptr<const mkldnn::memory> > mem_holder;
+  std::vector<std::shared_ptr<const dnnl::memory> > mem_holder;
 
  public:
   /*!
@@ -129,11 +128,11 @@ class MKLDNNRnnMemMgr {
     return mem_size;
   }
 
-  void RegisterMem(std::shared_ptr<const mkldnn::memory> mem) {
+  void RegisterMem(std::shared_ptr<const dnnl::memory> mem) {
     mem_holder.push_back(mem);
   }
 
-  mkldnn::memory* Alloc(const mkldnn::memory::desc& md);
+  dnnl::memory* Alloc(const dnnl::memory::desc& md);
 };
 
 /*
@@ -159,7 +158,7 @@ class RnnPrimitive {
     rnn_fwd_prim.weights_proj_desc_  = fwd_pd->weights_projection_desc();
     rnn_fwd_prim.workspace_desc_     = fwd_pd->workspace_desc();
 
-    rnn_fwd_prim.primitive_ = std::shared_ptr<mkldnn::primitive>(new rnn_fwd(*fwd_pd));
+    rnn_fwd_prim.primitive_ = std::shared_ptr<dnnl::primitive>(new rnn_fwd(*fwd_pd));
 
     return rnn_fwd_prim;
   }
@@ -167,10 +166,10 @@ class RnnPrimitive {
   RnnPrimitive() {
     this->fwd_pd_             = nullptr;
     this->primitive_          = nullptr;
-    this->weights_layer_desc_ = mkldnn::memory::desc();
-    this->weights_iter_desc_  = mkldnn::memory::desc();
-    this->weights_proj_desc_  = mkldnn::memory::desc();
-    this->workspace_desc_     = mkldnn::memory::desc();
+    this->weights_layer_desc_ = dnnl::memory::desc();
+    this->weights_iter_desc_  = dnnl::memory::desc();
+    this->weights_proj_desc_  = dnnl::memory::desc();
+    this->workspace_desc_     = dnnl::memory::desc();
   }
 
   RnnPrimitive(const RnnPrimitive& rnn_fwd_prim) {
@@ -198,50 +197,50 @@ class RnnPrimitive {
   const void* GetPrimDesc() const {
     return fwd_pd_.get();
   }
-  const mkldnn::primitive& GetPrim() const {
+  const dnnl::primitive& GetPrim() const {
     return *primitive_;
   }
 
-  const mkldnn::memory::desc& GetLayerDesc() const {
+  const dnnl::memory::desc& GetLayerDesc() const {
     return weights_layer_desc_;
   }
 
-  const mkldnn::memory::desc& GetIterDesc() const {
+  const dnnl::memory::desc& GetIterDesc() const {
     return weights_iter_desc_;
   }
 
-  const mkldnn::memory::desc& GetProjDesc() const {
+  const dnnl::memory::desc& GetProjDesc() const {
     return weights_proj_desc_;
   }
 
-  const mkldnn::memory::desc& GetWorkspaceDesc() const {
+  const dnnl::memory::desc& GetWorkspaceDesc() const {
     return workspace_desc_;
   }
 
  private:
   std::shared_ptr<void> fwd_pd_;
-  std::shared_ptr<mkldnn::primitive> primitive_;
-  mkldnn::memory::desc weights_layer_desc_;
-  mkldnn::memory::desc weights_iter_desc_;
-  mkldnn::memory::desc weights_proj_desc_;
-  mkldnn::memory::desc workspace_desc_;
+  std::shared_ptr<dnnl::primitive> primitive_;
+  dnnl::memory::desc weights_layer_desc_;
+  dnnl::memory::desc weights_iter_desc_;
+  dnnl::memory::desc weights_proj_desc_;
+  dnnl::memory::desc workspace_desc_;
 };
 
-RnnPrimitive GetRnnFwdPrim(const MKLDNNRnnLayerParam& layer_param,
+RnnPrimitive GetRnnFwdPrim(const DNNLRnnLayerParam& layer_param,
                            const bool is_train,
                            const NDArray& data,
                            const NDArray& params);
 
 /*
- * Use this to manage memory and primitive of MKL-DNN RNN forward inference.
+ * Use this to manage memory and primitive of DNNL RNN forward inference.
  */
-class MKLDNNRnnForward {
+class DNNLRnnForward {
  public:
-  MKLDNNRnnForward(const Context ctx,
-                   const MKLDNNRnnLayerParam& layer_param,
-                   const bool is_train,
-                   const NDArray& data,
-                   const NDArray& params)
+  DNNLRnnForward(const Context ctx,
+                 const DNNLRnnLayerParam& layer_param,
+                 const bool is_train,
+                 const NDArray& data,
+                 const NDArray& params)
       : ctx_(ctx),
         initialized_(false),
         param_(layer_param),
@@ -260,7 +259,7 @@ class MKLDNNRnnForward {
                      const int dtype     = mshadow::kFloat32);
   void ReorderWeights();
 
-  const mkldnn::primitive& GetFwd() const {
+  const dnnl::primitive& GetFwd() const {
     return fwd_inf_.GetPrim();
   }
 
@@ -270,11 +269,11 @@ class MKLDNNRnnForward {
     return size;
   }
 
-  const MKLDNNRnnLayerParam& GetParam() const {
+  const DNNLRnnLayerParam& GetParam() const {
     return param_;
   }
 
-  const mkldnn_args_map_t& GetArgsMap() const {
+  const dnnl_args_map_t& GetArgsMap() const {
     return net_args_;
   }
 
@@ -288,79 +287,79 @@ class MKLDNNRnnForward {
  private:
   Context ctx_;
   bool initialized_;
-  MKLDNNRnnLayerParam param_;
+  DNNLRnnLayerParam param_;
   RnnPrimitive fwd_inf_;  // forward inference primitive
 
-  MKLDNNRnnMemMgr mem_mgr_;
-  mkldnn::memory* weights_layer_ = nullptr;
-  mkldnn::memory* weights_iter_  = nullptr;
-  mkldnn::memory* weights_proj_  = nullptr;
-  mkldnn::memory* bias_          = nullptr;
+  DNNLRnnMemMgr mem_mgr_;
+  dnnl::memory* weights_layer_ = nullptr;
+  dnnl::memory* weights_iter_  = nullptr;
+  dnnl::memory* weights_proj_  = nullptr;
+  dnnl::memory* bias_          = nullptr;
 
-  mkldnn::memory* weights_layer_r_ = nullptr;
-  mkldnn::memory* weights_iter_r_  = nullptr;
-  mkldnn::memory* weights_proj_r_  = nullptr;
+  dnnl::memory* weights_layer_r_ = nullptr;
+  dnnl::memory* weights_iter_r_  = nullptr;
+  dnnl::memory* weights_proj_r_  = nullptr;
 
   /*
    * net_args must contain some keys as below:
-   *   MKLDNN_ARG_SRC
-   *   MKLDNN_ARG_SRC_ITER
-   *   MKLDNN_WEIGHTS_LAYER
-   *   MKLDNN_WEIGHTS_ITER
-   *   MKLDNN_BIAS
-   *   MKLDNN_ARG_DST
-   *   MKLDNN_ARG_DST_ITER
+   *   DNNL_ARG_SRC
+   *   DNNL_ARG_SRC_ITER
+   *   DNNL_WEIGHTS_LAYER
+   *   DNNL_WEIGHTS_ITER
+   *   DNNL_BIAS
+   *   DNNL_ARG_DST
+   *   DNNL_ARG_DST_ITER
    * if mode == Lstm, it also needs two additional key:
-   *   MKLDNN_ARG_SRC_ITER_C
-   *   MKLDNN_ARG_DST_ITER_C
+   *   DNNL_ARG_SRC_ITER_C
+   *   DNNL_ARG_DST_ITER_C
    */
-  mkldnn_args_map_t net_args_;
+  dnnl_args_map_t net_args_;
 
-  friend class MKLDNNRnnForwardTraining;
+  friend class DNNLRnnForwardTraining;
 };
 
-typedef std::shared_ptr<mkldnn::memory> mkldnn_shared_mem_t;
+typedef std::shared_ptr<dnnl::memory> dnnl_shared_mem_t;
 /*
- * Use this to manage memory and primitive of MKL-DNN RNN forward training.
+ * Use this to manage memory and primitive of DNNL RNN forward training.
  */
-class MKLDNNRnnForwardTraining {
+class DNNLRnnForwardTraining {
  public:
-  MKLDNNRnnForwardTraining(const MKLDNNRnnLayerParam& layer_param,
-                           const bool is_train,
-                           const NDArray& data,
-                           const NDArray& params)
+  DNNLRnnForwardTraining(const DNNLRnnLayerParam& layer_param,
+                         const bool is_train,
+                         const NDArray& data,
+                         const NDArray& params)
       : fwd_trn_(GetRnnFwdPrim(layer_param, is_train, data, params)), param_(&layer_param) {}
 
-  void SetTrnMem(const MKLDNNRnnForward& fwd);
-  void FetchData(const MKLDNNRnnForward& fwd);
+  void SetTrnMem(const DNNLRnnForward& fwd);
+  void FetchData(const DNNLRnnForward& fwd);
 
-  const MKLDNNRnnLayerParam& GetParam() const {
+  const DNNLRnnLayerParam& GetParam() const {
     return *param_;
   }
   const void* GetPrimDesc() const {
     return fwd_trn_.GetPrimDesc();
   }
-  const mkldnn::primitive& GetFwd() const {
+  const dnnl::primitive& GetFwd() const {
     return fwd_trn_.GetPrim();
   }
-  const mkldnn_args_map_t& GetArgsMap() const {
+  const dnnl_args_map_t& GetArgsMap() const {
     return net_args_;
   }
 
  private:
   RnnPrimitive fwd_trn_;
-  const MKLDNNRnnLayerParam* param_;
+  const DNNLRnnLayerParam* param_;
 
-  mkldnn_shared_mem_t weights_layer_ = nullptr;
-  mkldnn_shared_mem_t weights_iter_  = nullptr;
-  mkldnn::memory* bias_              = nullptr;
+  dnnl_shared_mem_t weights_layer_ = nullptr;
+  dnnl_shared_mem_t weights_iter_  = nullptr;
+  dnnl::memory* bias_              = nullptr;
 
-  mkldnn_shared_mem_t workspace_ = nullptr;
+  dnnl_shared_mem_t workspace_ = nullptr;
 
-  // Key MKLDNN_ARGS_WORKSPACE must be included in forward training
-  mkldnn_args_map_t net_args_;
+  // Key DNNL_ARGS_WORKSPACE must be included in forward training
+  dnnl_args_map_t net_args_;
 
-  friend class MKLDNNRnnBackward;
+  friend class DNNLRnnBackward;
 };
 
 /*
@@ -379,18 +378,18 @@ class RnnBwdPrimitive {
     bwd_prim.diff_weights_iter_desc_  = bwd_pd.diff_weights_iter_desc();
     bwd_prim.diff_bias_desc_          = bwd_pd.diff_bias_desc();
 
-    bwd_prim.primitive_ = std::shared_ptr<mkldnn::primitive>(new rnn_bwd(bwd_pd));
+    bwd_prim.primitive_ = std::shared_ptr<dnnl::primitive>(new rnn_bwd(bwd_pd));
 
     return bwd_prim;
   }
 
   RnnBwdPrimitive() {
     this->primitive_               = nullptr;
-    this->weights_layer_desc_      = mkldnn::memory::desc();
-    this->weights_iter_desc_       = mkldnn::memory::desc();
-    this->diff_weights_layer_desc_ = mkldnn::memory::desc();
-    this->diff_weights_iter_desc_  = mkldnn::memory::desc();
-    this->diff_bias_desc_          = mkldnn::memory::desc();
+    this->weights_layer_desc_      = dnnl::memory::desc();
+    this->weights_iter_desc_       = dnnl::memory::desc();
+    this->diff_weights_layer_desc_ = dnnl::memory::desc();
+    this->diff_weights_iter_desc_  = dnnl::memory::desc();
+    this->diff_bias_desc_          = dnnl::memory::desc();
   }
 
   RnnBwdPrimitive(const RnnBwdPrimitive& bwd) {
@@ -416,27 +415,27 @@ class RnnBwdPrimitive {
   }
 
  private:
-  std::shared_ptr<mkldnn::primitive> primitive_;
-  mkldnn::memory::desc weights_layer_desc_;
-  mkldnn::memory::desc weights_iter_desc_;
-  mkldnn::memory::desc diff_weights_layer_desc_;
-  mkldnn::memory::desc diff_weights_iter_desc_;
-  mkldnn::memory::desc diff_bias_desc_;
-  friend class MKLDNNRnnBackward;
+  std::shared_ptr<dnnl::primitive> primitive_;
+  dnnl::memory::desc weights_layer_desc_;
+  dnnl::memory::desc weights_iter_desc_;
+  dnnl::memory::desc diff_weights_layer_desc_;
+  dnnl::memory::desc diff_weights_iter_desc_;
+  dnnl::memory::desc diff_bias_desc_;
+  friend class DNNLRnnBackward;
 };
-RnnBwdPrimitive GetRnnBwdPrim(const MKLDNNRnnForwardTraining& fwd,
+RnnBwdPrimitive GetRnnBwdPrim(const DNNLRnnForwardTraining& fwd,
                               const NDArray& data,
                               const NDArray& params);
 
 /*
- * Use this to manage memory and primitive of MKL-DNN RNN backward.
+ * Use this to manage memory and primitive of DNNL RNN backward.
  */
-class MKLDNNRnnBackward {
+class DNNLRnnBackward {
  public:
-  MKLDNNRnnBackward(const MKLDNNRnnForwardTraining& fwd, const NDArray& data, const NDArray& params)
+  DNNLRnnBackward(const DNNLRnnForwardTraining& fwd, const NDArray& data, const NDArray& params)
       : bwd_(GetRnnBwdPrim(fwd, data, params)), fwd_ptr_(&fwd) {}
 
-  void FetchDataWeightsMem(const MKLDNNRnnForwardTraining& fwd);
+  void FetchDataWeightsMem(const DNNLRnnForwardTraining& fwd);
   void SetWeightsGradsMem();
   void SetDataGradsMem(void* diff_src,
                        void* diff_state,
@@ -451,45 +450,45 @@ class MKLDNNRnnBackward {
                           const OpReqType req,
                           const int dtype = mshadow::kFloat32);
 
-  const mkldnn::primitive& GetBwd() const {
+  const dnnl::primitive& GetBwd() const {
     return *bwd_.primitive_;
   }
-  const mkldnn_args_map_t& GetArgsMap() const {
+  const dnnl_args_map_t& GetArgsMap() const {
     return net_args_;
   }
 
  private:
   RnnBwdPrimitive bwd_;
-  const MKLDNNRnnForwardTraining* fwd_ptr_;
+  const DNNLRnnForwardTraining* fwd_ptr_;
 
-  mkldnn_shared_mem_t weights_layer_;
-  mkldnn_shared_mem_t weights_iter_;
+  dnnl_shared_mem_t weights_layer_;
+  dnnl_shared_mem_t weights_iter_;
 
-  mkldnn_shared_mem_t diff_weights_layer_;
-  mkldnn_shared_mem_t diff_weights_iter_;
-  mkldnn_shared_mem_t diff_weights_layer_r_;
-  mkldnn_shared_mem_t diff_weights_iter_r_;
-  mkldnn_shared_mem_t diff_bias_;
+  dnnl_shared_mem_t diff_weights_layer_;
+  dnnl_shared_mem_t diff_weights_iter_;
+  dnnl_shared_mem_t diff_weights_layer_r_;
+  dnnl_shared_mem_t diff_weights_iter_r_;
+  dnnl_shared_mem_t diff_bias_;
 
-  mkldnn_args_map_t net_args_;
+  dnnl_args_map_t net_args_;
 };
 
 /*
- * Use MKLDNNRnnOp to manage fused or unfused RNN layers. A MKLDNNRnnOp contains
+ * Use DNNLRnnOp to manage fused or unfused RNN layers. A DNNLRnnOp contains
  * the parameter(s) and primitive(s) of RNN layer(s). According to the direction,
  * input size, and state size, multple layers could be inplemented by unfused and
- * fused layers - MKLDNNRnnForward, which holds the memory and forward primitive
- * of MKL-DNN.
+ * fused layers - DNNLRnnForward, which holds the memory and forward primitive
+ * of DNNL.
  */
-class MKLDNNRnnOp {
+class DNNLRnnOp {
  public:
-  explicit MKLDNNRnnOp(const RNNParam& param,
-                       const int seq_len,
-                       const int batch_size,
-                       const int input_size)
+  explicit DNNLRnnOp(const RNNParam& param,
+                     const int seq_len,
+                     const int batch_size,
+                     const int input_size)
       : initialized_(false),
         weights_version_(0),
-        full_param_(MKLDNNRnnFullParamParser(param, seq_len, batch_size, input_size)) {}
+        full_param_(DNNLRnnFullParamParser(param, seq_len, batch_size, input_size)) {}
 
   void Forward(const OpContext& ctx,
                const std::vector<NDArray>& inputs,
@@ -508,17 +507,17 @@ class MKLDNNRnnOp {
  private:
   bool initialized_;
   size_t weights_version_;
-  MKLDNNRnnFullParam full_param_;
-  MKLDNNRnnMemMgr mgr_;
-  std::vector<MKLDNNRnnForward> fwd_inf_vec_;          // forward inference layers
-  std::vector<MKLDNNRnnForwardTraining> fwd_trn_vec_;  // forward training layers
-  std::vector<MKLDNNRnnBackward> bwd_vec_;             // backward layers
+  DNNLRnnFullParam full_param_;
+  DNNLRnnMemMgr mgr_;
+  std::vector<DNNLRnnForward> fwd_inf_vec_;          // forward inference layers
+  std::vector<DNNLRnnForwardTraining> fwd_trn_vec_;  // forward training layers
+  std::vector<DNNLRnnBackward> bwd_vec_;             // backward layers
 
   // Used to store the intermediate results of multi-layer
-  std::vector<mkldnn::memory*> dst_;
+  std::vector<dnnl::memory*> dst_;
 
   // Used to store the intermediate diff_src of multi_layer
-  mkldnn_shared_mem_t diff_src;
+  dnnl_shared_mem_t diff_src;
 
   void Init(const OpContext& ctx,
             const std::vector<NDArray>& inputs,
@@ -526,21 +525,21 @@ class MKLDNNRnnOp {
             const std::vector<NDArray>& outputs);
 };
 
-inline bool SupportMKLDNNRnn(const int input_dtype) {
+inline bool SupportDNNLRnn(const int input_dtype) {
   if (input_dtype == mshadow::kFloat32 && dmlc::GetEnv("MXNET_USE_ONEDNN_RNN", 1)) {
     return true;
   }
   return false;
 }
 
-inline bool SupportMKLDNNRnn(const RNNParam& param, const int input_dtype) {
+inline bool SupportDNNLRnn(const RNNParam& param, const int input_dtype) {
   if (param.use_sequence_length)
     return false;
-  return SupportMKLDNNRnn(input_dtype);
+  return SupportDNNLRnn(input_dtype);
 }
 
 }  // namespace op
 }  // namespace mxnet
 
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RNN_INL_H_
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_RNN_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_rnn.cc b/src/operator/nn/dnnl/dnnl_rnn.cc
similarity index 76%
rename from src/operator/nn/mkldnn/mkldnn_rnn.cc
rename to src/operator/nn/dnnl/dnnl_rnn.cc
index b113969ee84c..844bad99c845 100644
--- a/src/operator/nn/mkldnn/mkldnn_rnn.cc
+++ b/src/operator/nn/dnnl/dnnl_rnn.cc
@@ -18,8 +18,8 @@
  */
 
 /*!
- * \file mkldnn_rnn.cc
- * \brief Common functions used by MKLDNN RNN operator
+ * \file dnnl_rnn.cc
+ * \brief Common functions used by DNNL RNN operator
  * \author Zixuan Wei
  */
 
@@ -27,7 +27,7 @@
 
 #include <numeric>
 
-#include "./mkldnn_rnn-inl.h"
+#include "./dnnl_rnn-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -47,7 +47,7 @@ inline int GetRnnGatesNum(int mode) {
   }
 }
 
-void MKLDNNRnnLayerParam::SetDims() {
+void DNNLRnnLayerParam::SetDims() {
   const int ngates = GetRnnGatesNum(mode);
   //* NOTES: LBR-GRU's new gate formula needs two bias. So it has one more bias with LBR-GRU
   const int nbias         = mode == rnn_enum::kGru ? (ngates + 1) : ngates;
@@ -87,11 +87,11 @@ void MKLDNNRnnLayerParam::SetDims() {
   reserve_size = 0;
 }
 
-MKLDNNRnnFullParam MKLDNNRnnFullParamParser(const RNNParam& rnn_param,
-                                            const index_t seq_len,
-                                            const index_t batch_size,
-                                            const index_t input_size) {
-  MKLDNNRnnFullParam full_param;
+DNNLRnnFullParam DNNLRnnFullParamParser(const RNNParam& rnn_param,
+                                        const index_t seq_len,
+                                        const index_t batch_size,
+                                        const index_t input_size) {
+  DNNLRnnFullParam full_param;
   full_param.default_param = rnn_param;
   const int state_size     = rnn_param.state_size;
   const int proj_size =
@@ -103,7 +103,7 @@ MKLDNNRnnFullParam MKLDNNRnnFullParamParser(const RNNParam& rnn_param,
   full_param.default_param.seq_length_ = seq_len;
   full_param.default_param.batch_size_ = batch_size;
   full_param.default_param.input_size_ = input_size;
-  // Set basic size by constructing MKLDNNRnnLayerParam instance(s)
+  // Set basic size by constructing DNNLRnnLayerParam instance(s)
   if (rnn_param.bidirectional) {  // unfused bidirectional multi-layer RNN
     layer_params.emplace_back(
         1, batch_size, seq_len, input_size, state_size, proj_size, rnn_param.mode);
@@ -142,21 +142,21 @@ MKLDNNRnnFullParam MKLDNNRnnFullParamParser(const RNNParam& rnn_param,
   return full_param;
 }
 
-void MKLDNNRnnMemMgr::Init(dim_t size, const Context& ctx) {
+void DNNLRnnMemMgr::Init(dim_t size, const Context& ctx) {
   workspace_ = NDArray(TShape({size}), ctx, false, mshadow::kUint8);
   if (workspace_.data().dptr_ == nullptr)
-    LOG(FATAL) << "MKLDNN RNN operator memory allocation error.";
+    LOG(FATAL) << "DNNL RNN operator memory allocation error.";
   curr_mem  = static_cast<char*>(workspace_.data().dptr_);
   mem_size  = size;
   curr_size = size;
 }
 
-mkldnn::memory* MKLDNNRnnMemMgr::Alloc(const mkldnn::memory::desc& md) {
+dnnl::memory* DNNLRnnMemMgr::Alloc(const dnnl::memory::desc& md) {
   if (curr_mem == nullptr) {
     curr_mem = static_cast<char*>(workspace_.data().dptr_);
   }
 
-  mkldnn_mem_ptr ret(new mkldnn::memory());
+  dnnl_mem_ptr ret(new dnnl::memory());
   size_t addr       = reinterpret_cast<size_t>(curr_mem);
   size_t last_chunk = addr % alignment;
   size_t padding    = alignment - last_chunk;
@@ -165,28 +165,28 @@ mkldnn::memory* MKLDNNRnnMemMgr::Alloc(const mkldnn::memory::desc& md) {
 
   curr_size -= (md.get_size() + padding);
   if (curr_size < 0) {
-    ret.reset(new mkldnn::memory(md, cpu_engine));
+    ret.reset(new dnnl::memory(md, cpu_engine));
   } else {
     curr_mem += (md.get_size() + padding);
-    ret.reset(new mkldnn::memory(md, cpu_engine, reinterpret_cast<void*>(addr)));
+    ret.reset(new dnnl::memory(md, cpu_engine, reinterpret_cast<void*>(addr)));
   }
   RegisterMem(ret);
   return ret.get();
 }
 
-RnnPrimitive GetRnnFwdPrim(const MKLDNNRnnLayerParam& layer_param,
+RnnPrimitive GetRnnFwdPrim(const DNNLRnnLayerParam& layer_param,
                            const bool is_train,
                            const NDArray& data,
                            const NDArray& params) {
-  using namespace mkldnn;
-  using tag                     = mkldnn::memory::format_tag;
+  using namespace dnnl;
+  using tag                     = dnnl::memory::format_tag;
   const int mode                = layer_param.mode;
-  memory::data_type data_type   = get_mkldnn_type(data.dtype());
-  memory::data_type weight_type = get_mkldnn_type(params.dtype());
+  memory::data_type data_type   = get_dnnl_type(data.dtype());
+  memory::data_type weight_type = get_dnnl_type(params.dtype());
   const prop_kind prop = is_train ? prop_kind::forward_training : prop_kind::forward_inference;
-  const rnn_direction mkldnn_rnn_direction = layer_param.bidirectional
-                                                 ? rnn_direction::bidirectional_concat
-                                                 : rnn_direction::unidirectional;
+  const rnn_direction dnnl_rnn_direction = layer_param.bidirectional
+                                               ? rnn_direction::bidirectional_concat
+                                               : rnn_direction::unidirectional;
 
   auto src_layer_desc    = memory::desc(layer_param.src_dims, data_type, tag::tnc);
   auto weight_layer_desc = memory::desc(layer_param.weight_layer_dims, weight_type, tag::any);
@@ -197,20 +197,20 @@ RnnPrimitive GetRnnFwdPrim(const MKLDNNRnnLayerParam& layer_param,
   auto src_cell_desc     = memory::desc(layer_param.cell_dims, data_type, tag::ldnc);
   auto weight_peep_desc  = memory::desc();
   auto weight_proj_desc  = layer_param.proj_size > 0
-                               ? memory::desc(layer_param.weight_proj_dims, weight_type, tag::any)
-                               : memory::desc();
-  auto dst_state_desc    = layer_param.state_outputs
-                               ? memory::desc(layer_param.state_dims, data_type, tag::ldnc)
-                               : memory::desc();
-  auto dst_cell_desc     = layer_param.state_outputs
-                               ? memory::desc(layer_param.cell_dims, data_type, tag::ldnc)
-                               : memory::desc();
+                              ? memory::desc(layer_param.weight_proj_dims, weight_type, tag::any)
+                              : memory::desc();
+  auto dst_state_desc = layer_param.state_outputs
+                            ? memory::desc(layer_param.state_dims, data_type, tag::ldnc)
+                            : memory::desc();
+  auto dst_cell_desc = layer_param.state_outputs
+                           ? memory::desc(layer_param.cell_dims, data_type, tag::ldnc)
+                           : memory::desc();
 
   auto fwd = RnnPrimitive();
   switch (mode) {
     case rnn_enum::kLstm:
       fwd = RnnPrimitive::Create<lstm_forward>(prop,
-                                               mkldnn_rnn_direction,
+                                               dnnl_rnn_direction,
                                                src_layer_desc,
                                                src_state_desc,
                                                src_cell_desc,
@@ -225,7 +225,7 @@ RnnPrimitive GetRnnFwdPrim(const MKLDNNRnnLayerParam& layer_param,
       break;
     case rnn_enum::kGru:
       fwd = RnnPrimitive::Create<lbr_gru_forward>(prop,
-                                                  mkldnn_rnn_direction,
+                                                  dnnl_rnn_direction,
                                                   src_layer_desc,
                                                   src_state_desc,
                                                   weight_layer_desc,
@@ -239,7 +239,7 @@ RnnPrimitive GetRnnFwdPrim(const MKLDNNRnnLayerParam& layer_param,
       fwd = RnnPrimitive::Create<vanilla_rnn_forward>(
           prop,
           mode == rnn_enum::kRnnTanh ? algorithm::eltwise_tanh : algorithm::eltwise_relu,
-          mkldnn_rnn_direction,
+          dnnl_rnn_direction,
           src_layer_desc,
           src_state_desc,
           weight_layer_desc,
@@ -255,19 +255,18 @@ RnnPrimitive GetRnnFwdPrim(const MKLDNNRnnLayerParam& layer_param,
   return fwd;
 }
 
-RnnBwdPrimitive GetRnnBwdPrim(const MKLDNNRnnForwardTraining& fwd,
+RnnBwdPrimitive GetRnnBwdPrim(const DNNLRnnForwardTraining& fwd,
                               const NDArray& data,
                               const NDArray& params) {
-  using namespace mkldnn;
-  using tag                              = mkldnn::memory::format_tag;
-  const MKLDNNRnnLayerParam& layer_param = fwd.GetParam();
-  const int mode                         = layer_param.mode;
-  memory::data_type data_type            = get_mkldnn_type(data.dtype());
-  memory::data_type weight_type          = get_mkldnn_type(params.dtype());
-  const prop_kind prop                   = prop_kind::backward;
-  rnn_direction mkldnn_rnn_direction     = layer_param.bidirectional
-                                               ? rnn_direction::bidirectional_concat
-                                               : rnn_direction::unidirectional;
+  using namespace dnnl;
+  using tag                            = dnnl::memory::format_tag;
+  const DNNLRnnLayerParam& layer_param = fwd.GetParam();
+  const int mode                       = layer_param.mode;
+  memory::data_type data_type          = get_dnnl_type(data.dtype());
+  memory::data_type weight_type        = get_dnnl_type(params.dtype());
+  const prop_kind prop                 = prop_kind::backward;
+  rnn_direction dnnl_rnn_direction = layer_param.bidirectional ? rnn_direction::bidirectional_concat
+                                                               : rnn_direction::unidirectional;
 
   auto src_layer_desc    = memory::desc(layer_param.src_dims, data_type, tag::tnc);
   auto weight_layer_desc = memory::desc(layer_param.weight_layer_dims, weight_type, tag::any);
@@ -276,8 +275,8 @@ RnnBwdPrimitive GetRnnBwdPrim(const MKLDNNRnnForwardTraining& fwd,
   auto dst_layer_desc    = memory::desc(layer_param.dst_dims, data_type, tag::tnc);
   auto src_state_desc    = memory::desc(layer_param.state_dims, data_type, tag::ldnc);
   auto dst_state_desc    = layer_param.state_outputs
-                               ? memory::desc(layer_param.state_dims, data_type, tag::ldnc)
-                               : memory::desc();
+                            ? memory::desc(layer_param.state_dims, data_type, tag::ldnc)
+                            : memory::desc();
 
   const void* fwd_pd = fwd.GetPrimDesc();
   auto bwd           = RnnBwdPrimitive();
@@ -287,7 +286,7 @@ RnnBwdPrimitive GetRnnBwdPrim(const MKLDNNRnnForwardTraining& fwd,
           reinterpret_cast<const lstm_forward::primitive_desc*>(fwd_pd);
       bwd = RnnBwdPrimitive::Create<lstm_forward, lstm_backward>(*pd,
                                                                  prop,
-                                                                 mkldnn_rnn_direction,
+                                                                 dnnl_rnn_direction,
                                                                  // data desc
                                                                  src_layer_desc,
                                                                  src_state_desc,
@@ -314,7 +313,7 @@ RnnBwdPrimitive GetRnnBwdPrim(const MKLDNNRnnForwardTraining& fwd,
           reinterpret_cast<const lbr_gru_forward::primitive_desc*>(fwd_pd);
       bwd = RnnBwdPrimitive::Create<lbr_gru_forward, lbr_gru_backward>(*pd,
                                                                        prop,
-                                                                       mkldnn_rnn_direction,
+                                                                       dnnl_rnn_direction,
                                                                        // data desc
                                                                        src_layer_desc,
                                                                        src_state_desc,
@@ -340,7 +339,7 @@ RnnBwdPrimitive GetRnnBwdPrim(const MKLDNNRnnForwardTraining& fwd,
           *pd,
           prop,
           mode == rnn_enum::kRnnTanh ? algorithm::eltwise_tanh : algorithm::eltwise_relu,
-          mkldnn_rnn_direction,
+          dnnl_rnn_direction,
           // data desc
           src_layer_desc,
           src_state_desc,
@@ -378,13 +377,13 @@ RnnBwdPrimitive GetRnnBwdPrim(const MKLDNNRnnForwardTraining& fwd,
  *
  * All the memory blocks are in goi format.
  */
-static void ConcatWeights(const mkldnn::memory& dst,
+static void ConcatWeights(const dnnl::memory& dst,
                           const int concat_dimension,
                           const std::vector<void*>& src_ptrs,
-                          const mkldnn::memory::format_tag src_format) {
-  using memory    = mkldnn::memory;
+                          const dnnl::memory::format_tag src_format) {
+  using memory    = dnnl::memory;
   auto cpu_engine = dst.get_engine();
-  mkldnn::stream s(cpu_engine);
+  dnnl::stream s(cpu_engine);
   const memory::desc& dst_desc = dst.get_desc();
   // Use dst memory dims to initialize src memory dims, then set the concat
   // dim to 1. And Rnn weights are 5-dimension tensor.
@@ -396,57 +395,57 @@ static void ConcatWeights(const mkldnn::memory& dst,
   for (size_t i = 0; i < src_ptrs.size(); ++i) {
     src_descs.emplace_back(
         src_dims, static_cast<memory::data_type>(dst_desc.data.data_type), src_format);
-    concat_args.emplace(MKLDNN_ARG_MULTIPLE_SRC + i,
+    concat_args.emplace(DNNL_ARG_MULTIPLE_SRC + i,
                         memory(src_descs.back(), cpu_engine, src_ptrs.at(i)));
   }
-  concat_args.emplace(MKLDNN_ARG_DST, dst);
+  concat_args.emplace(DNNL_ARG_DST, dst);
 
   auto concat_pd =
-      mkldnn::concat::primitive_desc(dst.get_desc(), concat_dimension, src_descs, cpu_engine);
-  mkldnn::concat(concat_pd).execute(s, concat_args);
+      dnnl::concat::primitive_desc(dst.get_desc(), concat_dimension, src_descs, cpu_engine);
+  dnnl::concat(concat_pd).execute(s, concat_args);
 }
 
 #define RNN_HANDLE_FUNC_NAME set_handle
-#define RNN_HANDLE_FUNC(RNN_FUNC_NAME)                                                      \
-  auto RNN_FUNC_NAME = [&cpu_engine, &args](int arg_name, const desc& md, void* handle) {   \
-    if (args.find(arg_name) != args.end()) {                                                \
-      if (handle != nullptr)                                                                \
-        args.at(arg_name).set_data_handle(handle);                                          \
-    } else {                                                                                \
-      args[arg_name] =                                                                      \
-          handle ? mkldnn::memory(md, cpu_engine, handle) : mkldnn::memory(md, cpu_engine); \
-    }                                                                                       \
+#define RNN_HANDLE_FUNC(RNN_FUNC_NAME)                                                    \
+  auto RNN_FUNC_NAME = [&cpu_engine, &args](int arg_name, const desc& md, void* handle) { \
+    if (args.find(arg_name) != args.end()) {                                              \
+      if (handle != nullptr)                                                              \
+        args.at(arg_name).set_data_handle(handle);                                        \
+    } else {                                                                              \
+      args[arg_name] =                                                                    \
+          handle ? dnnl::memory(md, cpu_engine, handle) : dnnl::memory(md, cpu_engine);   \
+    }                                                                                     \
   }
 
 #define RNN_FWD_SET(NAME, DIMS, TAG, HANDLE, DTYPE) \
   RNN_FWD_SET_(RNN_HANDLE_FUNC_NAME, NAME, DIMS, TAG, HANDLE, DTYPE)
 
 #define RNN_FWD_SET_(FUNC, NAME, DIMS, TAG, HANDLE, DTYPE) \
-  FUNC(MKLDNN_ARG_##NAME, {DIMS, get_mkldnn_type(DTYPE), TAG}, HANDLE)
+  FUNC(DNNL_ARG_##NAME, {DIMS, get_dnnl_type(DTYPE), TAG}, HANDLE)
 
 #define RNN_BWD_SET(NAME, ARGS, HANDLE) RNN_BWD_SET_(RNN_HANDLE_FUNC_NAME, NAME, ARGS, HANDLE)
 
 #define RNN_BWD_SET_(FUNC, NAME, ARGS, HANDLE) \
-  FUNC(MKLDNN_ARG_DIFF_##NAME, ARGS.at(MKLDNN_ARG_##NAME).get_desc(), HANDLE)
+  FUNC(DNNL_ARG_DIFF_##NAME, ARGS.at(DNNL_ARG_##NAME).get_desc(), HANDLE)
 
 /*
  * Set new src data handler to Forward memory. The memory primitives are
  * not initialized until SetNewDataMem is first invoked. Src data handler
  * must not be nullptr, except for cx with LSTM mode. If either hy, cy is
  * nullptr, it may run with non-state_ouput or non-LSTM mode. Thus, the
- * corresponding memory should be a empty mkldnn::memory().
+ * corresponding memory should be a empty dnnl::memory().
  */
-void MKLDNNRnnForward::SetNewDataMem(void* x,
-                                     void* hx,
-                                     void* cx,
-                                     void* y,
-                                     void* hy,
-                                     void* cy,
-                                     const int dtype) {
-  using desc              = mkldnn::memory::desc;
-  using format_tag        = mkldnn::memory::format_tag;
-  auto& cpu_engine        = CpuEngine::Get()->get_engine();
-  mkldnn_args_map_t& args = net_args_;
+void DNNLRnnForward::SetNewDataMem(void* x,
+                                   void* hx,
+                                   void* cx,
+                                   void* y,
+                                   void* hy,
+                                   void* cy,
+                                   const int dtype) {
+  using desc            = dnnl::memory::desc;
+  using format_tag      = dnnl::memory::format_tag;
+  auto& cpu_engine      = CpuEngine::Get()->get_engine();
+  dnnl_args_map_t& args = net_args_;
 
   RNN_HANDLE_FUNC(RNN_HANDLE_FUNC_NAME);
 
@@ -467,11 +466,11 @@ void MKLDNNRnnForward::SetNewDataMem(void* x,
   }
 }
 
-inline void MKLDNNMemoryReorder(const mkldnn::memory& src, const mkldnn::memory& dst) {
+inline void DNNLMemoryReorder(const dnnl::memory& src, const dnnl::memory& dst) {
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<OpSignature, mkldnn::reorder, OpHash> reorderPrimitives;
+  static thread_local std::unordered_map<OpSignature, dnnl::reorder, OpHash> reorderPrimitives;
 #else
-  static MX_THREAD_LOCAL std::unordered_map<OpSignature, mkldnn::reorder, OpHash> reorderPrimitives;
+  static MX_THREAD_LOCAL std::unordered_map<OpSignature, dnnl::reorder, OpHash> reorderPrimitives;
 #endif
   OpSignature key{};
   key.AddSign(src);
@@ -479,25 +478,25 @@ inline void MKLDNNMemoryReorder(const mkldnn::memory& src, const mkldnn::memory&
 
   auto it = reorderPrimitives.find(key);
   if (it == reorderPrimitives.end()) {
-    auto reorder = mkldnn::reorder(src, dst);
+    auto reorder = dnnl::reorder(src, dst);
     it           = AddToCache(&reorderPrimitives, key, reorder);
   }
 
-  mkldnn_args_map_t net_args;
-  net_args.emplace(MKLDNN_ARG_SRC, src);
-  net_args.emplace(MKLDNN_ARG_DST, dst);
-  MKLDNNStream::Get()->RegisterPrimArgs(it->second, net_args);
+  dnnl_args_map_t net_args;
+  net_args.emplace(DNNL_ARG_SRC, src);
+  net_args.emplace(DNNL_ARG_DST, dst);
+  DNNLStream::Get()->RegisterPrimArgs(it->second, net_args);
 }
 
 /*
  * Reorder the concatenated weights memory to a efficient memory block
  * with primitive-prefered format.
  */
-void MKLDNNRnnForward::ReorderWeights() {
-  MKLDNNMemoryReorder(*weights_layer_r_, *weights_layer_);
-  MKLDNNMemoryReorder(*weights_iter_r_, *weights_iter_);
+void DNNLRnnForward::ReorderWeights() {
+  DNNLMemoryReorder(*weights_layer_r_, *weights_layer_);
+  DNNLMemoryReorder(*weights_iter_r_, *weights_iter_);
   if (param_.proj_size > 0)
-    MKLDNNMemoryReorder(*weights_proj_r_, *weights_proj_);
+    DNNLMemoryReorder(*weights_proj_r_, *weights_proj_);
 }
 
 void AdjustGruGateOrder(char* weight,
@@ -505,7 +504,7 @@ void AdjustGruGateOrder(char* weight,
                         const size_t hidden_size,
                         const int dtype) {
   // mxnet gru gate order is reset, update and new gates
-  // mkldnn gru gate order is update, reset and new gates
+  // dnnl gru gate order is update, reset and new gates
   size_t single_weight_bytes = input_size * hidden_size * mshadow::mshadow_sizeof(dtype);
   char* weight_reset         = weight;
   char* weight_update        = weight + single_weight_bytes;
@@ -528,7 +527,7 @@ void FuseBias(DType* fuse_bias, DType* native_bias, const int mode, const size_t
   DType* bh             = native_bias + state_size * ngates;
   if (mode == rnn_enum::kGru) {
 // While mxnet gru gate order is reset, update and new gates,
-// mkldnn gru gate order is update, reset and new gates. So
+// dnnl gru gate order is update, reset and new gates. So
 // we need to swap the order of reset and update from mxnet.
 #pragma omp parallel for num_threads(omp_threads)
     for (int j = 0; j < state_size_; j++) {
@@ -549,9 +548,7 @@ void FuseBias(DType* fuse_bias, DType* native_bias, const int mode, const size_t
   }
 }
 
-inline void EmplaceNetArgs(mkldnn_args_map_t* net_args,
-                           const int arg_name,
-                           const mkldnn::memory* mem) {
+inline void EmplaceNetArgs(dnnl_args_map_t* net_args, const int arg_name, const dnnl::memory* mem) {
   if (net_args->find(arg_name) != net_args->end()) {
     if (net_args->at(arg_name).get_data_handle() == mem->get_data_handle()) {
       return;
@@ -564,29 +561,26 @@ inline void EmplaceNetArgs(mkldnn_args_map_t* net_args,
 }
 
 /*
- * Copy native memory to mkldnn-format memory. It will initialize the memory
+ * Copy native memory to dnnl-format memory. It will initialize the memory
  * when first invoked. Then, the native weight_layer and weight_iter are
  * concatenated to xxx_xx_r memory. Per the different gates order of GRU,
  * it will swap the memory blocks of gates among concatenated memory
  * inplace. From then on, the xxx_xx_r memory is reordered to target
- * memory with preferred format_tag. Finally, native bias is fused to MKLDNN
+ * memory with preferred format_tag. Finally, native bias is fused to DNNL
  * bias memory.
  */
-void MKLDNNRnnForward::SetWeightsMem(void* w_ptr,
-                                     void* b_ptr,
-                                     const bool is_train,
-                                     const int dtype) {
-  using format_tag         = mkldnn::memory::format_tag;
-  auto mkldnn_dtype        = get_mkldnn_type(dtype);
+void DNNLRnnForward::SetWeightsMem(void* w_ptr, void* b_ptr, const bool is_train, const int dtype) {
+  using format_tag         = dnnl::memory::format_tag;
+  auto dnnl_dtype          = get_dnnl_type(dtype);
   const size_t dtype_bytes = mshadow::mshadow_sizeof(dtype);
 
   const size_t buffer_bytes =
       this->GetSize()  // byte number of the buffer
       + (param_.workspace_size + param_.reserve_size) * dtype_bytes +
-      kMKLDNNAlign * 7;  // Add margin for alignment of seven times allocation for the
-                         // dnnl memory handlers, i.e. weights_layer_, weights_iter_,
-                         // weights_proj_, bias_, weights_layer_r_, weights_iter_r_,
-                         // and weights_proj_r_.
+      kDNNLAlign * 7;  // Add margin for alignment of seven times allocation for the
+                       // dnnl memory handlers, i.e. weights_layer_, weights_iter_,
+                       // weights_proj_, bias_, weights_layer_r_, weights_iter_r_,
+                       // and weights_proj_r_.
   if (mem_mgr_.Size() < buffer_bytes)
     mem_mgr_.Init(buffer_bytes, this->ctx_);
 
@@ -602,18 +596,18 @@ void MKLDNNRnnForward::SetWeightsMem(void* w_ptr,
     weights_proj_ = mem_mgr_.Alloc(fwd_inf_.GetProjDesc());
   }
   if (bias_ == nullptr) {
-    bias_ = mem_mgr_.Alloc({param_.bias_dims, mkldnn_dtype, format_tag::ldgo});
+    bias_ = mem_mgr_.Alloc({param_.bias_dims, dnnl_dtype, format_tag::ldgo});
   }
 
   // Get the intermediate memory for weights concat & reorder
   if (weights_layer_r_ == nullptr) {
-    weights_layer_r_ = mem_mgr_.Alloc({param_.weight_layer_dims, mkldnn_dtype, format_tag::ldgoi});
+    weights_layer_r_ = mem_mgr_.Alloc({param_.weight_layer_dims, dnnl_dtype, format_tag::ldgoi});
   }
   if (weights_iter_r_ == nullptr) {
-    weights_iter_r_ = mem_mgr_.Alloc({param_.weight_iter_dims, mkldnn_dtype, format_tag::ldgoi});
+    weights_iter_r_ = mem_mgr_.Alloc({param_.weight_iter_dims, dnnl_dtype, format_tag::ldgoi});
   }
   if (use_proj && weights_proj_r_ == nullptr) {
-    weights_proj_r_ = mem_mgr_.Alloc({param_.weight_proj_dims, mkldnn_dtype, format_tag::ldoi});
+    weights_proj_r_ = mem_mgr_.Alloc({param_.weight_proj_dims, dnnl_dtype, format_tag::ldoi});
   }
 
   // convert void* to char* for arithmetic operations
@@ -695,9 +689,9 @@ void MKLDNNRnnForward::SetWeightsMem(void* w_ptr,
   });
 
   // insert weights into net_args
-  EmplaceNetArgs(&this->net_args_, MKLDNN_ARG_WEIGHTS_LAYER, this->weights_layer_);
-  EmplaceNetArgs(&this->net_args_, MKLDNN_ARG_WEIGHTS_ITER, this->weights_iter_);
-  EmplaceNetArgs(&this->net_args_, MKLDNN_ARG_BIAS, this->bias_);
+  EmplaceNetArgs(&this->net_args_, DNNL_ARG_WEIGHTS_LAYER, this->weights_layer_);
+  EmplaceNetArgs(&this->net_args_, DNNL_ARG_WEIGHTS_ITER, this->weights_iter_);
+  EmplaceNetArgs(&this->net_args_, DNNL_ARG_BIAS, this->bias_);
   if (use_proj)
     EmplaceNetArgs(&this->net_args_, DNNL_ARG_WEIGHTS_PROJECTION, this->weights_proj_);
 
@@ -712,48 +706,48 @@ void MKLDNNRnnForward::SetWeightsMem(void* w_ptr,
   }
 }
 
-void MKLDNNRnnForwardTraining::SetTrnMem(const MKLDNNRnnForward& fwd) {
-  using memory           = mkldnn::memory;
+void DNNLRnnForwardTraining::SetTrnMem(const DNNLRnnForward& fwd) {
+  using memory           = dnnl::memory;
   const auto& cpu_engine = CpuEngine::Get()->get_engine();
-  auto s                 = mkldnn::stream(cpu_engine);
-  // Prepare mkldnn::memorys for weights_layer, weight_iter, and workspace
+  auto s                 = dnnl::stream(cpu_engine);
+  // Prepare dnnl::memorys for weights_layer, weight_iter, and workspace
   if (workspace_ == nullptr)
-    workspace_ = mkldnn_shared_mem_t(new memory(fwd_trn_.GetWorkspaceDesc(), cpu_engine));
+    workspace_ = dnnl_shared_mem_t(new memory(fwd_trn_.GetWorkspaceDesc(), cpu_engine));
   if (weights_layer_ == nullptr)
-    weights_layer_ = mkldnn_shared_mem_t(new memory(fwd_trn_.GetLayerDesc(), cpu_engine));
+    weights_layer_ = dnnl_shared_mem_t(new memory(fwd_trn_.GetLayerDesc(), cpu_engine));
   if (weights_iter_ == nullptr)
-    weights_iter_ = mkldnn_shared_mem_t(new memory(fwd_trn_.GetIterDesc(), cpu_engine));
+    weights_iter_ = dnnl_shared_mem_t(new memory(fwd_trn_.GetIterDesc(), cpu_engine));
 
   // fill weights memory using the reordered weights of fwd_inference primitive
   if (fwd.weights_layer_r_->get_desc() == fwd_trn_.GetLayerDesc()) {
     weights_layer_->set_data_handle(fwd.weights_layer_r_->get_data_handle());
   } else {
-    MKLDNNMemoryReorder(*fwd.weights_layer_r_, *weights_layer_);
+    DNNLMemoryReorder(*fwd.weights_layer_r_, *weights_layer_);
   }
 
   if (fwd.weights_iter_r_->get_desc() == fwd_trn_.GetIterDesc()) {
     weights_iter_->set_data_handle(fwd.weights_iter_r_->get_data_handle());
   } else {
-    MKLDNNMemoryReorder(*fwd.weights_iter_r_, *weights_iter_);
+    DNNLMemoryReorder(*fwd.weights_iter_r_, *weights_iter_);
   }
 
   // bias are always in format_tag::ldgo
   this->bias_ = fwd.bias_;
 
   // insert weights into net_args
-  EmplaceNetArgs(&this->net_args_, MKLDNN_ARG_WEIGHTS_LAYER, this->weights_layer_.get());
-  EmplaceNetArgs(&this->net_args_, MKLDNN_ARG_WEIGHTS_ITER, this->weights_iter_.get());
-  EmplaceNetArgs(&this->net_args_, MKLDNN_ARG_BIAS, this->bias_);
-  EmplaceNetArgs(&this->net_args_, MKLDNN_ARG_WORKSPACE, this->workspace_.get());
+  EmplaceNetArgs(&this->net_args_, DNNL_ARG_WEIGHTS_LAYER, this->weights_layer_.get());
+  EmplaceNetArgs(&this->net_args_, DNNL_ARG_WEIGHTS_ITER, this->weights_iter_.get());
+  EmplaceNetArgs(&this->net_args_, DNNL_ARG_BIAS, this->bias_);
+  EmplaceNetArgs(&this->net_args_, DNNL_ARG_WORKSPACE, this->workspace_.get());
 }
 
-void MKLDNNRnnForwardTraining::FetchData(const MKLDNNRnnForward& fwd) {
+void DNNLRnnForwardTraining::FetchData(const DNNLRnnForward& fwd) {
   for (auto& kv : fwd.net_args_) {
     switch (kv.first) {
-      case MKLDNN_ARG_WEIGHTS_LAYER:
-      case MKLDNN_ARG_WEIGHTS_ITER:
-      case MKLDNN_ARG_BIAS:
-      case MKLDNN_ARG_WORKSPACE:
+      case DNNL_ARG_WEIGHTS_LAYER:
+      case DNNL_ARG_WEIGHTS_ITER:
+      case DNNL_ARG_BIAS:
+      case DNNL_ARG_WORKSPACE:
         continue;
 
       default:
@@ -762,11 +756,11 @@ void MKLDNNRnnForwardTraining::FetchData(const MKLDNNRnnForward& fwd) {
   }
 }
 
-void MKLDNNRnnOp::Init(const OpContext& op_ctx,
-                       const std::vector<NDArray>& inputs,
-                       const std::vector<OpReqType>& req,
-                       const std::vector<NDArray>& outputs) {
-  using format_tag = mkldnn::memory::format_tag;
+void DNNLRnnOp::Init(const OpContext& op_ctx,
+                     const std::vector<NDArray>& inputs,
+                     const std::vector<OpReqType>& req,
+                     const std::vector<NDArray>& outputs) {
+  using format_tag = dnnl::memory::format_tag;
 
   // In the `autograd.record()` context, RNNOp is required to run into
   // `forward_training` mode.
@@ -825,49 +819,48 @@ void MKLDNNRnnOp::Init(const OpContext& op_ctx,
   if (dst_.size() < num_fusion - 1) {
     int data_dtype           = outputs[rnn_enum::kOut].dtype();
     const size_t data_dbytes = mshadow::mshadow_sizeof(data_dtype);
-    mgr_.Init(
-        (outputs[rnn_enum::kOut].data().Size() * data_dbytes + kMKLDNNAlign) * (num_fusion - 1),
-        op_ctx.run_ctx.ctx);
+    mgr_.Init((outputs[rnn_enum::kOut].data().Size() * data_dbytes + kDNNLAlign) * (num_fusion - 1),
+              op_ctx.run_ctx.ctx);
     // Here we need `fwd_inf_vec_.size() - 1` spaces for the intermediate results of the multiple
     // fused layers. And for the result of the last fused layer, `outputs[rnn_enum::kOut]` could
     // provide the space. Hence, `forward_inf_vec_.back()` is excluded when allocates the spaces
     // for intermediate results.
-    for (std::vector<MKLDNNRnnForward>::const_iterator fwd = fwd_inf_vec_.begin();
+    for (std::vector<DNNLRnnForward>::const_iterator fwd = fwd_inf_vec_.begin();
          fwd != fwd_inf_vec_.end() - 1;
          ++fwd)
       dst_.push_back(
-          mgr_.Alloc({fwd->GetParam().dst_dims, get_mkldnn_type(data_dtype), format_tag::tnc}));
+          mgr_.Alloc({fwd->GetParam().dst_dims, get_dnnl_type(data_dtype), format_tag::tnc}));
   }
 
   if (!is_training)
     initialized_ = true;
 }
 
-void MKLDNNRnnBackward::FetchDataWeightsMem(const MKLDNNRnnForwardTraining& fwd) {
-  using memory     = mkldnn::memory;
+void DNNLRnnBackward::FetchDataWeightsMem(const DNNLRnnForwardTraining& fwd) {
+  using memory     = dnnl::memory;
   auto& cpu_engine = CpuEngine::Get()->get_engine();
 
   if (this->weights_layer_ == nullptr || this->weights_iter_ == nullptr) {
-    this->weights_layer_ = mkldnn_shared_mem_t(new memory(bwd_.weights_layer_desc_, cpu_engine));
-    this->weights_iter_  = mkldnn_shared_mem_t(new memory(bwd_.weights_iter_desc_, cpu_engine));
+    this->weights_layer_ = dnnl_shared_mem_t(new memory(bwd_.weights_layer_desc_, cpu_engine));
+    this->weights_iter_  = dnnl_shared_mem_t(new memory(bwd_.weights_iter_desc_, cpu_engine));
   }
 
   for (auto& kv : fwd.net_args_) {
-    const mkldnn::memory* valid_mem;
+    const dnnl::memory* valid_mem;
     switch (kv.first) {
-      case MKLDNN_ARG_WEIGHTS_LAYER: {
+      case DNNL_ARG_WEIGHTS_LAYER: {
         if (bwd_.weights_layer_desc_ == fwd.fwd_trn_.GetLayerDesc()) {
           this->weights_layer_->set_data_handle(kv.second.get_data_handle());
         } else {
-          MKLDNNMemoryReorder(*fwd.weights_layer_, *this->weights_layer_);
+          DNNLMemoryReorder(*fwd.weights_layer_, *this->weights_layer_);
         }
         valid_mem = this->weights_layer_.get();
       } break;
-      case MKLDNN_ARG_WEIGHTS_ITER: {
+      case DNNL_ARG_WEIGHTS_ITER: {
         if (bwd_.weights_iter_desc_ == fwd.fwd_trn_.GetIterDesc()) {
           this->weights_iter_->set_data_handle(kv.second.get_data_handle());
         } else {
-          MKLDNNMemoryReorder(*fwd.weights_iter_, *this->weights_iter_);
+          DNNLMemoryReorder(*fwd.weights_iter_, *this->weights_iter_);
         }
         valid_mem = this->weights_iter_.get();
       } break;
@@ -879,58 +872,58 @@ void MKLDNNRnnBackward::FetchDataWeightsMem(const MKLDNNRnnForwardTraining& fwd)
   }
 }
 
-void MKLDNNRnnBackward::SetWeightsGradsMem() {
-  using tag = mkldnn::memory::format_tag;
+void DNNLRnnBackward::SetWeightsGradsMem() {
+  using tag = dnnl::memory::format_tag;
 
   if (this->diff_weights_layer_ == nullptr || this->diff_weights_iter_ == nullptr ||
       this->diff_bias_ == nullptr) {
-    const auto& cpu_engine           = CpuEngine::Get()->get_engine();
-    const MKLDNNRnnLayerParam& param = fwd_ptr_->GetParam();
-    const auto mkldnn_type =
-        static_cast<mkldnn::memory::data_type>(bwd_.diff_weights_layer_desc_.data.data_type);
+    const auto& cpu_engine         = CpuEngine::Get()->get_engine();
+    const DNNLRnnLayerParam& param = fwd_ptr_->GetParam();
+    const auto dnnl_type =
+        static_cast<dnnl::memory::data_type>(bwd_.diff_weights_layer_desc_.data.data_type);
 
-    auto native_layer_desc = mkldnn::memory::desc(param.weight_layer_dims, mkldnn_type, tag::ldgoi);
-    auto native_iter_desc  = mkldnn::memory::desc(param.weight_iter_dims, mkldnn_type, tag::ldgoi);
+    auto native_layer_desc = dnnl::memory::desc(param.weight_layer_dims, dnnl_type, tag::ldgoi);
+    auto native_iter_desc  = dnnl::memory::desc(param.weight_iter_dims, dnnl_type, tag::ldgoi);
 
-    this->diff_weights_layer_r_ = std::make_shared<mkldnn::memory>(native_layer_desc, cpu_engine);
-    this->diff_weights_iter_r_  = std::make_shared<mkldnn::memory>(native_iter_desc, cpu_engine);
+    this->diff_weights_layer_r_ = std::make_shared<dnnl::memory>(native_layer_desc, cpu_engine);
+    this->diff_weights_iter_r_  = std::make_shared<dnnl::memory>(native_iter_desc, cpu_engine);
 
     if (native_layer_desc == bwd_.diff_weights_layer_desc_) {
-      this->diff_weights_layer_ = std::make_shared<mkldnn::memory>(
+      this->diff_weights_layer_ = std::make_shared<dnnl::memory>(
           bwd_.diff_weights_layer_desc_, cpu_engine, diff_weights_layer_r_->get_data_handle());
     } else {
       this->diff_weights_layer_ =
-          std::make_shared<mkldnn::memory>(bwd_.diff_weights_layer_desc_, cpu_engine);
+          std::make_shared<dnnl::memory>(bwd_.diff_weights_layer_desc_, cpu_engine);
     }
     if (native_iter_desc == bwd_.diff_weights_iter_desc_) {
-      this->diff_weights_iter_ = std::make_shared<mkldnn::memory>(
+      this->diff_weights_iter_ = std::make_shared<dnnl::memory>(
           bwd_.diff_weights_iter_desc_, cpu_engine, diff_weights_iter_r_->get_data_handle());
     } else {
       this->diff_weights_iter_ =
-          std::make_shared<mkldnn::memory>(bwd_.diff_weights_iter_desc_, cpu_engine);
+          std::make_shared<dnnl::memory>(bwd_.diff_weights_iter_desc_, cpu_engine);
     }
-    this->diff_bias_ = std::make_shared<mkldnn::memory>(bwd_.diff_bias_desc_, cpu_engine);
+    this->diff_bias_ = std::make_shared<dnnl::memory>(bwd_.diff_bias_desc_, cpu_engine);
   }
   std::memset(
       this->diff_weights_layer_->get_data_handle(), 0, bwd_.diff_weights_layer_desc_.get_size());
   std::memset(
       this->diff_weights_iter_->get_data_handle(), 0, bwd_.diff_weights_iter_desc_.get_size());
   std::memset(this->diff_bias_->get_data_handle(), 0, bwd_.diff_bias_desc_.get_size());
-  EmplaceNetArgs(&this->net_args_, MKLDNN_ARG_DIFF_WEIGHTS_LAYER, this->diff_weights_layer_.get());
-  EmplaceNetArgs(&this->net_args_, MKLDNN_ARG_DIFF_WEIGHTS_ITER, this->diff_weights_iter_.get());
-  EmplaceNetArgs(&this->net_args_, MKLDNN_ARG_DIFF_BIAS, this->diff_bias_.get());
+  EmplaceNetArgs(&this->net_args_, DNNL_ARG_DIFF_WEIGHTS_LAYER, this->diff_weights_layer_.get());
+  EmplaceNetArgs(&this->net_args_, DNNL_ARG_DIFF_WEIGHTS_ITER, this->diff_weights_iter_.get());
+  EmplaceNetArgs(&this->net_args_, DNNL_ARG_DIFF_BIAS, this->diff_bias_.get());
 }
 
-void MKLDNNRnnBackward::SetDataGradsMem(void* diff_src,
-                                        void* diff_state,
-                                        void* diff_statecell,
-                                        void* diff_dst,
-                                        void* diff_state_out,
-                                        void* diff_statecell_out,
-                                        const int dtype) {
-  using desc              = mkldnn::memory::desc;
-  auto& cpu_engine        = CpuEngine::Get()->get_engine();
-  mkldnn_args_map_t& args = this->net_args_;
+void DNNLRnnBackward::SetDataGradsMem(void* diff_src,
+                                      void* diff_state,
+                                      void* diff_statecell,
+                                      void* diff_dst,
+                                      void* diff_state_out,
+                                      void* diff_statecell_out,
+                                      const int dtype) {
+  using desc            = dnnl::memory::desc;
+  auto& cpu_engine      = CpuEngine::Get()->get_engine();
+  dnnl_args_map_t& args = this->net_args_;
 
   RNN_HANDLE_FUNC(RNN_HANDLE_FUNC_NAME);
 
@@ -951,12 +944,12 @@ void MKLDNNRnnBackward::SetDataGradsMem(void* diff_src,
   }
 }
 
-void MKLDNNRnnBackward::SetNativeWeightsGrads() const {
+void DNNLRnnBackward::SetNativeWeightsGrads() const {
   if (this->diff_weights_layer_->get_desc() != this->diff_weights_layer_r_->get_desc()) {
-    MKLDNNMemoryReorder(*this->diff_weights_layer_, *this->diff_weights_layer_r_);
+    DNNLMemoryReorder(*this->diff_weights_layer_, *this->diff_weights_layer_r_);
   }
   if (this->diff_weights_iter_->get_desc() != this->diff_weights_iter_r_->get_desc()) {
-    MKLDNNMemoryReorder(*this->diff_weights_iter_, *this->diff_weights_iter_r_);
+    DNNLMemoryReorder(*this->diff_weights_iter_, *this->diff_weights_iter_r_);
   }
 }
 
@@ -968,11 +961,11 @@ void MKLDNNRnnBackward::SetNativeWeightsGrads() const {
     FWrapper = common::ParallelAdd<DType>;                        \
   { __VA_ARGS__ }
 
-void MKLDNNRnnBackward::CommitWeightsGrads(void* diff_weights,
-                                           void* diff_bias,
-                                           const OpReqType req,
-                                           const int dtype) {
-  const MKLDNNRnnLayerParam& param = fwd_ptr_->GetParam();
+void DNNLRnnBackward::CommitWeightsGrads(void* diff_weights,
+                                         void* diff_bias,
+                                         const OpReqType req,
+                                         const int dtype) {
+  const DNNLRnnLayerParam& param = fwd_ptr_->GetParam();
 
   void* diff_weights_layer_ptr = this->diff_weights_layer_->get_data_handle();
   void* diff_weights_iter_ptr  = this->diff_weights_iter_->get_data_handle();
@@ -1078,21 +1071,21 @@ void MKLDNNRnnBackward::CommitWeightsGrads(void* diff_weights,
   });
 }
 
-template <typename MKLDNNRnnX>
-inline void RegisterMKLDNNRnn(MKLDNNRnnX const& rnn) {
-  MKLDNNStream::Get()->RegisterPrimArgs(rnn.GetFwd(), rnn.GetArgsMap());
+template <typename DNNLRnnX>
+inline void RegisterDNNLRnn(DNNLRnnX const& rnn) {
+  DNNLStream::Get()->RegisterPrimArgs(rnn.GetFwd(), rnn.GetArgsMap());
 }
 
 template <>
-inline void RegisterMKLDNNRnn(MKLDNNRnnBackward const& rnn) {
-  MKLDNNStream::Get()->RegisterPrimArgs(rnn.GetBwd(), rnn.GetArgsMap());
+inline void RegisterDNNLRnn(DNNLRnnBackward const& rnn) {
+  DNNLStream::Get()->RegisterPrimArgs(rnn.GetBwd(), rnn.GetArgsMap());
   rnn.SetNativeWeightsGrads();
 }
 
-void MKLDNNRnnOp::Forward(const OpContext& ctx,
-                          const std::vector<NDArray>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<NDArray>& outputs) {
+void DNNLRnnOp::Forward(const OpContext& ctx,
+                        const std::vector<NDArray>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<NDArray>& outputs) {
   TmpMemMgr::Get()->Init(ctx.requested[1]);
   // In the `autograd.record()` context, RNNOp is required to run into
   // forward_training mode.
@@ -1133,21 +1126,21 @@ void MKLDNNRnnOp::Forward(const OpContext& ctx,
   const int batch_size = default_param.batch_size_;
   const int state_size = default_param.state_size;
   const int iter_size  = default_param.projection_size.has_value()
-                             ? default_param.projection_size.value()
-                             : default_param.state_size;
+                            ? default_param.projection_size.value()
+                            : default_param.state_size;
   const int directions = default_param.bidirectional ? 2 : 1;
-  mkldnn::memory::desc dst_desc({seq_length, batch_size, directions * iter_size},
-                                get_mkldnn_type(data_dtype),
-                                mkldnn::memory::format_tag::tnc);
-  mkldnn::memory::desc state_desc({num_layers, directions, batch_size, iter_size},
-                                  get_mkldnn_type(data_dtype),
-                                  mkldnn::memory::format_tag::ldnc);
-  mkldnn::memory::desc cell_desc({num_layers, directions, batch_size, state_size},
-                                 get_mkldnn_type(data_dtype),
-                                 mkldnn::memory::format_tag::ldnc);
-  auto out_mem = CreateMKLDNNMem(outputs[rnn_enum::kOut], dst_desc, req[rnn_enum::kOut]);
-  mkldnn_output_t stateout_mem;
-  mkldnn_output_t statecellout_mem;
+  dnnl::memory::desc dst_desc({seq_length, batch_size, directions * iter_size},
+                              get_dnnl_type(data_dtype),
+                              dnnl::memory::format_tag::tnc);
+  dnnl::memory::desc state_desc({num_layers, directions, batch_size, iter_size},
+                                get_dnnl_type(data_dtype),
+                                dnnl::memory::format_tag::ldnc);
+  dnnl::memory::desc cell_desc({num_layers, directions, batch_size, state_size},
+                               get_dnnl_type(data_dtype),
+                               dnnl::memory::format_tag::ldnc);
+  auto out_mem = CreateDNNLMem(outputs[rnn_enum::kOut], dst_desc, req[rnn_enum::kOut]);
+  dnnl_output_t stateout_mem;
+  dnnl_output_t statecellout_mem;
 
   // Get input & output NDArray
   char* src            = static_cast<char*>(inputs[rnn_enum::kData].data().dptr_);
@@ -1159,15 +1152,15 @@ void MKLDNNRnnOp::Forward(const OpContext& ctx,
 
   if (default_param.state_outputs && req[rnn_enum::kStateOut] != kNullOp) {
     stateout_mem =
-        CreateMKLDNNMem(outputs[rnn_enum::kStateOut], state_desc, req[rnn_enum::kStateOut]);
+        CreateDNNLMem(outputs[rnn_enum::kStateOut], state_desc, req[rnn_enum::kStateOut]);
     dst_state = static_cast<char*>(stateout_mem.second->get_data_handle());
   }
 
   if (default_param.mode == rnn_enum::kLstm) {
     src_state_cell = static_cast<char*>(inputs[rnn_enum::kStateCell].data().dptr_);
     if (default_param.state_outputs && req[rnn_enum::kStateCellOut] != kNullOp) {
-      statecellout_mem = CreateMKLDNNMem(
-          outputs[rnn_enum::kStateCellOut], cell_desc, req[rnn_enum::kStateCellOut]);
+      statecellout_mem =
+          CreateDNNLMem(outputs[rnn_enum::kStateCellOut], cell_desc, req[rnn_enum::kStateCellOut]);
       dst_state_cell = static_cast<char*>(statecellout_mem.second->get_data_handle());
     }
   }
@@ -1238,10 +1231,10 @@ void MKLDNNRnnOp::Forward(const OpContext& ctx,
   }
   if (is_training) {
     for (auto& trn_lyr : fwd_trn_vec_)
-      RegisterMKLDNNRnn(trn_lyr);
+      RegisterDNNLRnn(trn_lyr);
   } else {
     for (auto& inf_lyr : fwd_inf_vec_)
-      RegisterMKLDNNRnn(inf_lyr);
+      RegisterDNNLRnn(inf_lyr);
   }
   CommitOutput(outputs[rnn_enum::kOut], out_mem);
   if (default_param.state_outputs) {
@@ -1249,14 +1242,14 @@ void MKLDNNRnnOp::Forward(const OpContext& ctx,
     if (default_param.mode == rnn_enum::kLstm)
       CommitOutput(outputs[rnn_enum::kStateCellOut], statecellout_mem);
   }
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
-void MKLDNNRnnOp::Backward(const OpContext& ctx,
-                           const std::vector<NDArray>& inputs,
-                           const std::vector<OpReqType>& req,
-                           const std::vector<NDArray>& outputs) {
-  using tag = mkldnn::memory::format_tag;
+void DNNLRnnOp::Backward(const OpContext& ctx,
+                         const std::vector<NDArray>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<NDArray>& outputs) {
+  using tag = dnnl::memory::format_tag;
   TmpMemMgr::Get()->Init(ctx.requested[1]);
   const RNNParam& default_param = full_param_.default_param;
   const int data_dtype          = inputs[rnn_enum::kData].dtype();
@@ -1271,7 +1264,7 @@ void MKLDNNRnnOp::Backward(const OpContext& ctx,
   }
   // Fetch weights, src and dst from Forward layer
   if (bwd_vec_.size() != fwd_trn_vec_.size())
-    LOG(FATAL) << "MKL-DNN RNN fusion error.";
+    LOG(FATAL) << "DNNL RNN fusion error.";
   for (size_t lyr = 0; lyr < bwd_vec_.size(); ++lyr) {
     bwd_vec_.at(lyr).FetchDataWeightsMem(fwd_trn_vec_.at(lyr));
     bwd_vec_.at(lyr).SetWeightsGradsMem();
@@ -1285,13 +1278,13 @@ void MKLDNNRnnOp::Backward(const OpContext& ctx,
   const int input_size = default_param.input_size_;
   const int state_size = default_param.state_size;
   const int directions = default_param.bidirectional ? 2 : 1;
-  mkldnn::memory::desc src_desc(
-      {seq_length, batch_size, input_size}, get_mkldnn_type(data_dtype), tag::tnc);
-  mkldnn::memory::desc state_desc(
-      {num_layers, directions, batch_size, state_size}, get_mkldnn_type(data_dtype), tag::ldnc);
-  auto diff_input_mem = CreateMKLDNNMem(outputs[rnn_enum::kData], src_desc, req[rnn_enum::kData]);
-  mkldnn_output_t diff_state_mem;
-  mkldnn_output_t diff_statecell_mem;
+  dnnl::memory::desc src_desc(
+      {seq_length, batch_size, input_size}, get_dnnl_type(data_dtype), tag::tnc);
+  dnnl::memory::desc state_desc(
+      {num_layers, directions, batch_size, state_size}, get_dnnl_type(data_dtype), tag::ldnc);
+  auto diff_input_mem = CreateDNNLMem(outputs[rnn_enum::kData], src_desc, req[rnn_enum::kData]);
+  dnnl_output_t diff_state_mem;
+  dnnl_output_t diff_statecell_mem;
   // index description of outputs NDArray
   //   0    1    2     3
   // | dx | dw | dhx | dcx|
@@ -1303,12 +1296,12 @@ void MKLDNNRnnOp::Backward(const OpContext& ctx,
                                   default_param.bidirectional + 1,
                                   default_param.mode)) *
                       w_bytes;
-  diff_state_mem = CreateMKLDNNMem(outputs[rnn_enum::kState], state_desc, req[rnn_enum::kState]);
+  diff_state_mem = CreateDNNLMem(outputs[rnn_enum::kState], state_desc, req[rnn_enum::kState]);
   char* dhx      = static_cast<char*>(diff_state_mem.second->get_data_handle());
   char* dcx      = nullptr;
   if (full_param_.default_param.mode == rnn_enum::kLstm && req[rnn_enum::kStateCell] != kNullOp) {
     diff_statecell_mem =
-        CreateMKLDNNMem(outputs[rnn_enum::kStateCell], state_desc, req[rnn_enum::kStateCell]);
+        CreateDNNLMem(outputs[rnn_enum::kStateCell], state_desc, req[rnn_enum::kStateCell]);
     dcx = static_cast<char*>(diff_statecell_mem.second->get_data_handle());
   }
 
@@ -1326,14 +1319,14 @@ void MKLDNNRnnOp::Backward(const OpContext& ctx,
 
   if (bwd_vec_.size() == 1) {
     bwd_vec_.back().SetDataGradsMem(dx, dhx, dcx, dy, dhy, dcy, data_dtype);
-    RegisterMKLDNNRnn(bwd_vec_.back());
+    RegisterDNNLRnn(bwd_vec_.back());
   } else {
     const size_t state_bytes = (default_param.bidirectional + 1) * default_param.batch_size_ *
                                default_param.state_size * mshadow::mshadow_sizeof(data_dtype);
     if (diff_src == nullptr) {
-      auto desc = mkldnn::memory::desc(
-          full_param_.layer_params.back().src_dims, get_mkldnn_type(data_dtype), tag::tnc);
-      diff_src = std::make_shared<mkldnn::memory>(desc, CpuEngine::Get()->get_engine());
+      auto desc = dnnl::memory::desc(
+          full_param_.layer_params.back().src_dims, get_dnnl_type(data_dtype), tag::tnc);
+      diff_src = std::make_shared<dnnl::memory>(desc, CpuEngine::Get()->get_engine());
     }
     // Sets primitives from bottom to top, then submits them in reversed order.
     bwd_vec_.front().SetDataGradsMem(
@@ -1361,17 +1354,17 @@ void MKLDNNRnnOp::Backward(const OpContext& ctx,
     bwd_vec_.back().SetDataGradsMem(
         diff_src->get_data_handle(), dhx, dcx, dy, dhy, dcy, data_dtype);
 
-    for (std::vector<MKLDNNRnnBackward>::const_reverse_iterator bwd = bwd_vec_.rbegin();
+    for (std::vector<DNNLRnnBackward>::const_reverse_iterator bwd = bwd_vec_.rbegin();
          bwd != bwd_vec_.rend();
          ++bwd) {
-      RegisterMKLDNNRnn(*bwd);
+      RegisterDNNLRnn(*bwd);
     }
   }
   CommitOutput(outputs[rnn_enum::kData], diff_input_mem);
   CommitOutput(outputs[rnn_enum::kState], diff_state_mem);
   if (full_param_.default_param.mode == rnn_enum::kLstm)
     CommitOutput(outputs[rnn_enum::kStateCell], diff_statecell_mem);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 
   // Commit weights diff
   if (req[rnn_enum::kParams] != kNullOp) {
diff --git a/src/operator/nn/mkldnn/mkldnn_slice-inl.h b/src/operator/nn/dnnl/dnnl_slice-inl.h
similarity index 54%
rename from src/operator/nn/mkldnn/mkldnn_slice-inl.h
rename to src/operator/nn/dnnl/dnnl_slice-inl.h
index c9207630209e..64ef19a5ea38 100644
--- a/src/operator/nn/mkldnn/mkldnn_slice-inl.h
+++ b/src/operator/nn/dnnl/dnnl_slice-inl.h
@@ -18,13 +18,13 @@
  */
 
 /*!
- * \file mkldnn_slice-inl.h
+ * \file dnnl_slice-inl.h
  * \brief
  * \author Zhiyuan Huang
  */
 
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_SLICE_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_SLICE_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_SLICE_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_SLICE_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 
@@ -34,39 +34,38 @@
 
 #include <utility>
 
-#include "./mkldnn_base-inl.h"
-
 #include "../../operator_common.h"
 #include "../../tensor/slice-inl.h"
+#include "./dnnl_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-class MKLDNNSliceFwd {
+class DNNLSliceFwd {
  public:
-  MKLDNNSliceFwd(const SliceParam& param, const NDArray& in, const NDArray& out);
-  void SetNewMem(const mkldnn::memory& input, const mkldnn::memory& output);
+  DNNLSliceFwd(const SliceParam& param, const NDArray& in, const NDArray& out);
+  void SetNewMem(const dnnl::memory& input, const dnnl::memory& output);
   void Register();
 
  private:
-  std::shared_ptr<mkldnn::memory> data_;
-  std::shared_ptr<mkldnn::memory> out_;
-  std::shared_ptr<mkldnn::reorder> fwd_;
+  std::shared_ptr<dnnl::memory> data_;
+  std::shared_ptr<dnnl::memory> out_;
+  std::shared_ptr<dnnl::reorder> fwd_;
 };
 
-typedef ParamOpSign<SliceParam> MKLDNNSliceSignature;
-MKLDNNSliceFwd& GetSliceForward(const SliceParam& param,
-                                const bool is_train,
-                                const NDArray& in_data,
-                                const NDArray& out_data);
+typedef ParamOpSign<SliceParam> DNNLSliceSignature;
+DNNLSliceFwd& GetSliceForward(const SliceParam& param,
+                              const bool is_train,
+                              const NDArray& in_data,
+                              const NDArray& out_data);
 
-void MKLDNNSlice(const nnvm::NodeAttrs& attrs,
-                 const OpContext& ctx,
-                 const NDArray& in,
-                 OpReqType req,
-                 const NDArray& out);
+void DNNLSlice(const nnvm::NodeAttrs& attrs,
+               const OpContext& ctx,
+               const NDArray& in,
+               OpReqType req,
+               const NDArray& out);
 
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_SLICE_INL_H_
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_SLICE_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_slice.cc b/src/operator/nn/dnnl/dnnl_slice.cc
similarity index 52%
rename from src/operator/nn/mkldnn/mkldnn_slice.cc
rename to src/operator/nn/dnnl/dnnl_slice.cc
index 00019ace3b50..32bef008d796 100644
--- a/src/operator/nn/mkldnn/mkldnn_slice.cc
+++ b/src/operator/nn/dnnl/dnnl_slice.cc
@@ -18,26 +18,26 @@
  */
 
 /*!
- * \file mkldnn_slice.cc
+ * \file dnnl_slice.cc
  * \brief
  * \author Zhiyuan Huang
  */
 
 #if MXNET_USE_ONEDNN == 1
 
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-#include "./mkldnn_slice-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
+#include "./dnnl_slice-inl.h"
 
 namespace mxnet {
 namespace op {
 
-MKLDNNSliceFwd::MKLDNNSliceFwd(const SliceParam& param, const NDArray& in, const NDArray& out) {
+DNNLSliceFwd::DNNLSliceFwd(const SliceParam& param, const NDArray& in, const NDArray& out) {
   const mxnet::TShape ishape = in.shape();
   const mxnet::TShape oshape = out.shape();
   const int N                = ishape.ndim();
-  mkldnn::memory::dims dims(N);
-  mkldnn::memory::dims offsets(N);
+  dnnl::memory::dims dims(N);
+  dnnl::memory::dims offsets(N);
   for (int i = 0; i < N; ++i) {
     dim_t s = 0;
     if (i < param.begin.ndim() && param.begin[i]) {
@@ -49,62 +49,62 @@ MKLDNNSliceFwd::MKLDNNSliceFwd(const SliceParam& param, const NDArray& in, const
     offsets[i] = s;
   }
 
-  auto in_md  = in.GetMKLDNNData()->get_desc();
-  auto out_md = out.GetMKLDNNData()->get_desc();
+  auto in_md  = in.GetDNNLData()->get_desc();
+  auto out_md = out.GetDNNLData()->get_desc();
   auto sub_md = in_md.submemory_desc(dims, offsets);
 
   auto engine = CpuEngine::Get()->get_engine();
-  this->data_ = std::make_shared<mkldnn::memory>(sub_md, engine, nullptr);
-  this->out_  = std::make_shared<mkldnn::memory>(out_md, engine, nullptr);
-  this->fwd_  = std::make_shared<mkldnn::reorder>(*this->data_, *this->out_);
+  this->data_ = std::make_shared<dnnl::memory>(sub_md, engine, nullptr);
+  this->out_  = std::make_shared<dnnl::memory>(out_md, engine, nullptr);
+  this->fwd_  = std::make_shared<dnnl::reorder>(*this->data_, *this->out_);
 }
 
-void MKLDNNSliceFwd::SetNewMem(const mkldnn::memory& input, const mkldnn::memory& output) {
+void DNNLSliceFwd::SetNewMem(const dnnl::memory& input, const dnnl::memory& output) {
   this->data_->set_data_handle(input.get_data_handle());
   this->out_->set_data_handle(output.get_data_handle());
 }
 
-void MKLDNNSliceFwd::Register() {
-  MKLDNNStream::Get()->RegisterPrimArgs(
-      *fwd_, {{MKLDNN_ARG_FROM, *(this->data_)}, {MKLDNN_ARG_TO, *(this->out_)}});
+void DNNLSliceFwd::Register() {
+  DNNLStream::Get()->RegisterPrimArgs(
+      *fwd_, {{DNNL_ARG_FROM, *(this->data_)}, {DNNL_ARG_TO, *(this->out_)}});
 }
 
-MKLDNNSliceFwd& GetSliceForward(const SliceParam& param,
-                                const bool is_train,
-                                const NDArray& in_data,
-                                const NDArray& out_data) {
+DNNLSliceFwd& GetSliceForward(const SliceParam& param,
+                              const bool is_train,
+                              const NDArray& in_data,
+                              const NDArray& out_data) {
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNSliceSignature, MKLDNNSliceFwd, OpHash> fwds;
+  static thread_local std::unordered_map<DNNLSliceSignature, DNNLSliceFwd, OpHash> fwds;
 #else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNSliceSignature, MKLDNNSliceFwd, OpHash> fwds;
+  static MX_THREAD_LOCAL std::unordered_map<DNNLSliceSignature, DNNLSliceFwd, OpHash> fwds;
 #endif
-  MKLDNNSliceSignature key(param);
+  DNNLSliceSignature key(param);
   key.AddSign(is_train);
   key.AddSign(in_data);
   key.AddSign(out_data);
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
-    MKLDNNSliceFwd fwd(param, in_data, out_data);
+    DNNLSliceFwd fwd(param, in_data, out_data);
     it = AddToCache(&fwds, key, fwd);
   }
   return it->second;
 }
 
-void MKLDNNSlice(const nnvm::NodeAttrs& attrs,
-                 const OpContext& ctx,
-                 const NDArray& in,
-                 OpReqType req,
-                 const NDArray& out) {
+void DNNLSlice(const nnvm::NodeAttrs& attrs,
+               const OpContext& ctx,
+               const NDArray& in,
+               OpReqType req,
+               const NDArray& out) {
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
-  MKLDNNSliceFwd& fwd     = GetSliceForward(param, ctx.is_train, in, out);
-  auto in_mem             = in.GetMKLDNNData();
-  auto out_md             = out.GetMKLDNNData()->get_desc();
-  auto out_mem            = CreateMKLDNNMem(out, out_md, req);
+  DNNLSliceFwd& fwd       = GetSliceForward(param, ctx.is_train, in, out);
+  auto in_mem             = in.GetDNNLData();
+  auto out_md             = out.GetDNNLData()->get_desc();
+  auto out_mem            = CreateDNNLMem(out, out_md, req);
   fwd.SetNewMem(*in_mem, *out_mem.second);
   fwd.Register();
   CommitOutput(out, out_mem);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
 }  // namespace op
diff --git a/src/operator/nn/dnnl/dnnl_softmax.cc b/src/operator/nn/dnnl/dnnl_softmax.cc
new file mode 100644
index 000000000000..f5e5f3e3681d
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_softmax.cc
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_softmax.cc
+ * \brief
+ * \author Da Zheng
+ */
+
+#include "../softmax-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
+
+#if MXNET_USE_ONEDNN == 1
+namespace mxnet {
+namespace op {
+
+static dnnl::softmax_forward::primitive_desc GetSoftmaxFwdPd(bool is_train,
+                                                             const int axis,
+                                                             const dnnl::memory& input_mem) {
+  dnnl::memory::desc data_md = input_mem.get_desc();
+  auto cpu_engine            = CpuEngine::Get()->get_engine();
+  auto prop = is_train ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_scoring;
+  auto desc = dnnl::softmax_forward::desc(prop, data_md, axis);
+  return dnnl::softmax_forward::primitive_desc(desc, cpu_engine);
+}
+
+static dnnl::softmax_backward::primitive_desc GetSoftmaxBwdPd(
+    const dnnl::memory& diff_mem,
+    const dnnl::memory& data_mem,
+    const int axis,
+    const dnnl::softmax_forward::primitive_desc& hint_fwd_pd) {
+  dnnl::memory::desc diff_md = diff_mem.get_desc();
+  dnnl::memory::desc data_md = data_mem.get_desc();
+  auto cpu_engine            = CpuEngine::Get()->get_engine();
+  auto desc                  = dnnl::softmax_backward::desc(diff_md, data_md, axis);
+  return dnnl::softmax_backward::primitive_desc(desc, cpu_engine, hint_fwd_pd);
+}
+
+bool SupportDNNLSoftmax(const SoftmaxParam& param, const NDArray& data, const NDArray& output) {
+  // DNNL does not support temperature argument in their softmax function
+  // now. Need update this once they start to support it.
+  const int ndim      = data.shape().ndim();
+  const int in_dtype  = data.dtype();
+  const int out_dtype = output.dtype();
+  const int axis      = CheckAxis(param.axis, ndim);
+  // DNNL does not support temperature argument in their softmax function
+  // now. Need update this once they start to support it.
+  // Currently, DNNL shows bad performance when softmax is not performed on the last dimension
+  if (param.temperature.has_value() || in_dtype != mshadow::kFloat32 || in_dtype != out_dtype ||
+      axis != (ndim - 1)) {
+    return false;
+  }
+
+  // only supports ndim = 1, 2, 3, 4 for now
+  return (ndim >= 1 && ndim <= 4);
+}
+
+class DNNLSoftmaxFwd {
+ public:
+  dnnl::softmax_forward::primitive_desc pd;
+
+  DNNLSoftmaxFwd(const bool is_train, const int axis, const dnnl::memory& input)
+      : pd(GetSoftmaxFwdPd(is_train, axis, input)) {
+    fwd_ = std::make_shared<dnnl::softmax_forward>(pd);
+  }
+
+  const dnnl::softmax_forward& GetFwd() const {
+    return *fwd_;
+  }
+
+ private:
+  std::shared_ptr<dnnl::softmax_forward> fwd_;
+};
+
+typedef ParamOpSign<SoftmaxParam> DNNLSoftmaxSignature;
+
+static DNNLSoftmaxFwd& GetSoftmaxFwd(const SoftmaxParam& param,
+                                     const int real_axis,
+                                     const bool is_train,
+                                     const NDArray& data,
+                                     const NDArray& output) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLSoftmaxSignature, DNNLSoftmaxFwd, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLSoftmaxSignature, DNNLSoftmaxFwd, OpHash> fwds;
+#endif
+
+  DNNLSoftmaxSignature key(param);
+  key.AddSign(real_axis);
+  key.AddSign(is_train);
+  key.AddSign(data);
+  key.AddSign(output);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    DNNLSoftmaxFwd fwd(is_train, real_axis, *(data.GetDNNLData()));
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void DNNLSoftmaxForward(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const NDArray& in_data,
+                        const OpReqType& req,
+                        const NDArray& out_data) {
+  if (req == kNullOp)
+    return;
+  // same as the FCompute path, softmax only supports kWriteTo and kWriteInplace for now.
+  CHECK_NE(req, kAddTo);
+
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  int axis                  = CheckAxis(param.axis, in_data.shape().ndim());
+  auto fwd                  = GetSoftmaxFwd(param, axis, ctx.is_train, in_data, out_data);
+
+  auto in_mem        = in_data.GetDNNLData();
+  auto out_mem       = out_data.GetDNNLData(fwd.pd.dst_desc());
+  DNNLStream* stream = DNNLStream::Get();
+  stream->RegisterPrimArgs(fwd.GetFwd(), {{DNNL_ARG_SRC, *in_mem}, {DNNL_ARG_DST, *out_mem}});
+  stream->Submit();
+}
+
+class DNNLSoftmaxBwd {
+ public:
+  dnnl::softmax_backward::primitive_desc pd;
+
+  DNNLSoftmaxBwd(const dnnl::memory& diff_mem,
+                 const dnnl::memory& data_mem,
+                 const int axis,
+                 const dnnl::softmax_forward::primitive_desc& hint_fwd_pd)
+      : pd(GetSoftmaxBwdPd(diff_mem, data_mem, axis, hint_fwd_pd)) {
+    bwd_ = std::make_shared<dnnl::softmax_backward>(pd);
+  }
+
+  const dnnl::softmax_backward& GetBwd() const {
+    return *bwd_;
+  }
+
+ private:
+  std::shared_ptr<dnnl::softmax_backward> bwd_;
+};
+
+static DNNLSoftmaxBwd& GetSoftmaxBwd(const SoftmaxParam& param,
+                                     const int real_axis,
+                                     const std::vector<NDArray>& data,
+                                     const std::vector<NDArray>& output) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLSoftmaxSignature, DNNLSoftmaxBwd, OpHash> bwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLSoftmaxSignature, DNNLSoftmaxBwd, OpHash> bwds;
+#endif
+
+  DNNLSoftmaxSignature key(param);
+  key.AddSign(real_axis);
+  key.AddSign(data);
+  key.AddSign(output);
+
+  auto it = bwds.find(key);
+  if (it == bwds.end()) {
+    auto diff_mem = data[0].GetDNNLData();
+    auto data_mem = data[1].GetDNNLData();
+    auto fwd_pd   = GetSoftmaxFwdPd(true, real_axis, *data_mem);
+    DNNLSoftmaxBwd bwd(*diff_mem, *data_mem, real_axis, fwd_pd);
+    it = AddToCache(&bwds, key, bwd);
+  }
+  return it->second;
+}
+
+void DNNLSoftmaxBackward(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<NDArray>& in_data,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<NDArray>& out_data) {
+  if (req[0] == kNullOp)
+    return;
+  CHECK_EQ(in_data.size(), 2U);
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  int axis                  = CheckAxis(param.axis, in_data[1].shape().ndim());
+  auto diff_mem             = in_data[0].GetDNNLData();
+  auto data_mem             = in_data[1].GetDNNLData();
+  auto bwd                  = GetSoftmaxBwd(param, axis, in_data, out_data);
+
+  auto out_mem         = CreateDNNLMem(out_data[0], bwd.pd.diff_src_desc(), req[0]);
+  DNNLStream* stream   = DNNLStream::Get();
+  dnnl_args_map_t args = {{DNNL_ARG_DST, *data_mem},
+                          {DNNL_ARG_DIFF_DST, *diff_mem},
+                          {DNNL_ARG_DIFF_SRC, *out_mem.second}};
+
+  stream->RegisterPrimArgs(bwd.GetBwd(), args);
+  CommitOutput(out_data[0], out_mem);
+  stream->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/nn/dnnl/dnnl_softmax_output.cc b/src/operator/nn/dnnl/dnnl_softmax_output.cc
new file mode 100644
index 000000000000..7a7d3991dc6c
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_softmax_output.cc
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_softmax_output.cc
+ * \brief integrate dnnl softmax to softmax_output forward
+ * \author Zhang Rong A
+ */
+
+#if MXNET_USE_ONEDNN == 1
+#include "../../softmax_output-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
+namespace mxnet {
+namespace op {
+
+static dnnl::softmax_forward::primitive_desc GetSoftmaxOutputFwdDescImpl(
+    const SoftmaxOutputParam& param,
+    bool is_train,
+    const int axis,
+    const dnnl::memory& input_mem) {
+  dnnl::memory::desc data_md = input_mem.get_desc();
+  auto cpu_engine            = CpuEngine::Get()->get_engine();
+  auto prop = is_train ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_scoring;
+  auto desc = dnnl::softmax_forward::desc(prop, data_md, axis);
+  return dnnl::softmax_forward::primitive_desc(desc, cpu_engine);
+}
+
+typedef ParamOpSign<SoftmaxOutputParam> DNNLSoftmaxOuputSignature;
+
+class DNNLSoftmaxOutputFwd {
+  std::shared_ptr<dnnl::softmax_forward> fwd_;
+
+ public:
+  const dnnl::softmax_forward::primitive_desc fwd_pd;
+
+  DNNLSoftmaxOutputFwd(const SoftmaxOutputParam& param,
+                       bool is_train,
+                       const int axis,
+                       const dnnl::memory& mem)
+      : fwd_pd(GetSoftmaxOutputFwdDescImpl(param, is_train, axis, mem)) {
+    fwd_ = std::make_shared<dnnl::softmax_forward>(fwd_pd);
+  }
+
+  const inline dnnl::softmax_forward& GetFwd() const {
+    return *fwd_;
+  }
+};
+
+static DNNLSoftmaxOutputFwd& GetSoftmaxOutputForward(const SoftmaxOutputParam& param,
+                                                     const OpContext& ctx,
+                                                     const NDArray& in_data) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLSoftmaxOuputSignature, DNNLSoftmaxOutputFwd, OpHash>
+      fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLSoftmaxOuputSignature, DNNLSoftmaxOutputFwd, OpHash>
+      fwds;
+#endif
+  DNNLSoftmaxOuputSignature key(param);
+  key.AddSign(ctx.is_train);
+  key.AddSign(in_data);
+
+  //  softmax_output has no axis parameter, so use it as it original implement.
+  int axis = in_data.shape().ndim() - 1;
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    auto in_mem = *(in_data.GetDNNLData());
+    DNNLSoftmaxOutputFwd fwd(param, ctx.is_train, axis, in_mem);
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+//  This is only used for forward. For backward ,need double check compatibility
+bool SupportDNNLSoftmaxOutput(const SoftmaxOutputParam& param) {
+  return param.multi_output ? false : true;
+}
+
+void DNNLSoftmaxOutputForward(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& in_data,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& out_data) {
+  const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
+
+  NDArray idata = in_data[softmaxout_enum::kData];
+  NDArray odata = out_data[softmaxout_enum::kOut];
+  if (in_data[softmaxout_enum::kData].IsView() && in_data[softmaxout_enum::kData].IsDNNLData()) {
+    idata = in_data[softmaxout_enum::kData].Reorder2Default();
+  }
+
+  auto input_mem = idata.GetDNNLData();
+  auto out_mem   = CreateDNNLMem(
+      out_data[softmaxout_enum::kOut], input_mem->get_desc(), req[softmaxout_enum::kOut]);
+
+  DNNLSoftmaxOutputFwd& fwd = GetSoftmaxOutputForward(param, ctx, idata);
+
+  DNNLStream* stream = DNNLStream::Get();
+  stream->RegisterPrimArgs(fwd.GetFwd(),
+                           {{DNNL_ARG_SRC, *input_mem}, {DNNL_ARG_DST, *out_mem.second}});
+  CommitOutput(out_data[softmaxout_enum::kOut], out_mem);
+  stream->Submit();
+}
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/nn/dnnl/dnnl_sum.cc b/src/operator/nn/dnnl/dnnl_sum.cc
new file mode 100644
index 000000000000..14b7deafd66e
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_sum.cc
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_sum.cc
+ * \brief
+ * \author Da Zheng
+ */
+#include <iostream>
+
+#include "../../operator_common.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
+
+namespace mxnet {
+namespace op {
+
+#if MXNET_USE_ONEDNN == 1
+void DNNLSum(const dnnl::memory& arr1, const dnnl::memory& arr2, const dnnl::memory& out) {
+  std::vector<dnnl::memory::desc> input_pds(2);
+  std::vector<float> scales(2, 1);
+  input_pds[0] = arr1.get_desc();
+  input_pds[1] = arr2.get_desc();
+  CHECK(input_pds[0] == input_pds[0]);
+  const dnnl::memory* in_mem1 = &arr1;
+  const dnnl::memory* in_mem2 = &arr2;
+  auto output_pd              = out.get_desc();
+  if (input_pds[0] != output_pd) {
+    auto tmp_memory1 = TmpMemMgr::Get()->Alloc(output_pd);
+    auto tmp_memory2 = TmpMemMgr::Get()->Alloc(output_pd);
+    DNNLMemoryCopy(arr1, tmp_memory1);
+    DNNLMemoryCopy(arr2, tmp_memory2);
+    input_pds[0] = tmp_memory1->get_desc();
+    input_pds[1] = tmp_memory2->get_desc();
+    in_mem1      = tmp_memory1;
+    in_mem2      = tmp_memory2;
+  }
+  dnnl::sum::primitive_desc sum_pd(output_pd, scales, input_pds, CpuEngine::Get()->get_engine());
+  dnnl_args_map_t args = {
+      {DNNL_ARG_MULTIPLE_SRC, *in_mem1},
+      {DNNL_ARG_MULTIPLE_SRC + 1, *in_mem2},
+      {DNNL_ARG_DST, out},
+  };
+  DNNLStream::Get()->RegisterPrimArgs(dnnl::sum(sum_pd), args);
+}
+
+class DNNLSumFwd {
+ public:
+  dnnl::sum::primitive_desc fwd_pd;
+
+  DNNLSumFwd(const std::vector<float>& scales, const std::vector<dnnl::memory::desc>& data_md)
+      : fwd_pd(scales, data_md, CpuEngine::Get()->get_engine()) {
+    fwd_ = std::make_shared<dnnl::sum>(fwd_pd);
+  }
+
+  const dnnl::sum& GetFwd() const {
+    return *fwd_;
+  }
+
+ private:
+  std::shared_ptr<dnnl::sum> fwd_;
+};
+
+static DNNLSumFwd& GetSumForward(const std::vector<float>& scales,
+                                 const std::vector<NDArray>& in_data,
+                                 const std::vector<dnnl::memory::desc>& data_md) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<OpSignature, DNNLSumFwd, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<OpSignature, DNNLSumFwd, OpHash> fwds;
+#endif
+  OpSignature key;
+  key.AddSign(in_data);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    DNNLSumFwd fwd(scales, data_md);
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void DNNLSumForward(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<NDArray>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<NDArray>& outputs) {
+  TmpMemMgr::Get()->Init(ctx.requested[0]);
+  const int num_inputs    = inputs.size();
+  const NDArray& out_data = outputs[0];
+  std::vector<dnnl::memory::desc> data_md;
+  std::vector<const dnnl::memory*> data_mem;
+  std::vector<float> scales(num_inputs, 1);
+
+  data_md.reserve(num_inputs);
+  data_mem.reserve(num_inputs);
+
+  for (int i = 0; i < num_inputs; ++i) {
+    const dnnl::memory* in_mem = inputs[i].GetDNNLData();
+    dnnl::memory::desc tmp_md  = in_mem->get_desc();
+    data_md.push_back(tmp_md);
+    data_mem.push_back(in_mem);
+  }
+
+  DNNLSumFwd& fwd              = GetSumForward(scales, inputs, data_md);
+  mxnet::dnnl_output_t out_mem = CreateDNNLMem(out_data, fwd.fwd_pd.dst_desc(), req[0], &inputs[0]);
+  dnnl_args_map_t net_args;
+  net_args.insert({DNNL_ARG_DST, *out_mem.second});
+  for (int i = 0; i < num_inputs; ++i) {
+    net_args.insert({DNNL_ARG_MULTIPLE_SRC + i, *data_mem[i]});
+  }
+  DNNLStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
+  CommitOutput(out_data, out_mem);
+  DNNLStream::Get()->Submit();
+}
+#endif
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/nn/mkldnn/mkldnn_transpose.cc b/src/operator/nn/dnnl/dnnl_transpose.cc
similarity index 56%
rename from src/operator/nn/mkldnn/mkldnn_transpose.cc
rename to src/operator/nn/dnnl/dnnl_transpose.cc
index 27dc1adbcf0d..7a4f6941fc7e 100644
--- a/src/operator/nn/mkldnn/mkldnn_transpose.cc
+++ b/src/operator/nn/dnnl/dnnl_transpose.cc
@@ -18,21 +18,21 @@
  */
 
 /*!
- * \file mkldnn_transpose.cc
- * \brief Implement transpose operator via MKL-DNN reorder primitive
+ * \file dnnl_transpose.cc
+ * \brief Implement transpose operator via DNNL reorder primitive
  * \author Tao Lv
  */
 
 #if MXNET_USE_ONEDNN == 1
 
-#include <mkldnn.hpp>
+#include <dnnl.hpp>
 
 #include "../../tensor/matrix_op-inl.h"
 
 namespace mxnet {
 namespace op {
 
-bool SupportMKLDNNTranspose(const TransposeParam& param, const NDArray& data) {
+bool SupportDNNLTranspose(const TransposeParam& param, const NDArray& data) {
   auto data_ndim = data.shape().ndim();
 
   if (data_ndim > 4 || data_ndim == 0 || data.shape().Size() == 0 ||
@@ -42,17 +42,17 @@ bool SupportMKLDNNTranspose(const TransposeParam& param, const NDArray& data) {
   return true;
 }
 
-typedef ParamOpSign<TransposeParam> MKLDNNTransposeSignature;
+typedef ParamOpSign<TransposeParam> DNNLTransposeSignature;
 
-class MKLDNNTransposeForward {
+class DNNLTransposeForward {
  public:
-  std::shared_ptr<mkldnn::memory> data_;
-  std::shared_ptr<mkldnn::memory> out_;
-  std::shared_ptr<mkldnn::memory::desc> dst_md_;
-  std::shared_ptr<mkldnn::reorder> transpose_;
+  std::shared_ptr<dnnl::memory> data_;
+  std::shared_ptr<dnnl::memory> out_;
+  std::shared_ptr<dnnl::memory::desc> dst_md_;
+  std::shared_ptr<dnnl::reorder> transpose_;
 
  public:
-  MKLDNNTransposeForward(const TransposeParam& param, const NDArray& data) {
+  DNNLTransposeForward(const TransposeParam& param, const NDArray& data) {
     auto shape     = data.shape();
     auto data_ndim = shape.ndim();
     auto axes_ndim = param.axes.ndim();
@@ -66,12 +66,12 @@ class MKLDNNTransposeForward {
     }
 
     auto engine = CpuEngine::Get()->get_engine();
-    auto in_mem = data.GetMKLDNNData();
+    auto in_mem = data.GetDNNLData();
     auto src_md = in_mem->get_desc();
-    data_       = std::make_shared<mkldnn::memory>(src_md, engine, nullptr);
+    data_       = std::make_shared<dnnl::memory>(src_md, engine, nullptr);
 
-    mkldnn_dims_t strides;
-    mkldnn_dims_t sh;
+    dnnl_dims_t strides;
+    dnnl_dims_t sh;
     dim_t total_stride = 1;
     for (int i = data_ndim - 1; i >= 0; i--) {
       sh[i]            = shape[i];
@@ -79,67 +79,64 @@ class MKLDNNTransposeForward {
       total_stride *= shape[axes[i]];
     }
 
-    mkldnn_memory_desc_t dst_fmt;
-    mkldnn_memory_desc_init_by_strides(&dst_fmt, data_ndim, sh, mkldnn_f32, strides);
+    dnnl_memory_desc_t dst_fmt;
+    dnnl_memory_desc_init_by_strides(&dst_fmt, data_ndim, sh, dnnl_f32, strides);
 
-    dst_md_ = std::make_shared<mkldnn::memory::desc>(dst_fmt);
-    out_    = std::make_shared<mkldnn::memory>(*dst_md_, engine, nullptr);
+    dst_md_ = std::make_shared<dnnl::memory::desc>(dst_fmt);
+    out_    = std::make_shared<dnnl::memory>(*dst_md_, engine, nullptr);
 
-    transpose_ = std::make_shared<mkldnn::reorder>(*data_, *out_);
+    transpose_ = std::make_shared<dnnl::reorder>(*data_, *out_);
   }
 
   void SetNewMem(const NDArray& data, const NDArray& output) {
-    if (data.IsMKLDNNData()) {
-      this->data_->set_data_handle(data.GetMKLDNNData()->get_data_handle());
+    if (data.IsDNNLData()) {
+      this->data_->set_data_handle(data.GetDNNLData()->get_data_handle());
     } else {
       MSHADOW_TYPE_SWITCH(
           data.dtype(), DTYPE, { this->data_->set_data_handle(data.data().dptr<DTYPE>()); });
     }
 
-    CHECK(!output.IsMKLDNNData());
+    CHECK(!output.IsDNNLData());
     MSHADOW_TYPE_SWITCH(
         output.dtype(), DTYPE, { this->out_->set_data_handle(output.data().dptr<DTYPE>()); });
   }
 
-  const mkldnn::reorder& GetFwd() const {
+  const dnnl::reorder& GetFwd() const {
     return *transpose_;
   }
 
   void Execute() const {
-    auto stream = MKLDNNStream::Get();
-    mkldnn_args_map_t net_args;
-    net_args.insert({{MKLDNN_ARG_FROM, *(data_)}, {MKLDNN_ARG_TO, *(out_)}});
+    auto stream = DNNLStream::Get();
+    dnnl_args_map_t net_args;
+    net_args.insert({{DNNL_ARG_FROM, *(data_)}, {DNNL_ARG_TO, *(out_)}});
     stream->RegisterPrimArgs(*transpose_, net_args);
     stream->Submit();
   }
 };
 
-static MKLDNNTransposeForward& GetTransposeForward(const TransposeParam& param,
-                                                   const NDArray& data) {
+static DNNLTransposeForward& GetTransposeForward(const TransposeParam& param, const NDArray& data) {
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNTransposeSignature, MKLDNNTransposeForward, OpHash>
-      fwds;
+  static thread_local std::unordered_map<DNNLTransposeSignature, DNNLTransposeForward, OpHash> fwds;
 #else
-  static MX_THREAD_LOCAL
-      std::unordered_map<MKLDNNTransposeSignature, MKLDNNTransposeForward, OpHash>
-          fwds;
+  static MX_THREAD_LOCAL std::unordered_map<DNNLTransposeSignature, DNNLTransposeForward, OpHash>
+      fwds;
 #endif
-  MKLDNNTransposeSignature key(param);
+  DNNLTransposeSignature key(param);
   key.AddSign(data);
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
-    MKLDNNTransposeForward fwd(param, data);
+    DNNLTransposeForward fwd(param, data);
     it = AddToCache(&fwds, key, fwd);
   }
   return it->second;
 }
 
-void MKLDNNTransposeForward(const nnvm::NodeAttrs& attrs,
-                            const OpContext& ctx,
-                            const NDArray& data,
-                            const OpReqType& req,
-                            const NDArray& output) {
+void DNNLTransposeForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const NDArray& data,
+                          const OpReqType& req,
+                          const NDArray& output) {
   const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
 
   auto fwd = GetTransposeForward(param, data);
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index de9351aabdc5..f5d6c2c96634 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -21,14 +21,14 @@
  * \file fully_connected.cc
  * \brief fully connect operator
  */
+#include "./dnnl/dnnl_base-inl.h"
+#include "./dnnl/dnnl_ops-inl.h"
 #include "./fully_connected-inl.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
-#include "./mkldnn/mkldnn_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-bool SupportMKLDNNFC(const NDArray& input) {
+bool SupportDNNLFC(const NDArray& input) {
   int ndim = input.shape().ndim();
   return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16) &&
          (ndim >= 1 && ndim <= 4) && input.storage_type() == kDefaultStorage;
@@ -97,10 +97,10 @@ void FullyConnectedComputeExCPU(const nnvm::NodeAttrs& attrs,
 #if MXNET_USE_ONEDNN == 1
   if (common::ContainsOnlyStorage(inputs, kDefaultStorage) &&
       common::ContainsOnlyStorage(outputs, kDefaultStorage)) {
-    if (SupportMKLDNNFC(inputs[0])) {
-      MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-      MKLDNNRun(MKLDNNFCForward, attrs, ctx, inputs, req, outputs);
-      MKLDNN_OPCHECK_RUN(FullyConnectedCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    if (SupportDNNLFC(inputs[0])) {
+      DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+      DNNLRun(DNNLFCForward, attrs, ctx, inputs, req, outputs);
+      DNNL_OPCHECK_RUN(FullyConnectedCompute<cpu>, attrs, ctx, inputs, req, outputs);
     } else {
       FallBackCompute(FullyConnectedCompute<cpu>, attrs, ctx, inputs, req, outputs);
     }
@@ -110,7 +110,7 @@ void FullyConnectedComputeExCPU(const nnvm::NodeAttrs& attrs,
     std::vector<NDArray> temp_ndarrays;
     std::vector<TBlob> in_blobs;
     for (const NDArray& in : inputs) {
-      // if ndarray is in default storage and MKLDNN is available,
+      // if ndarray is in default storage and DNNL is available,
       // need to make sure cpu layout data is used, instead of MKL layout
       if (in.storage_type() == kDefaultStorage) {
         temp_ndarrays.push_back(in.Reorder2Default());
@@ -145,10 +145,10 @@ void FullyConnectedGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                     const std::vector<NDArray>& inputs,
                                     const std::vector<OpReqType>& req,
                                     const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNFC(inputs[0])) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNFCBackward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(FullyConnectedGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLFC(inputs[0])) {
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLFCBackward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(FullyConnectedGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(FullyConnectedGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -208,7 +208,7 @@ static bool FCStorageType(const nnvm::NodeAttrs& attrs,
         out_attrs, mxnet::kDefaultStorage, dispatch_mode, DispatchMode::kFComputeEx);
   }
 #if MXNET_USE_ONEDNN == 1
-  if (!MKLDNNEnvSet())
+  if (!DNNLEnvSet())
     *dispatch_mode = DispatchMode::kFComputeFallback;
 #endif
 
@@ -240,7 +240,7 @@ static bool BackwardFCStorageType(const nnvm::NodeAttrs& attrs,
         out_attrs, mxnet::kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
   }
 #if MXNET_USE_ONEDNN == 1
-  if (!MKLDNNEnvSet())
+  if (!DNNLEnvSet())
     *dispatch_mode = DispatchMode::kFComputeFallback;
 #endif
   return dispatched;
@@ -304,7 +304,7 @@ If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
                                         return std::vector<std::string>{"output"};
                                       })
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FResourceRequest>("FResourceRequest",
                                 [](const NodeAttrs& n) {
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
@@ -341,7 +341,7 @@ NNVM_REGISTER_OP(_backward_FullyConnected)
     .set_attr<FInferStorageType>("FInferStorageType", BackwardFCStorageType)
     .set_attr_parser(ParamParser<FullyConnectedParam>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", FullyConnectedGradComputeExCPU)
 #endif
     .set_attr<FCompute>("FCompute<cpu>", FullyConnectedGradCompute<cpu>);
diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index 89d1c2e3cbf5..98222821a43c 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -57,8 +57,8 @@
 #include <nnvm/op_attr_types.h>
 #include "../elemwise_op_common.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./mkldnn/mkldnn_base-inl.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
+#include "./dnnl/dnnl_base-inl.h"
+#include "./dnnl/dnnl_ops-inl.h"
 #endif  // MXNET_USE_ONEDNN
 
 #if MSHADOW_USE_MKL == 1
@@ -423,7 +423,7 @@ static bool LayerNormInferStorageType(const nnvm::NodeAttrs& attrs,
                                       std::vector<int>* out_attrs) {
   CHECK(!in_attrs->empty());
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 
 static void LayerNormComputeExCPU(const nnvm::NodeAttrs& attrs,
@@ -432,10 +432,10 @@ static void LayerNormComputeExCPU(const nnvm::NodeAttrs& attrs,
                                   const std::vector<OpReqType>& req,
                                   const std::vector<NDArray>& outputs) {
   const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
-  if (SupportMKLDNNLayerNorm(param, inputs)) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNLayerNormForward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(LayerNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLLayerNorm(param, inputs)) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLLayerNormForward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(LayerNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   } else {
     FallBackCompute(LayerNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -448,10 +448,10 @@ static void LayerNormGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                       const std::vector<OpReqType>& req,
                                       const std::vector<NDArray>& outputs) {
   const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
-  if (SupportMKLDNNLayerNorm(param, inputs)) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNLayerNormBackward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(LayerNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLLayerNorm(param, inputs)) {
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLLayerNormBackward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(LayerNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   } else {
     FallBackCompute(LayerNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -508,7 +508,7 @@ axis to be the last item in the input shape.
     .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 3>)
     .set_attr<FCompute>("FCompute<cpu>", LayerNormCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FInferStorageType>("FInferStorageType", LayerNormInferStorageType)
     .set_attr<FComputeEx>("FComputeEx<cpu>", LayerNormComputeExCPU)
 #endif
@@ -523,8 +523,8 @@ axis to be the last item in the input shape.
           heads.emplace_back(n, 2, 0);    // std
 #if MXNET_USE_ONEDNN == 1
           heads.push_back(
-              n->inputs[2]);  // beta - needed for MKLDNN backward propagation;
-                              // added at the end in case of fallback to non MKLDNN version
+              n->inputs[2]);  // beta - needed for DNNL backward propagation;
+                              // added at the end in case of fallback to non DNNL version
 #endif
           return MakeGradNode("_backward_LayerNorm", n, heads, n->attrs.dict);
         })
@@ -554,7 +554,7 @@ NNVM_REGISTER_OP(_backward_LayerNorm)
     .set_attr<FCompute>("FCompute<cpu>", LayerNormGradCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
     .set_attr<FInferStorageType>("FInferStorageType", LayerNormInferStorageType)
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", LayerNormGradComputeExCPU)
 #endif
     .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
diff --git a/src/operator/nn/log_softmax.cc b/src/operator/nn/log_softmax.cc
index de0cb8dae7f0..197f8922d797 100644
--- a/src/operator/nn/log_softmax.cc
+++ b/src/operator/nn/log_softmax.cc
@@ -26,8 +26,8 @@
 #include "../tensor/elemwise_binary_op.h"
 #include "../operator_common.h"
 #if MXNET_USE_ONEDNN == 1
-#include "mkldnn/mkldnn_base-inl.h"
-#include "mkldnn/mkldnn_ops-inl.h"
+#include "dnnl/dnnl_base-inl.h"
+#include "dnnl/dnnl_ops-inl.h"
 #endif
 
 namespace mxnet {
@@ -42,11 +42,11 @@ static void LogSoftmaxComputeExCPU(const nnvm::NodeAttrs& attrs,
   if (inputs[0].shape().Size() == 0U)
     return;
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-  if (SupportMKLDNNLogSoftmax(param, inputs[0], outputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNLogSoftmaxForward, attrs, ctx, inputs[0], req[0], outputs[0]);
+  if (SupportDNNLLogSoftmax(param, inputs[0], outputs[0])) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLLogSoftmaxForward, attrs, ctx, inputs[0], req[0], outputs[0]);
     auto fn = SoftmaxCompute<cpu, mxnet_op::log_softmax_fwd>;
-    MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(SoftmaxCompute<cpu, mxnet_op::log_softmax_fwd>, attrs, ctx, inputs, req, outputs);
@@ -60,11 +60,11 @@ static void LogSoftmaxGradComputeExCPU(const nnvm::NodeAttrs& attrs,
   if (inputs[0].shape().Size() == 0U)
     return;
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-  if (SupportMKLDNNLogSoftmax(param, inputs[1], outputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNLogSoftmaxBackward, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLLogSoftmax(param, inputs[1], outputs[0])) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLLogSoftmaxBackward, attrs, ctx, inputs, req, outputs);
     auto fn = SoftmaxGradCompute<cpu, op::mshadow_op::left, mxnet_op::log_softmax_bwd>;
-    MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(SoftmaxGradCompute<cpu, op::mshadow_op::left, mxnet_op::log_softmax_bwd>,
@@ -83,7 +83,7 @@ inline static bool LogSoftmaxStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 
 inline static bool LogSoftmaxGradStorageType(const nnvm::NodeAttrs& attrs,
@@ -100,7 +100,7 @@ inline static bool LogSoftmaxGradStorageType(const nnvm::NodeAttrs& attrs,
 
   CHECK_EQ(in_attrs->size(), num_inputs);
   CHECK_EQ(out_attrs->size(), 1U);
-  return MKLDNNStorageType(attrs, dev_mask, support, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, support, dispatch_mode, in_attrs, out_attrs);
 }
 #endif
 
@@ -129,7 +129,7 @@ Examples::
                                      })
     .set_attr<FCompute>("FCompute<cpu>", SoftmaxCompute<cpu, mxnet_op::log_softmax_fwd>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", LogSoftmaxComputeExCPU)
     .set_attr<FInferStorageType>("FInferStorageType", LogSoftmaxStorageType)
 #endif
@@ -155,7 +155,7 @@ NNVM_REGISTER_OP(_backward_log_softmax)
     .add_argument("args", "NDArray-or-Symbol[]", "Positional input arguments")
     .set_attr_parser(ParamParser<SoftmaxParam>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", LogSoftmaxGradComputeExCPU)
     .set_attr<FInferStorageType>("FInferStorageType", LogSoftmaxGradStorageType)
 #endif
diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc
index 2c17e47e2b01..c121be272003 100644
--- a/src/operator/nn/lrn.cc
+++ b/src/operator/nn/lrn.cc
@@ -26,8 +26,8 @@
 #include "./lrn-inl.h"
 #include "../operator_common.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./mkldnn/mkldnn_lrn-inl.h"
-#include "./mkldnn/mkldnn_base-inl.h"
+#include "./dnnl/dnnl_base-inl.h"
+#include "./dnnl/dnnl_lrn-inl.h"
 #endif
 
 namespace mxnet {
@@ -89,7 +89,7 @@ bool LRNForwardInferStorageType(const nnvm::NodeAttrs& attrs,
                                 std::vector<int>* out_attrs) {
   CHECK(!in_attrs->empty());
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 
 bool LRNBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
@@ -99,7 +99,7 @@ bool LRNBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
                                  std::vector<int>* out_attrs) {
   CHECK(!in_attrs->empty());
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 
 void LRNComputeExCPU(const nnvm::NodeAttrs& attrs,
@@ -107,13 +107,13 @@ void LRNComputeExCPU(const nnvm::NodeAttrs& attrs,
                      const std::vector<NDArray>& inputs,
                      const std::vector<OpReqType>& req,
                      const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNN(inputs[0])) {
+  if (SupportDNNL(inputs[0])) {
     // We only need to test one output array.
-    MKLDNN_OPCHECK_INIT(false, 1, inputs, outputs);
-    MKLDNNRun(MKLDNNLRNForward, attrs, ctx, inputs[0], req[0], outputs[0]);
-    MKLDNN_OPCHECK_RUN(LRNCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_INIT(false, 1, inputs, outputs);
+    DNNLRun(DNNLLRNForward, attrs, ctx, inputs[0], req[0], outputs[0]);
+    DNNL_OPCHECK_RUN(LRNCompute<cpu>, attrs, ctx, inputs, req, outputs);
     // Copy outputs[1] from opcheck reference as backward check needs it.
-    MKLDNN_OPCHECK_COPY_RESULT(outputs, std::vector<size_t>{1});
+    DNNL_OPCHECK_COPY_RESULT(outputs, std::vector<size_t>{1});
     return;
   }
   FallBackCompute(LRNCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -124,10 +124,10 @@ void LRNGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                          const std::vector<NDArray>& inputs,
                          const std::vector<OpReqType>& req,
                          const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNN(inputs[0])) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNLRNBackward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(LRNGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNL(inputs[0])) {
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLLRNBackward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(LRNGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(LRNGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -173,7 +173,7 @@ number of kernels in the layer.
                                       })
     .set_attr<FCompute>("FCompute<cpu>", LRNCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", LRNComputeExCPU)
 #endif
     .set_attr<nnvm::FGradient>("FGradient", LRNGrad{"_backward_LRN"})
@@ -189,10 +189,10 @@ NNVM_REGISTER_OP(_backward_LRN)
 #endif
     .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", LRNGradComputeExCPU)
-    // Native compute requires norm while MKLDNN does not so cannot be compared in debug mode
-    .set_attr<bool>("TExcludeMKLDNNDebug", true)
+    // Native compute requires norm while DNNL does not so cannot be compared in debug mode
+    .set_attr<bool>("TExcludeDNNLDebug", true)
 #endif
     .set_attr<FCompute>("FCompute<cpu>", LRNGradCompute<cpu>);
 
diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h
deleted file mode 100644
index f14eef60b5b6..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_act-inl.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_act-inl.h
- * \brief MKLDNN Activation operator
- * /author Zhiyuan Huang
- */
-
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_
-
-#if MXNET_USE_ONEDNN == 1
-#include <utility>
-#include <vector>
-
-#include "../../leaky_relu-inl.h"
-#include "../activation-inl.h"
-
-namespace mxnet {
-namespace op {
-
-struct MKLDNNActParam {
-  mkldnn::algorithm alg;
-  float slope = 0.f;
-
-  bool operator==(const MKLDNNActParam& other) const {
-    return this->alg == other.alg && this->slope == other.slope;
-  }
-};
-
-mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param);
-mkldnn::algorithm GetMKLDNNActAlgo(const LeakyReLUParam& param);
-
-mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(const MKLDNNActParam& param,
-                                                          bool is_train,
-                                                          const mkldnn::memory& input_mem);
-
-class MKLDNNActForward {
- public:
-  const mkldnn::eltwise_forward::primitive_desc fwd_pd;
-
-  MKLDNNActForward(const MKLDNNActParam& param,
-                   bool is_train,
-                   const NDArray& data,
-                   const mkldnn::memory& mem)
-      : fwd_pd(GetActFwdDescImpl(param, is_train, mem)) {
-    fwd_ = std::make_shared<mkldnn::eltwise_forward>(fwd_pd);
-  }
-  const inline mkldnn::eltwise_forward& GetFwd() const;
-
- private:
-  std::shared_ptr<mkldnn::eltwise_forward> fwd_;
-};
-
-typedef ParamOpSign<MKLDNNActParam> MKLDNNActSignature;
-MKLDNNActForward& GetActForward(const MKLDNNActParam& param,
-                                const OpContext& ctx,
-                                const NDArray& in_data,
-                                const mkldnn::memory& in_mem);
-
-mkldnn::eltwise_backward::primitive_desc GetActBwdDescImpl(const MKLDNNActParam& param,
-                                                           const mkldnn::memory& input_mem,
-                                                           const mkldnn::memory& diff_dst_memory);
-
-class MKLDNNActBackward {
- public:
-  const mkldnn::eltwise_backward::primitive_desc bwd_pd;
-
-  explicit MKLDNNActBackward(const MKLDNNActParam& param,
-                             const NDArray& data,
-                             const mkldnn::memory& mem,
-                             const mkldnn::memory& diff_dst_memory)
-      : bwd_pd(GetActBwdDescImpl(param, mem, diff_dst_memory)) {
-    bwd_prim_ = std::make_shared<mkldnn::eltwise_backward>(bwd_pd);
-  }
-  const inline mkldnn::eltwise_backward& GetBwd() const;
-
- private:
-  std::shared_ptr<mkldnn::eltwise_backward> bwd_prim_;
-};
-}  // namespace op
-}  // namespace mxnet
-
-namespace std {
-template <>
-struct hash<mxnet::op::MKLDNNActParam> {
-  size_t operator()(const mxnet::op::MKLDNNActParam& val) {
-    size_t ret = 0;
-    ret        = dmlc::HashCombine(ret, static_cast<size_t>(val.alg));
-    ret        = dmlc::HashCombine(ret, val.slope);
-    return ret;
-  }
-};
-}  // namespace std
-
-#endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc
deleted file mode 100644
index afaf5e9ced2e..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_act.cc
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_act.cc
- * \brief
- * \author Da Zheng
- */
-
-#if MXNET_USE_ONEDNN == 1
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-
-#include <algorithm>
-#include <map>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "./mkldnn_base-inl.h"
-
-#include "../../operator_common.h"
-
-#include "mkldnn_act-inl.h"
-
-namespace mxnet {
-namespace op {
-
-bool SupportMKLDNNAct(const ActivationParam& param) {
-  return param.act_type == activation::kReLU || param.act_type == activation::kSigmoid ||
-         param.act_type == activation::kLogSigmoid || param.act_type == activation::kMish ||
-         param.act_type == activation::kSoftReLU || param.act_type == activation::kTanh;
-}
-
-bool SupportMKLDNNAct(const ActivationParam& param, const NDArray& input) {
-  // MKL-DNN Activation supports 1d, 2d, 3d, 4d and 5d data layout
-  if ((input.shape().ndim() < 1) || (input.shape().ndim() > 5) ||
-      !(input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16))
-    return false;
-  return SupportMKLDNNAct(param);
-}
-
-bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param) {
-  return param.act_type == leakyrelu::kLeakyReLU || param.act_type == leakyrelu::kELU ||
-         param.act_type == leakyrelu::kGELU;
-}
-
-bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param, const NDArray& input) {
-  // MKL-DNN Activation supports 1d, 2d, 3d, 4d and 5d data layout
-  if ((input.shape().ndim() < 1) || (input.shape().ndim() > 5) ||
-      !(input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16))
-    return false;
-  return SupportMKLDNNLeakyRelu(param);
-}
-
-bool SupportQuantizedMKLDNNAct(const ActivationParam& param) {
-  // TODO(zhennan): Add more activation type when mkldnn supports.
-  //                Remove this when it's identity to SupportMKLDNNAct.
-  return param.act_type == activation::kReLU;
-}
-
-mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) {
-  switch (param.act_type) {
-    case activation::kReLU:
-      return mkldnn::algorithm::eltwise_relu;
-    case activation::kSigmoid:
-      return mkldnn::algorithm::eltwise_logistic;
-    case activation::kLogSigmoid:
-      return mkldnn::algorithm::eltwise_logsigmoid;
-    case activation::kMish:
-      return mkldnn::algorithm::eltwise_mish;
-    case activation::kTanh:
-      return mkldnn::algorithm::eltwise_tanh;
-    case activation::kSoftReLU:
-      return mkldnn::algorithm::eltwise_soft_relu;
-    default:
-      LOG(FATAL) << "unknown activation type";
-      return mkldnn::algorithm::eltwise_relu;
-  }
-}
-
-mkldnn::algorithm GetMKLDNNActAlgo(const LeakyReLUParam& param) {
-  switch (param.act_type) {
-    case leakyrelu::kLeakyReLU:
-      return mkldnn::algorithm::eltwise_relu;
-    case leakyrelu::kELU:
-      return mkldnn::algorithm::eltwise_elu;
-    case leakyrelu::kGELU:
-      return mkldnn::algorithm::eltwise_gelu_erf;
-    default:
-      LOG(FATAL) << "unknown activation type for LeakyReLU: " << param.act_type;
-      return mkldnn::algorithm::eltwise_relu;
-  }
-}
-
-mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(const MKLDNNActParam& param,
-                                                          bool is_train,
-                                                          const mkldnn::memory& input_mem) {
-  mkldnn::memory::desc data_md = input_mem.get_desc();
-  auto cpu_engine              = CpuEngine::Get()->get_engine();
-  auto alg                     = param.alg;
-
-  auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
-  auto desc = mkldnn::eltwise_forward::desc(prop, alg, data_md, param.slope);
-  return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine);
-}
-
-const inline mkldnn::eltwise_forward& MKLDNNActForward::GetFwd() const {
-  return *fwd_;
-}
-
-MKLDNNActForward& GetActForward(const MKLDNNActParam& param,
-                                const OpContext& ctx,
-                                const NDArray& in_data,
-                                const mkldnn::memory& in_mem) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNActSignature, MKLDNNActForward, OpHash> fwds;
-#else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNActSignature, MKLDNNActForward, OpHash> fwds;
-#endif
-  MKLDNNActSignature key(param);
-  key.AddSign(ctx.is_train);
-  key.AddSign(static_cast<int>(param.alg));
-  key.AddSign(param.slope);
-  key.AddSign(in_data);
-  auto it = fwds.find(key);
-  if (it == fwds.end()) {
-    MKLDNNActForward fwd(param, ctx.is_train, in_data, in_mem);
-    it = AddToCache(&fwds, key, fwd);
-  }
-  return it->second;
-}
-
-void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs,
-                             const OpContext& ctx,
-                             const NDArray& in_data,
-                             const OpReqType& req,
-                             const NDArray& out_data) {
-  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  MKLDNNActParam param_;
-  param_.alg               = GetMKLDNNActAlgo(param);
-  const NDArray& in_buffer = in_data;
-  MKLDNNStream* stream     = MKLDNNStream::Get();
-  auto input_mem           = in_buffer.GetMKLDNNData();
-  MKLDNNActForward& fwd    = GetActForward(param_, ctx, in_buffer, *input_mem);
-  auto out_mem_t           = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
-  stream->RegisterPrimArgs(fwd.GetFwd(),
-                           {{MKLDNN_ARG_SRC, *input_mem}, {MKLDNN_ARG_DST, *out_mem_t.second}});
-  CommitOutput(out_data, out_mem_t);
-  stream->Submit();
-}
-
-void MKLDNNLeakyReluForward(const nnvm::NodeAttrs& attrs,
-                            const OpContext& ctx,
-                            const NDArray& in_data,
-                            const OpReqType& req,
-                            const NDArray& out_data) {
-  const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
-  MKLDNNActParam param_;
-  param_.alg   = GetMKLDNNActAlgo(param);
-  param_.slope = param.slope;
-
-  NDArray in_buffer    = in_data;
-  MKLDNNStream* stream = MKLDNNStream::Get();
-
-  if (in_data.IsView() && in_data.IsMKLDNNData())
-    in_buffer = in_data.Reorder2Default();
-
-  auto input_mem        = in_buffer.GetMKLDNNData();
-  MKLDNNActForward& fwd = GetActForward(param_, ctx, in_buffer, *input_mem);
-  auto out_mem_t        = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
-  stream->RegisterPrimArgs(fwd.GetFwd(),
-                           {{MKLDNN_ARG_SRC, *input_mem}, {MKLDNN_ARG_DST, *out_mem_t.second}});
-  CommitOutput(out_data, out_mem_t);
-  stream->Submit();
-}
-
-mkldnn::eltwise_backward::primitive_desc GetActBwdDescImpl(const MKLDNNActParam& param,
-                                                           const mkldnn::memory& input_mem,
-                                                           const mkldnn::memory& diff_dst_memory) {
-  mkldnn::memory::desc data_md = input_mem.get_desc();
-  mkldnn::memory::desc diff_md = diff_dst_memory.get_desc();
-  auto cpu_engine              = CpuEngine::Get()->get_engine();
-  auto alg                     = param.alg;
-
-  mkldnn::eltwise_forward::desc fw_desc(
-      mkldnn::prop_kind::forward_training, alg, data_md, param.slope);
-  mkldnn::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine);
-  mkldnn::eltwise_backward::desc bw_desc(alg, diff_md, data_md, param.slope);
-  mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, fw_pdesc);
-  return bw_pdesc;
-}
-
-const inline mkldnn::eltwise_backward& MKLDNNActBackward::GetBwd() const {
-  return *bwd_prim_;
-}
-
-static inline MKLDNNActBackward& GetActBackward(const MKLDNNActParam& param,
-                                                const OpContext& ctx,
-                                                const NDArray& in_data,
-                                                const NDArray& out_grad,
-                                                const mkldnn::memory& in_mem) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNActSignature, MKLDNNActBackward, OpHash> bwds;
-#else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNActSignature, MKLDNNActBackward, OpHash> bwds;
-#endif
-  MKLDNNActSignature key(param);
-  key.AddSign(in_data);
-  key.AddSign(out_grad);
-
-  auto it = bwds.find(key);
-  if (it == bwds.end()) {
-    MKLDNNActBackward bwd(param, in_data, in_mem, *out_grad.GetMKLDNNData());
-    it = AddToCache(&bwds, key, bwd);
-  }
-  return it->second;
-}
-
-// For backward relu activation, it's okay to pass "out_data" as "in_data" to this
-// function, since the computation only involes non-zeros.
-void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs,
-                              const OpContext& ctx,
-                              const std::vector<NDArray>& inputs,
-                              const std::vector<OpReqType>& req,
-                              const std::vector<NDArray>& outputs) {
-  if (req[0] == kNullOp) {
-    return;
-  }
-  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  // XXX: for y = relu(x), y is passed as "in_data" to Backward()
-  const bool relu           = param.act_type == activation::kReLU;
-  const NDArray& out_buffer = inputs[0];
-  const NDArray& in_buffer  = relu ? inputs[1] : inputs[2];
-  const NDArray& in_grad    = outputs[0];
-  MKLDNNActParam param_;
-  param_.alg = GetMKLDNNActAlgo(param);
-  TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]);
-  auto diff_dst_memory = out_buffer.GetMKLDNNData();
-  auto input_mem       = in_buffer.GetMKLDNNData();
-  // We need to make sure the two inputs to eltwise_backward has the same memory
-  // descriptor. Otherwise, the perf will suffer.
-  if (input_mem->get_desc() != diff_dst_memory->get_desc()) {
-    input_mem = in_buffer.GetMKLDNNDataReorder(diff_dst_memory->get_desc());
-  }
-
-  MKLDNNActBackward& bwd = GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
-  MKLDNNStream* stream   = MKLDNNStream::Get();
-  mkldnn_args_map_t args = {{MKLDNN_ARG_SRC, *input_mem}, {MKLDNN_ARG_DIFF_DST, *diff_dst_memory}};
-  if (req[0] != kAddTo) {
-    // req[0] is kWriteTo or kWriteInplace
-    auto diff_src_memory =
-        const_cast<NDArray&>(in_grad).CreateMKLDNNData(bwd.bwd_pd.diff_src_desc());
-    args.insert({MKLDNN_ARG_DIFF_SRC, *diff_src_memory});
-    stream->RegisterPrimArgs(bwd.GetBwd(), args);
-    stream->Submit();
-  } else {
-    auto diff_src_memory = CreateMKLDNNMem(in_grad, bwd.bwd_pd.diff_src_desc(), req[0]);
-    args.insert({MKLDNN_ARG_DIFF_SRC, *diff_src_memory.second});
-    stream->RegisterPrimArgs(bwd.GetBwd(), args);
-    CommitOutput(in_grad, diff_src_memory);
-    stream->Submit();
-  }
-}
-
-void MKLDNNLeakyReluBackward(const nnvm::NodeAttrs& attrs,
-                             const OpContext& ctx,
-                             const std::vector<NDArray>& inputs,
-                             const std::vector<OpReqType>& req,
-                             const std::vector<NDArray>& outputs) {
-  if (req[0] == kNullOp) {
-    return;
-  }
-  CHECK_EQ(inputs.size(), 2U);
-  CHECK_EQ(outputs.size(), 1U);
-  const NDArray& out_buffer = inputs[0];
-  const NDArray& in_buffer  = inputs[1];
-  const NDArray& output     = outputs[0];
-
-  const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
-  MKLDNNActParam param_;
-  param_.alg   = GetMKLDNNActAlgo(param);
-  param_.slope = param.slope;
-
-  TmpMemMgr::Get()->Init(ctx.requested[leakyrelu::kRandom]);
-  auto diff_dst_memory = out_buffer.GetMKLDNNData();
-  auto input_mem       = in_buffer.GetMKLDNNData();
-  // We need to make sure the two inputs to eltwise_backward has the same memory
-  // descriptor. Otherwise, the perf will suffer.
-  if (input_mem->get_desc() != diff_dst_memory->get_desc())
-    input_mem = in_buffer.GetMKLDNNDataReorder(diff_dst_memory->get_desc());
-  MKLDNNActBackward& bwd          = GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
-  MKLDNNStream* stream            = MKLDNNStream::Get();
-  mkldnn_output_t diff_src_memory = CreateMKLDNNMem(output, bwd.bwd_pd.diff_src_desc(), req[0]);
-  mkldnn_args_map_t args          = {
-      {MKLDNN_ARG_SRC, *input_mem},
-      {MKLDNN_ARG_DIFF_DST, *diff_dst_memory},
-      {MKLDNN_ARG_DIFF_SRC, *diff_src_memory.second},
-  };
-  stream->RegisterPrimArgs(bwd.GetBwd(), args);
-  CommitOutput(output, diff_src_memory);
-  stream->Submit();
-}
-
-}  // namespace op
-}  // namespace mxnet
-#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_dot.cc b/src/operator/nn/mkldnn/mkldnn_batch_dot.cc
deleted file mode 100644
index 87ddb9876023..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_batch_dot.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_batch_dot.cc
- * \author: Bartosz Kuncer, bartosz.kuncer@intel.com
- */
-
-#if MXNET_USE_ONEDNN == 1
-
-#include "./mkldnn_batch_dot-inl.h"
-
-namespace mxnet {
-namespace op {
-
-bool SupportMKLDNNBatchDot(const std::vector<NDArray>& inputs, const NDArray& output) {
-  return inputs[0].shape().Size() != 0 && inputs[1].shape().Size() != 0 &&
-         output.shape().Size() != 0 &&
-         (inputs[0].dtype() == mshadow::kFloat32 || inputs[0].dtype() == mshadow::kBfloat16);
-}
-
-void MKLDNNBatchDotForward(const nnvm::NodeAttrs& attrs,
-                           const OpContext& ctx,
-                           const std::vector<NDArray>& inputs,
-                           const std::vector<OpReqType>& req,
-                           const std::vector<NDArray>& outputs) {
-  const DotParam& param  = nnvm::get<DotParam>(attrs.parsed);
-  MKLDNNBatchDotFwd& fwd = MKLDNNBatchDotFwd::GetCached(param, inputs, outputs);
-  fwd.Execute(inputs, req, outputs);
-}
-
-MKLDNNBatchDotFwd& MKLDNNBatchDotFwd::GetCached(const DotParam& param,
-                                                const std::vector<NDArray>& inputs,
-                                                const std::vector<NDArray>& outputs) {
-  using batch_dot_fwd_map = std::unordered_map<BatchDotSignature, MKLDNNBatchDotFwd, OpHash>;
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local batch_dot_fwd_map fwds;
-#else
-  static MX_THREAD_LOCAL batch_dot_fwd_map fwds;
-#endif
-
-  BatchDotSignature key(param);
-  key.AddSign(inputs[0]);
-  key.AddSign(inputs[1]);
-  key.AddSign(outputs[0]);
-
-  auto it = fwds.find(key);
-  if (it == fwds.end()) {
-    const MKLDNNBatchDotFwd fwd(param, inputs, outputs);
-    it = AddToCache(&fwds, key, fwd);
-  }
-  return it->second;
-}
-
-MKLDNNBatchDotFwd::MKLDNNBatchDotFwd(const DotParam& param,
-                                     const std::vector<NDArray>& inputs,
-                                     const std::vector<NDArray>& outputs) {
-  auto shape  = inputs[0].shape();
-  auto ndim   = shape.ndim();
-  auto bigDim = shape[0];
-  for (size_t i = 1; i < ndim - 2; ++i) {
-    bigDim *= shape[i];
-  }
-
-  auto GetMemoryDesc = [&ndim, &bigDim](const NDArray& tensor, const bool transpose) {
-    auto shape = tensor.shape();
-    if (transpose) {
-      return mkldnn::memory::desc(mkldnn::memory::dims{bigDim, shape[ndim - 1], shape[ndim - 2]},
-                                  get_mkldnn_type(tensor.dtype()),
-                                  mkldnn::memory::format_tag::acb);
-    } else {
-      return mkldnn::memory::desc(mkldnn::memory::dims{bigDim, shape[ndim - 2], shape[ndim - 1]},
-                                  get_mkldnn_type(tensor.dtype()),
-                                  mkldnn::memory::format_tag::any);
-    }
-  };
-
-  mkldnn::memory::desc data_md    = GetMemoryDesc(inputs[0], param.transpose_a);
-  mkldnn::memory::desc weights_md = GetMemoryDesc(inputs[1], param.transpose_b);
-  mkldnn::memory::desc out_md({bigDim, data_md.dims()[1], weights_md.dims()[2]},
-                              get_mkldnn_type(outputs[0].dtype()),
-                              mkldnn::memory::format_tag::any);
-  mkldnn::matmul::desc fwd_desc(data_md, weights_md, out_md);
-  fwd_pd = std::make_shared<batch_dot_fwd_pd_t>(fwd_desc, mxnet::CpuEngine::Get()->get_engine());
-  fwd    = std::make_shared<batch_dot_fwd_t>(*fwd_pd);
-}
-
-void MKLDNNBatchDotFwd::Execute(const std::vector<NDArray>& inputs,
-                                const std::vector<OpReqType>& req,
-                                const std::vector<NDArray>& outputs) {
-  auto engine = mxnet::CpuEngine::Get()->get_engine();
-  auto data =
-      mkldnn::memory(fwd_pd->src_desc(), engine, reinterpret_cast<void*>(inputs[0].data().dptr_));
-  auto weights = mkldnn::memory(
-      fwd_pd->weights_desc(), engine, reinterpret_cast<void*>(inputs[1].data().dptr_));
-  mkldnn_output_t out_mem = CreateMKLDNNMem(outputs[0], fwd_pd->dst_desc(), req[0], &inputs[0]);
-
-  mkldnn_args_map_t args = {
-      {MKLDNN_ARG_SRC, data},
-      {MKLDNN_ARG_WEIGHTS, weights},
-      {MKLDNN_ARG_DST, *out_mem.second},
-  };
-
-  MKLDNNStream::Get()->RegisterPrimArgs(*fwd, args);
-  CommitOutput(outputs[0], out_mem);
-  MKLDNNStream::Get()->Submit();
-}
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_USE_ONEDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc
deleted file mode 100644
index 69dad1dd0fe4..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_concat.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_concat.cc
- * \brief
- * \author
- */
-
-#if MXNET_USE_ONEDNN == 1
-#include "mkldnn_concat-inl.h"
-
-namespace mxnet {
-namespace op {
-
-static inline bool IsUsingPadding(const mkldnn::memory::desc& dst_md) {
-  // make sure a blocked format is used (at least one dimension is blocked)
-  bool is_blocked_format =
-      dst_md.data.format_kind == mkldnn_blocked && dst_md.data.format_desc.blocking.inner_nblks > 0;
-  return is_blocked_format &&
-         !std::equal(
-             dst_md.data.dims, dst_md.data.dims + dst_md.data.ndims, dst_md.data.padded_dims);
-}
-
-MKLDNNConcatFwd::MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc>& data_md)
-    : fwd_pd(concat_dim, data_md, CpuEngine::Get()->get_engine()) {
-  // MKL-DNN introduced padded formats since 0.15 which require more memory
-  // compared to the actual size of the tensor. Currently, MKL-DNN operators
-  // still reuse memory from memory planning, so here we need to select a
-  // format that has the expected memory size requirements (a plain format)
-
-  // When fwd_pd uses padding, impose a plain format
-  const auto& dst_md = fwd_pd.dst_desc();
-  if (IsUsingPadding(dst_md)) {
-    auto plain_dst_tag =
-        static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(dst_md.data.ndims));
-    auto plain_dst_md = mkldnn::memory::desc(dst_md.dims(), dst_md.data_type(), plain_dst_tag);
-    fwd_pd            = mkldnn::concat::primitive_desc(
-        plain_dst_md, concat_dim, data_md, CpuEngine::Get()->get_engine());
-  }
-  fwd_ = std::make_shared<mkldnn::concat>(fwd_pd);
-}
-
-void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs,
-                         const OpContext& ctx,
-                         const std::vector<NDArray>& in_data,
-                         const std::vector<OpReqType>& req,
-                         const std::vector<NDArray>& out_data) {
-  TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
-  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
-  const int num_in_data    = param.num_args;
-  const int concat_dim     = param.dim;
-  std::vector<mkldnn::memory::desc> data_md;
-  std::vector<const mkldnn::memory*> data_mem;
-  data_md.reserve(num_in_data);
-  data_mem.reserve(num_in_data);
-  for (int i = 0; i < num_in_data; i++) {
-    const mkldnn::memory* tmp_mem = in_data[i].GetMKLDNNData();
-    mkldnn::memory::desc tmp_md   = tmp_mem->get_desc();
-    data_md.push_back(tmp_md);
-    data_mem.push_back(tmp_mem);
-  }
-  MKLDNNConcatFwd& fwd = GetConcatForward(concat_dim, in_data, data_md);
-  mxnet::mkldnn_output_t out_mem =
-      CreateMKLDNNMem(out_data[concat_enum::kOut], fwd.fwd_pd.dst_desc(), req[concat_enum::kOut]);
-  std::unordered_map<int, mkldnn::memory> net_args;
-  net_args.insert({MKLDNN_ARG_DST, *out_mem.second});
-  for (int i = 0; i < num_in_data; i++) {
-    net_args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, *data_mem[i]});
-  }
-  MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
-  CommitOutput(out_data[concat_enum::kOut], out_mem);
-  MKLDNNStream::Get()->Submit();
-}
-
-void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<NDArray>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<NDArray>& outputs) {
-  TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
-  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
-  const int num_in_data    = param.num_args;
-  const int axis           = param.dim;
-  const auto gradz_mem     = inputs[0].GetMKLDNNData();
-  /* init the offset */
-  mkldnn::memory::dims offsets(outputs[0].shape().ndim());
-  for (auto& v : offsets) {
-    v = 0;
-  }
-
-  for (int i = 0; i < num_in_data; i++) {
-    mkldnn::memory::dims diff_src_tz(outputs[i].shape().begin(), outputs[i].shape().end());
-    auto diff_src_md = outputs[i].GetMKLDNNData()->get_desc();
-    auto gradi_mem   = CreateMKLDNNMem(outputs[i], diff_src_md, req[i]);
-
-    auto from_md = gradz_mem->get_desc().submemory_desc(diff_src_tz, offsets);
-    auto from_mem =
-        new mkldnn::memory(from_md, gradz_mem->get_engine(), gradz_mem->get_data_handle());
-    offsets[axis] += diff_src_tz[axis];
-
-    std::unordered_map<int, mkldnn::memory> net_args(
-        {{MKLDNN_ARG_FROM, *gradz_mem}, {MKLDNN_ARG_TO, *gradi_mem.second}});
-    MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::reorder(*from_mem, *gradi_mem.second), net_args);
-    CommitOutput(outputs[i], gradi_mem);
-  }
-
-  MKLDNNStream::Get()->Submit();
-}
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_USE_ONEDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
deleted file mode 100644
index 4197a01d3574..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_convolution-inl.h
- * \brief
- */
-
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONVOLUTION_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONVOLUTION_INL_H_
-
-#if MXNET_USE_ONEDNN == 1
-
-#include <utility>
-#include <vector>
-
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-
-#include "../convolution-inl.h"
-
-namespace mxnet {
-namespace op {
-
-struct MKLDNNConvParam : public dmlc::Parameter<MKLDNNConvParam> {
-  bool with_bn;
-  bool with_act;
-  bool with_sum;
-  bool with_postsum_act;
-  bool quantized;
-  bool dedup_sum;
-
-  dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
-  dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
-
-  DMLC_DECLARE_PARAMETER(MKLDNNConvParam) {
-    DMLC_DECLARE_FIELD(with_bn).set_default(false).describe("Add post batchnorm.");
-    DMLC_DECLARE_FIELD(with_act).set_default(false).describe("Add post activation");
-    DMLC_DECLARE_FIELD(with_sum).set_default(false).describe("Add post sum");
-    DMLC_DECLARE_FIELD(with_postsum_act)
-        .set_default(false)
-        .describe("Add post activation after sum");
-    DMLC_DECLARE_FIELD(quantized).set_default(false).describe("enable quantization");
-    DMLC_DECLARE_FIELD(dedup_sum).set_default(false).describe("deduplicated sum input");
-    DMLC_DECLARE_FIELD(min_calib_range)
-        .set_default(dmlc::optional<float>())
-        .describe(
-            "The minimum scalar value in the form of float32 obtained "
-            "through calibration. If present, it will be used to by "
-            "quantized convolution op to calculate primitive scale");
-    DMLC_DECLARE_FIELD(max_calib_range)
-        .set_default(dmlc::optional<float>())
-        .describe(
-            "The maximum scalar value in the form of float32 obtained "
-            "through calibration. If present, it will be used to by "
-            "quantized convolution op to calculate primitive scale");
-  }
-};
-
-struct MKLDNNConvFullParam {
-  ConvolutionParam conv_param;
-  MKLDNNConvParam mkldnn_param;
-  float sum_scale = 1.f;
-  std::vector<float> requantize_scales;
-  MKLDNNPostEltwiseParam act_param;
-  MKLDNNPostEltwiseParam postsum_act_param;
-};
-
-std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
-    const ConvolutionParam& param,
-    const bool is_train,
-    const NDArray& data,
-    const NDArray& weight,
-    const NDArray* bias,
-    const NDArray& output);
-
-class MKLDNNConvForward {
- public:
-  MKLDNNConvForward(const MKLDNNConvFullParam& param,
-                    const bool is_train,
-                    const NDArray& data,
-                    const NDArray& weight,
-                    const NDArray* bias,
-                    const NDArray& output);
-
-  const mkldnn::convolution_forward& GetFwd() const {
-    return *fwd_;
-  }
-
-  const mkldnn::convolution_forward::primitive_desc& GetPd() const {
-    return *pd_;
-  }
-
- private:
-  std::shared_ptr<mkldnn::convolution_forward> fwd_;
-  std::shared_ptr<mkldnn::convolution_forward::primitive_desc> pd_;
-};
-
-typedef ParamOpSign<ConvolutionParam> MKLDNNConvSignature;
-
-MKLDNNConvForward& GetConvFwd(const MKLDNNConvFullParam& param,
-                              const bool is_train,
-                              const NDArray& data,
-                              const NDArray& weight,
-                              const NDArray* bias,
-                              const NDArray& output);
-
-void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam& param,
-                                         const OpContext& ctx,
-                                         MKLDNNConvForward* fwd,
-                                         const std::vector<NDArray>& in_data,
-                                         const std::vector<OpReqType>& req,
-                                         const std::vector<NDArray>& out_data);
-
-void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs,
-                              const OpContext& ctx,
-                              const std::vector<NDArray>& in_data,
-                              const std::vector<OpReqType>& req,
-                              const std::vector<NDArray>& out_data);
-
-class MKLDNNConvBackward {
- public:
-  MKLDNNConvBackward(const MKLDNNConvFullParam& param,
-                     const NDArray& data,
-                     const NDArray& weight,
-                     const NDArray* bias,
-                     const NDArray& output);
-
-  const mkldnn::convolution_backward_data& GetBwdData() const {
-    return *bwd_data_;
-  }
-
-  const mkldnn::convolution_backward_weights& GetBwdWeights() const {
-    return *bwd_weight_;
-  }
-
-  const mkldnn::convolution_backward_data::primitive_desc& GetDataPd() const {
-    return *bwd_data_pd_;
-  }
-
-  const mkldnn::convolution_backward_weights::primitive_desc& GetWeightsPd() const {
-    return *bwd_weight_pd_;
-  }
-
- private:
-  std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> bwd_data_pd_;
-  std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> bwd_weight_pd_;
-  std::shared_ptr<mkldnn::convolution_backward_data> bwd_data_;
-  std::shared_ptr<mkldnn::convolution_backward_weights> bwd_weight_;
-};
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONVOLUTION_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h b/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h
deleted file mode 100644
index f6c53e0f8046..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_fully_connected-inl.h
- * \brief Common functions used by MKLDNN (Quantized) FullyConnected operator
- * \author Ciyong Chen
- */
-
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_FULLY_CONNECTED_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_FULLY_CONNECTED_INL_H_
-
-#if MXNET_USE_ONEDNN == 1
-
-#include <string>
-#include <vector>
-
-#include "./mkldnn_base-inl.h"
-
-#include "../fully_connected-inl.h"
-
-namespace mxnet {
-namespace op {
-
-struct MKLDNNFCParam : public dmlc::Parameter<MKLDNNFCParam> {
-  bool quantized;
-  bool enable_float_output;
-  bool with_eltwise;
-  dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
-  dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
-  dmlc::optional<bool> channel_wise_quantize;
-
-  DMLC_DECLARE_PARAMETER(MKLDNNFCParam) {
-    DMLC_DECLARE_FIELD(quantized).set_default(false).describe(
-        "Whether it's a quantized FullyConnected operator");
-    DMLC_DECLARE_FIELD(enable_float_output)
-        .set_default(false)
-        .describe("Whether to enable float32 output");
-    DMLC_DECLARE_FIELD(with_eltwise)
-        .set_default(false)
-        .describe("Whether there's a post with_eltwise after FullyConnected operator");
-    DMLC_DECLARE_FIELD(min_calib_range)
-        .set_default(dmlc::optional<float>())
-        .describe(
-            "The minimum scalar value in the form of float32 obtained "
-            "through calibration. If present, it will be used to by "
-            "quantized fullyconnected op to calculate primitive scale");
-    DMLC_DECLARE_FIELD(max_calib_range)
-        .set_default(dmlc::optional<float>())
-        .describe(
-            "The maximum scalar value in the form of float32 obtained "
-            "through calibration. If present, it will be used to by "
-            "quantized fullyconnected op to calculate primitive scale");
-    DMLC_DECLARE_FIELD(channel_wise_quantize)
-        .set_default(dmlc::optional<bool>())
-        .describe("Whether support channel-wise-quantize for weight.");
-  }
-};
-
-struct MKLDNNFCFullParam {
-  FullyConnectedParam default_param;
-  MKLDNNFCParam mkldnn_param;
-  MKLDNNPostEltwiseParam eltwise_param;
-  std::vector<float> output_scales = {0.0f};
-};
-
-mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(const MKLDNNFCFullParam& full_param,
-                                                           const bool is_train,
-                                                           const NDArray& data,
-                                                           const NDArray& weight,
-                                                           const NDArray* bias,
-                                                           const mkldnn::memory::desc& out_md);
-
-class MKLDNNFullyConnectedForward {
- public:
-  mkldnn::inner_product_forward::primitive_desc fwd_pd;
-
-  MKLDNNFullyConnectedForward(const MKLDNNFCFullParam& full_param,
-                              const bool is_train,
-                              const NDArray& data,
-                              const NDArray& weight,
-                              const NDArray* bias,
-                              const mkldnn::memory::desc& out_md)
-      : fwd_pd(GetFCFwdImpl(full_param, is_train, data, weight, bias, out_md)) {
-    fwd_ = std::make_shared<mkldnn::inner_product_forward>(fwd_pd);
-  }
-
-  const mkldnn::inner_product_forward& GetFwd() const {
-    return *fwd_;
-  }
-
- private:
-  std::shared_ptr<mkldnn::inner_product_forward> fwd_;
-};
-
-typedef ParamOpSign<FullyConnectedParam> MKLDNNFullyconSignature;
-
-MKLDNNFullyConnectedForward& GetFCFwd(const FullyConnectedParam& param,
-                                      const bool is_train,
-                                      const NDArray& data,
-                                      const NDArray& weight,
-                                      const NDArray* bias,
-                                      const mkldnn::memory::desc& out_md);
-
-void MKLDNNFCFlattenData(const FullyConnectedParam& param,
-                         const NDArray& out_data,
-                         NDArray* in_data,
-                         mkldnn::memory::desc* out_md);
-
-void MKLDNNFCForward(const nnvm::NodeAttrs& attrs,
-                     const OpContext& ctx,
-                     const std::vector<NDArray>& in_data,
-                     const std::vector<OpReqType>& req,
-                     const std::vector<NDArray>& out_data);
-
-void MKLDNNFCForwardFullFeature(const MKLDNNFCFullParam& param,
-                                const OpContext& ctx,
-                                MKLDNNFullyConnectedForward* fwd,
-                                const std::vector<NDArray>& in_data,
-                                const std::vector<OpReqType>& req,
-                                const std::vector<NDArray>& out_data);
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_FULLY_CONNECTED_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
deleted file mode 100644
index 023a8f53dc8f..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_fully_connected.cc
- * \brief MKLDNN FullyConnected operator
- * \author Da Zheng, Ciyong Chen
- */
-
-#if MXNET_USE_ONEDNN == 1
-#include "mkldnn_fully_connected-inl.h"
-
-namespace mxnet {
-namespace op {
-
-DMLC_REGISTER_PARAMETER(MKLDNNFCParam);
-
-mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(const MKLDNNFCFullParam& full_param,
-                                                           const bool is_train,
-                                                           const NDArray& data,
-                                                           const NDArray& weight,
-                                                           const NDArray* bias,
-                                                           const mkldnn::memory::desc& out_md) {
-  auto engine    = CpuEngine::Get()->get_engine();
-  auto data_md   = GetMemDesc(data);
-  auto weight_md = full_param.mkldnn_param.quantized
-                       ? GetFCWeightDesc(weight, data.shape()[0], mshadow::kInt8)
-                       : GetFCWeightDesc(weight, data.shape()[0]);
-  auto propagation =
-      is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
-
-  mkldnn::primitive_attr attr;
-  mkldnn::post_ops ops;
-  if (full_param.mkldnn_param.with_eltwise) {
-    ops.append_eltwise(full_param.eltwise_param.scale,
-                       full_param.eltwise_param.alg,
-                       full_param.eltwise_param.alpha,
-                       full_param.eltwise_param.beta);
-  }
-  attr.set_post_ops(ops);
-
-  if (full_param.mkldnn_param.quantized && full_param.output_scales.size()) {
-    int mask = (full_param.output_scales.size() == 1) ? 0 : (1 << 1);
-    attr.set_output_scales(mask, full_param.output_scales);
-  }
-
-  auto GetFCFwdPd = [&full_param, &attr, &engine](const mkldnn::inner_product_forward::desc& desc) {
-    try {
-      return mkldnn::inner_product_forward::primitive_desc(desc, attr, engine);
-    } catch (mkldnn::error& e) {
-      if (e.status == mkldnn_unimplemented && full_param.mkldnn_param.quantized) {
-        LOG(ERROR) << "AVX512-BW support or MKLDNN v0.18 is required for INT8 fully_connected.";
-      } else {
-        LOG(ERROR) << e.message;
-      }
-      throw;
-    }
-  };
-
-  if (bias) {
-    if ((*bias).shape().ndim() != 1)
-      LOG(FATAL) << "Unexpected shape for bias " << (*bias).shape();
-    auto bias_md =
-        full_param.mkldnn_param.quantized ? GetMemDesc(*bias, mshadow::kInt32) : GetMemDesc(*bias);
-    mkldnn::inner_product_forward::desc desc(propagation, data_md, weight_md, bias_md, out_md);
-    return GetFCFwdPd(desc);
-  } else {
-    mkldnn::inner_product_forward::desc desc(propagation, data_md, weight_md, out_md);
-    return GetFCFwdPd(desc);
-  }
-}
-
-inline static mkldnn::inner_product_backward_data::primitive_desc GetFCBwdData(
-    const NDArray& data,
-    const NDArray& weight,
-    const NDArray& output,
-    mkldnn::inner_product_forward::primitive_desc fwd_pd) {
-  auto data_md   = GetMemDesc(data);
-  auto weight_md = GetFCWeightDesc(weight, data.shape()[0]);
-  auto out_md    = GetMemDesc(output);
-  auto engine    = CpuEngine::Get()->get_engine();
-  mkldnn::inner_product_backward_data::desc desc(data_md, weight_md, out_md);
-  return mkldnn::inner_product_backward_data::primitive_desc(desc, engine, fwd_pd);
-}
-
-inline static mkldnn::inner_product_backward_weights::primitive_desc GetFCBwdWeights(
-    const NDArray& data,
-    const NDArray& weight,
-    const NDArray* bias,
-    const NDArray& output,
-    mkldnn::inner_product_forward::primitive_desc fwd_pd) {
-  auto data_md   = GetMemDesc(data);
-  auto weight_md = GetFCWeightDesc(weight, data.shape()[0]);
-  auto out_md    = GetMemDesc(output);
-  auto engine    = CpuEngine::Get()->get_engine();
-  if (bias) {
-    auto bias_md = GetMemDesc(*bias);
-    mkldnn::inner_product_backward_weights::desc desc(data_md, weight_md, bias_md, out_md);
-    return mkldnn::inner_product_backward_weights::primitive_desc(desc, engine, fwd_pd);
-  } else {
-    mkldnn::inner_product_backward_weights::desc desc(data_md, weight_md, out_md);
-    return mkldnn::inner_product_backward_weights::primitive_desc(desc, engine, fwd_pd);
-  }
-}
-
-MKLDNNFullyConnectedForward& GetFCFwd(const FullyConnectedParam& param,
-                                      const bool is_train,
-                                      const NDArray& data,
-                                      const NDArray& weight,
-                                      const NDArray* bias,
-                                      const mkldnn::memory::desc& out_md) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::
-      unordered_map<MKLDNNFullyconSignature, MKLDNNFullyConnectedForward, OpHash>
-          fcFwds;
-#else
-  static MX_THREAD_LOCAL
-      std::unordered_map<MKLDNNFullyconSignature, MKLDNNFullyConnectedForward, OpHash>
-          fcFwds;
-#endif
-  MKLDNNFullyconSignature key(param);
-  key.AddSign(is_train);
-  key.AddSign(data);
-  key.AddSign(weight);
-  if (bias)
-    key.AddSign(*bias);
-
-  auto it = fcFwds.find(key);
-  if (it == fcFwds.end()) {
-    MKLDNNFCFullParam full_param;
-    full_param.default_param = param;
-    full_param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
-    MKLDNNFullyConnectedForward fcFwd(full_param, is_train, data, weight, bias, out_md);
-    it = AddToCache(&fcFwds, key, fcFwd);
-  }
-  return it->second;
-}
-
-void MKLDNNFCFlattenData(const FullyConnectedParam& param,
-                         const NDArray& out_data,
-                         NDArray* in_data,
-                         mkldnn::memory::desc* out_md) {
-  const mxnet::TShape ishape = in_data->shape();
-  const mxnet::TShape oshape = out_data.shape();
-  if (ishape.ndim() != 2) {
-    if (!param.flatten) {
-      *in_data = in_data->MKLDNNDataReshape(
-          Shape2(ishape.ProdShape(0, ishape.ndim() - 1), ishape[ishape.ndim() - 1]));
-      mkldnn::memory::dims out_dims{static_cast<int>(oshape.ProdShape(0, oshape.ndim() - 1)),
-                                    static_cast<int>(oshape[ishape.ndim() - 1])};
-      *out_md = mkldnn::memory::desc(
-          out_dims, get_mkldnn_type(out_data.dtype()), mkldnn::memory::format_tag::any);
-    } else {
-      *in_data = in_data->MKLDNNDataReshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())));
-      mkldnn::memory::dims out_dims{static_cast<int>(oshape[0]),
-                                    static_cast<int>(oshape.ProdShape(1, oshape.ndim()))};
-      *out_md = mkldnn::memory::desc(
-          out_dims, get_mkldnn_type(out_data.dtype()), mkldnn::memory::format_tag::any);
-    }
-  }
-}
-
-void MKLDNNFCForwardFullFeature(const MKLDNNFCFullParam& full_param,
-                                const OpContext& ctx,
-                                MKLDNNFullyConnectedForward* fwd,
-                                const std::vector<NDArray>& in_data,
-                                const std::vector<OpReqType>& req,
-                                const std::vector<NDArray>& out_data) {
-  TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
-  NDArray weight = in_data[fullc::kWeight];
-  NDArray data   = in_data[fullc::kData];
-
-  auto data_mem = data.GetMKLDNNDataReorder(fwd->fwd_pd.src_desc());
-  const mkldnn::memory* weight_mem;
-  if (ctx.is_train) {
-    if (weight.IsMKLDNNData()) {
-      weight.Reorder2DefaultAsync();
-    }
-    weight_mem = GetWeights(weight, fwd->fwd_pd.weights_desc(), 1);
-  } else {
-    weight_mem = weight.GetMKLDNNData();
-    if (weight_mem->get_desc() != fwd->fwd_pd.weights_desc()) {
-      weight.MKLDNNDataReorderAsync(fwd->fwd_pd.weights_desc());
-      weight_mem = GetWeights(weight, fwd->fwd_pd.weights_desc(), 1);
-    }
-  }
-  auto out_mem =
-      CreateMKLDNNMem(out_data[fullc::kOut], fwd->fwd_pd.dst_desc(), req[fullc::kOut], &data);
-
-  mkldnn_args_map_t args = {
-      {MKLDNN_ARG_SRC, *data_mem},
-      {MKLDNN_ARG_WEIGHTS, *weight_mem},
-      {MKLDNN_ARG_DST, *out_mem.second},
-  };
-  if (!full_param.default_param.no_bias) {
-    auto bias_mem         = in_data[fullc::kBias].GetMKLDNNDataReorder(fwd->fwd_pd.bias_desc());
-    args[MKLDNN_ARG_BIAS] = *bias_mem;
-  }
-  MKLDNNStream::Get()->RegisterPrimArgs(fwd->GetFwd(), args);
-  CommitOutput(out_data[fullc::kOut], out_mem);
-  MKLDNNStream::Get()->Submit();
-}
-
-void MKLDNNFCForward(const nnvm::NodeAttrs& attrs,
-                     const OpContext& ctx,
-                     const std::vector<NDArray>& in_data,
-                     const std::vector<OpReqType>& req,
-                     const std::vector<NDArray>& out_data) {
-  MKLDNNFCFullParam full_param;
-  full_param.default_param = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  full_param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
-
-  NDArray data                = in_data[fullc::kData];
-  mkldnn::memory::desc out_md = GetMemDesc(out_data[fullc::kOut]);
-  MKLDNNFCFlattenData(full_param.default_param, out_data[fullc::kOut], &data, &out_md);
-  auto& fwd = GetFCFwd(full_param.default_param,
-                       ctx.is_train,
-                       data,
-                       in_data[fullc::kWeight],
-                       full_param.default_param.no_bias ? nullptr : &in_data[fullc::kBias],
-                       out_md);
-  std::vector<NDArray> new_inputs;
-  if (full_param.default_param.no_bias)
-    new_inputs = {data, in_data[fullc::kWeight]};
-  else
-    new_inputs = {data, in_data[fullc::kWeight], in_data[fullc::kBias]};
-  MKLDNNFCForwardFullFeature(full_param, ctx, &fwd, new_inputs, req, out_data);
-}
-
-void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs,
-                      const OpContext& ctx,
-                      const std::vector<NDArray>& inputs,
-                      const std::vector<OpReqType>& req,
-                      const std::vector<NDArray>& outputs) {
-  TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
-  const std::vector<NDArray>& in_grad = outputs;
-  MKLDNNFCFullParam full_param;
-  full_param.default_param = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  full_param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
-  const FullyConnectedParam& param = full_param.default_param;
-  const mxnet::TShape& ishape      = inputs[fullc::kData + 1].shape();
-  const mxnet::TShape& oshape      = inputs[fullc::kOut].shape();
-
-  NDArray weight = inputs[fullc::kWeight + 1];
-  NDArray data   = inputs[fullc::kData + 1];
-  if (data.shape().ndim() != 2 && !param.flatten)
-    data = data.MKLDNNDataReshape(
-        Shape2(ishape.ProdShape(0, ishape.ndim() - 1), ishape[ishape.ndim() - 1]));
-  else if (data.shape().ndim() != 2)
-    data = data.MKLDNNDataReshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())));
-  NDArray out_grad = inputs[fullc::kOut];
-  if (out_grad.shape().ndim() != 2 && !param.flatten)
-    out_grad = out_grad.MKLDNNDataReshape(
-        Shape2(oshape.ProdShape(0, oshape.ndim() - 1), oshape[oshape.ndim() - 1]));
-  else if (out_grad.shape().ndim() != 2)
-    out_grad = out_grad.MKLDNNDataReshape(Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())));
-
-  mkldnn::inner_product_forward::primitive_desc fwd_pd =
-      GetFCFwdImpl(full_param,
-                   ctx.is_train,
-                   data,
-                   weight,
-                   param.no_bias ? nullptr : &in_grad[fullc::kBias],
-                   GetMemDesc(out_grad));
-
-  CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
-  if (req[fullc::kWeight]) {
-    mkldnn::inner_product_backward_weights::primitive_desc ipBwdWeights_pd = GetFCBwdWeights(
-        data, weight, param.no_bias ? nullptr : &in_grad[fullc::kBias], out_grad, fwd_pd);
-    auto out_grad_mem   = out_grad.GetMKLDNNDataReorder(ipBwdWeights_pd.diff_dst_desc());
-    auto data_mem       = data.GetMKLDNNDataReorder(ipBwdWeights_pd.src_desc());
-    auto in_grad_weight = CreateMKLDNNWeightGrad(
-        in_grad[fullc::kWeight], ipBwdWeights_pd.diff_weights_desc(), req[fullc::kWeight]);
-    mkldnn_args_map_t args = {
-        {MKLDNN_ARG_DIFF_DST, *out_grad_mem},
-        {MKLDNN_ARG_SRC, *data_mem},
-        {MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second},
-    };
-
-    mkldnn_output_t in_grad_bias;
-    if (!param.no_bias) {
-      in_grad_bias = CreateMKLDNNMem(
-          in_grad[fullc::kBias], ipBwdWeights_pd.diff_bias_desc(), req[fullc::kBias]);
-      args[MKLDNN_ARG_DIFF_BIAS] = *in_grad_bias.second;
-    }
-    MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::inner_product_backward_weights(ipBwdWeights_pd),
-                                          args);
-    CommitOutput(in_grad[fullc::kWeight], in_grad_weight);
-    if (!param.no_bias) {
-      CommitOutput(in_grad[fullc::kBias], in_grad_bias);
-    }
-  }
-  if (req[fullc::kData]) {
-    mkldnn::inner_product_backward_data::primitive_desc ipBwdData_pd =
-        GetFCBwdData(data, weight, out_grad, fwd_pd);
-    auto out_grad_mem = out_grad.GetMKLDNNDataReorder(ipBwdData_pd.diff_dst_desc());
-    auto weight_mem   = weight.GetMKLDNNDataReorder(ipBwdData_pd.weights_desc());
-    auto in_grad_mem =
-        CreateMKLDNNMem(in_grad[fullc::kData], ipBwdData_pd.diff_src_desc(), req[fullc::kData]);
-    mkldnn_args_map_t args = {{MKLDNN_ARG_DIFF_DST, *out_grad_mem},
-                              {MKLDNN_ARG_WEIGHTS, *weight_mem},
-                              {MKLDNN_ARG_DIFF_SRC, *in_grad_mem.second}};
-
-    MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::inner_product_backward_data(ipBwdData_pd), args);
-    CommitOutput(in_grad[fullc::kData], in_grad_mem);
-  }
-  MKLDNNStream::Get()->Submit();
-}
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_USE_ONEDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_log_softmax.cc b/src/operator/nn/mkldnn/mkldnn_log_softmax.cc
deleted file mode 100644
index 5e3f7a5e2af1..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_log_softmax.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_log_softmax.cc
- * \brief Implementation of log_softmax function with MKLDNN support
- */
-
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-
-#include "../softmax-inl.h"
-
-#if MXNET_USE_ONEDNN == 1
-namespace mxnet {
-namespace op {
-
-static mkldnn::logsoftmax_forward::primitive_desc
-GetLogSoftmaxFwdPd(bool is_train, const int axis, const mkldnn::memory& input_mem) {
-  mkldnn::memory::desc data_md = input_mem.get_desc();
-  auto cpu_engine              = CpuEngine::Get()->get_engine();
-  auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
-  auto desc = mkldnn::logsoftmax_forward::desc(prop, data_md, axis);
-  return mkldnn::logsoftmax_forward::primitive_desc(desc, cpu_engine);
-}
-
-static mkldnn::logsoftmax_backward::primitive_desc GetLogSoftmaxBwdPd(
-    const mkldnn::memory& diff_mem,
-    const mkldnn::memory& data_mem,
-    const int axis,
-    const mkldnn::logsoftmax_forward::primitive_desc& hint_fwd_pd) {
-  mkldnn::memory::desc diff_md = diff_mem.get_desc();
-  mkldnn::memory::desc data_md = data_mem.get_desc();
-  auto cpu_engine              = CpuEngine::Get()->get_engine();
-  auto desc                    = mkldnn::logsoftmax_backward::desc(diff_md, data_md, axis);
-  return mkldnn::logsoftmax_backward::primitive_desc(desc, cpu_engine, hint_fwd_pd);
-}
-
-bool SupportMKLDNNLogSoftmax(const SoftmaxParam& param,
-                             const NDArray& data,
-                             const NDArray& output) {
-  const int ndim      = data.shape().ndim();
-  const int in_dtype  = data.dtype();
-  const int out_dtype = output.dtype();
-  const int axis      = CheckAxis(param.axis, ndim);
-  // MKLDNN does not support temperature argument in their log_softmax function
-  // now. Need update this once they start to support it.
-  // Currently, MKLDNN shows bad performance when log_softmax is not performed on the last dimension
-  if (param.temperature.has_value() || in_dtype != mshadow::kFloat32 || in_dtype != out_dtype ||
-      axis != (ndim - 1)) {
-    return false;
-  }
-
-  // only supports ndim = 1, 2, 3, 4 for now
-  return (ndim >= 1 && ndim <= 4);
-}
-
-class MKLDNNLogSoftmaxFwd {
- public:
-  mkldnn::logsoftmax_forward::primitive_desc pd;
-
-  MKLDNNLogSoftmaxFwd(const bool is_train, const int axis, const mkldnn::memory& input)
-      : pd(GetLogSoftmaxFwdPd(is_train, axis, input)) {
-    fwd_ = std::make_shared<mkldnn::logsoftmax_forward>(pd);
-  }
-
-  const mkldnn::logsoftmax_forward& GetFwd() const {
-    return *fwd_;
-  }
-
- private:
-  std::shared_ptr<mkldnn::logsoftmax_forward> fwd_;
-};
-
-typedef ParamOpSign<SoftmaxParam> MKLDNNSoftmaxSignature;
-
-static MKLDNNLogSoftmaxFwd& GetLogSoftmaxFwd(const SoftmaxParam& param,
-                                             const int real_axis,
-                                             const bool is_train,
-                                             const NDArray& data,
-                                             const NDArray& output) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNSoftmaxSignature, MKLDNNLogSoftmaxFwd, OpHash> fwds;
-#else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNSoftmaxSignature, MKLDNNLogSoftmaxFwd, OpHash>
-      fwds;
-#endif
-
-  MKLDNNSoftmaxSignature key(param);
-  key.AddSign(real_axis);
-  key.AddSign(is_train);
-  key.AddSign(data);
-  key.AddSign(output);
-
-  auto it = fwds.find(key);
-  if (it == fwds.end()) {
-    MKLDNNLogSoftmaxFwd fwd(is_train, real_axis, *(data.GetMKLDNNData()));
-    it = AddToCache(&fwds, key, fwd);
-  }
-  return it->second;
-}
-
-void MKLDNNLogSoftmaxForward(const nnvm::NodeAttrs& attrs,
-                             const OpContext& ctx,
-                             const NDArray& in_data,
-                             const OpReqType& req,
-                             const NDArray& out_data) {
-  if (req == kNullOp)
-    return;
-  // same as the FCompute path, log_softmax only supports kWriteTo and kWriteInplace for now.
-  CHECK_NE(req, kAddTo);
-
-  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-  int axis                  = CheckAxis(param.axis, in_data.shape().ndim());
-  auto fwd                  = GetLogSoftmaxFwd(param, axis, ctx.is_train, in_data, out_data);
-
-  auto in_mem          = in_data.GetMKLDNNData();
-  auto out_mem         = out_data.GetMKLDNNData(fwd.pd.dst_desc());
-  MKLDNNStream* stream = MKLDNNStream::Get();
-  stream->RegisterPrimArgs(fwd.GetFwd(), {{MKLDNN_ARG_SRC, *in_mem}, {MKLDNN_ARG_DST, *out_mem}});
-  stream->Submit();
-}
-
-class MKLDNNLogSoftmaxBwd {
- public:
-  mkldnn::logsoftmax_backward::primitive_desc pd;
-
-  MKLDNNLogSoftmaxBwd(const mkldnn::memory& diff_mem,
-                      const mkldnn::memory& data_mem,
-                      const int axis,
-                      const mkldnn::logsoftmax_forward::primitive_desc& hint_fwd_pd)
-      : pd(GetLogSoftmaxBwdPd(diff_mem, data_mem, axis, hint_fwd_pd)) {
-    bwd_ = std::make_shared<mkldnn::logsoftmax_backward>(pd);
-  }
-
-  const mkldnn::logsoftmax_backward& GetBwd() const {
-    return *bwd_;
-  }
-
- private:
-  std::shared_ptr<mkldnn::logsoftmax_backward> bwd_;
-};
-
-static MKLDNNLogSoftmaxBwd& GetLogSoftmaxBwd(const SoftmaxParam& param,
-                                             const int real_axis,
-                                             const std::vector<NDArray>& data,
-                                             const std::vector<NDArray>& output) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNSoftmaxSignature, MKLDNNLogSoftmaxBwd, OpHash> bwds;
-#else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNSoftmaxSignature, MKLDNNLogSoftmaxBwd, OpHash>
-      bwds;
-#endif
-
-  MKLDNNSoftmaxSignature key(param);
-  key.AddSign(real_axis);
-  key.AddSign(data);
-  key.AddSign(output);
-
-  auto it = bwds.find(key);
-  if (it == bwds.end()) {
-    auto diff_mem = data[0].GetMKLDNNData();
-    auto data_mem = data[1].GetMKLDNNData();
-    auto fwd_pd   = GetLogSoftmaxFwdPd(true, real_axis, *data_mem);
-    MKLDNNLogSoftmaxBwd bwd(*diff_mem, *data_mem, real_axis, fwd_pd);
-    it = AddToCache(&bwds, key, bwd);
-  }
-  return it->second;
-}
-
-void MKLDNNLogSoftmaxBackward(const nnvm::NodeAttrs& attrs,
-                              const OpContext& ctx,
-                              const std::vector<NDArray>& in_data,
-                              const std::vector<OpReqType>& req,
-                              const std::vector<NDArray>& out_data) {
-  if (req[0] == kNullOp)
-    return;
-  CHECK_EQ(in_data.size(), 2U);
-  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-  int axis                  = CheckAxis(param.axis, in_data[1].shape().ndim());
-  auto diff_mem             = in_data[0].GetMKLDNNData();
-  auto data_mem             = in_data[1].GetMKLDNNData();
-  auto bwd                  = GetLogSoftmaxBwd(param, axis, in_data, out_data);
-
-  auto out_mem           = CreateMKLDNNMem(out_data[0], bwd.pd.diff_src_desc(), req[0]);
-  MKLDNNStream* stream   = MKLDNNStream::Get();
-  mkldnn_args_map_t args = {{MKLDNN_ARG_DST, *data_mem},
-                            {MKLDNN_ARG_DIFF_DST, *diff_mem},
-                            {MKLDNN_ARG_DIFF_SRC, *out_mem.second}};
-
-  stream->RegisterPrimArgs(bwd.GetBwd(), args);
-  CommitOutput(out_data[0], out_mem);
-  stream->Submit();
-}
-
-}  // namespace op
-}  // namespace mxnet
-#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_lrn-inl.h b/src/operator/nn/mkldnn/mkldnn_lrn-inl.h
deleted file mode 100644
index 4626d6cd4923..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_lrn-inl.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_lrn-inl.h
- * \brief
- * \Author: Patric Zhao, patric.zhao@intel.com
- */
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_
-
-#if MXNET_USE_ONEDNN == 1
-#include <mkldnn.hpp>
-
-#include <utility>
-#include <vector>
-
-#include "./mkldnn_base-inl.h"
-
-#include "../lrn-inl.h"
-
-namespace mxnet {
-namespace op {
-
-inline mkldnn::algorithm GetMKLDNNLRNAlgo(const LRNParam& param) {
-  // TODO(Patric): lrn_within_channel will cause core dump in MKLDNN backward
-  //               Need to confirm with MKLDNN team and fix later
-  return mkldnn::algorithm::lrn_across_channels;
-}
-
-inline mkldnn::lrn_forward::primitive_desc GetLRNFwdDesc(const LRNParam& param,
-                                                         const bool is_train,
-                                                         const mkldnn::memory::desc& src_md) {
-  mkldnn::engine& engine      = CpuEngine::Get()->get_engine();
-  const mkldnn::algorithm alg = GetMKLDNNLRNAlgo(param);
-  const float alpha           = param.alpha;
-  const float beta            = param.beta;
-  const int nsize             = param.nsize;
-  const float k               = param.knorm;
-  auto kind                   = mkldnn::prop_kind::forward_training;
-  if (is_train) {
-    kind = mkldnn::prop_kind::forward_training;
-  } else {
-    kind = mkldnn::prop_kind::forward_scoring;
-  }
-  mkldnn::lrn_forward::desc fwd_desc(kind, alg, src_md, nsize, alpha, beta, k);
-  return mkldnn::lrn_forward::primitive_desc(fwd_desc, engine);
-}
-
-inline mkldnn::lrn_backward::primitive_desc GetLRNBwdDesc(
-    const LRNParam& param,
-    const mkldnn::memory::desc& data_in_md,
-    const mkldnn::memory::desc& diff_md,
-    const mkldnn::lrn_forward::primitive_desc& lrnFwd_desc) {
-  mkldnn::engine& engine      = CpuEngine::Get()->get_engine();
-  const mkldnn::algorithm alg = GetMKLDNNLRNAlgo(param);
-  const float alpha           = param.alpha;
-  const float beta            = param.beta;
-  const int nsize             = param.nsize;
-  const float k               = param.knorm;
-
-  mkldnn::lrn_backward::desc lrnBwd_desc(alg, data_in_md, diff_md, nsize, alpha, beta, k);
-  return mkldnn::lrn_backward::primitive_desc(lrnBwd_desc, engine, lrnFwd_desc);
-}
-
-typedef ParamOpSign<LRNParam> MKLDNNLRNSignature;
-
-// LRN Forward Class
-class MKLDNNLRNFwd {
- public:
-  MKLDNNLRNFwd(const LRNParam& param, bool is_train, const NDArray& in_data) {
-    _Init(param, is_train, in_data);
-  }
-
-  ~MKLDNNLRNFwd() {}
-
-  void Execute(const OpContext& ctx,
-               const NDArray& in_data,
-               const OpReqType req,
-               const NDArray& out_data);
-
-  mkldnn::lrn_forward& GetFwd();
-  const mkldnn::memory* GetWs();
-  mkldnn::lrn_forward::primitive_desc& GetFwdPd();
-
- private:
-  std::shared_ptr<mkldnn::lrn_forward> fwd;
-  mkldnn::lrn_forward::primitive_desc fwd_pd;
-
- private:
-  void _Init(const LRNParam& param, bool is_train, const NDArray& in_data);
-};  // End of LRN Forword Class
-
-void MKLDNNLRNFwd::_Init(const LRNParam& param, bool is_train, const NDArray& in_data) {
-  mkldnn::memory::desc in_data_md = in_data.GetMKLDNNData()->get_desc();
-  this->fwd_pd                    = GetLRNFwdDesc(param, is_train, in_data_md);
-
-  this->fwd = std::shared_ptr<mkldnn::lrn_forward>(new mkldnn::lrn_forward(this->fwd_pd));
-}
-
-void MKLDNNLRNFwd::Execute(const OpContext& ctx,
-                           const NDArray& in_data,
-                           const OpReqType req,
-                           const NDArray& out_data) {
-  auto output_mem_t = CreateMKLDNNMem(out_data, (this->fwd_pd).dst_desc(), req);
-
-  mkldnn_args_map_t args = {
-      {MKLDNN_ARG_SRC, *in_data.GetMKLDNNData()},
-      {MKLDNN_ARG_DST, *output_mem_t.second},
-  };
-  std::shared_ptr<mkldnn::memory> workspace;
-  if (ctx.is_train) {
-    auto engine = CpuEngine::Get()->get_engine();
-    workspace   = std::make_shared<mkldnn::memory>((this->fwd_pd).workspace_desc(), engine);
-    args[MKLDNN_ARG_WORKSPACE] = *(workspace);
-  }
-  MKLDNNStream::Get()->RegisterPrimArgs(*(this->fwd), args);
-  CommitOutput(out_data, output_mem_t);
-  MKLDNNStream::Get()->Submit();
-}
-
-mkldnn::lrn_forward& MKLDNNLRNFwd::GetFwd() {
-  return *this->fwd;
-}
-mkldnn::lrn_forward::primitive_desc& MKLDNNLRNFwd::GetFwdPd() {
-  return this->fwd_pd;
-}
-
-// End of LRN Class and its functions
-
-static MKLDNNLRNFwd& GetLRNFwd(const LRNParam& param,
-                               const OpContext& ctx,
-                               const NDArray& in_data) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNLRNSignature, MKLDNNLRNFwd, OpHash> lrn_fwds;
-#else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNLRNSignature, MKLDNNLRNFwd, OpHash> lrn_fwds;
-#endif
-  auto kind_ =
-      ctx.is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
-
-  MKLDNNLRNSignature key(param);
-  key.AddSign(static_cast<int>(kind_));
-  key.AddSign(in_data);
-
-  auto it = lrn_fwds.find(key);
-  if (it == lrn_fwds.end()) {
-    MKLDNNLRNFwd fwd(param, ctx.is_train, in_data);
-    it = AddToCache(&lrn_fwds, key, fwd);
-  }
-  return it->second;
-}
-
-void MKLDNNLRNForward(const nnvm::NodeAttrs& attrs,
-                      const OpContext& ctx,
-                      const NDArray& in_data,
-                      const OpReqType req,
-                      const NDArray& out_data) {
-  const LRNParam& param = nnvm::get<LRNParam>(attrs.parsed);
-  auto in_buffer        = in_data;
-  if (in_buffer.IsView() && in_buffer.IsMKLDNNData())
-    in_buffer = in_buffer.Reorder2Default();
-  MKLDNNLRNFwd fwd = GetLRNFwd(param, ctx, in_buffer);
-  fwd.Execute(ctx, in_buffer, req, out_data);
-}
-
-// LRN Backward Class
-class MKLDNNLRNBwd {
-  std::shared_ptr<mkldnn::lrn_backward> bwd;
-
- public:
-  const mkldnn::lrn_forward::primitive_desc fwd_pd;
-  const mkldnn::lrn_backward::primitive_desc bwd_pd;
-
-  ~MKLDNNLRNBwd() {}
-
-  MKLDNNLRNBwd(const LRNParam& param,
-               const mkldnn::memory::desc in_data_md,
-               const mkldnn::memory::desc diff_md)
-      : fwd_pd(GetLRNFwdDesc(param, true, in_data_md)),
-        bwd_pd(GetLRNBwdDesc(param, in_data_md, diff_md, this->fwd_pd)) {
-    bwd = std::make_shared<mkldnn::lrn_backward>(bwd_pd);
-  }
-
-  const mkldnn::lrn_backward& GetBwd() const {
-    return *bwd;
-  }
-
-  void Execute(const NDArray& out_grad,
-               const NDArray& in_data,
-               const NDArray& in_grad,
-               const mkldnn_output_t& diff_src_mem) {
-    auto engine    = CpuEngine::Get()->get_engine();
-    auto workspace = std::make_shared<mkldnn::memory>((this->fwd_pd).workspace_desc(), engine);
-    mkldnn_args_map_t args = {{MKLDNN_ARG_SRC, *in_data.GetMKLDNNData()},
-                              {MKLDNN_ARG_DIFF_DST, *out_grad.GetMKLDNNData()},
-                              {MKLDNN_ARG_WORKSPACE, *workspace},
-                              {MKLDNN_ARG_DIFF_SRC, *diff_src_mem.second}};
-    MKLDNNStream::Get()->RegisterPrimArgs(*(this->bwd), args);
-    CommitOutput(in_grad, diff_src_mem);
-    MKLDNNStream::Get()->Submit();
-  }
-};  // End of LRN Class
-
-static MKLDNNLRNBwd& GetLRNBwd(const LRNParam& param,
-                               const NDArray& in_data,
-                               const NDArray& in_grad,
-                               const NDArray& out_grad) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNLRNSignature, MKLDNNLRNBwd, OpHash> lrn_bwds;
-#else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNLRNSignature, MKLDNNLRNBwd, OpHash> lrn_bwds;
-#endif
-  MKLDNNLRNSignature key(param);
-  key.AddSign(in_data);
-  key.AddSign(in_grad);
-  key.AddSign(out_grad);
-
-  auto it = lrn_bwds.find(key);
-  if (it == lrn_bwds.end()) {
-    const mkldnn::memory::desc in_data_md = in_data.GetMKLDNNData()->get_desc();
-    const mkldnn::memory::desc diff_md    = out_grad.GetMKLDNNData()->get_desc();
-    MKLDNNLRNBwd bwd(param, in_data_md, diff_md);
-    it = AddToCache(&lrn_bwds, key, bwd);
-  }
-  return it->second;
-}
-
-void MKLDNNLRNBackward(const nnvm::NodeAttrs& attrs,
-                       const OpContext& ctx,
-                       const std::vector<NDArray>& inputs,
-                       const std::vector<OpReqType>& req,
-                       const std::vector<NDArray>& outputs) {
-  if (req[0] == kNullOp) {
-    return;
-  }
-  const LRNParam& param   = nnvm::get<LRNParam>(attrs.parsed);
-  const NDArray& out_grad = inputs[0];
-  const NDArray& in_data  = inputs[1];
-  const NDArray& in_grad  = outputs[0];
-  // TODO(alex): (MXNET-846) figure out why in_grad output incorrect when in_data is nchw8c
-  const auto in_buffer         = in_data.Reorder2Default();
-  MKLDNNLRNBwd& bwd            = GetLRNBwd(param, in_buffer, in_grad, out_grad);
-  mkldnn_output_t diff_src_mem = CreateMKLDNNMem(in_grad, bwd.bwd_pd.diff_src_desc(), req[0]);
-
-  bwd.Execute(out_grad, in_buffer, in_grad, diff_src_mem);
-}
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H__
diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
deleted file mode 100644
index d9d84e68050f..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_ops-inl.h
- * \brief
- * \author Da Zheng
- */
-
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_
-
-#include <dmlc/logging.h>
-#include <dmlc/optional.h>
-#include <mxnet/base.h>
-#include <mxnet/io.h>
-#include <mxnet/ndarray.h>
-#include <mxnet/operator.h>
-#include <mxnet/operator_util.h>
-
-#include <vector>
-
-#if MXNET_USE_ONEDNN == 1
-#include <mkldnn.hpp>
-
-namespace mxnet {
-namespace op {
-
-/* For fully connected. */
-void MKLDNNFCForward(const nnvm::NodeAttrs& attrs,
-                     const OpContext& ctx,
-                     const std::vector<NDArray>& in_data,
-                     const std::vector<OpReqType>& req,
-                     const std::vector<NDArray>& out_data);
-void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs,
-                      const OpContext& ctx,
-                      const std::vector<NDArray>& inputs,
-                      const std::vector<OpReqType>& req,
-                      const std::vector<NDArray>& outputs);
-
-/* For convolution. */
-void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs,
-                              const OpContext& ctx,
-                              const std::vector<NDArray>& in_data,
-                              const std::vector<OpReqType>& req,
-                              const std::vector<NDArray>& out_data);
-void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs,
-                               const OpContext& ctx,
-                               const std::vector<NDArray>& inputs,
-                               const std::vector<OpReqType>& req,
-                               const std::vector<NDArray>& outputs);
-
-/* For deconvolution */
-void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs,
-                                const OpContext& ctx,
-                                const std::vector<NDArray>& in_data,
-                                const std::vector<OpReqType>& req,
-                                const std::vector<NDArray>& out_data);
-void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs,
-                                 const OpContext& ctx,
-                                 const std::vector<NDArray>& inputs,
-                                 const std::vector<OpReqType>& req,
-                                 const std::vector<NDArray>& outputs);
-
-/* For activation */
-void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs,
-                             const OpContext& ctx,
-                             const NDArray& in_data,
-                             const OpReqType& req,
-                             const NDArray& out_data);
-void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs,
-                              const OpContext& ctx,
-                              const std::vector<NDArray>& inputs,
-                              const std::vector<OpReqType>& req,
-                              const std::vector<NDArray>& outputs);
-
-void MKLDNNLeakyReluForward(const nnvm::NodeAttrs& attrs,
-                            const OpContext& ctx,
-                            const NDArray& in_data,
-                            const OpReqType& req,
-                            const NDArray& out_data);
-void MKLDNNLeakyReluBackward(const nnvm::NodeAttrs& attrs,
-                             const OpContext& ctx,
-                             const std::vector<NDArray>& inputs,
-                             const std::vector<OpReqType>& req,
-                             const std::vector<NDArray>& outputs);
-
-/* For softmax */
-void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const NDArray& in_data,
-                          const OpReqType& req,
-                          const NDArray& out_data);
-void MKLDNNSoftmaxBackward(const nnvm::NodeAttrs& attrs,
-                           const OpContext& ctx,
-                           const std::vector<NDArray>& in_data,
-                           const std::vector<OpReqType>& req,
-                           const std::vector<NDArray>& out_data);
-
-/* For log_softmax */
-void MKLDNNLogSoftmaxForward(const nnvm::NodeAttrs& attrs,
-                             const OpContext& ctx,
-                             const NDArray& in_data,
-                             const OpReqType& req,
-                             const NDArray& out_data);
-void MKLDNNLogSoftmaxBackward(const nnvm::NodeAttrs& attrs,
-                              const OpContext& ctx,
-                              const std::vector<NDArray>& in_data,
-                              const std::vector<OpReqType>& req,
-                              const std::vector<NDArray>& out_data);
-
-/* For softmax_output */
-void MKLDNNSoftmaxOutputForward(const nnvm::NodeAttrs& attrs,
-                                const OpContext& ctx,
-                                const std::vector<NDArray>& in_data,
-                                const std::vector<OpReqType>& req,
-                                const std::vector<NDArray>& out_data);
-
-/* For sum */
-void MKLDNNSumForward(const nnvm::NodeAttrs& attrs,
-                      const OpContext& ctx,
-                      const std::vector<NDArray>& inputs,
-                      const std::vector<OpReqType>& req,
-                      const std::vector<NDArray>& outputs);
-
-/* For copy */
-void MKLDNNCopy(const nnvm::NodeAttrs& attrs,
-                const OpContext& ctx,
-                const NDArray& in_data,
-                const OpReqType& req,
-                const NDArray& out_data);
-
-/* For concat */
-void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs,
-                         const OpContext& ctx,
-                         const std::vector<NDArray>& in_data,
-                         const std::vector<OpReqType>& req,
-                         const std::vector<NDArray>& out_data);
-void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<NDArray>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<NDArray>& outputs);
-
-/* For batch dot */
-void MKLDNNBatchDotForward(const nnvm::NodeAttrs& attrs,
-                           const OpContext& ctx,
-                           const std::vector<NDArray>& inputs,
-                           const std::vector<OpReqType>& req,
-                           const std::vector<NDArray>& outputs);
-
-/* For layer normalization */
-void MKLDNNLayerNormForward(const nnvm::NodeAttrs& attrs,
-                            const OpContext& ctx,
-                            const std::vector<NDArray>& inputs,
-                            const std::vector<OpReqType>& req,
-                            const std::vector<NDArray>& outputs);
-void MKLDNNLayerNormBackward(const nnvm::NodeAttrs& attrs,
-                             const OpContext& ctx,
-                             const std::vector<NDArray>& inputs,
-                             const std::vector<OpReqType>& req,
-                             const std::vector<NDArray>& outputs);
-
-void MKLDNNSum(const mkldnn::memory& arr1, const mkldnn::memory& arr2, const mkldnn::memory& out);
-
-void MKLDNNTransposeForward(const nnvm::NodeAttrs& attrs,
-                            const OpContext& ctx,
-                            const NDArray& data,
-                            const OpReqType& req,
-                            const NDArray& output);
-
-void MKLDNNReshapeForward(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const NDArray& input,
-                          const OpReqType& req,
-                          const NDArray& output);
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_pooling.cc b/src/operator/nn/mkldnn/mkldnn_pooling.cc
deleted file mode 100644
index 194cb2d87254..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_pooling.cc
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_pooling.cc
- * \brief
- * \author Tao Lv
- */
-
-#if MXNET_USE_ONEDNN == 1
-
-#include "./mkldnn_pooling-inl.h"
-
-namespace mxnet {
-namespace op {
-
-static inline mkldnn::memory::data_type get_data_type(const mkldnn::memory::desc& md) {
-  return static_cast<mkldnn::memory::data_type>(md.data_type());
-}
-
-void MKLDNNPoolingFwd::Init(const mxnet::NDArray& input,
-                            const mxnet::NDArray& output,
-                            const mkldnn::memory::dims& kernel,
-                            const mkldnn::memory::dims& strides,
-                            const mkldnn::memory::dims& pad_l,
-                            const mkldnn::memory::dims& pad_r,
-                            const bool is_train,
-                            const mkldnn::algorithm alg_kind) {
-  const auto src_md           = input.GetMKLDNNData()->get_desc();
-  const auto dst_md           = GetMemDesc(output);
-  const mkldnn::engine engine = CpuEngine::Get()->get_engine();
-  if (alg_kind != mkldnn::algorithm::pooling_max && alg_kind != mkldnn::algorithm::pooling_avg &&
-      alg_kind != mkldnn::algorithm::pooling_avg_include_padding &&
-      alg_kind != mkldnn::algorithm::pooling_avg_exclude_padding) {
-    LOG(FATAL) << "MKLDNN Pooling: algorithm is not supported";
-  }
-
-  mkldnn::prop_kind prop = mkldnn::prop_kind::forward_scoring;
-  if (is_train && alg_kind != mkldnn::algorithm::pooling_avg) {
-    prop = mkldnn::prop_kind::forward_training;
-  }
-  if (is_train && prop == mkldnn::prop_kind::forward_scoring) {
-    LOG(INFO) << "MKLDNN Pooling: training with prop_kind is forward_scoring";
-  }
-
-  const auto fwd_desc =
-      mkldnn::pooling_forward::desc(prop, alg_kind, src_md, dst_md, strides, kernel, pad_l, pad_r);
-  this->fwd_pd_.reset(new mkldnn::pooling_forward::primitive_desc(fwd_desc, engine));
-  this->fwd_.reset(new mkldnn::pooling_forward(*(this->fwd_pd_)));
-
-  return;
-}
-
-void MKLDNNPoolingFwd::Execute(const NDArray& in_data,
-                               const OpReqType req,
-                               const NDArray& out_data,
-                               const NDArray* workspace) {
-  NDArray in_buffer = in_data;
-  if (in_data.IsView() && in_data.IsMKLDNNData())
-    in_buffer = in_data.Reorder2Default();
-
-  auto input_mem     = in_buffer.GetMKLDNNData();
-  auto output_mem_t_ = CreateMKLDNNMem(out_data, this->fwd_pd_->dst_desc(), req);
-
-  mkldnn_args_map_t args = {
-      {MKLDNN_ARG_SRC, *input_mem},
-      {MKLDNN_ARG_DST, *(output_mem_t_.second)},
-  };
-
-  if (this->with_workspace_) {
-    auto engine = CpuEngine::Get()->get_engine();
-
-    if (workspace == nullptr) {
-      LOG(FATAL) << "MKLDNN Pooling: incorrect workspace input";
-    }
-
-    auto ws = std::make_shared<mkldnn::memory>(
-        (*(this->fwd_pd_)).workspace_desc(), engine, workspace->GetMKLDNNData()->get_data_handle());
-    args[MKLDNN_ARG_WORKSPACE] = *ws;
-  }
-  if (this->fwd_) {
-    MKLDNNStream::Get()->RegisterPrimArgs(*(this->fwd_), args);
-    CommitOutput(out_data, output_mem_t_);
-    MKLDNNStream::Get()->Submit();
-  } else {
-    LOG(FATAL) << "MKLDNN Pooling: forward primitive is nullptr";
-  }
-}
-
-mkldnn::algorithm GetMKLDNNPoolAlgo(const PoolingParam& param) {
-  switch (param.pool_type) {
-    case pool_enum::kMaxPooling:
-      return mkldnn::algorithm::pooling_max;
-      break;
-    case pool_enum::kAvgPooling:
-      if (param.count_include_pad.has_value() && !param.count_include_pad.value()) {
-        return mkldnn::algorithm::pooling_avg_exclude_padding;
-      } else {
-        return mkldnn::algorithm::pooling_avg_include_padding;
-      }
-      break;
-    default:
-      LOG(FATAL) << "MKLDNN Pooling: Unknown pooling method.";
-      return mkldnn::algorithm::pooling_max;
-  }
-}
-
-void InitPoolingPrimitiveParams(const PoolingParam& param,
-                                const mkldnn::memory::desc& data_md,
-                                const mkldnn::memory::dims& new_kernel,
-                                const mkldnn::memory::dims& new_strides,
-                                const mkldnn::memory::dims& new_pad_l,
-                                const mkldnn::memory::dims& new_pad_r) {
-  const int kernel_ndims        = param.kernel.ndim();
-  mkldnn::memory::dims& kernel  = const_cast<mkldnn::memory::dims&>(new_kernel);
-  mkldnn::memory::dims& strides = const_cast<mkldnn::memory::dims&>(new_strides);
-  mkldnn::memory::dims& pad_l   = const_cast<mkldnn::memory::dims&>(new_pad_l);
-  mkldnn::memory::dims& pad_r   = const_cast<mkldnn::memory::dims&>(new_pad_r);
-  if (kernel_ndims == 1) {
-    CHECK_GE(param.pad.ndim(), 1);
-    CHECK_GE(param.stride.ndim(), 1);
-    kernel[0]  = param.kernel[0];
-    pad_l[0]   = param.pad[0];
-    pad_r[0]   = param.pad[0];
-    strides[0] = param.stride[0];
-
-    if (param.pooling_convention == pool_enum::kFull) {
-      pad_r[0] =
-          GetPaddingSizeFull(data_md.data.dims[2], pad_l[0], pad_r[0], kernel[0], strides[0]);
-    }
-
-    if (param.global_pool) {
-      kernel[0]  = data_md.data.dims[2];
-      strides[0] = 1;
-      pad_l[0] = pad_r[0] = 0;
-    }
-
-    CHECK_GT(kernel[0], 0) << "Filter dimensions cannot be zero.";
-  } else if (kernel_ndims == 2) {
-    CHECK_GE(param.pad.ndim(), 2);
-    CHECK_GE(param.stride.ndim(), 2);
-    kernel[0]  = param.kernel[0];
-    kernel[1]  = param.kernel[1];
-    pad_l[0]   = param.pad[0];
-    pad_l[1]   = param.pad[1];
-    pad_r[0]   = param.pad[0];
-    pad_r[1]   = param.pad[1];
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[1];
-
-    if (param.pooling_convention == pool_enum::kFull) {
-      pad_r[0] =
-          GetPaddingSizeFull(data_md.data.dims[2], pad_l[0], pad_r[0], kernel[0], strides[0]);
-      pad_r[1] =
-          GetPaddingSizeFull(data_md.data.dims[3], pad_l[1], pad_r[1], kernel[1], strides[1]);
-    }
-
-    if (param.global_pool) {
-      kernel[0]  = data_md.data.dims[2];
-      kernel[1]  = data_md.data.dims[3];
-      strides[0] = strides[1] = 1;
-      pad_l[0] = pad_l[1] = pad_r[0] = pad_r[1] = 0;
-    }
-
-    CHECK_GT(kernel[0], 0) << "Filter dimensions cannot be zero.";
-    CHECK_GT(kernel[1], 0) << "Filter dimensions cannot be zero.";
-  } else {
-    CHECK_GE(param.pad.ndim(), 3);
-    CHECK_GE(param.stride.ndim(), 3);
-    kernel[0]  = param.kernel[0];
-    kernel[1]  = param.kernel[1];
-    kernel[2]  = param.kernel[2];
-    pad_l[0]   = param.pad[0];
-    pad_l[1]   = param.pad[1];
-    pad_l[2]   = param.pad[2];
-    pad_r[0]   = param.pad[0];
-    pad_r[1]   = param.pad[1];
-    pad_r[2]   = param.pad[2];
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[1];
-    strides[2] = param.stride[2];
-
-    if (param.pooling_convention == pool_enum::kFull) {
-      pad_r[0] =
-          GetPaddingSizeFull(data_md.data.dims[2], pad_l[0], pad_r[0], kernel[0], strides[0]);
-      pad_r[1] =
-          GetPaddingSizeFull(data_md.data.dims[3], pad_l[1], pad_r[1], kernel[1], strides[1]);
-      pad_r[2] =
-          GetPaddingSizeFull(data_md.data.dims[4], pad_l[2], pad_r[2], kernel[2], strides[2]);
-    }
-
-    if (param.global_pool) {
-      kernel[0]  = data_md.data.dims[2];
-      kernel[1]  = data_md.data.dims[3];
-      kernel[2]  = data_md.data.dims[4];
-      strides[0] = strides[1] = strides[2] = 1;
-      pad_l[0] = pad_l[1] = pad_l[2] = pad_r[0] = pad_r[1] = pad_r[2] = 0;
-    }
-
-    CHECK_GT(kernel[0], 0) << "Filter dimensions cannot be zero.";
-    CHECK_GT(kernel[1], 0) << "Filter dimensions cannot be zero.";
-    CHECK_GT(kernel[2], 0) << "Filter dimensions cannot be zero.";
-  }
-
-  if (pad_l[0] != 0 || (kernel_ndims == 2 && pad_l[1] != 0) ||
-      (kernel_ndims == 3 && pad_l[2] != 0)) {
-    CHECK(param.pool_type == pool_enum::kAvgPooling || param.pool_type == pool_enum::kMaxPooling)
-        << "Padding implemented only for average and max pooling.";
-    CHECK_LT(pad_l[0], kernel[0]);
-    if (kernel_ndims > 1)
-      CHECK_LT(pad_l[1], kernel[1]);
-    if (kernel_ndims > 2)
-      CHECK_LT(pad_l[2], kernel[2]);
-  }
-}
-
-mkldnn::pooling_forward::primitive_desc GetPoolingFwdPdesc(const PoolingParam& param,
-                                                           const bool is_train,
-                                                           const mkldnn::memory::desc& data_md,
-                                                           const mkldnn::memory::desc& out_md) {
-  CHECK(param.kernel.ndim() == 1 || param.kernel.ndim() == 2 || param.kernel.ndim() == 3)
-      << "Not Implemented";
-
-  const int kernel_ndims = param.kernel.ndim();
-  mkldnn::memory::dims kernel(kernel_ndims);
-  mkldnn::memory::dims strides(kernel_ndims);
-  mkldnn::memory::dims pad_l(kernel_ndims);
-  mkldnn::memory::dims pad_r(kernel_ndims);
-
-  InitPoolingPrimitiveParams(param, data_md, kernel, strides, pad_l, pad_r);
-
-  const mkldnn::algorithm alg = GetMKLDNNPoolAlgo(param);
-  mkldnn::prop_kind kind      = mkldnn::prop_kind::forward_scoring;
-  if (is_train && alg != mkldnn::algorithm::pooling_avg) {
-    kind = mkldnn::prop_kind::forward_training;
-  }
-
-  const mkldnn::pooling_forward::desc poolingFwd_desc(
-      kind, alg, data_md, out_md, strides, kernel, pad_l, pad_r);
-  return mkldnn::pooling_forward::primitive_desc(poolingFwd_desc, CpuEngine::Get()->get_engine());
-}
-
-MKLDNNPoolingFwd& GetPoolingFwd(const PoolingParam& param,
-                                const bool is_train,
-                                const NDArray& data,
-                                const NDArray& output) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNPoolingSignature, MKLDNNPoolingFwd, OpHash>
-      pooling_fwds;
-#else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNPoolingSignature, MKLDNNPoolingFwd, OpHash>
-      pooling_fwds;
-#endif
-
-  bool with_workspace = is_train && MKLDNNRequireWorkspace(param);
-  MKLDNNPoolingSignature key(param);
-  key.AddSign(is_train);
-  key.AddSign(with_workspace);
-  key.AddSign(data);
-  key.AddSign(output);
-
-  auto it = pooling_fwds.find(key);
-  if (it == pooling_fwds.end()) {
-    CHECK(param.kernel.ndim() == 1 || param.kernel.ndim() == 2 || param.kernel.ndim() == 3)
-        << "Not Implemented";
-    auto data_md = data.GetMKLDNNData()->get_desc();
-
-    const auto kernel_ndims = param.kernel.ndim();
-    mkldnn::memory::dims kernel(kernel_ndims);
-    mkldnn::memory::dims strides(kernel_ndims);
-    mkldnn::memory::dims pad_l(kernel_ndims);
-    mkldnn::memory::dims pad_r(kernel_ndims);
-    InitPoolingPrimitiveParams(param, data_md, kernel, strides, pad_l, pad_r);
-
-    const mkldnn::algorithm alg = GetMKLDNNPoolAlgo(param);
-    MKLDNNPoolingFwd fwd(
-        data, output, kernel, strides, pad_l, pad_r, alg, with_workspace, is_train);
-    it = AddToCache(&pooling_fwds, key, fwd);
-  }
-  return it->second;
-}
-
-void MKLDNNPoolingCompute(const OpContext& ctx,
-                          const PoolingParam& param,
-                          const NDArray& in_data,
-                          const OpReqType req,
-                          const NDArray& out_data,
-                          const NDArray* workspace) {
-  auto& fwd = GetPoolingFwd(param, ctx.is_train, in_data, out_data);
-  fwd.Execute(in_data, req, out_data, workspace);
-}
-
-MKLDNNPoolingBwd::MKLDNNPoolingBwd(const mkldnn::pooling_backward::primitive_desc& pdesc,
-                                   bool with_ws)
-    : with_workspace(with_ws), pd(pdesc) {
-  bwd = std::make_shared<mkldnn::pooling_backward>(pd);
-}
-
-const mkldnn::pooling_backward& MKLDNNPoolingBwd::GetBwd() {
-  return *this->bwd;
-}
-
-MKLDNNPoolingBwd& GetPoolingBwd(const PoolingParam& param,
-                                const NDArray& in_data,
-                                const NDArray& in_grad,
-                                const NDArray& out_grad) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNPoolingSignature, MKLDNNPoolingBwd, OpHash>
-      pooling_bwds;
-#else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNPoolingSignature, MKLDNNPoolingBwd, OpHash>
-      pooling_bwds;
-#endif
-
-  bool with_workspace = MKLDNNRequireWorkspace(param);
-  MKLDNNPoolingSignature key(param);
-  key.AddSign(in_data);
-  key.AddSign(in_grad);
-  key.AddSign(out_grad);
-
-  auto it = pooling_bwds.find(key);
-  if (it == pooling_bwds.end()) {
-    auto input_mem = in_data.GetMKLDNNData();
-    auto data_md   = input_mem->get_desc();
-
-    auto dst_dims = mkldnn::memory::dims(out_grad.shape().begin(), out_grad.shape().end());
-    auto any      = mkldnn::memory::format_tag::any;
-    auto dst_md   = mkldnn::memory::desc(dst_dims, get_data_type(data_md), any);
-
-    // fwd hint
-    auto fwd_pd = GetPoolingFwdPdesc(param, true, data_md, dst_md);
-
-    // creat bwd desc
-    auto diff_src_dims = mkldnn::memory::dims(in_grad.shape().begin(), in_grad.shape().end());
-    auto diff_src_md   = mkldnn::memory::desc(diff_src_dims, get_data_type(data_md), any);
-    auto cpu_engine    = CpuEngine::Get()->get_engine();
-    auto alg           = GetMKLDNNPoolAlgo(param);
-
-    const int kernel_ndims = param.kernel.ndim();
-    mkldnn::memory::dims kernel(kernel_ndims);
-    mkldnn::memory::dims strides(kernel_ndims);
-    mkldnn::memory::dims pad_l(kernel_ndims);
-    mkldnn::memory::dims pad_r(kernel_ndims);
-
-    InitPoolingPrimitiveParams(param, data_md, kernel, strides, pad_l, pad_r);
-
-    // use dst_md as diff_dst_md with any format
-    auto bwd_desc =
-        mkldnn::pooling_backward::desc(alg, diff_src_md, dst_md, strides, kernel, pad_l, pad_r);
-    auto pdesc = mkldnn::pooling_backward::primitive_desc(bwd_desc, cpu_engine, fwd_pd);
-
-    MKLDNNPoolingBwd bwd(pdesc, with_workspace);
-    it = AddToCache(&pooling_bwds, key, bwd);
-  }
-  return it->second;
-}
-
-void MKLDNNPoolingGradCompute(const OpContext& ctx,
-                              const PoolingParam& param,
-                              const NDArray& out_grad,
-                              const NDArray& in_data,
-                              const NDArray* workspace,
-                              const OpReqType req,
-                              const NDArray& in_grad) {
-  if (req == kNullOp) {
-    return;
-  }
-
-  TmpMemMgr::Get()->Init(ctx.requested[0]);
-
-  auto& bwd              = GetPoolingBwd(param, in_data, in_grad, out_grad);
-  auto diff_dst_mem      = out_grad.GetMKLDNNDataReorder(bwd.pd.diff_dst_desc());
-  auto diff_src_mem      = CreateMKLDNNMem(in_grad, bwd.pd.diff_src_desc(), req);
-  mkldnn_args_map_t args = {
-      {MKLDNN_ARG_DIFF_DST, *diff_dst_mem},
-      {MKLDNN_ARG_DIFF_SRC, *diff_src_mem.second},
-  };
-  if (MKLDNNRequireWorkspace(param) && workspace != nullptr) {
-    args[MKLDNN_ARG_WORKSPACE] = *(workspace->GetMKLDNNData());
-  }
-
-  MKLDNNStream::Get()->RegisterPrimArgs(bwd.GetBwd(), args);
-  CommitOutput(in_grad, diff_src_mem);
-  MKLDNNStream::Get()->Submit();
-}
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_USE_ONEDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_reshape.cc b/src/operator/nn/mkldnn/mkldnn_reshape.cc
deleted file mode 100644
index 99d64efa148a..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_reshape.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_reshape.cc
- * \brief Implement reshape operator via MKL-DNN reorder primitive
- * \author Tao Lv
- */
-
-#if MXNET_USE_ONEDNN == 1
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-#include "./mkldnn_reshape-inl.h"
-
-#include "../../tensor/elemwise_unary_op.h"
-
-namespace mxnet {
-namespace op {
-
-bool SupportMKLDNNReshape(const NDArray& input, const NDArray& output) {
-  const int input_ndims  = input.shape().ndim();
-  const int output_ndims = output.shape().ndim();
-  return input.shape().Size() > 0 && input_ndims >= 1 && input_ndims <= 6 && output_ndims >= 1 &&
-         output_ndims <= 6 && IsMKLDNNType(input.dtype());
-}
-
-MKLDNNReshapeFwd::MKLDNNReshapeFwd(const OpReqType& req,
-                                   const NDArray& input,
-                                   const NDArray& output) {
-  const auto engine = CpuEngine::Get()->get_engine();
-  auto in_mem       = input.GetMKLDNNData();
-
-  // Create temp memory
-  auto temp_dims = mkldnn::memory::dims(input.shape().begin(), input.shape().end());
-  auto temp_type = static_cast<mkldnn::memory::data_type>(get_mkldnn_type(input.dtype()));
-  auto temp_fmt  = static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(input.shape().ndim()));
-  auto temp_desc = mkldnn::memory::desc(temp_dims, temp_type, temp_fmt);
-
-  out_ = std::make_shared<mkldnn::memory>(temp_desc, engine, nullptr);
-  if (req == kWriteInplace) {
-    // If the input has MKL-DNN internal layout, we need reorder it to a temporal buffer with
-    // default layout and copy from the temporal buffer back to output buffer which has the same
-    // address with input buffer.
-    // If the input has default layout, then nothing need to do.
-    if (input.IsMKLDNNData()) {
-      temp_ = std::make_shared<mkldnn::memory>(temp_desc, engine, nullptr);
-      prims_.push_back(mkldnn::reorder(*in_mem, *temp_));  // reorder to default
-      prims_.push_back(mkldnn::reorder(*temp_, *out_));    // copy back
-    }
-  } else if (req == kWriteTo) {
-    prims_.push_back(mkldnn::reorder(*in_mem, *out_));
-  } else {
-    LOG(FATAL) << "not supported req type: " << req;
-  }
-}
-
-int MKLDNNReshapeFwd::GetWorkspaceSize() {
-  return temp_ ? temp_->get_desc().get_size() : 0;
-}
-
-void MKLDNNReshapeFwd::Execute(const NDArray& input,
-                               const NDArray& output,
-                               const OpReqType& req,
-                               void* workspace) {
-  auto stream = MKLDNNStream::Get();
-  auto in_mem = input.GetMKLDNNData();
-  // register primitives and arguments
-  std::vector<mkldnn_args_map_t> args_map;
-  size_t prims_size = prims_.size();
-  if (prims_size == 1) {
-    args_map.push_back({{MKLDNN_ARG_FROM, *in_mem}, {MKLDNN_ARG_TO, *output.GetMKLDNNData()}});
-  } else if (prims_size == 2) {
-    if (workspace) {
-      temp_->set_data_handle(workspace);
-    }
-    args_map.push_back({{MKLDNN_ARG_FROM, *in_mem}, {MKLDNN_ARG_TO, *temp_}});
-    args_map.push_back({{MKLDNN_ARG_FROM, *temp_}, {MKLDNN_ARG_TO, *output.GetMKLDNNData()}});
-  } else {
-    CHECK(prims_size == 0 && req != kWriteTo) << "kWriteTo should never reach here.";
-  }
-
-  for (size_t i = 0; i < prims_size; i++) {
-    stream->RegisterPrimArgs(prims_[i], args_map[i]);
-  }
-  stream->Submit();
-  // invalidate mkldnn memory in output
-  const_cast<NDArray&>(output).InvalidateMKLDNNData();
-}
-
-MKLDNNReshapeFwd& GetReshapeForward(const OpReqType& req,
-                                    const NDArray& input,
-                                    const NDArray& output) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNReshapeSignature, MKLDNNReshapeFwd, OpHash> fwds;
-#else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNReshapeSignature, MKLDNNReshapeFwd, OpHash> fwds;
-#endif
-  MKLDNNReshapeSignature key;
-  key.AddSign(req);
-  key.AddSign(input);
-
-  auto it = fwds.find(key);
-  if (it == fwds.end()) {
-    MKLDNNReshapeFwd fwd(req, input, output);
-    it = AddToCache(&fwds, key, fwd);
-  }
-  return it->second;
-}
-
-void MKLDNNReshapeForward(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const NDArray& input,
-                          const OpReqType& req,
-                          const NDArray& output) {
-  if (req == kNullOp)
-    return;
-  CHECK_NE(req, kAddTo) << "kAddTo is not supported yet";
-  auto fwd     = GetReshapeForward(req, input, output);
-  auto ws_size = fwd.GetWorkspaceSize();
-  void* ws_ptr = nullptr;
-  if (ws_size) {
-    mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
-    mshadow::Tensor<cpu, 1, char> ws =
-        ctx.requested[0].get_space_typed<cpu, 1, char>(mshadow::Shape1(ws_size), s);
-    ws_ptr = static_cast<void*>(ws.dptr_);
-  }
-  fwd.Execute(input, output, req, ws_ptr);
-}
-
-}  // namespace op
-}  // namespace mxnet
-#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc
deleted file mode 100644
index 27902a8046c0..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_softmax.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_softmax.cc
- * \brief
- * \author Da Zheng
- */
-
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-
-#include "../softmax-inl.h"
-
-#if MXNET_USE_ONEDNN == 1
-namespace mxnet {
-namespace op {
-
-static mkldnn::softmax_forward::primitive_desc GetSoftmaxFwdPd(bool is_train,
-                                                               const int axis,
-                                                               const mkldnn::memory& input_mem) {
-  mkldnn::memory::desc data_md = input_mem.get_desc();
-  auto cpu_engine              = CpuEngine::Get()->get_engine();
-  auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
-  auto desc = mkldnn::softmax_forward::desc(prop, data_md, axis);
-  return mkldnn::softmax_forward::primitive_desc(desc, cpu_engine);
-}
-
-static mkldnn::softmax_backward::primitive_desc GetSoftmaxBwdPd(
-    const mkldnn::memory& diff_mem,
-    const mkldnn::memory& data_mem,
-    const int axis,
-    const mkldnn::softmax_forward::primitive_desc& hint_fwd_pd) {
-  mkldnn::memory::desc diff_md = diff_mem.get_desc();
-  mkldnn::memory::desc data_md = data_mem.get_desc();
-  auto cpu_engine              = CpuEngine::Get()->get_engine();
-  auto desc                    = mkldnn::softmax_backward::desc(diff_md, data_md, axis);
-  return mkldnn::softmax_backward::primitive_desc(desc, cpu_engine, hint_fwd_pd);
-}
-
-bool SupportMKLDNNSoftmax(const SoftmaxParam& param, const NDArray& data, const NDArray& output) {
-  // MKLDNN does not support temperature argument in their softmax function
-  // now. Need update this once they start to support it.
-  const int ndim      = data.shape().ndim();
-  const int in_dtype  = data.dtype();
-  const int out_dtype = output.dtype();
-  const int axis      = CheckAxis(param.axis, ndim);
-  // MKLDNN does not support temperature argument in their softmax function
-  // now. Need update this once they start to support it.
-  // Currently, MKLDNN shows bad performance when softmax is not performed on the last dimension
-  if (param.temperature.has_value() || in_dtype != mshadow::kFloat32 || in_dtype != out_dtype ||
-      axis != (ndim - 1)) {
-    return false;
-  }
-
-  // only supports ndim = 1, 2, 3, 4 for now
-  return (ndim >= 1 && ndim <= 4);
-}
-
-class MKLDNNSoftmaxFwd {
- public:
-  mkldnn::softmax_forward::primitive_desc pd;
-
-  MKLDNNSoftmaxFwd(const bool is_train, const int axis, const mkldnn::memory& input)
-      : pd(GetSoftmaxFwdPd(is_train, axis, input)) {
-    fwd_ = std::make_shared<mkldnn::softmax_forward>(pd);
-  }
-
-  const mkldnn::softmax_forward& GetFwd() const {
-    return *fwd_;
-  }
-
- private:
-  std::shared_ptr<mkldnn::softmax_forward> fwd_;
-};
-
-typedef ParamOpSign<SoftmaxParam> MKLDNNSoftmaxSignature;
-
-static MKLDNNSoftmaxFwd& GetSoftmaxFwd(const SoftmaxParam& param,
-                                       const int real_axis,
-                                       const bool is_train,
-                                       const NDArray& data,
-                                       const NDArray& output) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNSoftmaxSignature, MKLDNNSoftmaxFwd, OpHash> fwds;
-#else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNSoftmaxSignature, MKLDNNSoftmaxFwd, OpHash> fwds;
-#endif
-
-  MKLDNNSoftmaxSignature key(param);
-  key.AddSign(real_axis);
-  key.AddSign(is_train);
-  key.AddSign(data);
-  key.AddSign(output);
-
-  auto it = fwds.find(key);
-  if (it == fwds.end()) {
-    MKLDNNSoftmaxFwd fwd(is_train, real_axis, *(data.GetMKLDNNData()));
-    it = AddToCache(&fwds, key, fwd);
-  }
-  return it->second;
-}
-
-void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const NDArray& in_data,
-                          const OpReqType& req,
-                          const NDArray& out_data) {
-  if (req == kNullOp)
-    return;
-  // same as the FCompute path, softmax only supports kWriteTo and kWriteInplace for now.
-  CHECK_NE(req, kAddTo);
-
-  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-  int axis                  = CheckAxis(param.axis, in_data.shape().ndim());
-  auto fwd                  = GetSoftmaxFwd(param, axis, ctx.is_train, in_data, out_data);
-
-  auto in_mem          = in_data.GetMKLDNNData();
-  auto out_mem         = out_data.GetMKLDNNData(fwd.pd.dst_desc());
-  MKLDNNStream* stream = MKLDNNStream::Get();
-  stream->RegisterPrimArgs(fwd.GetFwd(), {{MKLDNN_ARG_SRC, *in_mem}, {MKLDNN_ARG_DST, *out_mem}});
-  stream->Submit();
-}
-
-class MKLDNNSoftmaxBwd {
- public:
-  mkldnn::softmax_backward::primitive_desc pd;
-
-  MKLDNNSoftmaxBwd(const mkldnn::memory& diff_mem,
-                   const mkldnn::memory& data_mem,
-                   const int axis,
-                   const mkldnn::softmax_forward::primitive_desc& hint_fwd_pd)
-      : pd(GetSoftmaxBwdPd(diff_mem, data_mem, axis, hint_fwd_pd)) {
-    bwd_ = std::make_shared<mkldnn::softmax_backward>(pd);
-  }
-
-  const mkldnn::softmax_backward& GetBwd() const {
-    return *bwd_;
-  }
-
- private:
-  std::shared_ptr<mkldnn::softmax_backward> bwd_;
-};
-
-static MKLDNNSoftmaxBwd& GetSoftmaxBwd(const SoftmaxParam& param,
-                                       const int real_axis,
-                                       const std::vector<NDArray>& data,
-                                       const std::vector<NDArray>& output) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNSoftmaxSignature, MKLDNNSoftmaxBwd, OpHash> bwds;
-#else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNSoftmaxSignature, MKLDNNSoftmaxBwd, OpHash> bwds;
-#endif
-
-  MKLDNNSoftmaxSignature key(param);
-  key.AddSign(real_axis);
-  key.AddSign(data);
-  key.AddSign(output);
-
-  auto it = bwds.find(key);
-  if (it == bwds.end()) {
-    auto diff_mem = data[0].GetMKLDNNData();
-    auto data_mem = data[1].GetMKLDNNData();
-    auto fwd_pd   = GetSoftmaxFwdPd(true, real_axis, *data_mem);
-    MKLDNNSoftmaxBwd bwd(*diff_mem, *data_mem, real_axis, fwd_pd);
-    it = AddToCache(&bwds, key, bwd);
-  }
-  return it->second;
-}
-
-void MKLDNNSoftmaxBackward(const nnvm::NodeAttrs& attrs,
-                           const OpContext& ctx,
-                           const std::vector<NDArray>& in_data,
-                           const std::vector<OpReqType>& req,
-                           const std::vector<NDArray>& out_data) {
-  if (req[0] == kNullOp)
-    return;
-  CHECK_EQ(in_data.size(), 2U);
-  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-  int axis                  = CheckAxis(param.axis, in_data[1].shape().ndim());
-  auto diff_mem             = in_data[0].GetMKLDNNData();
-  auto data_mem             = in_data[1].GetMKLDNNData();
-  auto bwd                  = GetSoftmaxBwd(param, axis, in_data, out_data);
-
-  auto out_mem           = CreateMKLDNNMem(out_data[0], bwd.pd.diff_src_desc(), req[0]);
-  MKLDNNStream* stream   = MKLDNNStream::Get();
-  mkldnn_args_map_t args = {{MKLDNN_ARG_DST, *data_mem},
-                            {MKLDNN_ARG_DIFF_DST, *diff_mem},
-                            {MKLDNN_ARG_DIFF_SRC, *out_mem.second}};
-
-  stream->RegisterPrimArgs(bwd.GetBwd(), args);
-  CommitOutput(out_data[0], out_mem);
-  stream->Submit();
-}
-
-}  // namespace op
-}  // namespace mxnet
-#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_softmax_output.cc b/src/operator/nn/mkldnn/mkldnn_softmax_output.cc
deleted file mode 100644
index cb2c9ded91c0..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_softmax_output.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_softmax_output.cc
- * \brief integrate mkldnn softmax to softmax_output forward
- * \author Zhang Rong A
- */
-
-#if MXNET_USE_ONEDNN == 1
-#include "../../softmax_output-inl.h"
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-namespace mxnet {
-namespace op {
-
-static mkldnn::softmax_forward::primitive_desc GetSoftmaxOutputFwdDescImpl(
-    const SoftmaxOutputParam& param,
-    bool is_train,
-    const int axis,
-    const mkldnn::memory& input_mem) {
-  mkldnn::memory::desc data_md = input_mem.get_desc();
-  auto cpu_engine              = CpuEngine::Get()->get_engine();
-  auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
-  auto desc = mkldnn::softmax_forward::desc(prop, data_md, axis);
-  return mkldnn::softmax_forward::primitive_desc(desc, cpu_engine);
-}
-
-typedef ParamOpSign<SoftmaxOutputParam> MKLDNNSoftmaxOuputSignature;
-
-class MKLDNNSoftmaxOutputFwd {
-  std::shared_ptr<mkldnn::softmax_forward> fwd_;
-
- public:
-  const mkldnn::softmax_forward::primitive_desc fwd_pd;
-
-  MKLDNNSoftmaxOutputFwd(const SoftmaxOutputParam& param,
-                         bool is_train,
-                         const int axis,
-                         const mkldnn::memory& mem)
-      : fwd_pd(GetSoftmaxOutputFwdDescImpl(param, is_train, axis, mem)) {
-    fwd_ = std::make_shared<mkldnn::softmax_forward>(fwd_pd);
-  }
-
-  const inline mkldnn::softmax_forward& GetFwd() const {
-    return *fwd_;
-  }
-};
-
-static MKLDNNSoftmaxOutputFwd& GetSoftmaxOutputForward(const SoftmaxOutputParam& param,
-                                                       const OpContext& ctx,
-                                                       const NDArray& in_data) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::
-      unordered_map<MKLDNNSoftmaxOuputSignature, MKLDNNSoftmaxOutputFwd, OpHash>
-          fwds;
-#else
-  static MX_THREAD_LOCAL
-      std::unordered_map<MKLDNNSoftmaxOuputSignature, MKLDNNSoftmaxOutputFwd, OpHash>
-          fwds;
-#endif
-  MKLDNNSoftmaxOuputSignature key(param);
-  key.AddSign(ctx.is_train);
-  key.AddSign(in_data);
-
-  //  softmax_output has no axis parameter, so use it as it original implement.
-  int axis = in_data.shape().ndim() - 1;
-
-  auto it = fwds.find(key);
-  if (it == fwds.end()) {
-    auto in_mem = *(in_data.GetMKLDNNData());
-    MKLDNNSoftmaxOutputFwd fwd(param, ctx.is_train, axis, in_mem);
-    it = AddToCache(&fwds, key, fwd);
-  }
-  return it->second;
-}
-
-//  This is only used for forward. For backward ,need double check compatibility
-bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam& param) {
-  return param.multi_output ? false : true;
-}
-
-void MKLDNNSoftmaxOutputForward(const nnvm::NodeAttrs& attrs,
-                                const OpContext& ctx,
-                                const std::vector<NDArray>& in_data,
-                                const std::vector<OpReqType>& req,
-                                const std::vector<NDArray>& out_data) {
-  const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
-
-  NDArray idata = in_data[softmaxout_enum::kData];
-  NDArray odata = out_data[softmaxout_enum::kOut];
-  if (in_data[softmaxout_enum::kData].IsView() && in_data[softmaxout_enum::kData].IsMKLDNNData()) {
-    idata = in_data[softmaxout_enum::kData].Reorder2Default();
-  }
-
-  auto input_mem = idata.GetMKLDNNData();
-  auto out_mem   = CreateMKLDNNMem(
-      out_data[softmaxout_enum::kOut], input_mem->get_desc(), req[softmaxout_enum::kOut]);
-
-  MKLDNNSoftmaxOutputFwd& fwd = GetSoftmaxOutputForward(param, ctx, idata);
-
-  MKLDNNStream* stream = MKLDNNStream::Get();
-  stream->RegisterPrimArgs(fwd.GetFwd(),
-                           {{MKLDNN_ARG_SRC, *input_mem}, {MKLDNN_ARG_DST, *out_mem.second}});
-  CommitOutput(out_data[softmaxout_enum::kOut], out_mem);
-  stream->Submit();
-}
-}  // namespace op
-}  // namespace mxnet
-#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc
deleted file mode 100644
index 9c6e562d7955..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_sum.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_sum.cc
- * \brief
- * \author Da Zheng
- */
-#include <iostream>
-
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-
-#include "../../operator_common.h"
-
-namespace mxnet {
-namespace op {
-
-#if MXNET_USE_ONEDNN == 1
-void MKLDNNSum(const mkldnn::memory& arr1, const mkldnn::memory& arr2, const mkldnn::memory& out) {
-  std::vector<mkldnn::memory::desc> input_pds(2);
-  std::vector<float> scales(2, 1);
-  input_pds[0] = arr1.get_desc();
-  input_pds[1] = arr2.get_desc();
-  CHECK(input_pds[0] == input_pds[0]);
-  const mkldnn::memory* in_mem1 = &arr1;
-  const mkldnn::memory* in_mem2 = &arr2;
-  auto output_pd                = out.get_desc();
-  if (input_pds[0] != output_pd) {
-    auto tmp_memory1 = TmpMemMgr::Get()->Alloc(output_pd);
-    auto tmp_memory2 = TmpMemMgr::Get()->Alloc(output_pd);
-    MKLDNNMemoryCopy(arr1, tmp_memory1);
-    MKLDNNMemoryCopy(arr2, tmp_memory2);
-    input_pds[0] = tmp_memory1->get_desc();
-    input_pds[1] = tmp_memory2->get_desc();
-    in_mem1      = tmp_memory1;
-    in_mem2      = tmp_memory2;
-  }
-  mkldnn::sum::primitive_desc sum_pd(output_pd, scales, input_pds, CpuEngine::Get()->get_engine());
-  mkldnn_args_map_t args = {
-      {MKLDNN_ARG_MULTIPLE_SRC, *in_mem1},
-      {MKLDNN_ARG_MULTIPLE_SRC + 1, *in_mem2},
-      {MKLDNN_ARG_DST, out},
-  };
-  MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::sum(sum_pd), args);
-}
-
-class MKLDNNSumFwd {
- public:
-  mkldnn::sum::primitive_desc fwd_pd;
-
-  MKLDNNSumFwd(const std::vector<float>& scales, const std::vector<mkldnn::memory::desc>& data_md)
-      : fwd_pd(scales, data_md, CpuEngine::Get()->get_engine()) {
-    fwd_ = std::make_shared<mkldnn::sum>(fwd_pd);
-  }
-
-  const mkldnn::sum& GetFwd() const {
-    return *fwd_;
-  }
-
- private:
-  std::shared_ptr<mkldnn::sum> fwd_;
-};
-
-static MKLDNNSumFwd& GetSumForward(const std::vector<float>& scales,
-                                   const std::vector<NDArray>& in_data,
-                                   const std::vector<mkldnn::memory::desc>& data_md) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<OpSignature, MKLDNNSumFwd, OpHash> fwds;
-#else
-  static MX_THREAD_LOCAL std::unordered_map<OpSignature, MKLDNNSumFwd, OpHash> fwds;
-#endif
-  OpSignature key;
-  key.AddSign(in_data);
-
-  auto it = fwds.find(key);
-  if (it == fwds.end()) {
-    MKLDNNSumFwd fwd(scales, data_md);
-    it = AddToCache(&fwds, key, fwd);
-  }
-  return it->second;
-}
-
-void MKLDNNSumForward(const nnvm::NodeAttrs& attrs,
-                      const OpContext& ctx,
-                      const std::vector<NDArray>& inputs,
-                      const std::vector<OpReqType>& req,
-                      const std::vector<NDArray>& outputs) {
-  TmpMemMgr::Get()->Init(ctx.requested[0]);
-  const int num_inputs    = inputs.size();
-  const NDArray& out_data = outputs[0];
-  std::vector<mkldnn::memory::desc> data_md;
-  std::vector<const mkldnn::memory*> data_mem;
-  std::vector<float> scales(num_inputs, 1);
-
-  data_md.reserve(num_inputs);
-  data_mem.reserve(num_inputs);
-
-  for (int i = 0; i < num_inputs; ++i) {
-    const mkldnn::memory* in_mem = inputs[i].GetMKLDNNData();
-    mkldnn::memory::desc tmp_md  = in_mem->get_desc();
-    data_md.push_back(tmp_md);
-    data_mem.push_back(in_mem);
-  }
-
-  MKLDNNSumFwd& fwd = GetSumForward(scales, inputs, data_md);
-  mxnet::mkldnn_output_t out_mem =
-      CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_desc(), req[0], &inputs[0]);
-  mkldnn_args_map_t net_args;
-  net_args.insert({MKLDNN_ARG_DST, *out_mem.second});
-  for (int i = 0; i < num_inputs; ++i) {
-    net_args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, *data_mem[i]});
-  }
-  MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
-  CommitOutput(out_data, out_mem);
-  MKLDNNStream::Get()->Submit();
-}
-#endif
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index 714fd0a2a51d..898309579054 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -258,7 +258,7 @@ namespace mxnet {
 namespace op {
 
 /*
- * When MKLDNN is enabled, we might want 2 outputs instead of one inputs, which
+ * When DNNL is enabled, we might want 2 outputs instead of one inputs, which
  * also changes the number of inputs for backward.
  */
 int GetNumOutputs(const PoolingParam& param);
@@ -482,7 +482,7 @@ void PoolingGradCompute(const nnvm::NodeAttrs& attrs,
         << "You need to set the kernel size if global pooling is not used";
   }
   off_t ograd_idx, in_data_idx, out_data_idx;
-  // When MKLDNN is enabled, the input data may contains arrays for workspace.
+  // When DNNL is enabled, the input data may contains arrays for workspace.
   if (GetNumBackInputs(param) == 5) {
     ograd_idx    = 0;
     in_data_idx  = 2;
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 005fe9c8ced6..47114f8cc897 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -25,8 +25,8 @@
 #include "../elemwise_op_common.h"
 #include "./pooling-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./mkldnn/mkldnn_pooling-inl.h"
-#include "./mkldnn/mkldnn_base-inl.h"
+#include "./dnnl/dnnl_base-inl.h"
+#include "./dnnl/dnnl_pooling-inl.h"
 #endif  // MXNET_USE_ONEDNN
 namespace mxnet {
 namespace op {
@@ -63,7 +63,7 @@ void PoolingParamParser(nnvm::NodeAttrs* attrs) {
 
 int GetNumOutputs(const PoolingParam& param) {
 #if MXNET_USE_ONEDNN == 1
-  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1;
+  return DNNLRequireWorkspace(param) && SupportDNNLPooling(param) ? 2 : 1;
 #else
   return 1;
 #endif
@@ -71,7 +71,7 @@ int GetNumOutputs(const PoolingParam& param) {
 
 int GetNumBackInputs(const PoolingParam& param) {
 #if MXNET_USE_ONEDNN == 1
-  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 5 : 3;
+  return DNNLRequireWorkspace(param) && SupportDNNLPooling(param) ? 5 : 3;
 #else
   return 3;
 #endif
@@ -83,7 +83,7 @@ static bool PoolingType(const nnvm::NodeAttrs& attrs,
   out_attrs->at(0) = in_attrs->at(0);
 #if MXNET_USE_ONEDNN == 1
   const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
-  if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param)) {
+  if (DNNLRequireWorkspace(param) && SupportDNNLPooling(param)) {
     CHECK_GT(out_attrs->size(), 1U);
     out_attrs->at(1) = mshadow::kInt32;
   }
@@ -147,7 +147,7 @@ static bool PoolingShape(const nnvm::NodeAttrs& attrs,
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
 #if MXNET_USE_ONEDNN == 1
-    if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
+    if (DNNLRequireWorkspace(param) && SupportDNNLPooling(param))
       out_shape->push_back(oshape);  // for workspace
 #endif
   } else if (param.kernel.ndim() == 0) {
@@ -181,7 +181,7 @@ static bool PoolingShape(const nnvm::NodeAttrs& attrs,
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
 #if MXNET_USE_ONEDNN == 1
-    if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
+    if (DNNLRequireWorkspace(param) && SupportDNNLPooling(param))
       out_shape->push_back(oshape);  // for workspace
 #endif
   } else if (param.kernel.ndim() == 2) {
@@ -218,7 +218,7 @@ static bool PoolingShape(const nnvm::NodeAttrs& attrs,
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
 #if MXNET_USE_ONEDNN == 1
-    if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
+    if (DNNLRequireWorkspace(param) && SupportDNNLPooling(param))
       out_shape->push_back(oshape);  // for workspace
 #endif
   } else if (param.kernel.ndim() == 3) {
@@ -261,7 +261,7 @@ static bool PoolingShape(const nnvm::NodeAttrs& attrs,
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
 #if MXNET_USE_ONEDNN == 1
-    if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
+    if (DNNLRequireWorkspace(param) && SupportDNNLPooling(param))
       out_shape->push_back(oshape);  // for workspace
 #endif
   }
@@ -284,14 +284,14 @@ void PoolingComputeExCPU(const nnvm::NodeAttrs& attrs,
     return;
   }
 
-  if (SupportMKLDNNPooling(param, inputs[0])) {
-    if (MKLDNNRequireWorkspace(param)) {
+  if (SupportDNNLPooling(param, inputs[0])) {
+    if (DNNLRequireWorkspace(param)) {
       CHECK_GT(outputs.size(), 1U);
       workspace = &outputs[1];
     }
-    MKLDNN_OPCHECK_INIT(false, 1, inputs, outputs);
-    MKLDNNPoolingCompute(ctx, param, inputs[0], req[0], outputs[0], workspace);
-    MKLDNN_OPCHECK_RUN(PoolingCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_INIT(false, 1, inputs, outputs);
+    DNNLPoolingCompute(ctx, param, inputs[0], req[0], outputs[0], workspace);
+    DNNL_OPCHECK_RUN(PoolingCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(PoolingCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -310,11 +310,11 @@ void PoolingGradComputeExCPU(const nnvm::NodeAttrs& attrs,
     return;
   }
 
-  if (SupportMKLDNNPooling(param, inputs[0])) {
+  if (SupportDNNLPooling(param, inputs[0])) {
     const NDArray& out_grad  = inputs[0];
     const NDArray* workspace = nullptr;
     const NDArray* in_data   = nullptr;
-    if (MKLDNNRequireWorkspace(param)) {
+    if (DNNLRequireWorkspace(param)) {
       // The first two elements are the gradient of the outputs in forward.
       // The third is the input of forward.
       // The fourth and the fifth are the outputs of forward.
@@ -326,9 +326,9 @@ void PoolingGradComputeExCPU(const nnvm::NodeAttrs& attrs,
       in_data = &inputs[1];
     }
     const NDArray& in_grad = outputs[0];
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNPoolingGradCompute(ctx, param, out_grad, *in_data, workspace, req[0], in_grad);
-    MKLDNN_OPCHECK_RUN(PoolingGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLPoolingGradCompute(ctx, param, out_grad, *in_data, workspace, req[0], in_grad);
+    DNNL_OPCHECK_RUN(PoolingGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(PoolingGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -341,10 +341,9 @@ inline static bool PoolingStorageType(const nnvm::NodeAttrs& attrs,
                                       std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1);
   const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
-  bool support_mkldnn_pool  = SupportMKLDNNPooling(param);
+  bool support_dnnl_pool    = SupportDNNLPooling(param);
 
-  return MKLDNNStorageType(
-      attrs, dev_mask, support_mkldnn_pool, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, support_dnnl_pool, dispatch_mode, in_attrs, out_attrs);
 }
 
 inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs& attrs,
@@ -355,10 +354,9 @@ inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs& attrs,
   const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), GetNumBackInputs(param));
   CHECK_EQ(out_attrs->size(), 1);
-  bool support_mkldnn_pool = SupportMKLDNNPooling(param);
+  bool support_dnnl_pool = SupportDNNLPooling(param);
 
-  return MKLDNNStorageType(
-      attrs, dev_mask, support_mkldnn_pool, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, support_dnnl_pool, dispatch_mode, in_attrs, out_attrs);
 }
 #endif
 
@@ -446,7 +444,7 @@ For each window ``X``, the mathematical expression for Lp pooling is:
     .set_attr<mxnet::FInferShape>("FInferShape", PoolingShape)
     .set_attr<FCompute>("FCompute<cpu>", PoolingCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", PoolingComputeExCPU)
 #endif
     .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_Pooling"})
@@ -467,8 +465,7 @@ NNVM_REGISTER_OP(_backward_Pooling)
 #if MXNET_USE_ONEDNN == 1
                                       const PoolingParam& param =
                                           nnvm::get<PoolingParam>(attrs.parsed);
-                                      if (MKLDNNRequireWorkspace(param) &&
-                                          SupportMKLDNNPooling(param))
+                                      if (DNNLRequireWorkspace(param) && SupportDNNLPooling(param))
                                         return std::vector<std::pair<int, int> >{{1, 0}};
 #endif
                                       return std::vector<std::pair<int, int> >();
@@ -482,7 +479,7 @@ NNVM_REGISTER_OP(_backward_Pooling)
 #endif
     .set_attr_parser(PoolingParamParser)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", PoolingGradComputeExCPU)
 #endif
     .set_attr<FCompute>("FCompute<cpu>", PoolingGradCompute<cpu>);
diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu
index c82f5b261177..9e883c62bd81 100644
--- a/src/operator/nn/pooling.cu
+++ b/src/operator/nn/pooling.cu
@@ -88,7 +88,7 @@ void PoolingGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   off_t ograd_idx, in_data_idx, out_data_idx;
-  // When MKLDNN is enabled, the input data may contains arrays for workspace.
+  // When DNNL is enabled, the input data may contains arrays for workspace.
   if (GetNumBackInputs(param) == 5) {
     ograd_idx    = 0;
     in_data_idx  = 2;
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index 132b36e41e0d..318446165247 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -26,8 +26,8 @@
 #include "../tensor/elemwise_binary_op.h"
 #include "../operator_common.h"
 #if MXNET_USE_ONEDNN == 1
-#include "mkldnn/mkldnn_base-inl.h"
-#include "mkldnn/mkldnn_ops-inl.h"
+#include "dnnl/dnnl_base-inl.h"
+#include "dnnl/dnnl_ops-inl.h"
 #endif
 
 namespace mxnet {
@@ -44,11 +44,11 @@ static void SoftmaxComputeExCPU(const nnvm::NodeAttrs& attrs,
   if (inputs[0].shape().Size() == 0U)
     return;
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-  if (SupportMKLDNNSoftmax(param, inputs[0], outputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNSoftmaxForward, attrs, ctx, inputs[0], req[0], outputs[0]);
+  if (SupportDNNLSoftmax(param, inputs[0], outputs[0])) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLSoftmaxForward, attrs, ctx, inputs[0], req[0], outputs[0]);
     auto fn = SoftmaxCompute<cpu, mxnet_op::softmax_fwd>;
-    MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(SoftmaxCompute<cpu, mxnet_op::softmax_fwd>, attrs, ctx, inputs, req, outputs);
@@ -62,11 +62,11 @@ static void SoftmaxGradComputeExCPU(const nnvm::NodeAttrs& attrs,
   if (inputs[0].shape().Size() == 0U)
     return;
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-  if (SupportMKLDNNSoftmax(param, inputs[1], outputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNSoftmaxBackward, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLSoftmax(param, inputs[1], outputs[0])) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLSoftmaxBackward, attrs, ctx, inputs, req, outputs);
     auto fn = SoftmaxGradCompute<cpu, op::mshadow_op::mul, mxnet_op::softmax_bwd>;
-    MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(SoftmaxGradCompute<cpu, op::mshadow_op::mul, mxnet_op::softmax_bwd>,
@@ -91,7 +91,7 @@ inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs,
     return storage_type_assign(&out_stype, kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
   }
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 
 inline static bool SoftmaxGradStorageType(const nnvm::NodeAttrs& attrs,
@@ -106,7 +106,7 @@ inline static bool SoftmaxGradStorageType(const nnvm::NodeAttrs& attrs,
 
   CHECK_EQ(in_attrs->size(), SoftmaxGradOpNumInputs(attrs));
   CHECK_EQ(out_attrs->size(), softmax_use_length(attrs) ? 2U : 1U);
-  return MKLDNNStorageType(attrs, dev_mask, support, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, support, dispatch_mode, in_attrs, out_attrs);
 }
 #endif
 
@@ -150,7 +150,7 @@ Example::
                                       })
     .set_attr<FCompute>("FCompute<cpu>", SoftmaxCompute<cpu, mxnet_op::softmax_fwd>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxComputeExCPU)
     .set_attr<FInferStorageType>("FInferStorageType", SoftmaxStorageType)
 #endif
@@ -183,7 +183,7 @@ NNVM_REGISTER_OP(_backward_softmax)
     .add_argument("args", "NDArray-or-Symbol[]", "Positional input arguments")
     .set_attr_parser(ParamParser<SoftmaxParam>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxGradComputeExCPU)
     .set_attr<FInferStorageType>("FInferStorageType", SoftmaxGradStorageType)
 #endif
diff --git a/src/operator/numpy/np_matrix_op.cc b/src/operator/numpy/np_matrix_op.cc
index dd7230e70970..042ff10feaf1 100644
--- a/src/operator/numpy/np_matrix_op.cc
+++ b/src/operator/numpy/np_matrix_op.cc
@@ -380,12 +380,13 @@ NNVM_REGISTER_OP(_npx_reshape)
     .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_reshape"})
     .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", ReshapeComputeExCPU)
     .set_attr<FInferStorageType>("FInferStorageType", ReshapeStorageType)
-    .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-    })
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& n) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
 #endif
     .set_attr<nnvm::FInplaceOption>("FInplaceOption",
                                     [](const NodeAttrs& attrs) {
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index 3388cfe29ec8..8c5beec85150 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -545,7 +545,7 @@ class OpSignature {
    */
 
 #if MXNET_USE_ONEDNN == 1
-  void AddSign(const mkldnn::memory& mem) {
+  void AddSign(const dnnl::memory& mem) {
     auto desc = mem.get_desc();
     hash      = hash * 2 + desc.data.format_kind;
     eles.push_back(desc.data.format_kind);
@@ -556,7 +556,7 @@ class OpSignature {
       eles.push_back(desc.data.dims[i]);
     }
     switch (desc.data.format_kind) {
-      case mkldnn_blocked:
+      case dnnl_blocked:
         hash = hash * 2 + desc.data.ndims;
         eles.push_back(desc.data.ndims);
         for (int i = 0; i < desc.data.ndims; i++) {
@@ -572,11 +572,11 @@ class OpSignature {
           eles.push_back(desc.data.format_desc.blocking.inner_idxs[i]);
         }
         break;
-      case mkldnn_format_kind_wino:
+      case dnnl_format_kind_wino:
         hash = hash * 2 + desc.data.format_desc.wino_desc.wino_format;
         eles.push_back(desc.data.format_desc.wino_desc.wino_format);
         break;
-      case mkldnn_format_kind_rnn_packed:
+      case dnnl_format_kind_rnn_packed:
         hash = hash * 2 + desc.data.format_desc.rnn_packed_desc.format;
         eles.push_back(desc.data.format_desc.rnn_packed_desc.format);
         hash = hash * 2 + desc.data.format_desc.rnn_packed_desc.n_parts;
@@ -613,8 +613,8 @@ class OpSignature {
 
   void AddSign(const NDArray& arr) {
 #if MXNET_USE_ONEDNN == 1
-    if (arr.IsMKLDNNData()) {
-      AddSign(*(arr.GetMKLDNNData()));
+    if (arr.IsDNNLData()) {
+      AddSign(*(arr.GetDNNLData()));
     } else {
 #endif
       hash = hash * 2 + arr.dtype();
diff --git a/src/operator/quantization/dequantize.cc b/src/operator/quantization/dequantize.cc
index 9c47cc96a960..b29cd1d18f40 100644
--- a/src/operator/quantization/dequantize.cc
+++ b/src/operator/quantization/dequantize.cc
@@ -23,7 +23,7 @@
  */
 #include "./dequantize-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./mkldnn/mkldnn_dequantize-inl.h"
+#include "./dnnl/dnnl_dequantize-inl.h"
 #endif
 
 namespace mxnet {
@@ -54,7 +54,7 @@ static OpStatePtr CreateDequantizeState(const nnvm::NodeAttrs& attrs,
     state = OpStatePtr::Create<DequantizeOperator<gpu>>(attrs);
   } else {
 #if MXNET_USE_ONEDNN == 1
-    state = OpStatePtr::Create<SgMKLDNNDequantizeOperator>(attrs);
+    state = OpStatePtr::Create<SgDNNLDequantizeOperator>(attrs);
 #else
     state = OpStatePtr::Create<DequantizeOperator<cpu>>(attrs);
 #endif
@@ -95,8 +95,8 @@ by keep zero centered for the quantized value:
     .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
     .set_attr<FCreateOpState>("FCreateOpState", CreateDequantizeState)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
-    .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgMKLDNNDequantizeForward)
+    .set_attr<bool>("TIsDNNL", true)
+    .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgDNNLDequantizeForward)
 #endif
     .set_attr<FStatefulCompute>("FStatefulCompute<cpu>", DequantizeForward<cpu>)
     .add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type `uint8`")
diff --git a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h b/src/operator/quantization/dnnl/dnnl_dequantize-inl.h
similarity index 52%
rename from src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
rename to src/operator/quantization/dnnl/dnnl_dequantize-inl.h
index 79356592ff79..0c4e417f4c1c 100644
--- a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
+++ b/src/operator/quantization/dnnl/dnnl_dequantize-inl.h
@@ -18,26 +18,26 @@
  */
 
 /*!
- * \file mkldnn_dequantize-inl.h
+ * \file dnnl_dequantize-inl.h
  * \author Wenting Jiang, Xinyu Chen
  * \brief
  */
 
-#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_DEQUANTIZE_INL_H_
-#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_DEQUANTIZE_INL_H_
+#ifndef MXNET_OPERATOR_QUANTIZATION_DNNL_DNNL_DEQUANTIZE_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_DNNL_DNNL_DEQUANTIZE_INL_H_
 #if MXNET_USE_ONEDNN == 1
 #include <algorithm>
 #include <string>
 #include <vector>
 
-#include "../../nn/mkldnn/mkldnn_base-inl.h"
+#include "../../nn/dnnl/dnnl_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-class SgMKLDNNDequantizeOperator {
+class SgDNNLDequantizeOperator {
  public:
-  explicit SgMKLDNNDequantizeOperator(const nnvm::NodeAttrs& attrs)
+  explicit SgDNNLDequantizeOperator(const nnvm::NodeAttrs& attrs)
       : param_(nnvm::get<DequantizeParam>(attrs.parsed)) {}
 
   void Forward(const OpContext& ctx,
@@ -50,19 +50,19 @@ class SgMKLDNNDequantizeOperator {
   DequantizeParam param_;
   float cached_data_min_{0.f};
   float cached_data_max_{0.f};
-  mkldnn::memory::desc o_desc_;
-  mkldnn_args_map_t args_;
-  std::shared_ptr<mkldnn::reorder> fwd_pd_;
+  dnnl::memory::desc o_desc_;
+  dnnl_args_map_t args_;
+  std::shared_ptr<dnnl::reorder> fwd_pd_;
 };
 
-void SgMKLDNNDequantizeOperator::Forward(const OpContext& ctx,
-                                         const std::vector<NDArray>& inputs,
-                                         const std::vector<OpReqType>& req,
-                                         const std::vector<NDArray>& outputs) {
+void SgDNNLDequantizeOperator::Forward(const OpContext& ctx,
+                                       const std::vector<NDArray>& inputs,
+                                       const std::vector<OpReqType>& req,
+                                       const std::vector<NDArray>& outputs) {
   NDArray in_buffer = inputs[0];
-  if (inputs[0].IsView() && inputs[0].IsMKLDNNData())
+  if (inputs[0].IsView() && inputs[0].IsDNNLData())
     in_buffer = inputs[0].Reorder2Default();
-  auto i_mem     = in_buffer.GetMKLDNNData();
+  auto i_mem     = in_buffer.GetDNNLData();
   float data_min = *inputs[1].data().dptr<float>();
   float data_max = *inputs[2].data().dptr<float>();
 
@@ -80,43 +80,42 @@ void SgMKLDNNDequantizeOperator::Forward(const OpContext& ctx,
       quantized_range = kInt8Range;
       real_range      = MaxAbs(*inputs[1].data().dptr<float>(), *inputs[2].data().dptr<float>());
     } else {
-      LOG(FATAL) << "mkldnn dequantize op only supports int8 and uint8 as output type";
+      LOG(FATAL) << "dnnl dequantize op only supports int8 and uint8 as output type";
     }
     float scale = real_range / quantized_range;
-    mkldnn::primitive_attr attr;
+    dnnl::primitive_attr attr;
     const int mask            = 0;
     std::vector<float> scales = {scale};
     attr.set_output_scales(mask, scales);
-    mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
-    auto i_desc               = i_mem->get_desc();
-    size_t i_ndim             = in_buffer.shape().ndim();
+    dnnl::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+    auto i_desc             = i_mem->get_desc();
+    size_t i_ndim           = in_buffer.shape().ndim();
     if (i_ndim == 4) {
-      mkldnn::memory::format_tag o_fmt = mkldnn::memory::format_tag::nchw;
-      mkldnn::memory::dims o_dims(i_desc.data.dims, i_desc.data.dims + i_desc.data.ndims);
-      o_desc_ = mkldnn::memory::desc(o_dims, get_mkldnn_type<float>(), o_fmt);
+      dnnl::memory::format_tag o_fmt = dnnl::memory::format_tag::nchw;
+      dnnl::memory::dims o_dims(i_desc.data.dims, i_desc.data.dims + i_desc.data.ndims);
+      o_desc_ = dnnl::memory::desc(o_dims, get_dnnl_type<float>(), o_fmt);
     } else {
       o_desc_                = i_desc;
-      o_desc_.data.data_type = get_mkldnn_type_t<float>();
+      o_desc_.data.data_type = get_dnnl_type_t<float>();
     }
-    auto reorder_pd =
-        mkldnn::reorder::primitive_desc(cpu_engine, i_desc, cpu_engine, o_desc_, attr);
-    fwd_pd_      = std::make_shared<mkldnn::reorder>(reorder_pd);
-    initialized_ = true;
+    auto reorder_pd = dnnl::reorder::primitive_desc(cpu_engine, i_desc, cpu_engine, o_desc_, attr);
+    fwd_pd_         = std::make_shared<dnnl::reorder>(reorder_pd);
+    initialized_    = true;
   }
-  auto o_mem             = CreateMKLDNNMem(outputs[0], o_desc_, req[0]);
-  args_[MKLDNN_ARG_FROM] = *i_mem;
-  args_[MKLDNN_ARG_TO]   = *o_mem.second;
-  MKLDNNStream::Get()->RegisterPrimArgs(*fwd_pd_, args_);
+  auto o_mem           = CreateDNNLMem(outputs[0], o_desc_, req[0]);
+  args_[DNNL_ARG_FROM] = *i_mem;
+  args_[DNNL_ARG_TO]   = *o_mem.second;
+  DNNLStream::Get()->RegisterPrimArgs(*fwd_pd_, args_);
   CommitOutput(outputs[0], o_mem);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
-static void SgMKLDNNDequantizeForward(const OpStatePtr& state_ptr,
-                                      const OpContext& ctx,
-                                      const std::vector<NDArray>& inputs,
-                                      const std::vector<OpReqType>& req,
-                                      const std::vector<NDArray>& outputs) {
-  SgMKLDNNDequantizeOperator& op = state_ptr.get_state<SgMKLDNNDequantizeOperator>();
+static void SgDNNLDequantizeForward(const OpStatePtr& state_ptr,
+                                    const OpContext& ctx,
+                                    const std::vector<NDArray>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<NDArray>& outputs) {
+  SgDNNLDequantizeOperator& op = state_ptr.get_state<SgDNNLDequantizeOperator>();
   op.Forward(ctx, inputs, req, outputs);
 }
 
@@ -124,4 +123,4 @@ static void SgMKLDNNDequantizeForward(const OpStatePtr& state_ptr,
 }  // namespace mxnet
 
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_DEQUANTIZE_INL_H_
+#endif  // MXNET_OPERATOR_QUANTIZATION_DNNL_DNNL_DEQUANTIZE_INL_H_
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h b/src/operator/quantization/dnnl/dnnl_quantize-inl.h
similarity index 54%
rename from src/operator/quantization/mkldnn/mkldnn_quantize-inl.h
rename to src/operator/quantization/dnnl/dnnl_quantize-inl.h
index 035f15148154..7a53ab17cc5b 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h
+++ b/src/operator/quantization/dnnl/dnnl_quantize-inl.h
@@ -18,29 +18,29 @@
  */
 
 /*!
- * \file mkldnn_quantize-inl.h
+ * \file dnnl_quantize-inl.h
  * \brief
  * \author Wenting Jiang, Xinyu Chen
  */
 
-#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_INL_H_
-#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_INL_H_
+#ifndef MXNET_OPERATOR_QUANTIZATION_DNNL_DNNL_QUANTIZE_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_DNNL_DNNL_QUANTIZE_INL_H_
 #if MXNET_USE_ONEDNN == 1
 #include <algorithm>
 #include <string>
 #include <vector>
 
-#include "../../nn/mkldnn/mkldnn_base-inl.h"
+#include "../../nn/dnnl/dnnl_base-inl.h"
 #include "../quantize-inl.h"
 
 namespace mxnet {
 namespace op {
 
 template <typename SrcType, typename DstType>
-static void MKLDNNQuantizeComputeKer(const std::vector<NDArray>& inputs,
-                                     const std::vector<NDArray>& outputs,
-                                     const QuantizeParam& param,
-                                     const std::vector<OpReqType>& req) {
+static void DNNLQuantizeComputeKer(const std::vector<NDArray>& inputs,
+                                   const std::vector<NDArray>& outputs,
+                                   const QuantizeParam& param,
+                                   const std::vector<OpReqType>& req) {
   using namespace mshadow;
   using namespace mxnet_op;
   using red::limits::MaxValue;
@@ -58,50 +58,50 @@ static void MKLDNNQuantizeComputeKer(const std::vector<NDArray>& inputs,
     *outputs[1].data().dptr<float>() = -real_range;
     *outputs[2].data().dptr<float>() = real_range;
   } else {
-    LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
+    LOG(FATAL) << "dnnl quantize op only supports int8 and uint8 as output type";
   }
   float scale = quantized_range / real_range;
-  mkldnn::primitive_attr attr;
+  dnnl::primitive_attr attr;
   const int mask            = 0;
   std::vector<float> scales = {scale};
   attr.set_output_scales(mask, scales);
-  mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
-  NDArray in_buffer         = inputs[0];
-  if (inputs[0].IsView() && inputs[0].IsMKLDNNData())
+  dnnl::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+  NDArray in_buffer       = inputs[0];
+  if (inputs[0].IsView() && inputs[0].IsDNNLData())
     in_buffer = inputs[0].Reorder2Default();
 
-  auto i_mem    = in_buffer.GetMKLDNNData();
+  auto i_mem    = in_buffer.GetDNNLData();
   auto i_desc   = i_mem->get_desc();
   size_t i_ndim = in_buffer.shape().ndim();
-  mkldnn::memory::desc o_desc;
+  dnnl::memory::desc o_desc;
   if (i_ndim == 4) {
-    mkldnn::memory::format_tag o_fmt = mkldnn::memory::format_tag::nhwc;
-    mkldnn::memory::dims o_dims(i_desc.data.dims, i_desc.data.dims + i_desc.data.ndims);
-    o_desc = mkldnn::memory::desc(o_dims, get_mkldnn_type<DstType>(), o_fmt);
+    dnnl::memory::format_tag o_fmt = dnnl::memory::format_tag::nhwc;
+    dnnl::memory::dims o_dims(i_desc.data.dims, i_desc.data.dims + i_desc.data.ndims);
+    o_desc = dnnl::memory::desc(o_dims, get_dnnl_type<DstType>(), o_fmt);
   } else {
     o_desc                = i_desc;
-    o_desc.data.data_type = get_mkldnn_type_t<DstType>();
+    o_desc.data.data_type = get_dnnl_type_t<DstType>();
   }
-  auto reorder_pd = mkldnn::reorder::primitive_desc(cpu_engine, i_desc, cpu_engine, o_desc, attr);
-  auto o_mem      = CreateMKLDNNMem(outputs[0], o_desc, req[0]);
-  MKLDNNStream::Get()->RegisterPrimArgs(
-      mkldnn::reorder(reorder_pd), {{MKLDNN_ARG_FROM, *i_mem}, {MKLDNN_ARG_TO, *o_mem.second}});
+  auto reorder_pd = dnnl::reorder::primitive_desc(cpu_engine, i_desc, cpu_engine, o_desc, attr);
+  auto o_mem      = CreateDNNLMem(outputs[0], o_desc, req[0]);
+  DNNLStream::Get()->RegisterPrimArgs(dnnl::reorder(reorder_pd),
+                                      {{DNNL_ARG_FROM, *i_mem}, {DNNL_ARG_TO, *o_mem.second}});
   CommitOutput(outputs[0], o_mem);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
-static void MKLDNNQuantizeCompute(const nnvm::NodeAttrs& attrs,
-                                  const OpContext& ctx,
-                                  const std::vector<NDArray>& inputs,
-                                  const std::vector<OpReqType>& req,
-                                  const std::vector<NDArray>& outputs) {
+static void DNNLQuantizeCompute(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
   const QuantizeParam& param = nnvm::get<QuantizeParam>(attrs.parsed);
   if (param.out_type == mshadow::kUint8) {
-    MKLDNNQuantizeComputeKer<float, uint8_t>(inputs, outputs, param, req);
+    DNNLQuantizeComputeKer<float, uint8_t>(inputs, outputs, param, req);
   } else if (param.out_type == mshadow::kInt8) {
-    MKLDNNQuantizeComputeKer<float, int8_t>(inputs, outputs, param, req);
+    DNNLQuantizeComputeKer<float, int8_t>(inputs, outputs, param, req);
   } else {
-    LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
+    LOG(FATAL) << "dnnl quantize op only supports int8 and uint8 as output type";
   }
 }
 
@@ -109,4 +109,4 @@ static void MKLDNNQuantizeCompute(const nnvm::NodeAttrs& attrs,
 }  // namespace mxnet
 
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_INL_H_
+#endif  // MXNET_OPERATOR_QUANTIZATION_DNNL_DNNL_QUANTIZE_INL_H_
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h b/src/operator/quantization/dnnl/dnnl_quantize_v2-inl.h
similarity index 65%
rename from src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
rename to src/operator/quantization/dnnl/dnnl_quantize_v2-inl.h
index 2413341a2d22..1acc8a59ce19 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
+++ b/src/operator/quantization/dnnl/dnnl_quantize_v2-inl.h
@@ -18,26 +18,26 @@
  */
 
 /*!
- * \file mkldnn_quantize_v2-inl.h
+ * \file dnnl_quantize_v2-inl.h
  * \brief
  */
 
-#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_V2_INL_H_
-#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_V2_INL_H_
+#ifndef MXNET_OPERATOR_QUANTIZATION_DNNL_DNNL_QUANTIZE_V2_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_DNNL_DNNL_QUANTIZE_V2_INL_H_
 #if MXNET_USE_ONEDNN == 1
 #include <algorithm>
 #include <string>
 #include <vector>
 
-#include "../../nn/mkldnn/mkldnn_base-inl.h"
+#include "../../nn/dnnl/dnnl_base-inl.h"
 #include "../quantize_v2-inl.h"
 
 namespace mxnet {
 namespace op {
 
-class SgMKLDNNQuantizeOperator {
+class SgDNNLQuantizeOperator {
  public:
-  explicit SgMKLDNNQuantizeOperator(const nnvm::NodeAttrs& attrs)
+  explicit SgDNNLQuantizeOperator(const nnvm::NodeAttrs& attrs)
       : param_(nnvm::get<QuantizeV2Param>(attrs.parsed)) {}
 
   void Forward(const OpContext& ctx,
@@ -50,15 +50,15 @@ class SgMKLDNNQuantizeOperator {
   QuantizeV2Param param_;
   float cached_data_min_{0.f};
   float cached_data_max_{0.f};
-  mkldnn::memory::desc o_desc_;
-  mkldnn_args_map_t args_;
-  std::shared_ptr<mkldnn::reorder> fwd_pd_;
+  dnnl::memory::desc o_desc_;
+  dnnl_args_map_t args_;
+  std::shared_ptr<dnnl::reorder> fwd_pd_;
 };
 
-void SgMKLDNNQuantizeOperator::Forward(const OpContext& ctx,
-                                       const std::vector<NDArray>& inputs,
-                                       const std::vector<OpReqType>& req,
-                                       const std::vector<NDArray>& outputs) {
+void SgDNNLQuantizeOperator::Forward(const OpContext& ctx,
+                                     const std::vector<NDArray>& inputs,
+                                     const std::vector<OpReqType>& req,
+                                     const std::vector<NDArray>& outputs) {
   float quantized_range = 0.0;
   NDArray in_buffer     = inputs[0];
   float data_min        = mshadow::red::limits::MaxValue<float>();
@@ -79,13 +79,13 @@ void SgMKLDNNQuantizeOperator::Forward(const OpContext& ctx,
       }
     }
     if (req[0] != kWriteInplace) {
-      const_cast<NDArray&>(outputs[0]).CopyFrom(*inputs[0].GetMKLDNNData());
-      MKLDNNStream::Get()->Submit();
+      const_cast<NDArray&>(outputs[0]).CopyFrom(*inputs[0].GetDNNLData());
+      DNNLStream::Get()->Submit();
     }
   } else {
-    if (in_buffer.IsView() && in_buffer.IsMKLDNNData())
+    if (in_buffer.IsView() && in_buffer.IsDNNLData())
       in_buffer = inputs[0].Reorder2Default();
-    auto i_mem = in_buffer.GetMKLDNNData();
+    auto i_mem = in_buffer.GetDNNLData();
 
     if (param_.min_calib_range.has_value() && param_.max_calib_range.has_value()) {
       data_min = param_.min_calib_range.value();
@@ -128,7 +128,7 @@ void SgMKLDNNQuantizeOperator::Forward(const OpContext& ctx,
       *outputs[1].data().dptr<float>() = -real_range;
       *outputs[2].data().dptr<float>() = real_range;
     } else {
-      LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
+      LOG(FATAL) << "dnnl quantize op only supports int8 and uint8 as output type";
     }
 
     if (!initalized_) {
@@ -136,41 +136,41 @@ void SgMKLDNNQuantizeOperator::Forward(const OpContext& ctx,
       cached_data_max_ = data_max;
       float real_range = MaxAbs(data_min, data_max);
       float scale      = quantized_range / real_range;
-      mkldnn::primitive_attr attr;
+      dnnl::primitive_attr attr;
       const int mask            = 0;
       std::vector<float> scales = {scale};
       attr.set_output_scales(mask, scales);
-      mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
-      auto i_desc               = i_mem->get_desc();
-      size_t i_ndim             = in_buffer.shape().ndim();
+      dnnl::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+      auto i_desc             = i_mem->get_desc();
+      size_t i_ndim           = in_buffer.shape().ndim();
       if (i_ndim == 4) {
-        mkldnn::memory::format_tag o_fmt = mkldnn::memory::format_tag::nhwc;
-        mkldnn::memory::dims o_dims(i_desc.data.dims, i_desc.data.dims + i_desc.data.ndims);
-        o_desc_ = mkldnn::memory::desc(o_dims, get_mkldnn_type(out_type), o_fmt);
+        dnnl::memory::format_tag o_fmt = dnnl::memory::format_tag::nhwc;
+        dnnl::memory::dims o_dims(i_desc.data.dims, i_desc.data.dims + i_desc.data.ndims);
+        o_desc_ = dnnl::memory::desc(o_dims, get_dnnl_type(out_type), o_fmt);
       } else {
         o_desc_                = i_desc;
-        o_desc_.data.data_type = get_mkldnn_type_t(out_type);
+        o_desc_.data.data_type = get_dnnl_type_t(out_type);
       }
       auto reorder_pd =
-          mkldnn::reorder::primitive_desc(cpu_engine, i_desc, cpu_engine, o_desc_, attr);
-      fwd_pd_     = std::make_shared<mkldnn::reorder>(reorder_pd);
+          dnnl::reorder::primitive_desc(cpu_engine, i_desc, cpu_engine, o_desc_, attr);
+      fwd_pd_     = std::make_shared<dnnl::reorder>(reorder_pd);
       initalized_ = true;
     }
-    auto o_mem             = CreateMKLDNNMem(outputs[0], o_desc_, req[0]);
-    args_[MKLDNN_ARG_FROM] = *i_mem;
-    args_[MKLDNN_ARG_TO]   = *o_mem.second;
-    MKLDNNStream::Get()->RegisterPrimArgs(*fwd_pd_, args_);
+    auto o_mem           = CreateDNNLMem(outputs[0], o_desc_, req[0]);
+    args_[DNNL_ARG_FROM] = *i_mem;
+    args_[DNNL_ARG_TO]   = *o_mem.second;
+    DNNLStream::Get()->RegisterPrimArgs(*fwd_pd_, args_);
     CommitOutput(outputs[0], o_mem);
-    MKLDNNStream::Get()->Submit();
+    DNNLStream::Get()->Submit();
   }
 }
 
-static void SgMKLDNNQuantizeForward(const OpStatePtr& state_ptr,
-                                    const OpContext& ctx,
-                                    const std::vector<NDArray>& inputs,
-                                    const std::vector<OpReqType>& req,
-                                    const std::vector<NDArray>& outputs) {
-  SgMKLDNNQuantizeOperator& op = state_ptr.get_state<SgMKLDNNQuantizeOperator>();
+static void SgDNNLQuantizeForward(const OpStatePtr& state_ptr,
+                                  const OpContext& ctx,
+                                  const std::vector<NDArray>& inputs,
+                                  const std::vector<OpReqType>& req,
+                                  const std::vector<NDArray>& outputs) {
+  SgDNNLQuantizeOperator& op = state_ptr.get_state<SgDNNLQuantizeOperator>();
   op.Forward(ctx, inputs, req, outputs);
 }
 
@@ -178,4 +178,4 @@ static void SgMKLDNNQuantizeForward(const OpStatePtr& state_ptr,
 }  // namespace mxnet
 
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_V2_INL_H_
+#endif  // MXNET_OPERATOR_QUANTIZATION_DNNL_DNNL_QUANTIZE_V2_INL_H_
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_act.cc b/src/operator/quantization/dnnl/dnnl_quantized_act.cc
similarity index 66%
rename from src/operator/quantization/mkldnn/mkldnn_quantized_act.cc
rename to src/operator/quantization/dnnl/dnnl_quantized_act.cc
index 6d366584c85c..0aea4b471ccc 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_act.cc
+++ b/src/operator/quantization/dnnl/dnnl_quantized_act.cc
@@ -17,35 +17,35 @@
  * under the License.
  */
 /*!
- * \file mkldnn_quantized_act.cc
- * \brief MKLDNN(Quantized) Activation operator based on subgraph
+ * \file dnnl_quantized_act.cc
+ * \brief DNNL(Quantized) Activation operator based on subgraph
  * /author Zhiyuan Huang
  */
 #if MXNET_USE_ONEDNN == 1
 
-#include "../../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../../nn/dnnl/dnnl_ops-inl.h"
 #include "../quantization_utils.h"
 
 namespace mxnet {
 namespace op {
 
-static void MKLDNNQuantizedActForward(const nnvm::NodeAttrs& attrs,
-                                      const OpContext& ctx,
-                                      const std::vector<NDArray>& in_data,
-                                      const std::vector<OpReqType>& req,
-                                      const std::vector<NDArray>& out_data) {
+static void DNNLQuantizedActForward(const nnvm::NodeAttrs& attrs,
+                                    const OpContext& ctx,
+                                    const std::vector<NDArray>& in_data,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<NDArray>& out_data) {
   CHECK(in_data[0].dtype() == mshadow::kUint8 || in_data[0].dtype() == mshadow::kInt8)
       << "_contrib_quantized_act op only supports uint8 and int8 as input "
          "type";
 
-  MKLDNNRun(MKLDNNActivationForward, attrs, ctx, in_data[0], req[0], out_data[0]);
+  DNNLRun(DNNLActivationForward, attrs, ctx, in_data[0], req[0], out_data[0]);
   out_data[1].data().dptr<float>()[0] = in_data[1].data().dptr<float>()[0];
   out_data[2].data().dptr<float>()[0] = in_data[2].data().dptr<float>()[0];
 }
 
 NNVM_REGISTER_OP(_contrib_quantized_act)
-    .set_attr<bool>("TIsMKLDNN", true)
-    .set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizedActForward);
+    .set_attr<bool>("TIsDNNL", true)
+    .set_attr<FComputeEx>("FComputeEx<cpu>", DNNLQuantizedActForward);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_batch_norm.cc b/src/operator/quantization/dnnl/dnnl_quantized_batch_norm.cc
similarity index 71%
rename from src/operator/quantization/mkldnn/mkldnn_quantized_batch_norm.cc
rename to src/operator/quantization/dnnl/dnnl_quantized_batch_norm.cc
index e3aeb22d5025..f8c79482cc9c 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_batch_norm.cc
+++ b/src/operator/quantization/dnnl/dnnl_quantized_batch_norm.cc
@@ -18,49 +18,49 @@
  */
 
 /*!
- * \file mkldnn_quantized_batch_norm.cc
+ * \file dnnl_quantized_batch_norm.cc
  * \brief
  * \author Yixin Bao
  */
 
 #if MXNET_USE_ONEDNN == 1
-#include "../../nn/mkldnn/mkldnn_batch_norm-inl.h"
+#include "../../nn/dnnl/dnnl_batch_norm-inl.h"
 #include "../quantization_utils.h"
 
 namespace mxnet {
 namespace op {
 
-static void MKLDNNQuantizedBatchNormForward(const nnvm::NodeAttrs& attrs,
-                                            const OpContext& ctx,
-                                            const std::vector<NDArray>& in_data,
-                                            const std::vector<OpReqType>& req,
-                                            const std::vector<NDArray>& outputs) {
+static void DNNLQuantizedBatchNormForward(const nnvm::NodeAttrs& attrs,
+                                          const OpContext& ctx,
+                                          const std::vector<NDArray>& in_data,
+                                          const std::vector<OpReqType>& req,
+                                          const std::vector<NDArray>& outputs) {
   CHECK_EQ(in_data.size(), 7U);
   CHECK_EQ(outputs.size(), 3U);
 
   TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
   const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
   const NDArray& data         = in_data[quantized_batchnorm::kData];
-  auto data_mem               = data.GetMKLDNNData();
+  auto data_mem               = data.GetDNNLData();
 
   // reorder if data type = uint8
   if (in_data[quantized_batchnorm::kData].dtype() == mshadow::kUint8) {
     auto u8_md            = data_mem->get_desc();
     auto s8_md            = u8_md;
-    s8_md.data.data_type  = static_cast<mkldnn_data_type_t>(mkldnn::memory::data_type::s8);
+    s8_md.data.data_type  = static_cast<dnnl_data_type_t>(dnnl::memory::data_type::s8);
     auto data_reorder_mem = TmpMemMgr::Get()->Alloc(s8_md);
 
     std::vector<float> reorder_scale;
     reorder_scale = {static_cast<float>(kInt8Range) / kUint8Range};
-    mkldnn::primitive_attr reorder_attr;
+    dnnl::primitive_attr reorder_attr;
     reorder_attr.set_output_scales(0, reorder_scale);
-    mkldnn::engine cpu_engine = CpuEngine::Get()->get_engine();
+    dnnl::engine cpu_engine = CpuEngine::Get()->get_engine();
     const auto reorder_pd =
-        mkldnn::reorder::primitive_desc(cpu_engine, u8_md, cpu_engine, s8_md, reorder_attr);
-    mkldnn_args_map_t reorder_args;
-    reorder_args[MKLDNN_ARG_SRC] = *data_mem;
-    reorder_args[MKLDNN_ARG_DST] = *data_reorder_mem;
-    MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::reorder(reorder_pd), reorder_args);
+        dnnl::reorder::primitive_desc(cpu_engine, u8_md, cpu_engine, s8_md, reorder_attr);
+    dnnl_args_map_t reorder_args;
+    reorder_args[DNNL_ARG_SRC] = *data_mem;
+    reorder_args[DNNL_ARG_DST] = *data_reorder_mem;
+    DNNLStream::Get()->RegisterPrimArgs(dnnl::reorder(reorder_pd), reorder_args);
     data_mem = data_reorder_mem;
   }
   const size_t channelAxis = static_cast<size_t>(
@@ -81,10 +81,10 @@ static void MKLDNNQuantizedBatchNormForward(const nnvm::NodeAttrs& attrs,
   }
   const float max_abs_output = std::max(std::abs(*min_output_ptr), std::abs(*max_output_ptr));
 
-  mkldnn::normalization_flags flags =
-      mkldnn::normalization_flags::use_global_stats | mkldnn::normalization_flags::use_scale_shift;
-  auto& fwd                        = GetBNForward<float>(param, ctx, data_mem, flags);
-  const mkldnn::memory& weight_mem = fwd.GetWeight();
+  dnnl::normalization_flags flags =
+      dnnl::normalization_flags::use_global_stats | dnnl::normalization_flags::use_scale_shift;
+  auto& fwd                      = GetBNForward<float>(param, ctx, data_mem, flags);
+  const dnnl::memory& weight_mem = fwd.GetWeight();
   CHECK_EQ(weight_mem.get_desc().get_size(), channel_count * sizeof(float) * 2);
   float* weight_buf = reinterpret_cast<float*>(weight_mem.get_data_handle());
 
@@ -97,8 +97,8 @@ static void MKLDNNQuantizedBatchNormForward(const nnvm::NodeAttrs& attrs,
   float* moving_var_ptr      = moving_var.data().dptr<float>();
 
   // rescale gamma and beta, to make mean=0 and var=1
-  auto rescaled_mean_mem   = TmpMemMgr::Get()->Alloc(moving_mean.GetMKLDNNData()->get_desc());
-  auto rescaled_var_mem    = TmpMemMgr::Get()->Alloc(moving_var.GetMKLDNNData()->get_desc());
+  auto rescaled_mean_mem   = TmpMemMgr::Get()->Alloc(moving_mean.GetDNNLData()->get_desc());
+  auto rescaled_var_mem    = TmpMemMgr::Get()->Alloc(moving_var.GetDNNLData()->get_desc());
   float* rescaled_mean_ptr = reinterpret_cast<float*>(rescaled_mean_mem->get_data_handle());
   float* rescaled_var_ptr  = reinterpret_cast<float*>(rescaled_var_mem->get_data_handle());
 
@@ -114,16 +114,16 @@ static void MKLDNNQuantizedBatchNormForward(const nnvm::NodeAttrs& attrs,
   }
 
   const NDArray& out = outputs[batchnorm::kOut];
-  auto out_mem       = const_cast<NDArray&>(out).CreateMKLDNNData(fwd.GetPd().dst_desc());
-  mkldnn_args_map_t net_args;
-  net_args[MKLDNN_ARG_SRC]         = *data_mem;
-  net_args[MKLDNN_ARG_SCALE_SHIFT] = weight_mem;
-  net_args[MKLDNN_ARG_DST]         = *out_mem;
-  net_args[MKLDNN_ARG_MEAN]        = *rescaled_mean_mem;
-  net_args[MKLDNN_ARG_VARIANCE]    = *rescaled_var_mem;
-
-  MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
-  MKLDNNStream::Get()->Submit();
+  auto out_mem       = const_cast<NDArray&>(out).CreateDNNLData(fwd.GetPd().dst_desc());
+  dnnl_args_map_t net_args;
+  net_args[DNNL_ARG_SRC]         = *data_mem;
+  net_args[DNNL_ARG_SCALE_SHIFT] = weight_mem;
+  net_args[DNNL_ARG_DST]         = *out_mem;
+  net_args[DNNL_ARG_MEAN]        = *rescaled_mean_mem;
+  net_args[DNNL_ARG_VARIANCE]    = *rescaled_var_mem;
+
+  DNNLStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
+  DNNLStream::Get()->Submit();
 }
 
 inline static bool QuantizedBatchNormStorageType(const nnvm::NodeAttrs& attrs,
@@ -133,19 +133,19 @@ inline static bool QuantizedBatchNormStorageType(const nnvm::NodeAttrs& attrs,
                                                  std::vector<int>* out_attrs) {
   bool dispatched = false;
   if (!dispatched) {
-    dispatched = MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+    dispatched = DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
   }
   return dispatched;
 }
 
 NNVM_REGISTER_OP(_contrib_quantized_batch_norm)
     .set_attr<FInferStorageType>("FInferStorageType", QuantizedBatchNormStorageType)
-    .set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizedBatchNormForward)
+    .set_attr<FComputeEx>("FComputeEx<cpu>", DNNLQuantizedBatchNormForward)
     .set_attr<FResourceRequest>("FResourceRequest",
                                 [](const NodeAttrs& n) {
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                 })
-    .set_attr<bool>("TIsMKLDNN", true);
+    .set_attr<bool>("TIsDNNL", true);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_concat.cc b/src/operator/quantization/dnnl/dnnl_quantized_concat.cc
similarity index 69%
rename from src/operator/quantization/mkldnn/mkldnn_quantized_concat.cc
rename to src/operator/quantization/dnnl/dnnl_quantized_concat.cc
index 5e6c9abe1533..06582cb8c3e8 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_concat.cc
+++ b/src/operator/quantization/dnnl/dnnl_quantized_concat.cc
@@ -23,7 +23,7 @@
  */
 
 #if MXNET_USE_ONEDNN == 1
-#include "../../nn/mkldnn/mkldnn_concat-inl.h"
+#include "../../nn/dnnl/dnnl_concat-inl.h"
 #include "../quantization_utils.h"
 
 namespace mxnet {
@@ -38,11 +38,11 @@ static float GetScale(const NDArray& data, float min, float max) {
   return data_range / MaxAbs(min, max);
 }
 
-static void MKLDNNQuantizedConcatForward(const nnvm::NodeAttrs& attrs,
-                                         const OpContext& ctx,
-                                         const std::vector<NDArray>& in_data,
-                                         const std::vector<OpReqType>& req,
-                                         const std::vector<NDArray>& out_data) {
+static void DNNLQuantizedConcatForward(const nnvm::NodeAttrs& attrs,
+                                       const OpContext& ctx,
+                                       const std::vector<NDArray>& in_data,
+                                       const std::vector<OpReqType>& req,
+                                       const std::vector<NDArray>& out_data) {
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
   CHECK_EQ(in_data.size(), static_cast<size_t>(param_.num_args * 3));
   CHECK_EQ(out_data.size(), 3U);
@@ -62,50 +62,50 @@ static void MKLDNNQuantizedConcatForward(const nnvm::NodeAttrs& attrs,
   out_data[quantized_concat_enum::kMin].data().dptr<float>()[0] = output_neg_min;
   out_data[quantized_concat_enum::kMax].data().dptr<float>()[0] = output_pos_max;
   auto out_scale = GetScale(out_data[quantized_concat_enum::kOut], output_neg_min, output_pos_max);
-  std::vector<mkldnn::memory::desc> data_md;
-  std::vector<const mkldnn::memory*> data_mem;
-  // new_data_mem is for auto-free new created mkldnn memory
-  std::vector<std::shared_ptr<mkldnn::memory>> new_data_mem;
+  std::vector<dnnl::memory::desc> data_md;
+  std::vector<const dnnl::memory*> data_mem;
+  // new_data_mem is for auto-free new created dnnl memory
+  std::vector<std::shared_ptr<dnnl::memory>> new_data_mem;
   const auto out_dtype = out_data[quantized_concat_enum::kOut].dtype();
   for (int i = 0; i < param_.num_args; ++i) {
     auto i_scale = GetScale(in_data[i], data_min[i], data_max[i]);
     if (i_scale == out_scale) {
       CHECK(in_data[i].dtype() == out_dtype);
-      auto mem = in_data[i].GetMKLDNNData();
+      auto mem = in_data[i].GetDNNLData();
       data_mem.push_back(mem);
       data_md.push_back(mem->get_desc());
     } else {
-      auto mem      = in_data[i].GetMKLDNNData();
+      auto mem      = in_data[i].GetDNNLData();
       auto mem_desc = mem->get_desc();
       if (in_data[i].dtype() != out_dtype) {
-        mem_desc.data.data_type = static_cast<mkldnn_data_type_t>(get_mkldnn_type(out_dtype));
+        mem_desc.data.data_type = static_cast<dnnl_data_type_t>(get_dnnl_type(out_dtype));
       }
       const auto rescaled_mem =
-          std::make_shared<mkldnn::memory>(mem_desc, CpuEngine::Get()->get_engine());
+          std::make_shared<dnnl::memory>(mem_desc, CpuEngine::Get()->get_engine());
       new_data_mem.push_back(rescaled_mem);
       std::vector<float> reorder_scale = {out_scale / i_scale};
-      mkldnn::primitive_attr reorder_attr;
+      dnnl::primitive_attr reorder_attr;
       reorder_attr.set_output_scales(0, reorder_scale);
-      const auto reorder_pd = mkldnn::reorder::primitive_desc(*mem, *rescaled_mem, reorder_attr);
-      mkldnn_args_map_t reorder_args;
-      reorder_args[MKLDNN_ARG_SRC] = *mem;
-      reorder_args[MKLDNN_ARG_DST] = *rescaled_mem;
-      MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::reorder(reorder_pd), reorder_args);
+      const auto reorder_pd = dnnl::reorder::primitive_desc(*mem, *rescaled_mem, reorder_attr);
+      dnnl_args_map_t reorder_args;
+      reorder_args[DNNL_ARG_SRC] = *mem;
+      reorder_args[DNNL_ARG_DST] = *rescaled_mem;
+      DNNLStream::Get()->RegisterPrimArgs(dnnl::reorder(reorder_pd), reorder_args);
       data_mem.push_back(rescaled_mem.get());
       data_md.push_back(mem_desc);
     }
   }
-  MKLDNNConcatFwd& fwd           = GetConcatForward(param_.dim, in_data, data_md);
-  mxnet::mkldnn_output_t out_mem = CreateMKLDNNMem(
+  DNNLConcatFwd& fwd           = GetConcatForward(param_.dim, in_data, data_md);
+  mxnet::dnnl_output_t out_mem = CreateDNNLMem(
       out_data[quantized_concat_enum::kOut], fwd.fwd_pd.dst_desc(), req[concat_enum::kOut]);
-  mkldnn_args_map_t net_args;
-  net_args[MKLDNN_ARG_DST] = *out_mem.second;
+  dnnl_args_map_t net_args;
+  net_args[DNNL_ARG_DST] = *out_mem.second;
   for (int i = 0; i < param_.num_args; i++) {
-    net_args[MKLDNN_ARG_MULTIPLE_SRC + i] = *data_mem[i];
+    net_args[DNNL_ARG_MULTIPLE_SRC + i] = *data_mem[i];
   }
-  MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
+  DNNLStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
   CommitOutput(out_data[concat_enum::kOut], out_mem);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
 inline static bool ConcatStorageType(const nnvm::NodeAttrs& attrs,
@@ -117,17 +117,17 @@ inline static bool ConcatStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), static_cast<size_t>(param_.num_args * 3));
   CHECK_EQ(out_attrs->size(), 3U);
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 
 NNVM_REGISTER_OP(_contrib_quantized_concat)
     .set_attr<FInferStorageType>("FInferStorageType", ConcatStorageType)
-    .set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizedConcatForward)
+    .set_attr<FComputeEx>("FComputeEx<cpu>", DNNLQuantizedConcatForward)
     .set_attr<FResourceRequest>("FResourceRequest",
                                 [](const NodeAttrs& n) {
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                 })
-    .set_attr<bool>("TIsMKLDNN", true);
+    .set_attr<bool>("TIsDNNL", true);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc b/src/operator/quantization/dnnl/dnnl_quantized_conv.cc
similarity index 64%
rename from src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
rename to src/operator/quantization/dnnl/dnnl_quantized_conv.cc
index c2ccbd5fdc7e..934a24ccb6ab 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
+++ b/src/operator/quantization/dnnl/dnnl_quantized_conv.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file mkldnn_quantized_conv.cc
+ * \file dnnl_quantized_conv.cc
  * \brief
  * \author Wenting Jiang, Xinyu Chen
  */
@@ -26,56 +26,55 @@
 #if MXNET_USE_ONEDNN == 1
 #include "../../elemwise_op_common.h"
 #include "../../nn/convolution-inl.h"
-#include "../../nn/mkldnn/mkldnn_base-inl.h"
-#include "../../nn/mkldnn/mkldnn_convolution-inl.h"
+#include "../../nn/dnnl/dnnl_base-inl.h"
+#include "../../nn/dnnl/dnnl_convolution-inl.h"
 #include "../../tensor/matrix_op-inl.h"
 #include "../quantization_utils.h"
 namespace mxnet {
 namespace op {
 
-static void MKLDNNQuantizedConvForward(const nnvm::NodeAttrs& attrs,
-                                       const OpContext& ctx,
-                                       const std::vector<NDArray>& in_data,
-                                       const std::vector<OpReqType>& req,
-                                       const std::vector<NDArray>& out_data) {
+static void DNNLQuantizedConvForward(const nnvm::NodeAttrs& attrs,
+                                     const OpContext& ctx,
+                                     const std::vector<NDArray>& in_data,
+                                     const std::vector<OpReqType>& req,
+                                     const std::vector<NDArray>& out_data) {
   TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
   NDArray weight         = in_data[conv::kWeight];
   ConvolutionParam param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  MKLDNNConvFullParam full_param;
+  DNNLConvFullParam full_param;
   full_param.conv_param = param;
-  full_param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
+  full_param.dnnl_param.Init(std::unordered_map<std::string, std::string>());
   auto& fwd     = GetConvFwd(full_param,
                          ctx.is_train,
                          in_data[conv::kData],
                          in_data[conv::kWeight],
                          param.no_bias ? nullptr : &in_data[conv::kBias],
                          out_data[conv::kOut]);
-  auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd.GetPd().src_desc());
-  const mkldnn::memory* weight_mem;
+  auto data_mem = in_data[conv::kData].GetDNNLDataReorder(fwd.GetPd().src_desc());
+  const dnnl::memory* weight_mem;
   // For inference, we want to reorder the weight array so we don't need to
   // reorder data every time.
   if (weight.IsDefaultData()) {
     // We also need to modify the layout on the original weight array.
     // Don't switch below sequence because naive engine will executes
     // pushAsync synchronously.
-    weight.MKLDNNDataReorderAsync(fwd.GetPd().weights_desc());
+    weight.DNNLDataReorderAsync(fwd.GetPd().weights_desc());
     weight_mem = GetWeights(weight, fwd.GetPd().weights_desc(), param.num_group);
   } else {
-    weight_mem = weight.GetMKLDNNData();
+    weight_mem = weight.GetDNNLData();
   }
-  auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd.GetPd().dst_desc(), req[conv::kOut]);
-  mkldnn_args_map_t net_args;
+  auto out_mem = CreateDNNLMem(out_data[conv::kOut], fwd.GetPd().dst_desc(), req[conv::kOut]);
+  dnnl_args_map_t net_args;
   if (!param.no_bias) {
-    const mkldnn::memory* bias_mem =
-        in_data[conv::kBias].GetMKLDNNDataReorder(fwd.GetPd().bias_desc());
-    net_args.insert({MKLDNN_ARG_BIAS, *bias_mem});
+    const dnnl::memory* bias_mem = in_data[conv::kBias].GetDNNLDataReorder(fwd.GetPd().bias_desc());
+    net_args.insert({DNNL_ARG_BIAS, *bias_mem});
   }
-  net_args.insert({MKLDNN_ARG_SRC, *data_mem});
-  net_args.insert({MKLDNN_ARG_WEIGHTS, *weight_mem});
-  net_args.insert({MKLDNN_ARG_DST, *out_mem.second});
-  MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
+  net_args.insert({DNNL_ARG_SRC, *data_mem});
+  net_args.insert({DNNL_ARG_WEIGHTS, *weight_mem});
+  net_args.insert({DNNL_ARG_DST, *out_mem.second});
+  DNNLStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
   CommitOutput(out_data[conv::kOut], out_mem);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
   Stream<cpu>* s          = ctx.get_stream<cpu>();
   const size_t num_inputs = param.no_bias ? 2 : 3;
   mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(
@@ -90,7 +89,7 @@ static void MKLDNNQuantizedConvForward(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_quantized_conv)
-    .set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizedConvForward);
+    .set_attr<FComputeEx>("FComputeEx<cpu>", DNNLQuantizedConvForward);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_elemwise_add.cc b/src/operator/quantization/dnnl/dnnl_quantized_elemwise_add.cc
similarity index 73%
rename from src/operator/quantization/mkldnn/mkldnn_quantized_elemwise_add.cc
rename to src/operator/quantization/dnnl/dnnl_quantized_elemwise_add.cc
index 52bc602e4eba..4dbe64836899 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_elemwise_add.cc
+++ b/src/operator/quantization/dnnl/dnnl_quantized_elemwise_add.cc
@@ -18,13 +18,13 @@
  */
 
 /*!
- * \file mkldnn_quantized_elemwise_add.cc
+ * \file dnnl_quantized_elemwise_add.cc
  * \brief
  */
 
 #if MXNET_USE_ONEDNN == 1
-#include "../../nn/mkldnn/mkldnn_base-inl.h"
-#include "../../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../../nn/dnnl/dnnl_base-inl.h"
+#include "../../nn/dnnl/dnnl_ops-inl.h"
 #include "../quantization_utils.h"
 #include "../quantized_elemwise_add-inl.h"
 
@@ -38,39 +38,38 @@ static inline float GetScale(const NDArray& data, float min, float max) {
   return data_range / MaxAbs(min, max);
 }
 
-class MKLDNNQuantizedElemwiseAddFwd {
+class DNNLQuantizedElemwiseAddFwd {
  public:
-  mkldnn::sum::primitive_desc fwd_pd;
+  dnnl::sum::primitive_desc fwd_pd;
 
-  MKLDNNQuantizedElemwiseAddFwd(const mkldnn::memory::desc& output_desc,
-                                const std::vector<float>& scales,
-                                const std::vector<mkldnn::memory::desc>& data_md)
+  DNNLQuantizedElemwiseAddFwd(const dnnl::memory::desc& output_desc,
+                              const std::vector<float>& scales,
+                              const std::vector<dnnl::memory::desc>& data_md)
       : fwd_pd(output_desc, scales, data_md, CpuEngine::Get()->get_engine()) {
-    fwd_ = std::make_shared<mkldnn::sum>(fwd_pd);
+    fwd_ = std::make_shared<dnnl::sum>(fwd_pd);
     data_.resize(data_md.size());
   }
 
-  const mkldnn::sum& GetFwd() const {
+  const dnnl::sum& GetFwd() const {
     return *fwd_;
   }
 
  private:
-  std::shared_ptr<mkldnn::sum> fwd_;
-  std::vector<std::shared_ptr<mkldnn::memory>> data_;
-  std::shared_ptr<mkldnn::memory> out_;
+  std::shared_ptr<dnnl::sum> fwd_;
+  std::vector<std::shared_ptr<dnnl::memory>> data_;
+  std::shared_ptr<dnnl::memory> out_;
 };
 
-static MKLDNNQuantizedElemwiseAddFwd& GetQuantizedElemwiseAddForward(
-    const mkldnn::memory::desc& output_desc,
+static DNNLQuantizedElemwiseAddFwd& GetQuantizedElemwiseAddForward(
+    const dnnl::memory::desc& output_desc,
     const std::vector<float>& scales,
     const std::vector<NDArray>& in_data,
     const std::vector<NDArray>& out_data,
-    const std::vector<mkldnn::memory::desc>& data_md) {
+    const std::vector<dnnl::memory::desc>& data_md) {
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<OpSignature, MKLDNNQuantizedElemwiseAddFwd, OpHash> fwds;
+  static thread_local std::unordered_map<OpSignature, DNNLQuantizedElemwiseAddFwd, OpHash> fwds;
 #else
-  static MX_THREAD_LOCAL std::unordered_map<OpSignature, MKLDNNQuantizedElemwiseAddFwd, OpHash>
-      fwds;
+  static MX_THREAD_LOCAL std::unordered_map<OpSignature, DNNLQuantizedElemwiseAddFwd, OpHash> fwds;
 #endif
   OpSignature key;
   key.AddSign(in_data);
@@ -84,17 +83,17 @@ static MKLDNNQuantizedElemwiseAddFwd& GetQuantizedElemwiseAddForward(
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
-    MKLDNNQuantizedElemwiseAddFwd fwd(output_desc, scales, data_md);
+    DNNLQuantizedElemwiseAddFwd fwd(output_desc, scales, data_md);
     it = AddToCache(&fwds, key, fwd);
   }
   return it->second;
 }
 
-static void MKLDNNQuantizedElemwiseAddForward(const nnvm::NodeAttrs& attrs,
-                                              const OpContext& ctx,
-                                              const std::vector<NDArray>& in_data,
-                                              const std::vector<OpReqType>& req,
-                                              const std::vector<NDArray>& out_data) {
+static void DNNLQuantizedElemwiseAddForward(const nnvm::NodeAttrs& attrs,
+                                            const OpContext& ctx,
+                                            const std::vector<NDArray>& in_data,
+                                            const std::vector<OpReqType>& req,
+                                            const std::vector<NDArray>& out_data) {
   const QuantizeElemwiseAddParam& params = nnvm::get<QuantizeElemwiseAddParam>(attrs.parsed);
   // A, B, A_min, A_max, B_min, B_max
   CHECK_EQ(in_data.size(), 6U) << "should be A, B, A_min, A_max, B_min, B_max";
@@ -108,8 +107,8 @@ static void MKLDNNQuantizedElemwiseAddForward(const nnvm::NodeAttrs& attrs,
   const float dataA_absmax = MaxAbs(dataA_min, dataA_max);
   const float dataB_absmax = MaxAbs(dataB_min, dataB_max);
 
-  auto dataA_mem = in_data[quantized_elemwise_add_enum::kDataA].GetMKLDNNData();
-  auto dataB_mem = in_data[quantized_elemwise_add_enum::kDataB].GetMKLDNNData();
+  auto dataA_mem = in_data[quantized_elemwise_add_enum::kDataA].GetDNNLData();
+  auto dataB_mem = in_data[quantized_elemwise_add_enum::kDataB].GetDNNLData();
   const bool is_dataA_int8 =
       (in_data[quantized_elemwise_add_enum::kDataA].dtype() == mshadow::kInt8);
   const float dataA_range = is_dataA_int8 ? kInt8Range : kUint8Range;
@@ -118,22 +117,22 @@ static void MKLDNNQuantizedElemwiseAddForward(const nnvm::NodeAttrs& attrs,
       GetScale(in_data[quantized_elemwise_add_enum::kDataA], dataA_min, dataA_max);
   const float B_scale =
       GetScale(in_data[quantized_elemwise_add_enum::kDataB], dataB_min, dataB_max);
-  // rescaled_mem is for reorder mkldnn memory
-  mkldnn::memory* rescaled_mem;
+  // rescaled_mem is for reorder dnnl memory
+  dnnl::memory* rescaled_mem;
 
   // output default set as int32
   double output_data_range = kInt32Range;
-  auto output_data_type    = mkldnn::memory::data_type::s32;
+  auto output_data_type    = dnnl::memory::data_type::s32;
   // dataA && dataB are uint8
   if (out_data[quantized_elemwise_add_enum::kOut].dtype() == mshadow::kInt8) {
     output_data_range = kInt8Range;
-    output_data_type  = mkldnn::memory::data_type::s8;
+    output_data_type  = dnnl::memory::data_type::s8;
   } else if (out_data[quantized_elemwise_add_enum::kOut].dtype() == mshadow::kUint8) {
     output_data_range = kUint8Range;
-    output_data_type  = mkldnn::memory::data_type::u8;
+    output_data_type  = dnnl::memory::data_type::u8;
   } else {
     output_data_range = kInt32Range;
-    output_data_type  = mkldnn::memory::data_type::s32;
+    output_data_type  = dnnl::memory::data_type::s32;
   }
 
   float output_min     = 0;
@@ -178,13 +177,13 @@ static void MKLDNNQuantizedElemwiseAddForward(const nnvm::NodeAttrs& attrs,
       }
     }
     std::vector<float> reorder_scale = {u8_reorder_scale};
-    mkldnn::primitive_attr reorder_attr;
+    dnnl::primitive_attr reorder_attr;
     reorder_attr.set_output_scales(0, reorder_scale);
     auto u8_mem = (is_dataA_int8 == true) ? dataB_mem : dataA_mem;
     const auto reorder_pd =
-        mkldnn::reorder::primitive_desc(engine, u8_mem->get_desc(), engine, s8_desc, reorder_attr);
-    mkldnn_args_map_t args({{MKLDNN_ARG_FROM, *u8_mem}, {MKLDNN_ARG_TO, *rescaled_mem}});
-    MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::reorder(reorder_pd), args);
+        dnnl::reorder::primitive_desc(engine, u8_mem->get_desc(), engine, s8_desc, reorder_attr);
+    dnnl_args_map_t args({{DNNL_ARG_FROM, *u8_mem}, {DNNL_ARG_TO, *rescaled_mem}});
+    DNNLStream::Get()->RegisterPrimArgs(dnnl::reorder(reorder_pd), args);
 
     if (is_dataA_int8 == true) {
       dataB_mem = rescaled_mem;
@@ -202,21 +201,20 @@ static void MKLDNNQuantizedElemwiseAddForward(const nnvm::NodeAttrs& attrs,
     }
   }
 
-  std::vector<mkldnn::memory::desc> in_desc;
+  std::vector<dnnl::memory::desc> in_desc;
   in_desc.push_back(dataA_mem->get_desc());
   in_desc.push_back(dataB_mem->get_desc());
   const auto in_shape = in_data[quantized_elemwise_add_enum::kDataA].shape();
-  mkldnn::memory::dims i_dims(in_shape.begin(), in_shape.end());
-  auto output_desc =
-      mkldnn::memory::desc(i_dims, output_data_type, mkldnn::memory::format_tag::any);
-  MKLDNNQuantizedElemwiseAddFwd& fwd =
+  dnnl::memory::dims i_dims(in_shape.begin(), in_shape.end());
+  auto output_desc = dnnl::memory::desc(i_dims, output_data_type, dnnl::memory::format_tag::any);
+  DNNLQuantizedElemwiseAddFwd& fwd =
       GetQuantizedElemwiseAddForward(output_desc, scales, in_data, out_data, in_desc);
-  auto mem = CreateMKLDNNMem(
+  auto mem = CreateDNNLMem(
       out_data[quantized_elemwise_add_enum::kOut], fwd.fwd_pd.dst_desc(), req[0], &in_data[0]);
-  mkldnn_args_map_t args({{MKLDNN_ARG_MULTIPLE_SRC, *dataA_mem},
-                          {MKLDNN_ARG_MULTIPLE_SRC + 1, *dataB_mem},
-                          {MKLDNN_ARG_DST, *mem.second}});
-  MKLDNNStream* stream = MKLDNNStream::Get();
+  dnnl_args_map_t args({{DNNL_ARG_MULTIPLE_SRC, *dataA_mem},
+                        {DNNL_ARG_MULTIPLE_SRC + 1, *dataB_mem},
+                        {DNNL_ARG_DST, *mem.second}});
+  DNNLStream* stream = DNNLStream::Get();
   stream->RegisterPrimArgs(fwd.GetFwd(), args);
   CommitOutput(out_data[quantized_elemwise_add_enum::kOut], mem);
   stream->Submit();
@@ -235,13 +233,13 @@ inline static bool ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs,
   // Check num of outputs: C, C_min, C_max
   CHECK_EQ(out_attrs->size(), 3U);
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 
 NNVM_REGISTER_OP(_contrib_quantized_elemwise_add)
     .set_attr<FInferStorageType>("FInferStorageType", ElemwiseAddStorageType)
-    .set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizedElemwiseAddForward)
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<FComputeEx>("FComputeEx<cpu>", DNNLQuantizedElemwiseAddForward)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr_parser(ParamParser<QuantizeElemwiseAddParam>)
     .add_arguments(QuantizeElemwiseAddParam::__FIELDS__());
 }  // namespace op
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_flatten.cc b/src/operator/quantization/dnnl/dnnl_quantized_flatten.cc
similarity index 73%
rename from src/operator/quantization/mkldnn/mkldnn_quantized_flatten.cc
rename to src/operator/quantization/dnnl/dnnl_quantized_flatten.cc
index 60df0bc20975..12eb01f39183 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_flatten.cc
+++ b/src/operator/quantization/dnnl/dnnl_quantized_flatten.cc
@@ -23,7 +23,7 @@
  */
 
 #if MXNET_USE_ONEDNN == 1
-#include "../../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../../nn/dnnl/dnnl_ops-inl.h"
 #include "../quantization_utils.h"
 
 namespace mxnet {
@@ -36,16 +36,16 @@ inline static bool FlattenStorageType(const nnvm::NodeAttrs& attrs,
                                       std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 3U);
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 
-static void MKLDNNQuantizedFlattenForward(const nnvm::NodeAttrs& attrs,
-                                          const OpContext& ctx,
-                                          const std::vector<NDArray>& inputs,
-                                          const std::vector<OpReqType>& req,
-                                          const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNReshape(inputs[0], outputs[0])) {
-    MKLDNNRun(MKLDNNReshapeForward, attrs, ctx, inputs[0], req[0], outputs[0]);
+static void DNNLQuantizedFlattenForward(const nnvm::NodeAttrs& attrs,
+                                        const OpContext& ctx,
+                                        const std::vector<NDArray>& inputs,
+                                        const std::vector<OpReqType>& req,
+                                        const std::vector<NDArray>& outputs) {
+  if (SupportDNNLReshape(inputs[0], outputs[0])) {
+    DNNLRun(DNNLReshapeForward, attrs, ctx, inputs[0], req[0], outputs[0]);
   } else {
     FallBackCompute(UnaryOp::IdentityCompute<cpu>, attrs, ctx, inputs, req, outputs);
   }
@@ -55,12 +55,12 @@ static void MKLDNNQuantizedFlattenForward(const nnvm::NodeAttrs& attrs,
 
 NNVM_REGISTER_OP(_contrib_quantized_flatten)
     .set_attr<FInferStorageType>("FInferStorageType", FlattenStorageType)
-    .set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizedFlattenForward)
+    .set_attr<FComputeEx>("FComputeEx<cpu>", DNNLQuantizedFlattenForward)
     .set_attr<FResourceRequest>("FResourceRequest",
                                 [](const NodeAttrs& n) {
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                 })
-    .set_attr<bool>("TIsMKLDNN", true);
+    .set_attr<bool>("TIsDNNL", true);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc b/src/operator/quantization/dnnl/dnnl_quantized_fully_connected.cc
similarity index 73%
rename from src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
rename to src/operator/quantization/dnnl/dnnl_quantized_fully_connected.cc
index 81a2f9be90d9..5e70c4ea457b 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
+++ b/src/operator/quantization/dnnl/dnnl_quantized_fully_connected.cc
@@ -18,23 +18,23 @@
  */
 
 /*!
- * \file mkldnn_quantized_fully_connected.cc
- * \brief MKLDNN Quantized FullyConnected operator
+ * \file dnnl_quantized_fully_connected.cc
+ * \brief DNNL Quantized FullyConnected operator
  * \author Ciyong Chen
  */
 
 #if MXNET_USE_ONEDNN == 1
-#include "../../nn/mkldnn/mkldnn_fully_connected-inl.h"
+#include "../../nn/dnnl/dnnl_fully_connected-inl.h"
 #include "../quantization_utils.h"
 
 namespace mxnet {
 namespace op {
 
-void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
-                                          const OpContext& ctx,
-                                          const std::vector<NDArray>& in_data,
-                                          const std::vector<OpReqType>& req,
-                                          const std::vector<NDArray>& out_data) {
+void DNNLQuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
+                                        const OpContext& ctx,
+                                        const std::vector<NDArray>& in_data,
+                                        const std::vector<OpReqType>& req,
+                                        const std::vector<NDArray>& out_data) {
   TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
   FullyConnectedParam param = nnvm::get<FullyConnectedParam>(attrs.parsed);
   const size_t num_inputs   = param.no_bias ? 2 : 3;
@@ -84,42 +84,42 @@ void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
         s, 1, min_output_ptr, max_output_ptr, &min_data, &max_data, &min_weight, &max_weight);
   }
 
-  bool is_train               = false;
-  mkldnn::memory::desc out_md = GetMemDesc(out_data[fullc::kOut]);
-  MKLDNNFCFlattenData(param, out_data[fullc::kOut], &data, &out_md);
+  bool is_train             = false;
+  dnnl::memory::desc out_md = GetMemDesc(out_data[fullc::kOut]);
+  DNNLFCFlattenData(param, out_data[fullc::kOut], &data, &out_md);
   auto& fwd =
       GetFCFwd(param, is_train, data, weight, param.no_bias ? nullptr : &quantized_bias, out_md);
 
-  auto data_mem = in_data[fullc::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_desc());
-  const mkldnn::memory* weight_mem = nullptr;
+  auto data_mem                  = in_data[fullc::kData].GetDNNLDataReorder(fwd.fwd_pd.src_desc());
+  const dnnl::memory* weight_mem = nullptr;
 
   if (weight.IsDefaultData()) {
     // We also need to modify the layout on the original weight array.
     // Don't switch below sequence because naive engine will executes
     // pushAsync synchronously.
-    weight.MKLDNNDataReorderAsync(fwd.fwd_pd.weights_desc());
+    weight.DNNLDataReorderAsync(fwd.fwd_pd.weights_desc());
     weight_mem = GetWeights(weight, fwd.fwd_pd.weights_desc(), 1);
   } else {
-    weight_mem = weight.GetMKLDNNData();
+    weight_mem = weight.GetDNNLData();
     CHECK(weight_mem->get_desc() == fwd.fwd_pd.weights_desc());
   }
-  auto out_mem = CreateMKLDNNMem(out_data[fullc::kOut], fwd.fwd_pd.dst_desc(), req[fullc::kOut]);
+  auto out_mem = CreateDNNLMem(out_data[fullc::kOut], fwd.fwd_pd.dst_desc(), req[fullc::kOut]);
 
-  mkldnn_args_map_t args = {
-      {MKLDNN_ARG_SRC, *data_mem},
-      {MKLDNN_ARG_WEIGHTS, *weight_mem},
-      {MKLDNN_ARG_DST, *out_mem.second},
+  dnnl_args_map_t args = {
+      {DNNL_ARG_SRC, *data_mem},
+      {DNNL_ARG_WEIGHTS, *weight_mem},
+      {DNNL_ARG_DST, *out_mem.second},
   };
 
-  const mkldnn::memory* bias_mem = nullptr;
+  const dnnl::memory* bias_mem = nullptr;
   if (!param.no_bias) {
-    bias_mem              = quantized_bias.GetMKLDNNDataReorder(fwd.fwd_pd.bias_desc());
-    args[MKLDNN_ARG_BIAS] = *bias_mem;
+    bias_mem            = quantized_bias.GetDNNLDataReorder(fwd.fwd_pd.bias_desc());
+    args[DNNL_ARG_BIAS] = *bias_mem;
   }
 
-  MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), args);
+  DNNLStream::Get()->RegisterPrimArgs(fwd.GetFwd(), args);
   CommitOutput(out_data[fullc::kOut], out_mem);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
 }  // namespace op
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_ops-inl.h b/src/operator/quantization/dnnl/dnnl_quantized_ops-inl.h
similarity index 59%
rename from src/operator/quantization/mkldnn/mkldnn_quantized_ops-inl.h
rename to src/operator/quantization/dnnl/dnnl_quantized_ops-inl.h
index 8ed778849182..515a7ec88061 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_ops-inl.h
+++ b/src/operator/quantization/dnnl/dnnl_quantized_ops-inl.h
@@ -18,13 +18,13 @@
  */
 
 /*!
- * \file mkldnn_quantized_ops-inl.h
- * \brief Common functions used by MKLDNN Quantized FullyConnected operator
+ * \file dnnl_quantized_ops-inl.h
+ * \brief Common functions used by DNNL Quantized FullyConnected operator
  * \author Ciyong Chen
  */
 
-#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZED_OPS_INL_H_
-#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZED_OPS_INL_H_
+#ifndef MXNET_OPERATOR_QUANTIZATION_DNNL_DNNL_QUANTIZED_OPS_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_DNNL_DNNL_QUANTIZED_OPS_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 
@@ -35,14 +35,14 @@
 namespace mxnet {
 namespace op {
 
-void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
-                                          const OpContext& ctx,
-                                          const std::vector<NDArray>& in_data,
-                                          const std::vector<OpReqType>& req,
-                                          const std::vector<NDArray>& out_data);
+void DNNLQuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
+                                        const OpContext& ctx,
+                                        const std::vector<NDArray>& in_data,
+                                        const std::vector<OpReqType>& req,
+                                        const std::vector<NDArray>& out_data);
 
 }  // namespace op
 }  // namespace mxnet
 
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZED_OPS_INL_H_
+#endif  // MXNET_OPERATOR_QUANTIZATION_DNNL_DNNL_QUANTIZED_OPS_INL_H_
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_pooling.cc b/src/operator/quantization/dnnl/dnnl_quantized_pooling.cc
similarity index 64%
rename from src/operator/quantization/mkldnn/mkldnn_quantized_pooling.cc
rename to src/operator/quantization/dnnl/dnnl_quantized_pooling.cc
index 337a71fc73ad..69476e23af15 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_pooling.cc
+++ b/src/operator/quantization/dnnl/dnnl_quantized_pooling.cc
@@ -18,34 +18,34 @@
  */
 
 /*!
- * \file mkldnn_quantized_pooling.cc
+ * \file dnnl_quantized_pooling.cc
  * \brief
  * \author Tao Lv, Xinyu Chen
  */
 
 #if MXNET_USE_ONEDNN == 1
 
-#include "../../nn/mkldnn/mkldnn_pooling-inl.h"
+#include "../../nn/dnnl/dnnl_pooling-inl.h"
 
 namespace mxnet {
 namespace op {
 
-static void MKLDNNQuantizedPoolingForward(const nnvm::NodeAttrs& attrs,
-                                          const OpContext& ctx,
-                                          const std::vector<NDArray>& in_data,
-                                          const std::vector<OpReqType>& req,
-                                          const std::vector<NDArray>& out_data) {
+static void DNNLQuantizedPoolingForward(const nnvm::NodeAttrs& attrs,
+                                        const OpContext& ctx,
+                                        const std::vector<NDArray>& in_data,
+                                        const std::vector<OpReqType>& req,
+                                        const std::vector<NDArray>& out_data) {
   CHECK(in_data[0].dtype() == mshadow::kUint8 || in_data[0].dtype() == mshadow::kInt8)
-      << "mkldnn_quantized_pooling op only supports uint8 and int8 as input type";
+      << "dnnl_quantized_pooling op only supports uint8 and int8 as input type";
   const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
-  MKLDNNPoolingCompute(ctx, param, in_data[0], req[0], out_data[0], nullptr);
+  DNNLPoolingCompute(ctx, param, in_data[0], req[0], out_data[0], nullptr);
   out_data[1].data().dptr<float>()[0] = in_data[1].data().dptr<float>()[0];
   out_data[2].data().dptr<float>()[0] = in_data[2].data().dptr<float>()[0];
 }
 
 NNVM_REGISTER_OP(_contrib_quantized_pooling)
-    .set_attr<bool>("TIsMKLDNN", true)
-    .set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizedPoolingForward);
+    .set_attr<bool>("TIsDNNL", true)
+    .set_attr<FComputeEx>("FComputeEx<cpu>", DNNLQuantizedPoolingForward);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h b/src/operator/quantization/dnnl/dnnl_requantize-inl.h
similarity index 70%
rename from src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
rename to src/operator/quantization/dnnl/dnnl_requantize-inl.h
index ae86bbba43d3..5eea9dcf4e09 100644
--- a/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
+++ b/src/operator/quantization/dnnl/dnnl_requantize-inl.h
@@ -17,31 +17,31 @@
  * under the License.
  */
 
-/* \file mkldnn_requantize-inl.h
+/* \file dnnl_requantize-inl.h
  * \brief
  * \author Jin Huang, Xinyu Chen
  */
 
-#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_REQUANTIZE_INL_H_
-#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_REQUANTIZE_INL_H_
+#ifndef MXNET_OPERATOR_QUANTIZATION_DNNL_DNNL_REQUANTIZE_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_DNNL_DNNL_REQUANTIZE_INL_H_
 #if MXNET_USE_ONEDNN == 1
 #include <algorithm>
 #include <string>
 #include <vector>
 
-#include "../../nn/mkldnn/mkldnn_base-inl.h"
+#include "../../nn/dnnl/dnnl_base-inl.h"
 #include "../requantize-inl.h"
 
 namespace mxnet {
 namespace op {
 
 template <typename DstType>
-static void MKLDNNRequantizeForwardKer(const nnvm::NodeAttrs& attrs,
-                                       const OpContext& ctx,
-                                       const std::vector<NDArray>& inputs,
-                                       const std::vector<OpReqType>& req,
-                                       const std::vector<NDArray>& outputs,
-                                       const float real_range) {
+static void DNNLRequantizeForwardKer(const nnvm::NodeAttrs& attrs,
+                                     const OpContext& ctx,
+                                     const std::vector<NDArray>& inputs,
+                                     const std::vector<OpReqType>& req,
+                                     const std::vector<NDArray>& outputs,
+                                     const float real_range) {
   using namespace mshadow;
   using namespace mxnet_op;
   using red::limits::MaxValue;
@@ -70,33 +70,33 @@ static void MKLDNNRequantizeForwardKer(const nnvm::NodeAttrs& attrs,
   float second_scale = second_quantized_range / second_real_range;
   float scale        = first_scale * second_scale;
 
-  mkldnn::primitive_attr attr;
+  dnnl::primitive_attr attr;
   const int mask            = 0;
   std::vector<float> scales = {scale};
   attr.set_output_scales(mask, scales);
-  mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+  dnnl::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
 
   NDArray in_buffer = inputs[0];
-  if (inputs[0].IsView() && inputs[0].IsMKLDNNData())
+  if (inputs[0].IsView() && inputs[0].IsDNNLData())
     in_buffer = inputs[0].Reorder2Default();
 
-  auto i_mem            = in_buffer.GetMKLDNNData();
+  auto i_mem            = in_buffer.GetDNNLData();
   auto i_desc           = i_mem->get_desc();
   auto o_desc           = i_desc;
-  o_desc.data.data_type = get_mkldnn_type_t<DstType>();
-  auto reorder_pd = mkldnn::reorder::primitive_desc(cpu_engine, i_desc, cpu_engine, o_desc, attr);
-  auto o_mem      = CreateMKLDNNMem(outputs[0], o_desc, req[0]);
-  MKLDNNStream::Get()->RegisterPrimArgs(
-      mkldnn::reorder(reorder_pd), {{MKLDNN_ARG_FROM, *i_mem}, {MKLDNN_ARG_TO, *o_mem.second}});
+  o_desc.data.data_type = get_dnnl_type_t<DstType>();
+  auto reorder_pd = dnnl::reorder::primitive_desc(cpu_engine, i_desc, cpu_engine, o_desc, attr);
+  auto o_mem      = CreateDNNLMem(outputs[0], o_desc, req[0]);
+  DNNLStream::Get()->RegisterPrimArgs(dnnl::reorder(reorder_pd),
+                                      {{DNNL_ARG_FROM, *i_mem}, {DNNL_ARG_TO, *o_mem.second}});
   CommitOutput(outputs[0], o_mem);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
-static void MKLDNNRequantizeForward(const nnvm::NodeAttrs& attrs,
-                                    const OpContext& ctx,
-                                    const std::vector<NDArray>& inputs,
-                                    const std::vector<OpReqType>& req,
-                                    const std::vector<NDArray>& outputs) {
+static void DNNLRequantizeForward(const nnvm::NodeAttrs& attrs,
+                                  const OpContext& ctx,
+                                  const std::vector<NDArray>& inputs,
+                                  const std::vector<OpReqType>& req,
+                                  const std::vector<NDArray>& outputs) {
   using namespace mshadow;
   using namespace mxnet_op;
   using red::limits::MaxValue;
@@ -138,11 +138,11 @@ static void MKLDNNRequantizeForward(const nnvm::NodeAttrs& attrs,
   }
   auto out_type = GetQuantizeOutputType(param);
   if (out_type == mshadow::kUint8) {
-    MKLDNNRequantizeForwardKer<uint8_t>(attrs, ctx, inputs, req, outputs, real_range);
+    DNNLRequantizeForwardKer<uint8_t>(attrs, ctx, inputs, req, outputs, real_range);
   } else if (out_type == mshadow::kInt8) {
-    MKLDNNRequantizeForwardKer<int8_t>(attrs, ctx, inputs, req, outputs, real_range);
+    DNNLRequantizeForwardKer<int8_t>(attrs, ctx, inputs, req, outputs, real_range);
   } else {
-    LOG(FATAL) << "mkldnn requantize op only supports int8 and uint8 as output type";
+    LOG(FATAL) << "dnnl requantize op only supports int8 and uint8 as output type";
   }
 }
 
@@ -150,4 +150,4 @@ static void MKLDNNRequantizeForward(const nnvm::NodeAttrs& attrs,
 }  // namespace mxnet
 
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_REQUANTIZE_INL_H_
+#endif  // MXNET_OPERATOR_QUANTIZATION_DNNL_DNNL_REQUANTIZE_INL_H_
diff --git a/src/operator/quantization/quantize.cc b/src/operator/quantization/quantize.cc
index f9684130a533..2b42abdca809 100644
--- a/src/operator/quantization/quantize.cc
+++ b/src/operator/quantization/quantize.cc
@@ -23,7 +23,7 @@
  */
 #include "./quantize-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./mkldnn/mkldnn_quantize-inl.h"
+#include "./dnnl/dnnl_quantize-inl.h"
 #endif
 
 namespace mxnet {
@@ -87,8 +87,8 @@ where
     // will be reverted after the improvement of CachedOP is done.
     .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
-    .set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizeCompute)
+    .set_attr<bool>("TIsDNNL", true)
+    .set_attr<FComputeEx>("FComputeEx<cpu>", DNNLQuantizeCompute)
 #endif
     .set_attr<FCompute>("FCompute<cpu>", QuantizeCompute<cpu>)
     .add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`")
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
index 99c4279b1e1a..3835f1a3a9c9 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -187,7 +187,7 @@ inline QuantizeType NeedQuantize(ObjectPtr node,
         need = false;
       if (need) {
         if ((quantize_granularity == "channel-wise") &&
-            (node->op() == Op::Get("_sg_mkldnn_fully_connected"))) {
+            (node->op() == Op::Get("_sg_onednn_fully_connected"))) {
           quantized_node->attrs.dict["channel_wise_quantize"] = "True";
         }
         quantized_node_map->insert(std::make_pair(node, quantized_node));
diff --git a/src/operator/quantization/quantize_v2.cc b/src/operator/quantization/quantize_v2.cc
index 3a94b81c6815..e08bd0d5f76d 100644
--- a/src/operator/quantization/quantize_v2.cc
+++ b/src/operator/quantization/quantize_v2.cc
@@ -24,7 +24,7 @@
 
 #include "./quantize_v2-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./mkldnn/mkldnn_quantize_v2-inl.h"
+#include "./dnnl/dnnl_quantize_v2-inl.h"
 #endif
 
 namespace mxnet {
@@ -57,7 +57,7 @@ static OpStatePtr CreateQuantizeV2State(const nnvm::NodeAttrs& attrs,
     state = OpStatePtr::Create<QuantizeV2Operator<gpu>>(attrs);
   } else {
 #if MXNET_USE_ONEDNN == 1
-    state = OpStatePtr::Create<SgMKLDNNQuantizeOperator>(attrs);
+    state = OpStatePtr::Create<SgDNNLQuantizeOperator>(attrs);
 #else
     state = OpStatePtr::Create<QuantizeV2Operator<cpu>>(attrs);
 #endif
@@ -108,8 +108,8 @@ If min_calib_range isn't presented, the output type will be int8.
     .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
     .set_attr<FCreateOpState>("FCreateOpState", CreateQuantizeV2State)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
-    .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgMKLDNNQuantizeForward)
+    .set_attr<bool>("TIsDNNL", true)
+    .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgDNNLQuantizeForward)
 #endif
     .set_attr<FStatefulCompute>("FStatefulCompute<cpu>", QuantizeV2Forward<cpu>)
     .set_attr<nnvm::FInplaceOption>("FInplaceOption",
diff --git a/src/operator/quantization/quantized_batch_norm.cc b/src/operator/quantization/quantized_batch_norm.cc
index 71819cc38522..9b1fd2adef43 100644
--- a/src/operator/quantization/quantized_batch_norm.cc
+++ b/src/operator/quantization/quantized_batch_norm.cc
@@ -25,7 +25,7 @@
 #include <mxnet/op_attr_types.h>
 #include "../nn/batch_norm-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "../nn/mkldnn/mkldnn_batch_norm-inl.h"
+#include "../nn/dnnl/dnnl_batch_norm-inl.h"
 #endif
 
 namespace mxnet {
@@ -70,7 +70,7 @@ bool QuantizedBatchNormType(const nnvm::NodeAttrs& attrs,
 
 #if MXNET_USE_ONEDNN == 1
   CHECK(in_type->at(0) == mshadow::kInt8 || in_type->at(0) == mshadow::kUint8)
-      << "QuantizedBatchNorm with MKLDNN backend only supports int8/uint8 input, while "
+      << "QuantizedBatchNorm with DNNL backend only supports int8/uint8 input, while "
       << in_type->at(0) << " is given.";
 #else
   TYPE_ASSIGN_CHECK(*in_type, 0, mshadow::kInt8);
diff --git a/src/operator/quantization/quantized_conv.cc b/src/operator/quantization/quantized_conv.cc
index 04e23e074b3f..cd93cebf4ab3 100644
--- a/src/operator/quantization/quantized_conv.cc
+++ b/src/operator/quantization/quantized_conv.cc
@@ -24,7 +24,7 @@
  */
 #include "../nn/convolution-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../nn/dnnl/dnnl_ops-inl.h"
 #endif
 
 namespace mxnet {
@@ -41,7 +41,7 @@ bool QuantizedConvShape(const nnvm::NodeAttrs& attrs,
   if (param.layout.has_value()) {
 #if MXNET_USE_ONEDNN == 1
     CHECK(param.layout.value() == mshadow::kNCHW || param.layout.value() == mshadow::kNCDHW)
-        << "mkldnn quantized_conv now supports NCHW or NCDHW for now";
+        << "dnnl quantized_conv now supports NCHW or NCDHW for now";
 #else
     CHECK_EQ(param.layout.value(), mshadow::kNCHW) << "quantized_conv only supports NCHW for now";
 #endif
@@ -55,9 +55,9 @@ bool QuantizedConvShape(const nnvm::NodeAttrs& attrs,
 
 #if MXNET_USE_ONEDNN == 1
   CHECK(kernel_ndims == 2U || kernel_ndims == 3U)
-      << "mkldnn quantized_conv only supports 2d or 3d kernel for now";
+      << "dnnl quantized_conv only supports 2d or 3d kernel for now";
   CHECK(data_ndims == 4U || data_ndims == 5U)
-      << "mkldnn quantized_conv only supports 4d or 5d layout for now";
+      << "dnnl quantized_conv only supports 4d or 5d layout for now";
 #else
   CHECK_EQ(kernel_ndims, 2U) << "quantized_conv only supports 2D convolution for now";
   CHECK(param.dilate.ndim() == 0U || param.dilate.Size() == 1U)
diff --git a/src/operator/quantization/quantized_elemwise_add.cc b/src/operator/quantization/quantized_elemwise_add.cc
index a531b3917838..b314e9e0f859 100644
--- a/src/operator/quantization/quantized_elemwise_add.cc
+++ b/src/operator/quantization/quantized_elemwise_add.cc
@@ -84,8 +84,8 @@ void QuantizedElemwiseAddForward(const nnvm::NodeAttrs& attrs,
                                  const std::vector<TBlob>& in_data,
                                  const std::vector<OpReqType>& req,
                                  const std::vector<TBlob>& out_data) {
-  LOG(FATAL) << "Not supported for MXNet built without MKLDNN. "
-                "Please install MKLDNN enabled MXNet.";
+  LOG(FATAL) << "Not supported for MXNet built without DNNL. "
+                "Please install DNNL enabled MXNet.";
 }
 
 NNVM_REGISTER_OP(_contrib_quantized_elemwise_add)
diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 446ed21f6c5c..930816abefee 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -26,8 +26,8 @@
 #include "quantization_utils.h"
 #include "../nn/fully_connected-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "../nn/mkldnn/mkldnn_fully_connected-inl.h"
-#include "mkldnn/mkldnn_quantized_ops-inl.h"
+#include "../nn/dnnl/dnnl_fully_connected-inl.h"
+#include "dnnl/dnnl_quantized_ops-inl.h"
 #endif
 
 namespace mxnet {
@@ -125,7 +125,7 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 3U);
 
 #if MXNET_USE_ONEDNN == 1
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 #else
   *dispatch_mode = DispatchMode::kFCompute;
 
@@ -308,7 +308,7 @@ void QuantizedFullyConnectedForwardExCPU(const nnvm::NodeAttrs& attrs,
                                          const std::vector<NDArray>& in_data,
                                          const std::vector<OpReqType>& req,
                                          const std::vector<NDArray>& out_data) {
-  MKLDNNQuantizedFullyConnectedForward(attrs, ctx, in_data, req, out_data);
+  DNNLQuantizedFullyConnectedForward(attrs, ctx, in_data, req, out_data);
 }
 #endif
 
@@ -361,7 +361,7 @@ and max thresholds representing the threholds for quantizing the float32 output
     .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
     .set_attr<FCompute>("FCompute<cpu>", QuantizedFullyConnectedForwardCPU)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", QuantizedFullyConnectedForwardExCPU)
 #endif
     .set_attr<FResourceRequest>("FResourceRequest",
diff --git a/src/operator/quantization/quantized_pooling.cc b/src/operator/quantization/quantized_pooling.cc
index 370f80cd3ddc..14ec43296452 100644
--- a/src/operator/quantization/quantized_pooling.cc
+++ b/src/operator/quantization/quantized_pooling.cc
@@ -23,7 +23,7 @@
 #include <mxnet/op_attr_types.h>
 #include "../nn/pooling-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "../nn/mkldnn/mkldnn_pooling-inl.h"
+#include "../nn/dnnl/dnnl_pooling-inl.h"
 #endif
 
 namespace mxnet {
@@ -44,12 +44,12 @@ bool QuantizedPoolingShape(const nnvm::NodeAttrs& attrs,
 
 #if MXNET_USE_ONEDNN == 1
   CHECK(data_ndims == 4U || data_ndims == 5U)
-      << "MKL-DNN QuantizedPoolingOp only supports 4D/5D layout yet, input should be 4D in"
+      << "DNNL QuantizedPoolingOp only supports 4D/5D layout yet, input should be 4D in"
       << "(batch, channel, y, x) or 5D in (batch, channel, d, y, x)";
   CHECK(layout == mshadow::kNCHW || layout == mshadow::kNCDHW)
-      << "MKL-DNN QuantizedPoolingOp only supports NCHW/NCDHW layout for now, saw " << layout;
+      << "DNNL QuantizedPoolingOp only supports NCHW/NCDHW layout for now, saw " << layout;
   CHECK(kernel_ndims == 2U || kernel_ndims == 3U)
-      << "MKL-DNN QuantizedPoolingOp only supports 2D/3D pooling for now, saw" << kernel_ndims;
+      << "DNNL QuantizedPoolingOp only supports 2D/3D pooling for now, saw" << kernel_ndims;
 #else
   CHECK_EQ(data_ndims, 4U) << "quantized_pooling: Input data should be 4D in "
                            << "(batch, channel, y, x)";
@@ -162,7 +162,7 @@ inline static bool QuantizedPoolingStorageType(const nnvm::NodeAttrs& attrs,
   *dispatch_mode = DispatchMode::kFCompute;
 #if MXNET_USE_ONEDNN == 1
   const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
-  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) {
+  if (dev_mask == mshadow::cpu::kDevMask && SupportDNNLPooling(param)) {
     *dispatch_mode = DispatchMode::kFComputeEx;
   }
 #else
diff --git a/src/operator/quantization/requantize.cc b/src/operator/quantization/requantize.cc
index 3a4402e86a3b..c16c3778acdf 100644
--- a/src/operator/quantization/requantize.cc
+++ b/src/operator/quantization/requantize.cc
@@ -24,7 +24,7 @@
 #include "./requantize-inl.h"
 #include "./quantize-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./mkldnn/mkldnn_requantize-inl.h"
+#include "./dnnl/dnnl_requantize-inl.h"
 #endif
 
 namespace mxnet {
@@ -73,8 +73,8 @@ inference accuracy.
     // will be reverted after the improvement of CachedOP is done.
     .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
-    .set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNRequantizeForward)
+    .set_attr<bool>("TIsDNNL", true)
+    .set_attr<FComputeEx>("FComputeEx<cpu>", DNNLRequantizeForward)
 #else
 .set_attr<FCompute>("FCompute<cpu>", RequantizeForward<cpu>)
 #endif
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 2c523c0e6f01..e4b84dd0d927 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -27,7 +27,7 @@
 
 #include "./rnn-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./nn/mkldnn/mkldnn_rnn-inl.h"
+#include "./nn/dnnl/dnnl_rnn-inl.h"
 #endif  // MXNET_USE_ONEDNN == 1
 
 namespace mxnet {
@@ -199,9 +199,9 @@ inline static bool RNNStorageType(const nnvm::NodeAttrs& attrs,
                                   std::vector<int>* in_attrs,
                                   std::vector<int>* out_attrs) {
   const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);
-  const bool support_mkldnn_rnn =
+  const bool support_dnnl_rnn =
       !param.use_sequence_length && dmlc::GetEnv("MXNET_USE_ONEDNN_RNN", 1);
-  return MKLDNNStorageType(attrs, dev_mask, support_mkldnn_rnn, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, support_dnnl_rnn, dispatch_mode, in_attrs, out_attrs);
 }
 #endif  // MXNET_USE_ONEDNN == 1
 
@@ -246,9 +246,9 @@ static OpStatePtr CreateRNNState(const nnvm::NodeAttrs& attrs,
   }
 
 #if MXNET_USE_ONEDNN == 1
-  if (ctx.dev_type == kCPU && SupportMKLDNNRnn(param, in_types[rnn_enum::kData])) {
+  if (ctx.dev_type == kCPU && SupportDNNLRnn(param, in_types[rnn_enum::kData])) {
     const mxnet::TShape& data_shape = in_shapes[rnn_enum::kData];
-    state = OpStatePtr::Create<MKLDNNRnnOp>(param, data_shape[0], data_shape[1], data_shape[2]);
+    state = OpStatePtr::Create<DNNLRnnOp>(param, data_shape[0], data_shape[1], data_shape[2]);
     return state;
   }
 #endif  // MXNET_USE_ONEDNN == 1
@@ -271,8 +271,8 @@ static void RNNStatefulComputeExCPU(const OpStatePtr& state_ptr,
                                     const std::vector<NDArray>& inputs,
                                     const std::vector<OpReqType>& req,
                                     const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNRnn(inputs[rnn_enum::kData].dtype())) {
-    MKLDNNRnnOp& op = state_ptr.get_state<MKLDNNRnnOp>();
+  if (SupportDNNLRnn(inputs[rnn_enum::kData].dtype())) {
+    DNNLRnnOp& op = state_ptr.get_state<DNNLRnnOp>();
     op.Forward(ctx, inputs, req, outputs);
   } else {
     FallBackCompute(RNNStatefulCompute<cpu>, state_ptr, ctx, inputs, req, outputs);
@@ -284,8 +284,8 @@ static void RNNStatefulGradComputeExCPU(const OpStatePtr& state_ptr,
                                         const std::vector<NDArray>& inputs,
                                         const std::vector<OpReqType>& req,
                                         const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNRnn(inputs[rnn_enum::kData].dtype())) {
-    MKLDNNRnnOp& op = state_ptr.get_state<MKLDNNRnnOp>();
+  if (SupportDNNLRnn(inputs[rnn_enum::kData].dtype())) {
+    DNNLRnnOp& op = state_ptr.get_state<DNNLRnnOp>();
     op.Backward(ctx, inputs, req, outputs);
   } else {
     FallBackCompute(RNNStatefulGradCompute<cpu>, state_ptr, ctx, inputs, req, outputs);
@@ -405,7 +405,7 @@ The definition of GRU here is slightly different from paper but compatible with
     .set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
     .set_attr<FInferStorageType>("FInferStorageType", RNNStorageType)
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", RNNStatefulComputeExCPU)
 #endif
     .set_attr<nnvm::FGradient>("FGradient", RNNGrad{"_backward_RNN"})
@@ -449,7 +449,7 @@ NNVM_REGISTER_OP(_backward_RNN)
     .set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulGradCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
     .set_attr<FInferStorageType>("FInferStorageType", RNNStorageType)
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", RNNStatefulGradComputeExCPU)
 #endif
     .set_attr<FResourceRequestEx>("FResourceRequestEx", RNNResourceEx);
diff --git a/src/operator/softmax_output.cc b/src/operator/softmax_output.cc
index 88d9d5ed342d..401968f2d2a5 100644
--- a/src/operator/softmax_output.cc
+++ b/src/operator/softmax_output.cc
@@ -24,8 +24,8 @@
  */
 #include "./softmax_output-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./nn/mkldnn/mkldnn_ops-inl.h"
-#include "./nn/mkldnn/mkldnn_base-inl.h"
+#include "./nn/dnnl/dnnl_base-inl.h"
+#include "./nn/dnnl/dnnl_ops-inl.h"
 #endif
 namespace mxnet {
 namespace op {
@@ -143,7 +143,7 @@ inline static bool SoftmaxOutputStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 2);
   CHECK_EQ(out_attrs->size(), 1);
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 
 void SoftmaxOutputComputeExCPU(const nnvm::NodeAttrs& attrs,
@@ -153,10 +153,10 @@ void SoftmaxOutputComputeExCPU(const nnvm::NodeAttrs& attrs,
                                const std::vector<NDArray>& outputs) {
   CHECK_EQ(inputs.size(), 2U);
   const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
-  if (SupportMKLDNN(inputs[0]) && !ctx.is_train && SupportMKLDNNSoftmaxOutput(param)) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNSoftmaxOutputForward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNL(inputs[0]) && !ctx.is_train && SupportDNNLSoftmaxOutput(param)) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLSoftmaxOutputForward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -245,7 +245,7 @@ NNVM_REGISTER_OP(SoftmaxOutput)
     .set_attr_parser(ParamParser<SoftmaxOutputParam>)
 #if MXNET_USE_ONEDNN == 1
     .set_attr<FInferStorageType>("FInferStorageType", SoftmaxOutputStorageType)
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxOutputComputeExCPU)
 #endif
     .set_attr<nnvm::FListInputNames>("FListInputNames",
diff --git a/src/operator/subgraph/mkldnn/mkldnn_bn_relu_property.h b/src/operator/subgraph/dnnl/dnnl_bn_relu_property.h
similarity index 82%
rename from src/operator/subgraph/mkldnn/mkldnn_bn_relu_property.h
rename to src/operator/subgraph/dnnl/dnnl_bn_relu_property.h
index 0593ad9b7350..4d39bbeb6869 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_bn_relu_property.h
+++ b/src/operator/subgraph/dnnl/dnnl_bn_relu_property.h
@@ -17,26 +17,25 @@
  * under the License.
  */
 
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_BN_RELU_PROPERTY_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_BN_RELU_PROPERTY_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_BN_RELU_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_BN_RELU_PROPERTY_H_
 #if MXNET_USE_ONEDNN == 1
 
 #include <string>
 #include <vector>
 
-#include "../../nn/mkldnn/mkldnn_act-inl.h"
-#include "../../nn/mkldnn/mkldnn_batch_norm-inl.h"
+#include "../../nn/dnnl/dnnl_act-inl.h"
+#include "../../nn/dnnl/dnnl_batch_norm-inl.h"
 #include "../common.h"
-
-#include "mkldnn_subgraph_base-inl.h"
+#include "dnnl_subgraph_base-inl.h"
 namespace mxnet {
 namespace op {
 
-class SgMKLDNNBNReLUSelector : public SubgraphSelector {
+class SgDNNLBNReLUSelector : public SubgraphSelector {
  public:
   enum SelectStatus { kStart, kSuccess, kFail };
 
-  explicit SgMKLDNNBNReLUSelector(const bool disable_bn_relu)
+  explicit SgDNNLBNReLUSelector(const bool disable_bn_relu)
       : disable_bn_relu_(disable_bn_relu), status_(kStart) {}
 
   bool Select(const nnvm::Node& n) override {
@@ -79,9 +78,9 @@ class SgMKLDNNBNReLUSelector : public SubgraphSelector {
   SelectStatus status_;
 };
 
-class SgMKLDNNBNReLUProperty : public SubgraphProperty {
+class SgDNNLBNReLUProperty : public SubgraphProperty {
  public:
-  SgMKLDNNBNReLUProperty() {
+  SgDNNLBNReLUProperty() {
     disable_bn_relu_ = dmlc::GetEnv("MXNET_DISABLE_ONEDNN_FUSE_BN_RELU", false);
   }
 
@@ -91,8 +90,8 @@ class SgMKLDNNBNReLUProperty : public SubgraphProperty {
   }
 
   static SubgraphPropertyPtr Create() {
-    static const std::string& name = "MKLDNN BN + ReLU optimization pass";
-    auto property                  = std::make_shared<SgMKLDNNBNReLUProperty>();
+    static const std::string& name = "oneDNN BN + ReLU optimization pass";
+    auto property                  = std::make_shared<SgDNNLBNReLUProperty>();
     property->SetAttr<std::string>("property_name", name);
     property->SetAttr<bool>("inference_only", true);
     if (dmlc::GetEnv("MXNET_DISABLE_ONEDNN_BN_RELU_OPT", 0)) {
@@ -106,7 +105,7 @@ class SgMKLDNNBNReLUProperty : public SubgraphProperty {
     nnvm::ObjectPtr n = nnvm::Node::Create();
 
     std::ostringstream node_name;
-    node_name << "sg_mkldnn_batch_norm_relu_" << std::to_string(subgraph_id);
+    node_name << "sg_onednn_batch_norm_relu_" << std::to_string(subgraph_id);
 
     // Copy params from BatchNorm node into subgraph BatchNormReLU node
     BatchNormParam param;
@@ -125,7 +124,7 @@ class SgMKLDNNBNReLUProperty : public SubgraphProperty {
   }
 
   SubgraphSelectorPtr CreateSubgraphSelector() const override {
-    auto selector = std::make_shared<SgMKLDNNBNReLUSelector>(disable_bn_relu_);
+    auto selector = std::make_shared<SgDNNLBNReLUSelector>(disable_bn_relu_);
     return selector;
   }
 
@@ -137,4 +136,4 @@ class SgMKLDNNBNReLUProperty : public SubgraphProperty {
 }  // namespace mxnet
 
 #endif  // if MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_BN_RELU_PROPERTY_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_BN_RELU_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_common.h b/src/operator/subgraph/dnnl/dnnl_common.h
similarity index 72%
rename from src/operator/subgraph/mkldnn/mkldnn_common.h
rename to src/operator/subgraph/dnnl/dnnl_common.h
index d3f4a108013e..7fdc7ec52811 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_common.h
+++ b/src/operator/subgraph/dnnl/dnnl_common.h
@@ -18,13 +18,13 @@
  */
 
 /*!
- * \file mkldnn_common.h
- * \brief Common header file for MKLDNN backend subgraph
+ * \file dnnl_common.h
+ * \brief Common header file for DNNL backend subgraph
  * \author Ciyong Chen
  */
 
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_COMMON_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_COMMON_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_COMMON_H_
+#define MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_COMMON_H_
 #if MXNET_USE_ONEDNN == 1
 #include <vector>
 
@@ -65,7 +65,7 @@ static std::vector<float> GetWeightScales(const NDArray& weight,
       float scale = GetQuantizeScale(mshadow::kInt8, weight_c_min[c], weight_c_max[c]);
       if (bias_ptr && bias_ptr[c]) {
         // avoid overflow on bias
-        // TODO(zhennan): mkldnn has bug to handle INT_MAX in bias, so set the maximum value of bias
+        // TODO(zhennan): dnnl has bug to handle INT_MAX in bias, so set the maximum value of bias
         // to INT_MAX / 2.
         float scale_max =
             static_cast<float>(bias_ptr[c] > 0 ? MaxValue<int32_t>() : MinValue<int32_t>()) / 2 /
@@ -91,31 +91,31 @@ static std::vector<float> GetWeightScales(const NDArray& weight,
   return weight_scales;
 }
 
-static inline void ConvertWeightBias2MKLDNN(NDArray* weight,
-                                            NDArray* bias,
-                                            bool has_bias,
-                                            const mkldnn::memory::desc& weight_md,
-                                            const mkldnn::memory::desc* bias_md,
-                                            const int num_group,
-                                            float data_scale,
-                                            const std::vector<float>& weight_scales,
-                                            const bool submit = true) {
-  MKLDNNStream* stream           = MKLDNNStream::Get();
+static inline void ConvertWeightBias2DNNL(NDArray* weight,
+                                          NDArray* bias,
+                                          bool has_bias,
+                                          const dnnl::memory::desc& weight_md,
+                                          const dnnl::memory::desc* bias_md,
+                                          const int num_group,
+                                          float data_scale,
+                                          const std::vector<float>& weight_scales,
+                                          const bool submit = true) {
+  DNNLStream* stream             = DNNLStream::Get();
   const auto new_weight          = NDArray(weight_md);
-  const auto conv_weights_memory = new_weight.GetMKLDNNData();
-  mkldnn::primitive_attr weight_attr;
+  const auto conv_weights_memory = new_weight.GetDNNLData();
+  dnnl::primitive_attr weight_attr;
   if (weight_scales.size()) {
     const int weight_mask = (weight_scales.size()) == 1 ? 0 : 1;
     weight_attr.set_output_scales(weight_mask, weight_scales);
   }
   auto default_weights_memory = GetWeights(*weight, num_group);
   if (default_weights_memory == nullptr)
-    default_weights_memory = weight->GetMKLDNNData();
+    default_weights_memory = weight->GetDNNLData();
   const auto weight_reorder_pd =
-      mkldnn::reorder::primitive_desc(*default_weights_memory, *conv_weights_memory, weight_attr);
-  MKLDNNStream::Get()->RegisterPrimArgs(
-      mkldnn::reorder(weight_reorder_pd),
-      {{MKLDNN_ARG_FROM, *default_weights_memory}, {MKLDNN_ARG_TO, *conv_weights_memory}});
+      dnnl::reorder::primitive_desc(*default_weights_memory, *conv_weights_memory, weight_attr);
+  DNNLStream::Get()->RegisterPrimArgs(
+      dnnl::reorder(weight_reorder_pd),
+      {{DNNL_ARG_FROM, *default_weights_memory}, {DNNL_ARG_TO, *conv_weights_memory}});
   NDArray new_bias;
   if (has_bias && data_scale) {
     std::vector<float> bias_scales(weight_scales.size());
@@ -123,16 +123,16 @@ static inline void ConvertWeightBias2MKLDNN(NDArray* weight,
       bias_scales[c] = weight_scales[c] * data_scale;
     }
     new_bias                    = NDArray(*bias_md);
-    const auto conv_bias_memory = new_bias.GetMKLDNNData();
+    const auto conv_bias_memory = new_bias.GetDNNLData();
     const int bias_mask         = (bias_scales.size()) == 1 ? 0 : 1;
-    mkldnn::primitive_attr bias_attr;
+    dnnl::primitive_attr bias_attr;
     bias_attr.set_output_scales(bias_mask, bias_scales);
-    auto bias_weights_memory = bias->GetMKLDNNData();
+    auto bias_weights_memory = bias->GetDNNLData();
     const auto bias_reorder_pd =
-        mkldnn::reorder::primitive_desc(*bias_weights_memory, *conv_bias_memory, bias_attr);
-    MKLDNNStream::Get()->RegisterPrimArgs(
-        mkldnn::reorder(bias_reorder_pd),
-        {{MKLDNN_ARG_FROM, *bias_weights_memory}, {MKLDNN_ARG_TO, *conv_bias_memory}});
+        dnnl::reorder::primitive_desc(*bias_weights_memory, *conv_bias_memory, bias_attr);
+    DNNLStream::Get()->RegisterPrimArgs(
+        dnnl::reorder(bias_reorder_pd),
+        {{DNNL_ARG_FROM, *bias_weights_memory}, {DNNL_ARG_TO, *conv_bias_memory}});
   }
   if (submit)
     stream->Submit();
@@ -172,4 +172,4 @@ static inline bool CheckSwapAxisConditions(const nnvm::Node& node) {
 }  // namespace mxnet
 
 #endif  // if MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_COMMON_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_COMMON_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv-inl.h b/src/operator/subgraph/dnnl/dnnl_conv-inl.h
similarity index 55%
rename from src/operator/subgraph/mkldnn/mkldnn_conv-inl.h
rename to src/operator/subgraph/dnnl/dnnl_conv-inl.h
index 631acb5fd559..81b7e550ecb0 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv-inl.h
+++ b/src/operator/subgraph/dnnl/dnnl_conv-inl.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_INL_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_INL_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_CONV_INL_H_
+#define MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_CONV_INL_H_
 #if MXNET_USE_ONEDNN == 1
 
 #include <string>
@@ -28,39 +28,39 @@
 #include "../../nn/activation-inl.h"
 #include "../../nn/batch_norm-inl.h"
 #include "../../nn/convolution-inl.h"
-#include "../../nn/mkldnn/mkldnn_convolution-inl.h"
+#include "../../nn/dnnl/dnnl_convolution-inl.h"
 
 namespace mxnet {
 namespace op {
 
-struct MKLDNNConvFusionParam {
-  MKLDNNConvFullParam full_conv_param;
+struct DNNLConvFusionParam {
+  DNNLConvFullParam full_conv_param;
   std::shared_ptr<BatchNormParam> bn_param;
 };
 
-static inline bool IsOutputUInt8(const MKLDNNConvFusionParam& param) {
+static inline bool IsOutputUInt8(const DNNLConvFusionParam& param) {
   bool result              = false;
-  const auto& mkldnn_param = param.full_conv_param.mkldnn_param;
-  auto IsOutputUInt8Helper = [](const MKLDNNPostEltwiseParam& param) {
-    return ((param.alg == mkldnn::algorithm::eltwise_relu && param.alpha == 0.f) ||
-            param.alg == mkldnn::algorithm::eltwise_logistic ||
-            param.alg == mkldnn::algorithm::eltwise_soft_relu ||
-            param.alg == mkldnn::algorithm::eltwise_bounded_relu);
+  const auto& dnnl_param   = param.full_conv_param.dnnl_param;
+  auto IsOutputUInt8Helper = [](const DNNLPostEltwiseParam& param) {
+    return ((param.alg == dnnl::algorithm::eltwise_relu && param.alpha == 0.f) ||
+            param.alg == dnnl::algorithm::eltwise_logistic ||
+            param.alg == dnnl::algorithm::eltwise_soft_relu ||
+            param.alg == dnnl::algorithm::eltwise_bounded_relu);
   };
-  if ((!mkldnn_param.with_sum) && mkldnn_param.with_act) {
-    CHECK(param.full_conv_param.act_param.alg != mkldnn::algorithm::undef);
+  if ((!dnnl_param.with_sum) && dnnl_param.with_act) {
+    CHECK(param.full_conv_param.act_param.alg != dnnl::algorithm::undef);
     result = IsOutputUInt8Helper(param.full_conv_param.act_param);
-  } else if (mkldnn_param.with_postsum_act) {
-    CHECK(param.full_conv_param.postsum_act_param.alg != mkldnn::algorithm::undef);
+  } else if (dnnl_param.with_postsum_act) {
+    CHECK(param.full_conv_param.postsum_act_param.alg != dnnl::algorithm::undef);
     result = IsOutputUInt8Helper(param.full_conv_param.postsum_act_param);
   }
   return result;
 }
 
-enum MKLDNNConvOpOutputs { kOut, kMin, kMax };
+enum DNNLConvOpOutputs { kOut, kMin, kMax };
 
 }  // namespace op
 }  // namespace mxnet
 
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_INL_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_CONV_INL_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv.cc b/src/operator/subgraph/dnnl/dnnl_conv.cc
similarity index 60%
rename from src/operator/subgraph/mkldnn/mkldnn_conv.cc
rename to src/operator/subgraph/dnnl/dnnl_conv.cc
index b4de50b697f2..8702430d45b0 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv.cc
+++ b/src/operator/subgraph/dnnl/dnnl_conv.cc
@@ -23,15 +23,14 @@
 #include <utility>
 #include <vector>
 
-#include "../../nn/mkldnn/mkldnn_act-inl.h"
-#include "../../nn/mkldnn/mkldnn_base-inl.h"
-#include "../../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../../nn/dnnl/dnnl_act-inl.h"
+#include "../../nn/dnnl/dnnl_base-inl.h"
+#include "../../nn/dnnl/dnnl_ops-inl.h"
 #include "../../quantization/quantization_utils.h"
 #include "../../tensor/matrix_op-inl.h"
 #include "../common.h"
-
-#include "mkldnn_common.h"
-#include "mkldnn_conv-inl.h"
+#include "dnnl_common.h"
+#include "dnnl_conv-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -83,19 +82,18 @@ static void UpdateConvWeightBias(NDArray* weight,
   *bias   = update_bias;
 }
 
-static inline size_t GetInSumIndex(const MKLDNNConvFusionParam& param) {
-  if (param.full_conv_param.mkldnn_param.dedup_sum) {
+static inline size_t GetInSumIndex(const DNNLConvFusionParam& param) {
+  if (param.full_conv_param.dnnl_param.dedup_sum) {
     return 0;
   }
   return 2 + (param.full_conv_param.conv_param.no_bias ? 0 : 1) +
-         (param.full_conv_param.mkldnn_param.with_bn ? 4 : 0);
+         (param.full_conv_param.dnnl_param.with_bn ? 4 : 0);
 }
 
-class SgMKLDNNConvOperator {
+class SgDNNLConvOperator {
  public:
-  explicit SgMKLDNNConvOperator(const nnvm::NodeAttrs& attrs)
-      : subgraph_sym_(*attrs.subgraphs[0]),
-        param_(nnvm::get<MKLDNNConvFusionParam>(attrs.parsed)) {}
+  explicit SgDNNLConvOperator(const nnvm::NodeAttrs& attrs)
+      : subgraph_sym_(*attrs.subgraphs[0]), param_(nnvm::get<DNNLConvFusionParam>(attrs.parsed)) {}
 
   void Forward(const OpContext& ctx,
                const std::vector<NDArray>& inputs,
@@ -107,9 +105,9 @@ class SgMKLDNNConvOperator {
   bool inplace_{false};
   bool post_requantize_{false};
   nnvm::Symbol subgraph_sym_;
-  MKLDNNConvFusionParam param_;
-  std::shared_ptr<MKLDNNConvForward> fwd_;
-  mkldnn_args_map_t args_;
+  DNNLConvFusionParam param_;
+  std::shared_ptr<DNNLConvForward> fwd_;
+  dnnl_args_map_t args_;
   NDArray cached_weight_;
   NDArray cached_bias_;
   float cached_data_min_;
@@ -124,22 +122,22 @@ class SgMKLDNNConvOperator {
   std::vector<float> weight_scales_;
 };
 
-void SgMKLDNNConvOperator::Forward(const OpContext& ctx,
-                                   const std::vector<NDArray>& inputs,
-                                   const std::vector<OpReqType>& req,
-                                   const std::vector<NDArray>& outputs) {
+void SgDNNLConvOperator::Forward(const OpContext& ctx,
+                                 const std::vector<NDArray>& inputs,
+                                 const std::vector<OpReqType>& req,
+                                 const std::vector<NDArray>& outputs) {
   auto& full_conv_param = param_.full_conv_param;
-  auto& mkldnn_param    = param_.full_conv_param.mkldnn_param;
+  auto& dnnl_param      = param_.full_conv_param.dnnl_param;
   auto& conv_param      = param_.full_conv_param.conv_param;
   auto bn_param         = param_.bn_param.get();
   size_t input_size =
-      2 + (conv_param.no_bias ? 0 : 1) + (mkldnn_param.with_bn ? 4 : 0) +
-      (mkldnn_param.with_sum ? 1 : 0) +
-      (mkldnn_param.quantized ? 2 + (full_conv_param.mkldnn_param.with_sum ? 2 : 0) : 0);
+      2 + (conv_param.no_bias ? 0 : 1) + (dnnl_param.with_bn ? 4 : 0) +
+      (dnnl_param.with_sum ? 1 : 0) +
+      (dnnl_param.quantized ? 2 + (full_conv_param.dnnl_param.with_sum ? 2 : 0) : 0);
   // When dedup is on, in_data is used to calculate sum instead of in_sum
-  if (mkldnn_param.dedup_sum) {
+  if (dnnl_param.dedup_sum) {
     input_size -= 1;
-    if (mkldnn_param.quantized) {
+    if (dnnl_param.quantized) {
       input_size -= 2;
     }
   }
@@ -149,17 +147,17 @@ void SgMKLDNNConvOperator::Forward(const OpContext& ctx,
   auto in_data   = idx++;
   auto in_weight = idx++;
   auto in_bias   = conv_param.no_bias ? 0 : (idx++);
-  auto in_gamma  = mkldnn_param.with_bn ? (idx++) : 0;
-  auto in_beta   = mkldnn_param.with_bn ? (idx++) : 0;
-  auto in_mean   = mkldnn_param.with_bn ? (idx++) : 0;
-  auto in_var    = mkldnn_param.with_bn ? (idx++) : 0;
-  auto in_sum    = mkldnn_param.with_sum ? (mkldnn_param.dedup_sum ? in_data : idx++) : -1;
-  float data_min = mkldnn_param.quantized ? inputs[idx++].data().dptr<float>()[0] : 0.0;
-  float data_max = mkldnn_param.quantized ? inputs[idx++].data().dptr<float>()[0] : 0.0;
+  auto in_gamma  = dnnl_param.with_bn ? (idx++) : 0;
+  auto in_beta   = dnnl_param.with_bn ? (idx++) : 0;
+  auto in_mean   = dnnl_param.with_bn ? (idx++) : 0;
+  auto in_var    = dnnl_param.with_bn ? (idx++) : 0;
+  auto in_sum    = dnnl_param.with_sum ? (dnnl_param.dedup_sum ? in_data : idx++) : -1;
+  float data_min = dnnl_param.quantized ? inputs[idx++].data().dptr<float>()[0] : 0.0;
+  float data_max = dnnl_param.quantized ? inputs[idx++].data().dptr<float>()[0] : 0.0;
   float sum_min  = 0.0f;
   float sum_max  = 0.0f;
-  if (mkldnn_param.with_sum && mkldnn_param.quantized) {
-    if (mkldnn_param.dedup_sum) {
+  if (dnnl_param.with_sum && dnnl_param.quantized) {
+    if (dnnl_param.dedup_sum) {
       sum_min = data_min;
       sum_max = data_max;
     } else {
@@ -168,42 +166,42 @@ void SgMKLDNNConvOperator::Forward(const OpContext& ctx,
     }
   }
   CHECK_EQ(input_size, idx);
-  bool has_bias  = mkldnn_param.with_bn || !conv_param.no_bias;
+  bool has_bias  = dnnl_param.with_bn || !conv_param.no_bias;
   NDArray data   = inputs[in_data];
-  NDArray output = mkldnn_param.with_sum ? inputs[in_sum] : outputs[kOut];
+  NDArray output = dnnl_param.with_sum ? inputs[in_sum] : outputs[kOut];
 
   // Copy inputs[in_sum] into outputs[kOut] in case inplace optimization failed.
-  if (mkldnn_param.with_sum) {
+  if (dnnl_param.with_sum) {
     if (!initialized_) {
-      // TODO(zhennan): Currently, mkldnn fallback mechanism will break inplace option,
+      // TODO(zhennan): Currently, dnnl fallback mechanism will break inplace option,
       // which make check (req[kOut] == kWriteInplace) useless.
-      auto in_mkl_mem  = inputs[in_sum].GetMKLDNNData();
-      auto out_mkl_mem = outputs[kOut].GetMKLDNNData();
-      if (in_mkl_mem->get_data_handle() == out_mkl_mem->get_data_handle()) {
+      auto in_dnnl_mem  = inputs[in_sum].GetDNNLData();
+      auto out_dnnl_mem = outputs[kOut].GetDNNLData();
+      if (in_dnnl_mem->get_data_handle() == out_dnnl_mem->get_data_handle()) {
         inplace_ = true;
       }
     }
     if (!inplace_) {
-      auto in_mkl_mem  = inputs[in_sum].GetMKLDNNData();
-      auto out_mkl_mem = outputs[kOut].GetMKLDNNData();
+      auto in_dnnl_mem  = inputs[in_sum].GetDNNLData();
+      auto out_dnnl_mem = outputs[kOut].GetDNNLData();
       if (outputs[kOut].dtype() == mshadow::kInt32) {
-        const auto& mem_desc  = in_mkl_mem->get_desc();
-        const auto this_dtype = get_mkldnn_type(mshadow::kInt32);
+        const auto& mem_desc  = in_dnnl_mem->get_desc();
+        const auto this_dtype = get_dnnl_type(mshadow::kInt32);
         auto omd              = mem_desc;
-        omd.data.data_type    = static_cast<mkldnn_data_type_t>(this_dtype);
-        mkldnn_mem_ptr tmp_mem(new mkldnn::memory(
-            omd, CpuEngine::Get()->get_engine(), out_mkl_mem->get_data_handle()));
-        MKLDNNStream::Get()->RegisterMem(tmp_mem);
-        MKLDNNStream::Get()->RegisterPrimArgs(
-            mkldnn::reorder(*in_mkl_mem, *tmp_mem),
-            {{MKLDNN_ARG_FROM, *in_mkl_mem}, {MKLDNN_ARG_TO, *tmp_mem}});
+        omd.data.data_type    = static_cast<dnnl_data_type_t>(this_dtype);
+        dnnl_mem_ptr tmp_mem(
+            new dnnl::memory(omd, CpuEngine::Get()->get_engine(), out_dnnl_mem->get_data_handle()));
+        DNNLStream::Get()->RegisterMem(tmp_mem);
+        DNNLStream::Get()->RegisterPrimArgs(
+            dnnl::reorder(*in_dnnl_mem, *tmp_mem),
+            {{DNNL_ARG_FROM, *in_dnnl_mem}, {DNNL_ARG_TO, *tmp_mem}});
         output = NDArray(tmp_mem);
       } else {
-        mkldnn_mem_ptr tmp_mem(new mkldnn::memory(in_mkl_mem->get_desc(),
-                                                  CpuEngine::Get()->get_engine(),
-                                                  out_mkl_mem->get_data_handle()));
-        MKLDNNStream::Get()->RegisterMem(tmp_mem);
-        MKLDNNMemoryCopy(*in_mkl_mem, tmp_mem.get());
+        dnnl_mem_ptr tmp_mem(new dnnl::memory(in_dnnl_mem->get_desc(),
+                                              CpuEngine::Get()->get_engine(),
+                                              out_dnnl_mem->get_data_handle()));
+        DNNLStream::Get()->RegisterMem(tmp_mem);
+        DNNLMemoryCopy(*in_dnnl_mem, tmp_mem.get());
         output = NDArray(tmp_mem);
       }
     }
@@ -212,13 +210,13 @@ void SgMKLDNNConvOperator::Forward(const OpContext& ctx,
   // Check input change
   // TODO(zhennan): Only update cached_* changed.
   if (initialized_) {
-    if (mkldnn_param.with_bn) {
+    if (dnnl_param.with_bn) {
       if (weight_ver_ != inputs[in_weight].version() ||
           ((!conv_param.no_bias) && bias_ver_ != inputs[in_bias].version())) {
         initialized_ = false;
       }
     }
-    if (initialized_ && mkldnn_param.quantized) {
+    if (initialized_ && dnnl_param.quantized) {
       if (cached_data_min_ != data_min || cached_data_max_ != data_max ||
           cached_sum_min_ != sum_min || cached_sum_max_ != sum_max ||
           weight_ver_ != inputs[in_weight].version() ||
@@ -242,8 +240,8 @@ void SgMKLDNNConvOperator::Forward(const OpContext& ctx,
     }
 
     // Update weight and bias after bn fusion.
-    if (mkldnn_param.with_bn) {
-      MKLDNN_REAL_TYPE_SWITCH(inputs[in_weight].dtype(), DType, {
+    if (dnnl_param.with_bn) {
+      DNNL_REAL_TYPE_SWITCH(inputs[in_weight].dtype(), DType, {
         UpdateConvWeightBias<DType>(&cached_weight_,
                                     &cached_bias_,
                                     conv_param.no_bias,
@@ -255,21 +253,21 @@ void SgMKLDNNConvOperator::Forward(const OpContext& ctx,
       });
     }
     // Quantize weight and bias.
-    if (mkldnn_param.quantized) {
+    if (dnnl_param.quantized) {
       CHECK(data.dtype() == mshadow::kInt8 || data.dtype() == mshadow::kUint8);
       if (cached_data_min_ < 0.0f) {
         CHECK_EQ(data.dtype(), mshadow::kInt8)
             << "Expect int8 when data_min < 0.0, consider quantize model with int8.";
       }
       auto weight_channelwise_scale = false;
-      if (mkldnn_param.min_calib_range.has_value() && mkldnn_param.max_calib_range.has_value()) {
-        cached_output_min_       = mkldnn_param.min_calib_range.value();
-        cached_output_max_       = mkldnn_param.max_calib_range.value();
+      if (dnnl_param.min_calib_range.has_value() && dnnl_param.max_calib_range.has_value()) {
+        cached_output_min_       = dnnl_param.min_calib_range.value();
+        cached_output_max_       = dnnl_param.max_calib_range.value();
         post_requantize_         = true;
         weight_channelwise_scale = true;
       }
       data_scale_ = GetQuantizeScale(data.dtype(), cached_data_min_, cached_data_max_);
-      MKLDNN_REAL_TYPE_SWITCH(cached_weight_.dtype(), DType, {
+      DNNL_REAL_TYPE_SWITCH(cached_weight_.dtype(), DType, {
         weight_scales_ = GetWeightScales<DType>(cached_weight_,
                                                 has_bias ? &cached_bias_ : nullptr,
                                                 data_scale_,
@@ -279,7 +277,7 @@ void SgMKLDNNConvOperator::Forward(const OpContext& ctx,
       size_t channel     = cached_weight_.shape()[0];
       float sum_in_scale = 1.0;
       float output_scale;
-      if (mkldnn_param.with_sum) {
+      if (dnnl_param.with_sum) {
         sum_in_scale = GetQuantizeScale(inputs[in_sum].dtype(), cached_sum_min_, cached_sum_max_);
       }
       if (post_requantize_) {
@@ -317,71 +315,71 @@ void SgMKLDNNConvOperator::Forward(const OpContext& ctx,
         output_scale = data_scale_ * weight_scales_[0];
         full_conv_param.requantize_scales.resize(0);
       }
-      if (mkldnn_param.with_sum) {
+      if (dnnl_param.with_sum) {
         full_conv_param.sum_scale = output_scale / sum_in_scale;
       }
-      if (mkldnn_param.with_act &&
-          full_conv_param.act_param.alg == mkldnn::algorithm::eltwise_bounded_relu) {
-        if (mkldnn_param.with_sum) {
-          LOG(ERROR) << "mkldnn doesn't support conv + relu + sum fusion yet.";
+      if (dnnl_param.with_act &&
+          full_conv_param.act_param.alg == dnnl::algorithm::eltwise_bounded_relu) {
+        if (dnnl_param.with_sum) {
+          LOG(ERROR) << "dnnl doesn't support conv + relu + sum fusion yet.";
           full_conv_param.act_param.alpha *= output_scale;
         } else {
           // For conv+relu6 without sum, we don't need post_ops as output_scale can do the cut off.
-          mkldnn_param.with_act = false;
+          dnnl_param.with_act = false;
         }
       }
-      if (mkldnn_param.with_postsum_act) {
-        CHECK(full_conv_param.postsum_act_param.alg == mkldnn::algorithm::eltwise_relu);
+      if (dnnl_param.with_postsum_act) {
+        CHECK(full_conv_param.postsum_act_param.alg == dnnl::algorithm::eltwise_relu);
       }
     }
-    fwd_.reset(new MKLDNNConvForward(full_conv_param,
-                                     ctx.is_train,
-                                     data,
-                                     cached_weight_,
-                                     has_bias ? &cached_bias_ : nullptr,
-                                     output));
-    mkldnn::memory::desc bias_md;
+    fwd_.reset(new DNNLConvForward(full_conv_param,
+                                   ctx.is_train,
+                                   data,
+                                   cached_weight_,
+                                   has_bias ? &cached_bias_ : nullptr,
+                                   output));
+    dnnl::memory::desc bias_md;
     if (has_bias)
       bias_md = fwd_->GetPd().bias_desc();
-    ConvertWeightBias2MKLDNN(&cached_weight_,
-                             &cached_bias_,
-                             has_bias,
-                             fwd_->GetPd().weights_desc(),
-                             has_bias ? &bias_md : nullptr,
-                             full_conv_param.conv_param.num_group,
-                             data_scale_,
-                             weight_scales_);
-    args_[MKLDNN_ARG_SRC]     = *data.GetMKLDNNData();
-    args_[MKLDNN_ARG_WEIGHTS] = *cached_weight_.GetMKLDNNData();
+    ConvertWeightBias2DNNL(&cached_weight_,
+                           &cached_bias_,
+                           has_bias,
+                           fwd_->GetPd().weights_desc(),
+                           has_bias ? &bias_md : nullptr,
+                           full_conv_param.conv_param.num_group,
+                           data_scale_,
+                           weight_scales_);
+    args_[DNNL_ARG_SRC]     = *data.GetDNNLData();
+    args_[DNNL_ARG_WEIGHTS] = *cached_weight_.GetDNNLData();
     if (has_bias)
-      args_[MKLDNN_ARG_BIAS] = *cached_bias_.GetMKLDNNData();
-    args_[MKLDNN_ARG_DST] = *output.GetMKLDNNData();
-    initialized_          = true;
+      args_[DNNL_ARG_BIAS] = *cached_bias_.GetDNNLData();
+    args_[DNNL_ARG_DST] = *output.GetDNNLData();
+    initialized_        = true;
   }
 
-  if (mkldnn_param.with_sum) {
-    const auto& output_mem   = output.GetMKLDNNData();
+  if (dnnl_param.with_sum) {
+    const auto& output_mem   = output.GetDNNLData();
     const auto& out_mem_desc = output_mem->get_desc();
     const auto& dst_mem_desc = fwd_->GetPd().dst_desc();
     if (out_mem_desc != dst_mem_desc) {
-      auto tmp_out_mem       = output.GetMKLDNNDataReorder(fwd_->GetPd().dst_desc());
+      auto tmp_out_mem       = output.GetDNNLDataReorder(fwd_->GetPd().dst_desc());
       auto data_md           = dst_mem_desc;
-      data_md.data.data_type = static_cast<mkldnn_data_type_t>(out_mem_desc.data.data_type);
-      mkldnn_mem_ptr new_out_mem(new mkldnn::memory(
-          data_md, CpuEngine::Get()->get_engine(), output_mem->get_data_handle()));
-      MKLDNNStream::Get()->RegisterMem(new_out_mem);
-      MKLDNNMemoryCopy(*tmp_out_mem, new_out_mem.get());
+      data_md.data.data_type = static_cast<dnnl_data_type_t>(out_mem_desc.data.data_type);
+      dnnl_mem_ptr new_out_mem(
+          new dnnl::memory(data_md, CpuEngine::Get()->get_engine(), output_mem->get_data_handle()));
+      DNNLStream::Get()->RegisterMem(new_out_mem);
+      DNNLMemoryCopy(*tmp_out_mem, new_out_mem.get());
       output = NDArray(new_out_mem);
     }
   }
 
-  if (mkldnn_param.quantized) {
-    auto data_mem         = data.GetMKLDNNDataReorder(fwd_->GetPd().src_desc());
-    mkldnn::memory* mem   = output.CreateMKLDNNData(fwd_->GetPd().dst_desc());
-    args_[MKLDNN_ARG_SRC] = *data_mem;
-    args_[MKLDNN_ARG_DST] = *mem;
-    MKLDNNStream::Get()->RegisterPrimArgs(fwd_->GetFwd(), args_);
-    MKLDNNStream::Get()->Submit();
+  if (dnnl_param.quantized) {
+    auto data_mem       = data.GetDNNLDataReorder(fwd_->GetPd().src_desc());
+    dnnl::memory* mem   = output.CreateDNNLData(fwd_->GetPd().dst_desc());
+    args_[DNNL_ARG_SRC] = *data_mem;
+    args_[DNNL_ARG_DST] = *mem;
+    DNNLStream::Get()->RegisterPrimArgs(fwd_->GetFwd(), args_);
+    DNNLStream::Get()->Submit();
   } else {
     std::vector<NDArray> new_inputs;
     if (has_bias) {
@@ -389,44 +387,42 @@ void SgMKLDNNConvOperator::Forward(const OpContext& ctx,
     } else {
       new_inputs = {data, cached_weight_};
     }
-    MKLDNNConvolutionForwardFullFeature(
-        full_conv_param, ctx, fwd_.get(), new_inputs, req, {output});
+    DNNLConvolutionForwardFullFeature(full_conv_param, ctx, fwd_.get(), new_inputs, req, {output});
   }
 
-  if (mkldnn_param.quantized) {
+  if (dnnl_param.quantized) {
     *outputs[kMin].data().dptr<float>() = cached_output_min_;
     *outputs[kMax].data().dptr<float>() = cached_output_max_;
   }
-  if (mkldnn_param.with_sum) {
+  if (dnnl_param.with_sum) {
     auto out = const_cast<NDArray&>(outputs[kOut]);
-    out.UpdateMKLDNNMemDesc(fwd_->GetPd().dst_desc());
+    out.UpdateDNNLMemDesc(fwd_->GetPd().dst_desc());
   }
 }
 
-static void SgMKLDNNConvOpForward(const OpStatePtr& state_ptr,
-                                  const OpContext& ctx,
-                                  const std::vector<NDArray>& inputs,
-                                  const std::vector<OpReqType>& req,
-                                  const std::vector<NDArray>& outputs) {
-  SgMKLDNNConvOperator& op = state_ptr.get_state<SgMKLDNNConvOperator>();
+static void SgDNNLConvOpForward(const OpStatePtr& state_ptr,
+                                const OpContext& ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
+  SgDNNLConvOperator& op = state_ptr.get_state<SgDNNLConvOperator>();
   op.Forward(ctx, inputs, req, outputs);
 }
 
-static uint32_t SgMKLDNNConvNumInputs(const NodeAttrs& attrs) {
-  auto const& param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
+static uint32_t SgDNNLConvNumInputs(const NodeAttrs& attrs) {
+  auto const& param = nnvm::get<DNNLConvFusionParam>(attrs.parsed);
   auto num_input    = DefaultSubgraphOpNumInputs(attrs);
-  if (param.full_conv_param.mkldnn_param.quantized)
+  if (param.full_conv_param.dnnl_param.quantized)
     return num_input + 2 +
-           (param.full_conv_param.mkldnn_param.with_sum &&
-                    !param.full_conv_param.mkldnn_param.dedup_sum
+           (param.full_conv_param.dnnl_param.with_sum && !param.full_conv_param.dnnl_param.dedup_sum
                 ? 2
                 : 0);
   else
     return num_input;
 }
 
-static void SgMKLDNNConvParamParser(nnvm::NodeAttrs* attrs) {
-  MKLDNNConvFusionParam param_;
+static void SgDNNLConvParamParser(nnvm::NodeAttrs* attrs) {
+  DNNLConvFusionParam param_;
 
   // For back-compatible, rename
   // with_relu -> with_act
@@ -445,7 +441,7 @@ static void SgMKLDNNConvParamParser(nnvm::NodeAttrs* attrs) {
   }
 
   try {
-    param_.full_conv_param.mkldnn_param.Init(attrs->dict);
+    param_.full_conv_param.dnnl_param.Init(attrs->dict);
   } catch (const dmlc::ParamError& e) {
     std::ostringstream os;
     os << e.what();
@@ -465,27 +461,27 @@ static void SgMKLDNNConvParamParser(nnvm::NodeAttrs* attrs) {
       return;
     auto& node_name = node->op()->name;
     if (node_name == "BatchNorm") {
-      CHECK_EQ(param_.full_conv_param.mkldnn_param.with_bn, true);
+      CHECK_EQ(param_.full_conv_param.dnnl_param.with_bn, true);
       CHECK(param_.bn_param.get() == nullptr);
       param_.bn_param =
           std::make_shared<BatchNormParam>(nnvm::get<BatchNormParam>(node->attrs.parsed));
     } else if (node_name == "Convolution") {
       param_.full_conv_param.conv_param = nnvm::get<ConvolutionParam>(node->attrs.parsed);
     } else if (node_name == "Activation" || node_name == "LeakyReLU" || node_name == "clip") {
-      auto& post_act_param = (param_.full_conv_param.mkldnn_param.with_act && !with_act)
+      auto& post_act_param = (param_.full_conv_param.dnnl_param.with_act && !with_act)
                                  ? param_.full_conv_param.act_param
                                  : param_.full_conv_param.postsum_act_param;
-      with_act             = true;
+      with_act = true;
       if (node_name == "Activation") {
         const auto act_param = nnvm::get<ActivationParam>(node->attrs.parsed);
-        post_act_param.alg   = GetMKLDNNActAlgo(act_param);
+        post_act_param.alg   = GetDNNLActAlgo(act_param);
       } else if (node_name == "LeakyReLU") {
         const auto act_param = nnvm::get<LeakyReLUParam>(node->attrs.parsed);
         post_act_param.alpha = act_param.slope;
-        post_act_param.alg   = GetMKLDNNActAlgo(act_param);
+        post_act_param.alg   = GetDNNLActAlgo(act_param);
       } else {
         const auto clip_param = nnvm::get<ClipParam>(node->attrs.parsed);
-        post_act_param.alg    = mkldnn::algorithm::eltwise_bounded_relu;
+        post_act_param.alg    = dnnl::algorithm::eltwise_bounded_relu;
         post_act_param.alpha  = clip_param.a_max;
       }
     }
@@ -493,53 +489,53 @@ static void SgMKLDNNConvParamParser(nnvm::NodeAttrs* attrs) {
   attrs->parsed = std::move(param_);
 }
 
-static std::vector<std::string> SgMKLDNNConvListInputNames(const NodeAttrs& attrs) {
-  auto const& param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
+static std::vector<std::string> SgDNNLConvListInputNames(const NodeAttrs& attrs) {
+  auto const& param = nnvm::get<DNNLConvFusionParam>(attrs.parsed);
   std::vector<std::string> input_names;
   input_names.emplace_back("data");
   input_names.emplace_back("weight");
   if (!param.full_conv_param.conv_param.no_bias) {
     input_names.emplace_back("bias");
   }
-  if (param.full_conv_param.mkldnn_param.with_bn) {
+  if (param.full_conv_param.dnnl_param.with_bn) {
     input_names.emplace_back("gamma");
     input_names.emplace_back("beta");
     input_names.emplace_back("mean");
     input_names.emplace_back("var");
   }
-  auto& mkldnn_param = param.full_conv_param.mkldnn_param;
-  if (mkldnn_param.with_sum && !mkldnn_param.dedup_sum) {
+  auto& dnnl_param = param.full_conv_param.dnnl_param;
+  if (dnnl_param.with_sum && !dnnl_param.dedup_sum) {
     input_names.emplace_back("sum");
   }
-  if (param.full_conv_param.mkldnn_param.quantized) {
+  if (param.full_conv_param.dnnl_param.quantized) {
     input_names.emplace_back("data_min");
     input_names.emplace_back("data_max");
-    if (mkldnn_param.with_sum && !mkldnn_param.dedup_sum) {
+    if (dnnl_param.with_sum && !dnnl_param.dedup_sum) {
       input_names.emplace_back("sum_min");
       input_names.emplace_back("sum_max");
     }
   }
-  CHECK_EQ(input_names.size(), SgMKLDNNConvNumInputs(attrs));
+  CHECK_EQ(input_names.size(), SgDNNLConvNumInputs(attrs));
   return input_names;
 }
 
-static std::vector<std::string> SgMKLDNNConvListOutputNames(const NodeAttrs& attrs) {
-  auto const& param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
-  if (param.full_conv_param.mkldnn_param.quantized)
+static std::vector<std::string> SgDNNLConvListOutputNames(const NodeAttrs& attrs) {
+  auto const& param = nnvm::get<DNNLConvFusionParam>(attrs.parsed);
+  if (param.full_conv_param.dnnl_param.quantized)
     return std::vector<std::string>{"output", "output_min", "output_max"};
   else
     return std::vector<std::string>{"output"};
 }
 
-static OpStatePtr CreateSgMKLDNNConvState(const nnvm::NodeAttrs& attrs,
-                                          Context ctx,
-                                          const mxnet::ShapeVector& in_shapes,
-                                          const std::vector<int>& in_types) {
-  return OpStatePtr::Create<SgMKLDNNConvOperator>(attrs);
+static OpStatePtr CreateSgDNNLConvState(const nnvm::NodeAttrs& attrs,
+                                        Context ctx,
+                                        const mxnet::ShapeVector& in_shapes,
+                                        const std::vector<int>& in_types) {
+  return OpStatePtr::Create<SgDNNLConvOperator>(attrs);
 }
 
 template <typename DType>
-static void FilterMinMaxIndice(const MKLDNNConvParam& mkldnn_param,
+static void FilterMinMaxIndice(const DNNLConvParam& dnnl_param,
                                std::vector<DType>* in_shapes,
                                std::vector<DType>* out_shapes,
                                std::vector<DType>* base_in_shapes,
@@ -547,7 +543,7 @@ static void FilterMinMaxIndice(const MKLDNNConvParam& mkldnn_param,
                                std::unordered_set<size_t>* minmax_indice) {
   base_out_shapes->push_back(out_shapes->at(0));
   size_t last = in_shapes->size() - 1;
-  if (mkldnn_param.with_sum && !mkldnn_param.dedup_sum) {
+  if (dnnl_param.with_sum && !dnnl_param.dedup_sum) {
     minmax_indice->insert(last);
     minmax_indice->insert(last - 1);
     minmax_indice->insert(last - 2);
@@ -560,16 +556,16 @@ static void FilterMinMaxIndice(const MKLDNNConvParam& mkldnn_param,
   }
 }
 
-static bool SgMKLDNNConvInferShape(const nnvm::NodeAttrs& attrs,
-                                   mxnet::ShapeVector* in_shapes,
-                                   mxnet::ShapeVector* out_shapes) {
-  auto const& param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
-  if (param.full_conv_param.mkldnn_param.quantized) {
+static bool SgDNNLConvInferShape(const nnvm::NodeAttrs& attrs,
+                                 mxnet::ShapeVector* in_shapes,
+                                 mxnet::ShapeVector* out_shapes) {
+  auto const& param = nnvm::get<DNNLConvFusionParam>(attrs.parsed);
+  if (param.full_conv_param.dnnl_param.quantized) {
     std::unordered_set<size_t> minmax_indice;
     mxnet::ShapeVector base_in_shapes;
     mxnet::ShapeVector base_out_shapes;
 
-    FilterMinMaxIndice<mxnet::TShape>(param.full_conv_param.mkldnn_param,
+    FilterMinMaxIndice<mxnet::TShape>(param.full_conv_param.dnnl_param,
                                       in_shapes,
                                       out_shapes,
                                       &base_in_shapes,
@@ -593,15 +589,15 @@ static bool SgMKLDNNConvInferShape(const nnvm::NodeAttrs& attrs,
   }
 }
 
-static bool SgMKLDNNConvInferType(const nnvm::NodeAttrs& attrs,
-                                  std::vector<int>* in_types,
-                                  std::vector<int>* out_types) {
-  auto const& param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
-  if (param.full_conv_param.mkldnn_param.quantized) {
+static bool SgDNNLConvInferType(const nnvm::NodeAttrs& attrs,
+                                std::vector<int>* in_types,
+                                std::vector<int>* out_types) {
+  auto const& param = nnvm::get<DNNLConvFusionParam>(attrs.parsed);
+  if (param.full_conv_param.dnnl_param.quantized) {
     std::unordered_set<size_t> minmax_indice;
     std::vector<int> base_in_types;
     std::vector<int> base_out_types;
-    FilterMinMaxIndice<int>(param.full_conv_param.mkldnn_param,
+    FilterMinMaxIndice<int>(param.full_conv_param.dnnl_param,
                             in_types,
                             out_types,
                             &base_in_types,
@@ -609,18 +605,18 @@ static bool SgMKLDNNConvInferType(const nnvm::NodeAttrs& attrs,
                             &minmax_indice);
     // Override data type to fp32 for default infer type as bn doesn't support
     // uint8.
-    int orig_data      = base_in_types[0];
-    base_in_types[0]   = mshadow::kFloat32;
-    int orig_sum       = base_in_types[0];
-    auto& mkldnn_param = param.full_conv_param.mkldnn_param;
-    if (param.full_conv_param.mkldnn_param.with_sum && !mkldnn_param.dedup_sum) {
+    int orig_data    = base_in_types[0];
+    base_in_types[0] = mshadow::kFloat32;
+    int orig_sum     = base_in_types[0];
+    auto& dnnl_param = param.full_conv_param.dnnl_param;
+    if (param.full_conv_param.dnnl_param.with_sum && !dnnl_param.dedup_sum) {
       auto sum_index           = GetInSumIndex(param);
       orig_sum                 = base_in_types[sum_index];
       base_in_types[sum_index] = mshadow::kFloat32;
     }
     bool result      = DefaultSubgraphOpType(attrs, &base_in_types, &base_out_types);
     base_in_types[0] = orig_data;
-    if (param.full_conv_param.mkldnn_param.with_sum && !mkldnn_param.dedup_sum) {
+    if (param.full_conv_param.dnnl_param.with_sum && !dnnl_param.dedup_sum) {
       auto sum_index           = GetInSumIndex(param);
       base_in_types[sum_index] = orig_sum;
     }
@@ -632,8 +628,8 @@ static bool SgMKLDNNConvInferType(const nnvm::NodeAttrs& attrs,
         in_types->at(i) = base_in_types[base_idx++];
       }
     }
-    if (param.full_conv_param.mkldnn_param.min_calib_range.has_value() &&
-        param.full_conv_param.mkldnn_param.max_calib_range.has_value()) {
+    if (param.full_conv_param.dnnl_param.min_calib_range.has_value() &&
+        param.full_conv_param.dnnl_param.max_calib_range.has_value()) {
       if (IsOutputUInt8(param)) {
         TYPE_ASSIGN_CHECK(*out_types, 0, mshadow::kUint8);
       } else {
@@ -651,17 +647,17 @@ static bool SgMKLDNNConvInferType(const nnvm::NodeAttrs& attrs,
   }
 }
 
-static bool SgMKLDNNConvOpStorageType(const nnvm::NodeAttrs& attrs,
-                                      const int dev_mask,
-                                      DispatchMode* dispatch_mode,
-                                      std::vector<int>* in_stypes,
-                                      std::vector<int>* out_stypes) {
-  auto const& param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
-  if (param.full_conv_param.mkldnn_param.quantized) {
+static bool SgDNNLConvOpStorageType(const nnvm::NodeAttrs& attrs,
+                                    const int dev_mask,
+                                    DispatchMode* dispatch_mode,
+                                    std::vector<int>* in_stypes,
+                                    std::vector<int>* out_stypes) {
+  auto const& param = nnvm::get<DNNLConvFusionParam>(attrs.parsed);
+  if (param.full_conv_param.dnnl_param.quantized) {
     std::unordered_set<size_t> minmax_indice;
     std::vector<int> base_in_stypes;
     std::vector<int> base_out_stypes;
-    FilterMinMaxIndice<int>(param.full_conv_param.mkldnn_param,
+    FilterMinMaxIndice<int>(param.full_conv_param.dnnl_param,
                             in_stypes,
                             out_stypes,
                             &base_in_stypes,
@@ -686,23 +682,22 @@ static bool SgMKLDNNConvOpStorageType(const nnvm::NodeAttrs& attrs,
   }
 }
 
-std::vector<std::pair<int, int>> SgMKLDNNConvInplaceOption(const NodeAttrs& attrs) {
-  auto const& param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
-  if (param.full_conv_param.mkldnn_param.with_sum &&
-      !param.full_conv_param.mkldnn_param.dedup_sum) {
+std::vector<std::pair<int, int>> SgDNNLConvInplaceOption(const NodeAttrs& attrs) {
+  auto const& param = nnvm::get<DNNLConvFusionParam>(attrs.parsed);
+  if (param.full_conv_param.dnnl_param.with_sum && !param.full_conv_param.dnnl_param.dedup_sum) {
     return std::vector<std::pair<int, int>>{{GetInSumIndex(param), 0}};
   } else {
     return std::vector<std::pair<int, int>>();
   }
 }
 
-nnvm::ObjectPtr SgMKLDNNConvQuantizedOp(const NodeAttrs& attrs) {
-  auto const& param    = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
+nnvm::ObjectPtr SgDNNLConvQuantizedOp(const NodeAttrs& attrs) {
+  auto const& param    = nnvm::get<DNNLConvFusionParam>(attrs.parsed);
   nnvm::ObjectPtr node = nnvm::Node::Create();
-  node->attrs.op       = Op::Get("_sg_mkldnn_conv");
+  node->attrs.op       = Op::Get("_sg_onednn_conv");
   const int k_ndims    = param.full_conv_param.conv_param.kernel.ndim();
   CHECK(k_ndims == 2U || k_ndims == 3U)
-      << "Quantized Convolution of MKL-DNN supports 2D/3D kernel currently."
+      << "Quantized Convolution of oneDNN supports 2D/3D kernel currently."
       << "Please exclude this layer from the quantized model.";
   node->attrs.name              = "quantized_" + attrs.name;
   node->attrs.dict              = attrs.dict;
@@ -715,10 +710,10 @@ nnvm::ObjectPtr SgMKLDNNConvQuantizedOp(const NodeAttrs& attrs) {
   return node;
 }
 
-bool SgMKLDNNAvoidConvQuantizeInput(const NodeAttrs& attrs,
-                                    const size_t index,
-                                    const std::string quantize_granularity) {
-  auto const& param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
+bool SgDNNLAvoidConvQuantizeInput(const NodeAttrs& attrs,
+                                  const size_t index,
+                                  const std::string quantize_granularity) {
+  auto const& param = nnvm::get<DNNLConvFusionParam>(attrs.parsed);
   std::unordered_set<size_t> avoid_indice;
   size_t idx = 0;
   idx++;                       // data
@@ -726,7 +721,7 @@ bool SgMKLDNNAvoidConvQuantizeInput(const NodeAttrs& attrs,
   if (!param.full_conv_param.conv_param.no_bias) {
     avoid_indice.insert(idx++);  // bias
   }
-  if (param.full_conv_param.mkldnn_param.with_bn) {
+  if (param.full_conv_param.dnnl_param.with_bn) {
     avoid_indice.insert(idx++);  // gamma
     avoid_indice.insert(idx++);  // beta
     avoid_indice.insert(idx++);  // mean
@@ -735,22 +730,22 @@ bool SgMKLDNNAvoidConvQuantizeInput(const NodeAttrs& attrs,
   return avoid_indice.count(index);
 }
 
-NNVM_REGISTER_OP(_sg_mkldnn_conv)
-    .describe(R"code(_sg_mkldnn_conv)code" ADD_FILELINE)
-    .set_num_inputs(SgMKLDNNConvNumInputs)
+NNVM_REGISTER_OP(_sg_onednn_conv)
+    .describe(R"code(_sg_onednn_conv)code" ADD_FILELINE)
+    .set_num_inputs(SgDNNLConvNumInputs)
     .set_num_outputs([](const NodeAttrs& attrs) {
-      auto const& param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
-      return param.full_conv_param.mkldnn_param.quantized ? 3 : 1;
+      auto const& param = nnvm::get<DNNLConvFusionParam>(attrs.parsed);
+      return param.full_conv_param.dnnl_param.quantized ? 3 : 1;
     })
-    .set_attr_parser(SgMKLDNNConvParamParser)
-    .set_attr<nnvm::FListInputNames>("FListInputNames", SgMKLDNNConvListInputNames)
-    .set_attr<nnvm::FListOutputNames>("FListOutputNames", SgMKLDNNConvListOutputNames)
-    .set_attr<FCreateOpState>("FCreateOpState", CreateSgMKLDNNConvState)
-    .set_attr<mxnet::FInferShape>("FInferShape", SgMKLDNNConvInferShape)
-    .set_attr<nnvm::FInferType>("FInferType", SgMKLDNNConvInferType)
-    .set_attr<FInferStorageType>("FInferStorageType", SgMKLDNNConvOpStorageType)
-    .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgMKLDNNConvOpForward)
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr_parser(SgDNNLConvParamParser)
+    .set_attr<nnvm::FListInputNames>("FListInputNames", SgDNNLConvListInputNames)
+    .set_attr<nnvm::FListOutputNames>("FListOutputNames", SgDNNLConvListOutputNames)
+    .set_attr<FCreateOpState>("FCreateOpState", CreateSgDNNLConvState)
+    .set_attr<mxnet::FInferShape>("FInferShape", SgDNNLConvInferShape)
+    .set_attr<nnvm::FInferType>("FInferType", SgDNNLConvInferType)
+    .set_attr<FInferStorageType>("FInferStorageType", SgDNNLConvOpStorageType)
+    .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgDNNLConvOpForward)
+    .set_attr<bool>("TIsDNNL", true)
     // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
     // will be reverted after the improvement of CachedOP is done.
     .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
@@ -760,12 +755,12 @@ NNVM_REGISTER_OP(_sg_mkldnn_conv)
                                 })
     .set_attr<nnvm::FMutateInputs>("FMutateInputs", DefaultSubgraphOpMutableInputs)
     .set_attr<std::string>("key_var_num_args", "num_args")
-    .set_attr<nnvm::FInplaceOption>("FInplaceOption", SgMKLDNNConvInplaceOption)
+    .set_attr<nnvm::FInplaceOption>("FInplaceOption", SgDNNLConvInplaceOption)
     .set_attr<FQuantizable>("FQuantizable",
                             [](const NodeAttrs& attrs) { return QuantizeType::kMust; })
-    .set_attr<FQuantizedOp>("FQuantizedOp", SgMKLDNNConvQuantizedOp)
+    .set_attr<FQuantizedOp>("FQuantizedOp", SgDNNLConvQuantizedOp)
     .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
-    .set_attr<FAvoidQuantizeInput>("FAvoidQuantizeInput", SgMKLDNNAvoidConvQuantizeInput);
+    .set_attr<FAvoidQuantizeInput>("FAvoidQuantizeInput", SgDNNLAvoidConvQuantizeInput);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv_property.h b/src/operator/subgraph/dnnl/dnnl_conv_property.h
similarity index 90%
rename from src/operator/subgraph/mkldnn/mkldnn_conv_property.h
rename to src/operator/subgraph/dnnl/dnnl_conv_property.h
index d738620f903f..3bb08a5eb373 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv_property.h
+++ b/src/operator/subgraph/dnnl/dnnl_conv_property.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_PROPERTY_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_PROPERTY_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_CONV_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_CONV_PROPERTY_H_
 #if MXNET_USE_ONEDNN == 1
 
 #include <string>
@@ -27,15 +27,14 @@
 #include "../../leaky_relu-inl.h"
 #include "../../nn/activation-inl.h"
 #include "../../nn/convolution-inl.h"
-#include "../../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../../nn/dnnl/dnnl_ops-inl.h"
 #include "../../tensor/matrix_op-inl.h"
 #include "../common.h"
-
-#include "mkldnn_subgraph_base-inl.h"
+#include "dnnl_subgraph_base-inl.h"
 
 namespace mxnet {
 namespace op {
-class SgMKLDNNConvSelector : public SubgraphSelector {
+class SgDNNLConvSelector : public SubgraphSelector {
  public:
   /*! \brief pattern match status_ */
   enum SelectStatus {
@@ -56,11 +55,7 @@ class SgMKLDNNConvSelector : public SubgraphSelector {
   std::vector<const nnvm::Node*> matched_list_;
 
  public:
-  SgMKLDNNConvSelector(int dis_all,
-                       int dis_conv_bn,
-                       int dis_conv_act,
-                       int dis_conv_sum,
-                       int quantize)
+  SgDNNLConvSelector(int dis_all, int dis_conv_bn, int dis_conv_act, int dis_conv_sum, int quantize)
       : disable_all_(dis_all),
         disable_conv_bn_(dis_conv_bn),
         disable_conv_act_(dis_conv_act),
@@ -70,7 +65,7 @@ class SgMKLDNNConvSelector : public SubgraphSelector {
   bool Select(const nnvm::Node& n, const std::shared_ptr<NodeAttr>& node_attr) override {
     if (n.op() && n.op()->name == "Convolution") {
       const auto& param = nnvm::get<ConvolutionParam>(n.attrs.parsed);
-      if ((param.kernel.ndim() == 2 || param.kernel.ndim() == 3) && SupportMKLDNNAttr(node_attr)) {
+      if ((param.kernel.ndim() == 2 || param.kernel.ndim() == 3) && SupportDNNLAttr(node_attr)) {
         status_ = disable_all_ ? kSuccess : kStart;
         matched_list_.clear();
         matched_list_.push_back(&n);
@@ -119,8 +114,8 @@ class SgMKLDNNConvSelector : public SubgraphSelector {
       default:
         if ((!disable_conv_act_) && node_name == "Activation") {
           const ActivationParam& param = nnvm::get<ActivationParam>(new_node.attrs.parsed);
-          if ((quantize_ && SupportQuantizedMKLDNNAct(param)) ||
-              (!quantize_ && SupportMKLDNNAct(param))) {
+          if ((quantize_ && SupportQuantizedDNNLAct(param)) ||
+              (!quantize_ && SupportDNNLAct(param))) {
             matched_list_.push_back(&new_node);
             // not support conv+relu+sum yet.
             status_ = kSuccess;
@@ -170,16 +165,16 @@ class SgMKLDNNConvSelector : public SubgraphSelector {
 
   void Reset() override {
     CHECK_GE(matched_list_.size(), 1);
-    auto new_selector = SgMKLDNNConvSelector(
+    auto new_selector = SgDNNLConvSelector(
         disable_all_, disable_conv_bn_, disable_conv_act_, disable_conv_sum_, quantize_);
     new_selector.Select(*matched_list_[0], nullptr);
     *this = new_selector;
   }
 };
 
-class SgMKLDNNConvProperty : public SubgraphProperty {
+class SgDNNLConvProperty : public SubgraphProperty {
  public:
-  SgMKLDNNConvProperty() {
+  SgDNNLConvProperty() {
     disable_conv_bn_  = dmlc::GetEnv("MXNET_DISABLE_ONEDNN_FUSE_CONV_BN", 0);
     disable_conv_act_ = dmlc::GetEnv("MXNET_DISABLE_ONEDNN_FUSE_CONV_RELU", 0);
     disable_conv_sum_ = dmlc::GetEnv("MXNET_DISABLE_ONEDNN_FUSE_CONV_SUM", 0);
@@ -187,8 +182,8 @@ class SgMKLDNNConvProperty : public SubgraphProperty {
     disable_all_ = disable_conv_bn_ && disable_conv_act_ && disable_conv_sum_;
   }
   static SubgraphPropertyPtr Create() {
-    static const std::string& name = "MKLDNN convolution optimization pass";
-    auto property                  = std::make_shared<SgMKLDNNConvProperty>();
+    static const std::string& name = "oneDNN convolution optimization pass";
+    auto property                  = std::make_shared<SgDNNLConvProperty>();
     property->SetAttr<std::string>("property_name", name);
     property->SetAttr<bool>("inference_only", true);
     if (dmlc::GetEnv("MXNET_DISABLE_ONEDNN_CONV_OPT", 0)) {
@@ -204,7 +199,7 @@ class SgMKLDNNConvProperty : public SubgraphProperty {
     nnvm::Symbol new_sym;
     new_sym.outputs.emplace_back(last_node);
     std::ostringstream node_name;
-    node_name << "sg_mkldnn_";
+    node_name << "sg_onednn_";
     bool _with_sum = false;
     DFSVisit(new_sym.outputs, [&](const nnvm::ObjectPtr& node) {
       if (node->is_variable())
@@ -230,7 +225,7 @@ class SgMKLDNNConvProperty : public SubgraphProperty {
     });
     node_name << std::to_string(subgraph_id);
     n->attrs.name = node_name.str();
-    n->attrs.op   = Op::Get("_sg_mkldnn_conv");
+    n->attrs.op   = Op::Get("_sg_onednn_conv");
     CHECK(n->attrs.op);
     n->attrs.subgraphs.emplace_back(std::make_shared<nnvm::Symbol>(new_sym));
     n->op()->attr_parser(&(n->attrs));
@@ -239,7 +234,7 @@ class SgMKLDNNConvProperty : public SubgraphProperty {
 
   SubgraphSelectorPtr CreateSubgraphSelector() const override {
     bool quantize = HasAttr("quantize") ? GetAttr<bool>("quantize") : false;
-    auto selector = std::make_shared<SgMKLDNNConvSelector>(
+    auto selector = std::make_shared<SgDNNLConvSelector>(
         disable_all_, disable_conv_bn_, disable_conv_act_, disable_conv_sum_, quantize);
     return selector;
   }
@@ -299,4 +294,4 @@ class SgMKLDNNConvProperty : public SubgraphProperty {
 }  // namespace mxnet
 
 #endif  // if MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_PROPERTY_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_CONV_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_elemwisemul_post_quantize_property.h b/src/operator/subgraph/dnnl/dnnl_elemwisemul_post_quantize_property.h
similarity index 94%
rename from src/operator/subgraph/mkldnn/mkldnn_elemwisemul_post_quantize_property.h
rename to src/operator/subgraph/dnnl/dnnl_elemwisemul_post_quantize_property.h
index dd3042067c69..5e015cbf14e1 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_elemwisemul_post_quantize_property.h
+++ b/src/operator/subgraph/dnnl/dnnl_elemwisemul_post_quantize_property.h
@@ -18,13 +18,13 @@
  */
 
 /*!
- * \file mkldnn_elemwisemul_post_quantize_property.cc
- * \brief Partition gragph property for MKLDNN Quantized ElemwiseMul operator
+ * \file dnnl_elemwisemul_post_quantize_property.cc
+ * \brief Partition gragph property for oneDNN Quantized ElemwiseMul operator
  * \author Xinyu Chen
  */
 
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_ELEMWISEMUL_POST_QUANTIZE_PROPERTY_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_ELEMWISEMUL_POST_QUANTIZE_PROPERTY_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_ELEMWISEMUL_POST_QUANTIZE_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_ELEMWISEMUL_POST_QUANTIZE_PROPERTY_H_
 #if MXNET_USE_ONEDNN == 1
 
 #include <memory>
@@ -34,8 +34,7 @@
 #include "../../quantization/requantize-inl.h"
 #include "../../tensor/elemwise_binary_op-inl.h"
 #include "../common.h"
-
-#include "mkldnn_subgraph_base-inl.h"
+#include "dnnl_subgraph_base-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -161,7 +160,7 @@ class ElemwiseMulPostQuantizeProperty : public SubgraphProperty {
   }
 
   static SubgraphPropertyPtr Create() {
-    static const std::string& name = "MKLDNN EltwiseMul post-quantization optimization pass";
+    static const std::string& name = "oneDNN EltwiseMul post-quantization optimization pass";
     auto property                  = std::make_shared<ElemwiseMulPostQuantizeProperty>();
     property->SetAttr<std::string>("property_name", name);
     property->SetAttr<bool>("inference_only", true);
@@ -229,4 +228,4 @@ class ElemwiseMulPostQuantizeProperty : public SubgraphProperty {
 }  // namespace mxnet
 
 #endif  // if MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_ELEMWISEMUL_POST_QUANTIZE_PROPERTY_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_ELEMWISEMUL_POST_QUANTIZE_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc-inl.h b/src/operator/subgraph/dnnl/dnnl_fc-inl.h
similarity index 61%
rename from src/operator/subgraph/mkldnn/mkldnn_fc-inl.h
rename to src/operator/subgraph/dnnl/dnnl_fc-inl.h
index fdb90eedb7cb..ba1beac46fc0 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_fc-inl.h
+++ b/src/operator/subgraph/dnnl/dnnl_fc-inl.h
@@ -17,22 +17,21 @@
  * under the License.
  */
 
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_FC_INL_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_FC_INL_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_FC_INL_H_
+#define MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_FC_INL_H_
 #if MXNET_USE_ONEDNN == 1
 
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "../../nn/mkldnn/mkldnn_fully_connected-inl.h"
-
-#include "mkldnn.hpp"
+#include "../../nn/dnnl/dnnl_fully_connected-inl.h"
+#include "dnnl.hpp"
 
 namespace mxnet {
 namespace op {
 
-static inline bool SupportMKLDNNFCEltwiseFusion(const std::string op_name) {
+static inline bool SupportDNNLFCEltwiseFusion(const std::string op_name) {
   if (op_name == "Activation" || op_name == "square" || op_name == "_npi_square" ||
       op_name == "sqrt" || op_name == "_npi_sqrt" || op_name == "exp" || op_name == "_npi_exp" ||
       op_name == "abs" || op_name == "_npi_absolute" || op_name == "clip" ||
@@ -43,30 +42,29 @@ static inline bool SupportMKLDNNFCEltwiseFusion(const std::string op_name) {
   }
 }
 
-static inline mkldnn::algorithm GetMKLDNNEltwiseAlgo(const std::string op_name) {
+static inline dnnl::algorithm GetDNNLEltwiseAlgo(const std::string op_name) {
   if (op_name == "square" || op_name == "_npi_square")
-    return mkldnn::algorithm::eltwise_square;
+    return dnnl::algorithm::eltwise_square;
   else if (op_name == "sqrt" || op_name == "_npi_sqrt")
-    return mkldnn::algorithm::eltwise_sqrt;
+    return dnnl::algorithm::eltwise_sqrt;
   else if (op_name == "exp" || op_name == "_npi_exp")
-    return mkldnn::algorithm::eltwise_exp;
+    return dnnl::algorithm::eltwise_exp;
   else if (op_name == "abs" || op_name == "_npi_absolute")
-    return mkldnn::algorithm::eltwise_abs;
+    return dnnl::algorithm::eltwise_abs;
   else
     LOG(FATAL) << "Unsupported eltwise fusion op: " << op_name;
 
-  return mkldnn::algorithm::undef;
+  return dnnl::algorithm::undef;
 }
 
-static inline bool IsOutputUint8(const MKLDNNFCFullParam& full_param) {
+static inline bool IsOutputUint8(const DNNLFCFullParam& full_param) {
   auto alg = full_param.eltwise_param.alg;
   // TODO(ciyong): some alg doesn't support int8 so far.
-  if (full_param.mkldnn_param.with_eltwise &&
-      (alg == mkldnn::algorithm::eltwise_relu || alg == mkldnn::algorithm::eltwise_logistic ||
-       alg == mkldnn::algorithm::eltwise_soft_relu ||
-       alg == mkldnn::algorithm::eltwise_bounded_relu || alg == mkldnn::algorithm::eltwise_square ||
-       alg == mkldnn::algorithm::eltwise_sqrt || alg == mkldnn::algorithm::eltwise_exp ||
-       alg == mkldnn::algorithm::eltwise_abs)) {
+  if (full_param.dnnl_param.with_eltwise &&
+      (alg == dnnl::algorithm::eltwise_relu || alg == dnnl::algorithm::eltwise_logistic ||
+       alg == dnnl::algorithm::eltwise_soft_relu || alg == dnnl::algorithm::eltwise_bounded_relu ||
+       alg == dnnl::algorithm::eltwise_square || alg == dnnl::algorithm::eltwise_sqrt ||
+       alg == dnnl::algorithm::eltwise_exp || alg == dnnl::algorithm::eltwise_abs)) {
     return true;
   }
 
@@ -77,4 +75,4 @@ static inline bool IsOutputUint8(const MKLDNNFCFullParam& full_param) {
 }  // namespace mxnet
 
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_FC_INL_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_FC_INL_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc.cc b/src/operator/subgraph/dnnl/dnnl_fc.cc
similarity index 67%
rename from src/operator/subgraph/mkldnn/mkldnn_fc.cc
rename to src/operator/subgraph/dnnl/dnnl_fc.cc
index b90418ec3934..24910e71b9d4 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_fc.cc
+++ b/src/operator/subgraph/dnnl/dnnl_fc.cc
@@ -18,8 +18,8 @@
  */
 
 /*!
- * \file mkldnn_fc.cc
- * \brief MKLDNN (Quantized) FullyConnected operator based on subgraph
+ * \file dnnl_fc.cc
+ * \brief DNNL (Quantized) FullyConnected operator based on subgraph
  * \author Ciyong Chen
  */
 
@@ -29,25 +29,23 @@
 #include <utility>
 #include <vector>
 
-#include "../../nn/mkldnn/mkldnn_act-inl.h"
-#include "../../nn/mkldnn/mkldnn_base-inl.h"
-#include "../../nn/mkldnn/mkldnn_fully_connected-inl.h"
-#include "../../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../../nn/dnnl/dnnl_act-inl.h"
+#include "../../nn/dnnl/dnnl_base-inl.h"
+#include "../../nn/dnnl/dnnl_fully_connected-inl.h"
+#include "../../nn/dnnl/dnnl_ops-inl.h"
 #include "../../quantization/quantization_utils.h"
 #include "../../tensor/matrix_op-inl.h"
 #include "../common.h"
-
-#include "mkldnn_common.h"
-#include "mkldnn_fc-inl.h"
+#include "dnnl_common.h"
+#include "dnnl_fc-inl.h"
 
 namespace mxnet {
 namespace op {
 
-class SgMKLDNNFCOp {
+class SgDNNLFCOp {
  public:
-  explicit SgMKLDNNFCOp(const nnvm::NodeAttrs& attrs)
-      : subgraph_sym_(*attrs.subgraphs[0]),
-        full_param_(nnvm::get<MKLDNNFCFullParam>(attrs.parsed)) {}
+  explicit SgDNNLFCOp(const nnvm::NodeAttrs& attrs)
+      : subgraph_sym_(*attrs.subgraphs[0]), full_param_(nnvm::get<DNNLFCFullParam>(attrs.parsed)) {}
 
   void Forward(const OpContext& ctx,
                const std::vector<NDArray>& inputs,
@@ -58,7 +56,7 @@ class SgMKLDNNFCOp {
                 const std::vector<NDArray>& inputs,
                 const std::vector<OpReqType>& req,
                 const std::vector<NDArray>& outputs) {
-    LOG(FATAL) << "Not implemented: subgraph mkldnn fully connected only supports "
+    LOG(FATAL) << "Not implemented: subgraph dnnl fully connected only supports "
                   "inference computation.";
   }
 
@@ -67,11 +65,11 @@ class SgMKLDNNFCOp {
   bool channel_wise_runtime_{false};
   bool reorder_data_{false};
   nnvm::Symbol subgraph_sym_;
-  MKLDNNFCFullParam full_param_;
-  mkldnn_args_map_t args_;
-  std::shared_ptr<MKLDNNFullyConnectedForward> fwd_;
-  std::shared_ptr<mkldnn::memory> cached_data_mem_;
-  std::shared_ptr<mkldnn::memory> cached_out_mem_;
+  DNNLFCFullParam full_param_;
+  dnnl_args_map_t args_;
+  std::shared_ptr<DNNLFullyConnectedForward> fwd_;
+  std::shared_ptr<dnnl::memory> cached_data_mem_;
+  std::shared_ptr<dnnl::memory> cached_out_mem_;
   NDArray cached_weight_;
   NDArray cached_bias_;
   float cached_min_data_;
@@ -90,11 +88,11 @@ class SgMKLDNNFCOp {
   size_t total_num_outputs_;
 };
 
-void SgMKLDNNFCOp::Forward(const OpContext& ctx,
-                           const std::vector<NDArray>& in_data,
-                           const std::vector<OpReqType>& req,
-                           const std::vector<NDArray>& out_data) {
-  auto& mkldnn_param      = full_param_.mkldnn_param;
+void SgDNNLFCOp::Forward(const OpContext& ctx,
+                         const std::vector<NDArray>& in_data,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<NDArray>& out_data) {
+  auto& dnnl_param        = full_param_.dnnl_param;
   auto& default_param     = full_param_.default_param;
   bool has_bias           = !default_param.no_bias;
   size_t base_num_inputs  = has_bias ? 3 : 2;
@@ -108,16 +106,16 @@ void SgMKLDNNFCOp::Forward(const OpContext& ctx,
   float max_bias   = 0.0f;
 
   if (!initialized_) {
-    if (mkldnn_param.channel_wise_quantize.has_value() && mkldnn_param.channel_wise_quantize) {
+    if (dnnl_param.channel_wise_quantize.has_value() && dnnl_param.channel_wise_quantize) {
       channel_wise_runtime_ = true;
     }
 
     total_num_inputs_  = base_num_inputs;
     total_num_outputs_ = base_num_outputs;
-    if (mkldnn_param.quantized) {
+    if (dnnl_param.quantized) {
       total_num_inputs_ = channel_wise_runtime_ ? (base_num_inputs + 2) : (base_num_inputs * 3);
       total_num_outputs_ =
-          mkldnn_param.enable_float_output ? base_num_outputs : (base_num_outputs * 3);
+          dnnl_param.enable_float_output ? base_num_outputs : (base_num_outputs * 3);
     }
   }
   CHECK_EQ(in_data.size(), total_num_inputs_);
@@ -127,7 +125,7 @@ void SgMKLDNNFCOp::Forward(const OpContext& ctx,
   const NDArray& weight = in_data[fullc::kWeight];
   const NDArray& output = out_data[fullc::kOut];
 
-  if (mkldnn_param.quantized) {
+  if (dnnl_param.quantized) {
     if (!channel_wise_runtime_) {
       min_weight = in_data[base_num_inputs + quantized_fullc::kWeightMin].data().dptr<float>()[0];
       max_weight = in_data[base_num_inputs + quantized_fullc::kWeightMax].data().dptr<float>()[0];
@@ -140,8 +138,7 @@ void SgMKLDNNFCOp::Forward(const OpContext& ctx,
     max_data = in_data[base_num_inputs + quantized_fullc::kDataMax].data().dptr<float>()[0];
   }
 
-  if (initialized_ && mkldnn_param.quantized &&
-      dmlc::GetEnv("MXNET_ONEDNN_QFC_DYNAMIC_PARAMS", 0)) {
+  if (initialized_ && dnnl_param.quantized && dmlc::GetEnv("MXNET_ONEDNN_QFC_DYNAMIC_PARAMS", 0)) {
     if (channel_wise_runtime_) {
       if (cached_min_data_ != min_data || cached_max_data_ != max_data ||
           weight_ver_ != weight.version() ||
@@ -176,22 +173,22 @@ void SgMKLDNNFCOp::Forward(const OpContext& ctx,
     }
     const mxnet::TShape ishape = data.shape();
     const auto data_ndim       = ishape.ndim();
-    if (data.IsMKLDNNData()) {
+    if (data.IsDNNLData()) {
       reorder_data_ = true;
       data          = data.Reorder2Default();
     }
     if (data_ndim != 2) {
       if (!default_param.flatten) {
-        data = data.MKLDNNDataReshape(
-            Shape2(ishape.ProdShape(0, data_ndim - 1), ishape[data_ndim - 1]));
+        data =
+            data.DNNLDataReshape(Shape2(ishape.ProdShape(0, data_ndim - 1), ishape[data_ndim - 1]));
       } else {
-        data = data.MKLDNNDataReshape(Shape2(ishape[0], ishape.ProdShape(1, data_ndim)));
+        data = data.DNNLDataReshape(Shape2(ishape[0], ishape.ProdShape(1, data_ndim)));
       }
     }
 
     // create cached out_md
     const mxnet::TShape oshape = output.shape();
-    mkldnn::memory::dims out_dims(2);
+    dnnl::memory::dims out_dims(2);
     if (oshape.ndim() == 2) {
       out_dims[0] = static_cast<int>(oshape[0]);
       out_dims[1] = static_cast<int>(oshape[1]);
@@ -204,26 +201,26 @@ void SgMKLDNNFCOp::Forward(const OpContext& ctx,
         out_dims[1] = static_cast<int>(oshape.ProdShape(1, oshape.ndim()));
       }
     }
-    mkldnn::memory::desc out_md =
-        mkldnn::memory::desc(out_dims,
-                             get_mkldnn_type(output.dtype()),
-                             static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(2)));
-    cached_out_mem_ = std::make_shared<mkldnn::memory>(out_md, engine);
+    dnnl::memory::desc out_md =
+        dnnl::memory::desc(out_dims,
+                           get_dnnl_type(output.dtype()),
+                           static_cast<dnnl::memory::format_tag>(GetDefaultFormat(2)));
+    cached_out_mem_ = std::make_shared<dnnl::memory>(out_md, engine);
 
     bool support_channelwise_scale = false;
-    if (mkldnn_param.quantized) {
+    if (dnnl_param.quantized) {
       CHECK(data.dtype() == mshadow::kInt8 || data.dtype() == mshadow::kUint8);
       data_scale_ = GetQuantizeScale(data.dtype(), cached_min_data_, cached_max_data_);
 
       bool fuse_requantize = false;
       // Channelwise scaling is only supported when fusion is enabled (requantize or dequantize).
-      if (mkldnn_param.min_calib_range.has_value() && mkldnn_param.max_calib_range.has_value()) {
-        cached_min_output_        = mkldnn_param.min_calib_range.value();
-        cached_max_output_        = mkldnn_param.max_calib_range.value();
+      if (dnnl_param.min_calib_range.has_value() && dnnl_param.max_calib_range.has_value()) {
+        cached_min_output_        = dnnl_param.min_calib_range.value();
+        cached_max_output_        = dnnl_param.max_calib_range.value();
         support_channelwise_scale = true;
         fuse_requantize           = true;
       }
-      if (mkldnn_param.enable_float_output) {
+      if (dnnl_param.enable_float_output) {
         support_channelwise_scale = true;
       }
       // channel_wise  support_channelwise_scale  result
@@ -255,7 +252,7 @@ void SgMKLDNNFCOp::Forward(const OpContext& ctx,
         if (has_bias) {
           float bias_scale = GetQuantizeScale(mshadow::kInt8, cached_min_bias_, cached_max_bias_);
           float bias_int32_rescale = data_scale_ * weight_scales_[0] / bias_scale;
-          // TODO(zhennan): mkldnn has bug to handle INT_MAX in bias, so set the maximum value
+          // TODO(zhennan): dnnl has bug to handle INT_MAX in bias, so set the maximum value
           // of bias to INT_MAX / 2.
           float bias_max_rescale =
               MaxValue<int32_t>() / 2 / MaxAbs(cached_min_bias_, cached_max_bias_) / bias_scale;
@@ -286,10 +283,10 @@ void SgMKLDNNFCOp::Forward(const OpContext& ctx,
       }
 
       size_t num_channel = cached_weight_.shape()[0];
-      if (fuse_requantize || mkldnn_param.enable_float_output) {
+      if (fuse_requantize || dnnl_param.enable_float_output) {
         float tmp_scale_ = 1.0f;
         if (fuse_requantize) {
-          if (mkldnn_param.with_eltwise) {
+          if (dnnl_param.with_eltwise) {
             tmp_scale_ = 1.0 / data_scale_;
             full_param_.eltwise_param.scale =
                 GetQuantizeScale(output.dtype(), cached_min_output_, cached_max_output_);
@@ -338,48 +335,48 @@ void SgMKLDNNFCOp::Forward(const OpContext& ctx,
       }
     }
 
-    fwd_.reset(new MKLDNNFullyConnectedForward(full_param_,
-                                               ctx.is_train,
-                                               data,
-                                               cached_weight_,
-                                               (has_bias ? &cached_bias_ : nullptr),
-                                               out_md));
+    fwd_.reset(new DNNLFullyConnectedForward(full_param_,
+                                             ctx.is_train,
+                                             data,
+                                             cached_weight_,
+                                             (has_bias ? &cached_bias_ : nullptr),
+                                             out_md));
 
-    // convert weight and bias to the format that MKL-DNN requires
-    if (!mkldnn_param.quantized || support_channelwise_scale) {
-      mkldnn::memory::desc bias_md;
+    // convert weight and bias to the format that DNNL requires
+    if (!dnnl_param.quantized || support_channelwise_scale) {
+      dnnl::memory::desc bias_md;
       if (has_bias)
         bias_md = fwd_->fwd_pd.bias_desc();
-      ConvertWeightBias2MKLDNN(&cached_weight_,
-                               &cached_bias_,
-                               has_bias,
-                               fwd_->fwd_pd.weights_desc(),
-                               has_bias ? &bias_md : nullptr,
-                               1,
-                               data_scale_,
-                               weight_scales_,
-                               false);
+      ConvertWeightBias2DNNL(&cached_weight_,
+                             &cached_bias_,
+                             has_bias,
+                             fwd_->fwd_pd.weights_desc(),
+                             has_bias ? &bias_md : nullptr,
+                             1,
+                             data_scale_,
+                             weight_scales_,
+                             false);
     } else {
-      const auto def_weight_mem = weight.GetMKLDNNData();
+      const auto def_weight_mem = weight.GetDNNLData();
       if (def_weight_mem->get_desc() != fwd_->fwd_pd.weights_desc()) {
         cached_weight_         = NDArray(fwd_->fwd_pd.weights_desc());
-        auto cached_weight_mem = cached_weight_.GetMKLDNNData();
-        std::unordered_map<int, mkldnn::memory> args(
-            {{MKLDNN_ARG_FROM, *def_weight_mem}, {MKLDNN_ARG_TO, *cached_weight_mem}});
-        MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::reorder(*def_weight_mem, *cached_weight_mem),
-                                              args);
+        auto cached_weight_mem = cached_weight_.GetDNNLData();
+        std::unordered_map<int, dnnl::memory> args(
+            {{DNNL_ARG_FROM, *def_weight_mem}, {DNNL_ARG_TO, *cached_weight_mem}});
+        DNNLStream::Get()->RegisterPrimArgs(dnnl::reorder(*def_weight_mem, *cached_weight_mem),
+                                            args);
       }
     }
 
-    const auto data_mem = data.GetMKLDNNData();
-    cached_data_mem_    = std::make_shared<mkldnn::memory>(data_mem->get_desc(), engine);
+    const auto data_mem = data.GetDNNLData();
+    cached_data_mem_    = std::make_shared<dnnl::memory>(data_mem->get_desc(), engine);
 
-    args_[MKLDNN_ARG_SRC]     = *cached_data_mem_;
-    args_[MKLDNN_ARG_WEIGHTS] = *cached_weight_.GetMKLDNNData();
+    args_[DNNL_ARG_SRC]     = *cached_data_mem_;
+    args_[DNNL_ARG_WEIGHTS] = *cached_weight_.GetDNNLData();
     if (has_bias)
-      args_[MKLDNN_ARG_BIAS] = *cached_bias_.GetMKLDNNData();
-    args_[MKLDNN_ARG_DST] = *cached_out_mem_;
-    initialized_          = true;
+      args_[DNNL_ARG_BIAS] = *cached_bias_.GetDNNLData();
+    args_[DNNL_ARG_DST] = *cached_out_mem_;
+    initialized_        = true;
   }
 
   if (reorder_data_) {
@@ -391,10 +388,10 @@ void SgMKLDNNFCOp::Forward(const OpContext& ctx,
   MSHADOW_TYPE_SWITCH(output.dtype(), DType, {
     cached_out_mem_->set_data_handle(reinterpret_cast<void*>(output.data().dptr<DType>()));
   });
-  MKLDNNStream::Get()->RegisterPrimArgs(fwd_->GetFwd(), args_);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->RegisterPrimArgs(fwd_->GetFwd(), args_);
+  DNNLStream::Get()->Submit();
 
-  if (mkldnn_param.quantized && !mkldnn_param.enable_float_output) {
+  if (dnnl_param.quantized && !dnnl_param.enable_float_output) {
     float* min_output_ptr = out_data[quantized_fullc::kOutMin].data().dptr<float>();
     float* max_output_ptr = out_data[quantized_fullc::kOutMax].data().dptr<float>();
     *min_output_ptr       = cached_min_output_;
@@ -402,7 +399,7 @@ void SgMKLDNNFCOp::Forward(const OpContext& ctx,
   }
 }
 
-static void SgMKLDNNFCParamParser(nnvm::NodeAttrs* attrs) {
+static void SgDNNLFCParamParser(nnvm::NodeAttrs* attrs) {
   // For backward compatible, with_relu->with_eltwise
   auto legacy = attrs->dict.find("with_relu");
   if (legacy != attrs->dict.end()) {
@@ -410,9 +407,9 @@ static void SgMKLDNNFCParamParser(nnvm::NodeAttrs* attrs) {
     attrs->dict.erase(legacy);
   }
 
-  MKLDNNFCFullParam full_param;
+  DNNLFCFullParam full_param;
   try {
-    full_param.mkldnn_param.Init(attrs->dict);
+    full_param.dnnl_param.Init(attrs->dict);
   } catch (const dmlc::ParamError& e) {
     std::ostringstream os;
     os << e.what();
@@ -431,33 +428,33 @@ static void SgMKLDNNFCParamParser(nnvm::NodeAttrs* attrs) {
     auto& op_name = node->op()->name;
     if (op_name == "FullyConnected") {
       full_param.default_param = nnvm::get<FullyConnectedParam>(node->attrs.parsed);
-    } else if (SupportMKLDNNFCEltwiseFusion(op_name)) {
+    } else if (SupportDNNLFCEltwiseFusion(op_name)) {
       if (op_name == "Activation") {
         const ActivationParam act_param = nnvm::get<ActivationParam>(node->attrs.parsed);
-        full_param.eltwise_param.alg    = GetMKLDNNActAlgo(act_param);
+        full_param.eltwise_param.alg    = GetDNNLActAlgo(act_param);
       } else if (op_name == "LeakyReLU") {
         const auto act_param           = nnvm::get<LeakyReLUParam>(node->attrs.parsed);
         full_param.eltwise_param.alpha = act_param.slope;
-        full_param.eltwise_param.alg   = GetMKLDNNActAlgo(act_param);
+        full_param.eltwise_param.alg   = GetDNNLActAlgo(act_param);
       } else if (op_name == "clip") {
         const ClipParam clip_param     = nnvm::get<ClipParam>(node->attrs.parsed);
-        full_param.eltwise_param.alg   = mkldnn::algorithm::eltwise_bounded_relu;
+        full_param.eltwise_param.alg   = dnnl::algorithm::eltwise_bounded_relu;
         full_param.eltwise_param.alpha = clip_param.a_max;
       } else {
-        full_param.eltwise_param.alg = GetMKLDNNEltwiseAlgo(op_name);
+        full_param.eltwise_param.alg = GetDNNLEltwiseAlgo(op_name);
       }
     }
   });
   attrs->parsed = std::move(full_param);
 }
 
-static std::vector<std::string> SgMKLDNNFCListInputNames(const NodeAttrs& attrs) {
-  auto const& full_param               = nnvm::get<MKLDNNFCFullParam>(attrs.parsed);
+static std::vector<std::string> SgDNNLFCListInputNames(const NodeAttrs& attrs) {
+  auto const& full_param               = nnvm::get<DNNLFCFullParam>(attrs.parsed);
   std::vector<std::string> input_names = DefaultSubgraphOpListInputs(attrs);
-  if (full_param.mkldnn_param.quantized) {
+  if (full_param.dnnl_param.quantized) {
     bool channel_wise = false;
-    if (full_param.mkldnn_param.channel_wise_quantize.has_value() &&
-        full_param.mkldnn_param.channel_wise_quantize) {
+    if (full_param.dnnl_param.channel_wise_quantize.has_value() &&
+        full_param.dnnl_param.channel_wise_quantize) {
       channel_wise = true;
     }
     input_names.emplace_back("min_data");
@@ -474,10 +471,10 @@ static std::vector<std::string> SgMKLDNNFCListInputNames(const NodeAttrs& attrs)
   return input_names;
 }
 
-static std::vector<std::string> SgMKLDNNFCListOutputNames(const NodeAttrs& attrs) {
-  auto const& full_param = nnvm::get<MKLDNNFCFullParam>(attrs.parsed);
-  if (full_param.mkldnn_param.quantized) {
-    if (full_param.mkldnn_param.enable_float_output)
+static std::vector<std::string> SgDNNLFCListOutputNames(const NodeAttrs& attrs) {
+  auto const& full_param = nnvm::get<DNNLFCFullParam>(attrs.parsed);
+  if (full_param.dnnl_param.quantized) {
+    if (full_param.dnnl_param.enable_float_output)
       return std::vector<std::string>{"output"};
     else
       return std::vector<std::string>{"output", "min_output", "max_output"};
@@ -500,11 +497,11 @@ static inline void FillBaseInputOutputInfo(const FullyConnectedParam& param,
   }
 }
 
-static bool SgMKLDNNFCInferShape(const nnvm::NodeAttrs& attrs,
-                                 mxnet::ShapeVector* in_shapes,
-                                 mxnet::ShapeVector* out_shapes) {
-  auto const& full_param = nnvm::get<MKLDNNFCFullParam>(attrs.parsed);
-  if (full_param.mkldnn_param.quantized) {
+static bool SgDNNLFCInferShape(const nnvm::NodeAttrs& attrs,
+                               mxnet::ShapeVector* in_shapes,
+                               mxnet::ShapeVector* out_shapes) {
+  auto const& full_param = nnvm::get<DNNLFCFullParam>(attrs.parsed);
+  if (full_param.dnnl_param.quantized) {
     mxnet::ShapeVector base_in_shapes;
     mxnet::ShapeVector base_out_shapes;
     FillBaseInputOutputInfo(
@@ -519,7 +516,7 @@ static bool SgMKLDNNFCInferShape(const nnvm::NodeAttrs& attrs,
     }
 
     out_shapes->at(0) = base_out_shapes[0];
-    if (!full_param.mkldnn_param.enable_float_output) {
+    if (!full_param.dnnl_param.enable_float_output) {
       SHAPE_ASSIGN_CHECK(*out_shapes, 1, Shape1(1));
       SHAPE_ASSIGN_CHECK(*out_shapes, 2, Shape1(1));
     }
@@ -529,14 +526,14 @@ static bool SgMKLDNNFCInferShape(const nnvm::NodeAttrs& attrs,
   }
 }
 
-static bool SgMKLDNNFCInferType(const nnvm::NodeAttrs& attrs,
-                                std::vector<int>* in_types,
-                                std::vector<int>* out_types) {
-  auto const& full_param = nnvm::get<MKLDNNFCFullParam>(attrs.parsed);
-  if (full_param.mkldnn_param.quantized) {
+static bool SgDNNLFCInferType(const nnvm::NodeAttrs& attrs,
+                              std::vector<int>* in_types,
+                              std::vector<int>* out_types) {
+  auto const& full_param = nnvm::get<DNNLFCFullParam>(attrs.parsed);
+  if (full_param.dnnl_param.quantized) {
     bool channel_wise = false;
-    if (full_param.mkldnn_param.channel_wise_quantize.has_value() &&
-        full_param.mkldnn_param.channel_wise_quantize) {
+    if (full_param.dnnl_param.channel_wise_quantize.has_value() &&
+        full_param.dnnl_param.channel_wise_quantize) {
       channel_wise = true;
     }
     size_t base_num_inputs = full_param.default_param.no_bias ? 2 : 3;
@@ -555,11 +552,11 @@ static bool SgMKLDNNFCInferType(const nnvm::NodeAttrs& attrs,
       }
     }
 
-    if (full_param.mkldnn_param.enable_float_output) {
+    if (full_param.dnnl_param.enable_float_output) {
       TYPE_ASSIGN_CHECK(*out_types, 0, mshadow::kFloat32);
     } else {
-      if (full_param.mkldnn_param.min_calib_range.has_value() &&
-          full_param.mkldnn_param.max_calib_range.has_value()) {
+      if (full_param.dnnl_param.min_calib_range.has_value() &&
+          full_param.dnnl_param.max_calib_range.has_value()) {
         if (IsOutputUint8(full_param)) {
           TYPE_ASSIGN_CHECK(*out_types, 0, mshadow::kUint8);
         } else {
@@ -577,13 +574,13 @@ static bool SgMKLDNNFCInferType(const nnvm::NodeAttrs& attrs,
   }
 }
 
-static bool SgMKLDNNFCStorageType(const nnvm::NodeAttrs& attrs,
-                                  const int dev_mask,
-                                  DispatchMode* dispatch_mode,
-                                  std::vector<int>* in_attrs,
-                                  std::vector<int>* out_attrs) {
-  auto const& full_param = nnvm::get<MKLDNNFCFullParam>(attrs.parsed);
-  if (full_param.mkldnn_param.quantized) {
+static bool SgDNNLFCStorageType(const nnvm::NodeAttrs& attrs,
+                                const int dev_mask,
+                                DispatchMode* dispatch_mode,
+                                std::vector<int>* in_attrs,
+                                std::vector<int>* out_attrs) {
+  auto const& full_param = nnvm::get<DNNLFCFullParam>(attrs.parsed);
+  if (full_param.dnnl_param.quantized) {
     std::vector<int> base_in_attrs;
     std::vector<int> base_out_attrs;
     FillBaseInputOutputInfo(
@@ -599,7 +596,7 @@ static bool SgMKLDNNFCStorageType(const nnvm::NodeAttrs& attrs,
     }
 
     out_attrs->at(0) = base_out_attrs[0];
-    if (!full_param.mkldnn_param.enable_float_output) {
+    if (!full_param.dnnl_param.enable_float_output) {
       type_assign(&out_attrs->at(1), mxnet::kDefaultStorage);
       type_assign(&out_attrs->at(2), mxnet::kDefaultStorage);
     }
@@ -609,25 +606,25 @@ static bool SgMKLDNNFCStorageType(const nnvm::NodeAttrs& attrs,
   }
 }
 
-static OpStatePtr CreateSgMKLDNNFCState(const nnvm::NodeAttrs& attrs,
-                                        Context ctx,
-                                        const mxnet::ShapeVector& in_shapes,
-                                        const std::vector<int>& in_types) {
-  return OpStatePtr::Create<SgMKLDNNFCOp>(attrs);
+static OpStatePtr CreateSgDNNLFCState(const nnvm::NodeAttrs& attrs,
+                                      Context ctx,
+                                      const mxnet::ShapeVector& in_shapes,
+                                      const std::vector<int>& in_types) {
+  return OpStatePtr::Create<SgDNNLFCOp>(attrs);
 }
 
-static void SgMKLDNNFCForward(const OpStatePtr& state_pointer,
-                              const OpContext& ctx,
-                              const std::vector<NDArray>& inputs,
-                              const std::vector<OpReqType>& req,
-                              const std::vector<NDArray>& outputs) {
-  SgMKLDNNFCOp& op = state_pointer.get_state<SgMKLDNNFCOp>();
+static void SgDNNLFCForward(const OpStatePtr& state_pointer,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& outputs) {
+  SgDNNLFCOp& op = state_pointer.get_state<SgDNNLFCOp>();
   op.Forward(ctx, inputs, req, outputs);
 }
 
-nnvm::ObjectPtr SgMKLDNNFCQuantizedOp(const NodeAttrs& attrs) {
+nnvm::ObjectPtr SgDNNLFCQuantizedOp(const NodeAttrs& attrs) {
   nnvm::ObjectPtr node          = nnvm::Node::Create();
-  node->attrs.op                = Op::Get("_sg_mkldnn_fully_connected");
+  node->attrs.op                = Op::Get("_sg_onednn_fully_connected");
   node->attrs.name              = "quantized_" + attrs.name;
   node->attrs.dict              = attrs.dict;
   node->attrs.dict["quantized"] = "True";
@@ -639,10 +636,10 @@ nnvm::ObjectPtr SgMKLDNNFCQuantizedOp(const NodeAttrs& attrs) {
   return node;
 }
 
-static bool SgMKLDNNAvoidFCQuantizeInput(const NodeAttrs& attrs,
-                                         const size_t index_to_check,
-                                         const std::string quantize_granularity) {
-  auto const& full_param = nnvm::get<MKLDNNFCFullParam>(attrs.parsed);
+static bool SgDNNLAvoidFCQuantizeInput(const NodeAttrs& attrs,
+                                       const size_t index_to_check,
+                                       const std::string quantize_granularity) {
+  auto const& full_param = nnvm::get<DNNLFCFullParam>(attrs.parsed);
   std::unordered_set<size_t> avoid_indexes;
   if (quantize_granularity == "channel-wise") {
     avoid_indexes.insert(fullc::kWeight);  // weight
@@ -654,14 +651,14 @@ static bool SgMKLDNNAvoidFCQuantizeInput(const NodeAttrs& attrs,
   return avoid_indexes.count(index_to_check);
 }
 
-NNVM_REGISTER_OP(_sg_mkldnn_fully_connected)
-    .describe(R"code(_sg_mkldnn_fully_connected)code" ADD_FILELINE)
+NNVM_REGISTER_OP(_sg_onednn_fully_connected)
+    .describe(R"code(_sg_onednn_fully_connected)code" ADD_FILELINE)
     .set_num_inputs([](const NodeAttrs& attrs) {
-      auto const& full_param = nnvm::get<MKLDNNFCFullParam>(attrs.parsed);
+      auto const& full_param = nnvm::get<DNNLFCFullParam>(attrs.parsed);
       auto num_inputs        = full_param.default_param.no_bias ? 2 : 3;
-      if (full_param.mkldnn_param.quantized) {
-        if (full_param.mkldnn_param.channel_wise_quantize.has_value() &&
-            full_param.mkldnn_param.channel_wise_quantize) {
+      if (full_param.dnnl_param.quantized) {
+        if (full_param.dnnl_param.channel_wise_quantize.has_value() &&
+            full_param.dnnl_param.channel_wise_quantize) {
           return num_inputs + 2;  // min_data, max_data
         } else {
           return num_inputs * 3;
@@ -671,20 +668,19 @@ NNVM_REGISTER_OP(_sg_mkldnn_fully_connected)
       }
     })
     .set_num_outputs([](const NodeAttrs& attrs) {
-      auto const& full_param = nnvm::get<MKLDNNFCFullParam>(attrs.parsed);
-      return (full_param.mkldnn_param.quantized && !full_param.mkldnn_param.enable_float_output)
-                 ? 3
-                 : 1;
+      auto const& full_param = nnvm::get<DNNLFCFullParam>(attrs.parsed);
+      return (full_param.dnnl_param.quantized && !full_param.dnnl_param.enable_float_output) ? 3
+                                                                                             : 1;
     })
-    .set_attr_parser(SgMKLDNNFCParamParser)
-    .set_attr<nnvm::FListInputNames>("FListInputNames", SgMKLDNNFCListInputNames)
-    .set_attr<nnvm::FListOutputNames>("FListOutputNames", SgMKLDNNFCListOutputNames)
-    .set_attr<mxnet::FInferShape>("FInferShape", SgMKLDNNFCInferShape)
-    .set_attr<nnvm::FInferType>("FInferType", SgMKLDNNFCInferType)
-    .set_attr<FInferStorageType>("FInferStorageType", SgMKLDNNFCStorageType)
-    .set_attr<FCreateOpState>("FCreateOpState", CreateSgMKLDNNFCState)
-    .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgMKLDNNFCForward)
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr_parser(SgDNNLFCParamParser)
+    .set_attr<nnvm::FListInputNames>("FListInputNames", SgDNNLFCListInputNames)
+    .set_attr<nnvm::FListOutputNames>("FListOutputNames", SgDNNLFCListOutputNames)
+    .set_attr<mxnet::FInferShape>("FInferShape", SgDNNLFCInferShape)
+    .set_attr<nnvm::FInferType>("FInferType", SgDNNLFCInferType)
+    .set_attr<FInferStorageType>("FInferStorageType", SgDNNLFCStorageType)
+    .set_attr<FCreateOpState>("FCreateOpState", CreateSgDNNLFCState)
+    .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgDNNLFCForward)
+    .set_attr<bool>("TIsDNNL", true)
     // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
     // will be reverted after the improvement of CachedOP is done.
     .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
@@ -696,9 +692,9 @@ NNVM_REGISTER_OP(_sg_mkldnn_fully_connected)
     .set_attr<std::string>("key_var_num_args", "num_args")
     .set_attr<FQuantizable>("FQuantizable",
                             [](const NodeAttrs& attrs) { return QuantizeType::kMust; })
-    .set_attr<FQuantizedOp>("FQuantizedOp", SgMKLDNNFCQuantizedOp)
+    .set_attr<FQuantizedOp>("FQuantizedOp", SgDNNLFCQuantizedOp)
     .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
-    .set_attr<FAvoidQuantizeInput>("FAvoidQuantizeInput", SgMKLDNNAvoidFCQuantizeInput);
+    .set_attr<FAvoidQuantizeInput>("FAvoidQuantizeInput", SgDNNLAvoidFCQuantizeInput);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc_post_quantize_property.h b/src/operator/subgraph/dnnl/dnnl_fc_post_quantize_property.h
similarity index 87%
rename from src/operator/subgraph/mkldnn/mkldnn_fc_post_quantize_property.h
rename to src/operator/subgraph/dnnl/dnnl_fc_post_quantize_property.h
index 9ae099565b0f..b1ae5373ece9 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_fc_post_quantize_property.h
+++ b/src/operator/subgraph/dnnl/dnnl_fc_post_quantize_property.h
@@ -18,13 +18,13 @@
  */
 
 /*!
- * \file mkldnn_fc_post_quantize_property.cc
- * \brief Partition gragph property for MKLDNN Quantized FullyConnected operator
+ * \file dnnl_fc_post_quantize_property.cc
+ * \brief Partition gragph property for oneDNN Quantized FullyConnected operator
  * \author Ciyong Chen
  */
 
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_FC_POST_QUANTIZE_PROPERTY_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_FC_POST_QUANTIZE_PROPERTY_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_FC_POST_QUANTIZE_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_FC_POST_QUANTIZE_PROPERTY_H_
 #if MXNET_USE_ONEDNN == 1
 
 #include <memory>
@@ -34,15 +34,14 @@
 #include "../../nn/fully_connected-inl.h"
 #include "../../quantization/requantize-inl.h"
 #include "../common.h"
-
-#include "mkldnn_subgraph_base-inl.h"
+#include "dnnl_subgraph_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-#define QUANTIZED_FC_NAME "_sg_mkldnn_fully_connected"
+#define QUANTIZED_FC_NAME "_sg_onednn_fully_connected"
 
-class SgMKLDNNFCPostQuantizeSelector : public SubgraphSelectorV2 {
+class SgDNNLFCPostQuantizeSelector : public SubgraphSelectorV2 {
  public:
   /*! \brief pattern match status */
   enum SelectStatus {
@@ -59,7 +58,7 @@ class SgMKLDNNFCPostQuantizeSelector : public SubgraphSelectorV2 {
   std::vector<const BiDirectedNode*> matched_list;
 
  public:
-  explicit SgMKLDNNFCPostQuantizeSelector(const bool dis_all, const bool dis_float_output)
+  explicit SgDNNLFCPostQuantizeSelector(const bool dis_all, const bool dis_float_output)
       : disable_all(dis_all), disable_float_output(dis_float_output) {}
 
   bool Select(const BiDirectedNode& n) override {
@@ -146,22 +145,22 @@ class SgMKLDNNFCPostQuantizeSelector : public SubgraphSelectorV2 {
 
   void Reset() override {
     CHECK_GE(matched_list.size(), 1);
-    auto new_selector = SgMKLDNNFCPostQuantizeSelector(disable_all, disable_float_output);
+    auto new_selector = SgDNNLFCPostQuantizeSelector(disable_all, disable_float_output);
     new_selector.Select(*matched_list[0]);
     *this = new_selector;
   }
 };
 
-class SgMKLDNNFCPostQuantizeProperty : public SubgraphProperty {
+class SgDNNLFCPostQuantizeProperty : public SubgraphProperty {
  public:
-  SgMKLDNNFCPostQuantizeProperty() {
+  SgDNNLFCPostQuantizeProperty() {
     disable_fuse_all     = dmlc::GetEnv("MXNET_DISABLE_ONEDNN_QFC_FUSE_ALL", false);
     disable_float_output = dmlc::GetEnv("MXNET_DISABLE_ONEDNN_QFC_FLOAT_OUTPUT", false);
   }
 
   static SubgraphPropertyPtr Create() {
-    static const std::string& name = "MKLDNN FullyConected post-quantization optimization pass";
-    auto property                  = std::make_shared<SgMKLDNNFCPostQuantizeProperty>();
+    static const std::string& name = "oneDNN FullyConected post-quantization optimization pass";
+    auto property                  = std::make_shared<SgDNNLFCPostQuantizeProperty>();
     property->SetAttr<std::string>("property_name", name);
     property->SetAttr<bool>("inference_only", true);
     return property;
@@ -207,7 +206,7 @@ class SgMKLDNNFCPostQuantizeProperty : public SubgraphProperty {
 
   SubgraphSelectorV2Ptr CreateSubgraphSelectorV2() const override {
     auto selector =
-        std::make_shared<SgMKLDNNFCPostQuantizeSelector>(disable_fuse_all, disable_float_output);
+        std::make_shared<SgDNNLFCPostQuantizeSelector>(disable_fuse_all, disable_float_output);
     return selector;
   }
 
@@ -228,4 +227,4 @@ class SgMKLDNNFCPostQuantizeProperty : public SubgraphProperty {
 }  // namespace mxnet
 
 #endif  // if MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_FC_POST_QUANTIZE_PROPERTY_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_FC_POST_QUANTIZE_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc_property.h b/src/operator/subgraph/dnnl/dnnl_fc_property.h
similarity index 85%
rename from src/operator/subgraph/mkldnn/mkldnn_fc_property.h
rename to src/operator/subgraph/dnnl/dnnl_fc_property.h
index e780350570a3..9884dc7168ee 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_fc_property.h
+++ b/src/operator/subgraph/dnnl/dnnl_fc_property.h
@@ -18,13 +18,13 @@
  */
 
 /*!
- * \file mkldnn_fc_property.cc
+ * \file dnnl_fc_property.cc
  * \brief Partition gragph property for FullyConnected operator
  * \author Ciyong Chen
  */
 
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_FC_PROPERTY_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_FC_PROPERTY_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_FC_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_FC_PROPERTY_H_
 #if MXNET_USE_ONEDNN == 1
 
 #include <string>
@@ -32,14 +32,13 @@
 
 #include "../../tensor/matrix_op-inl.h"
 #include "../common.h"
-
-#include "mkldnn_fc-inl.h"
-#include "mkldnn_subgraph_base-inl.h"
+#include "dnnl_fc-inl.h"
+#include "dnnl_subgraph_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-class SgMKLDNNFCSelector : public SubgraphSelector {
+class SgDNNLFCSelector : public SubgraphSelector {
  public:
   /* pattern match status */
   enum SelectStatus {
@@ -55,11 +54,11 @@ class SgMKLDNNFCSelector : public SubgraphSelector {
   std::vector<const nnvm::Node*> matched_list_;
 
  public:
-  explicit SgMKLDNNFCSelector(const bool dis_fc_eltwise, bool quantized)
+  explicit SgDNNLFCSelector(const bool dis_fc_eltwise, bool quantized)
       : disable_fc_eltwise_(dis_fc_eltwise), quantized_(quantized) {}
 
   bool Select(const nnvm::Node& n, const std::shared_ptr<NodeAttr>& node_attr) override {
-    if (n.op() == Op::Get("FullyConnected") && SupportMKLDNNAttr(node_attr)) {
+    if (n.op() == Op::Get("FullyConnected") && SupportDNNLAttr(node_attr)) {
       status_ = disable_fc_eltwise_ ? kSuccess : kStart;
       matched_list_.clear();
       matched_list_.push_back(&n);
@@ -94,8 +93,8 @@ class SgMKLDNNFCSelector : public SubgraphSelector {
         // Currently, For INT8 FC fusion, only supports relu/bounded_relu(clip)/abs.
         if (new_node.op() == Op::Get("Activation")) {
           const ActivationParam& param = nnvm::get<ActivationParam>(new_node.attrs.parsed);
-          if ((quantized_ && SupportQuantizedMKLDNNAct(param)) ||
-              (!quantized_ && SupportMKLDNNAct(param))) {
+          if ((quantized_ && SupportQuantizedDNNLAct(param)) ||
+              (!quantized_ && SupportDNNLAct(param))) {
             matched_list_.push_back(&new_node);
             status_ = kSuccess;
             return true;
@@ -156,21 +155,21 @@ class SgMKLDNNFCSelector : public SubgraphSelector {
 
   void Reset() override {
     CHECK_GE(matched_list_.size(), 1);
-    auto new_selector = SgMKLDNNFCSelector(disable_fc_eltwise_, quantized_);
+    auto new_selector = SgDNNLFCSelector(disable_fc_eltwise_, quantized_);
     new_selector.Select(*matched_list_[0], nullptr);
     *this = new_selector;
   }
 };
 
-class SgMKLDNNFCProperty : public SubgraphProperty {
+class SgDNNLFCProperty : public SubgraphProperty {
  public:
-  SgMKLDNNFCProperty() {
+  SgDNNLFCProperty() {
     disable_fc_eltwise_ = dmlc::GetEnv("MXNET_DISABLE_ONEDNN_FUSE_FC_ELTWISE", false);
   }
 
   static SubgraphPropertyPtr Create() {
-    static const std::string& name = "MKLDNN FullyConnected optimization pass";
-    auto property                  = std::make_shared<SgMKLDNNFCProperty>();
+    static const std::string& name = "oneDNN FullyConnected optimization pass";
+    auto property                  = std::make_shared<SgDNNLFCProperty>();
     property->SetAttr<std::string>("property_name", name);
     property->SetAttr<bool>("inference_only", true);
     if (dmlc::GetEnv("MXNET_DISABLE_ONEDNN_FC_OPT", 0)) {
@@ -187,21 +186,21 @@ class SgMKLDNNFCProperty : public SubgraphProperty {
     nnvm::Symbol new_sym;
     new_sym.outputs.emplace_back(last_node);
     std::ostringstream node_name;
-    node_name << "sg_mkldnn_";
+    node_name << "sg_onednn_";
     DFSVisit(new_sym.outputs, [&](const nnvm::ObjectPtr& node) {
       if (node->is_variable())
         return;
       auto& sub_name = node->op()->name;
       if (sub_name == "FullyConnected") {
         node_name << "fully_connected_";
-      } else if (SupportMKLDNNFCEltwiseFusion(sub_name)) {
+      } else if (SupportDNNLFCEltwiseFusion(sub_name)) {
         node_name << "eltwise_";
         n->attrs.dict["with_eltwise"] = "True";
       }
     });
     node_name << std::to_string(subgraph_id);
     n->attrs.name = node_name.str();
-    n->attrs.op   = Op::Get("_sg_mkldnn_fully_connected");
+    n->attrs.op   = Op::Get("_sg_onednn_fully_connected");
     CHECK(n->attrs.op);
     n->attrs.subgraphs.emplace_back(std::make_shared<nnvm::Symbol>(new_sym));
     n->op()->attr_parser(&(n->attrs));
@@ -210,7 +209,7 @@ class SgMKLDNNFCProperty : public SubgraphProperty {
 
   SubgraphSelectorPtr CreateSubgraphSelector() const override {
     bool quantized = HasAttr("quantize") ? GetAttr<bool>("quantize") : false;
-    auto selector  = std::make_shared<SgMKLDNNFCSelector>(disable_fc_eltwise_, quantized);
+    auto selector  = std::make_shared<SgDNNLFCSelector>(disable_fc_eltwise_, quantized);
     return selector;
   }
 
@@ -231,4 +230,4 @@ class SgMKLDNNFCProperty : public SubgraphProperty {
 }  // namespace mxnet
 
 #endif  // if MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_FC_PROPERTY_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_FC_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_post_quantize_align_scale_property.h b/src/operator/subgraph/dnnl/dnnl_post_quantize_align_scale_property.h
similarity index 87%
rename from src/operator/subgraph/mkldnn/mkldnn_post_quantize_align_scale_property.h
rename to src/operator/subgraph/dnnl/dnnl_post_quantize_align_scale_property.h
index 7d5286f949ca..a4cc724dd898 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_post_quantize_align_scale_property.h
+++ b/src/operator/subgraph/dnnl/dnnl_post_quantize_align_scale_property.h
@@ -17,21 +17,20 @@
  * under the License.
  */
 
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_POST_QUANTIZE_ALIGN_SCALE_PROPERTY_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_POST_QUANTIZE_ALIGN_SCALE_PROPERTY_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_POST_QUANTIZE_ALIGN_SCALE_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_POST_QUANTIZE_ALIGN_SCALE_PROPERTY_H_
 #if MXNET_USE_ONEDNN == 1
 
 #include <string>
 #include <vector>
 
 #include "../common.h"
-
-#include "mkldnn_subgraph_base-inl.h"
+#include "dnnl_subgraph_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-class SgMKLDNNConcatPostQuantizeSelector : public SubgraphSelectorV2 {
+class SgDNNLConcatPostQuantizeSelector : public SubgraphSelectorV2 {
  public:
   bool Select(const BiDirectedNode& sn) override {
     const auto& n = *sn.node;
@@ -105,7 +104,7 @@ class SgMKLDNNConcatPostQuantizeSelector : public SubgraphSelectorV2 {
   }
 
   void Reset() override {
-    auto new_selector = SgMKLDNNConcatPostQuantizeSelector();
+    auto new_selector = SgDNNLConcatPostQuantizeSelector();
     new_selector.Select(head_);
     *this = new_selector;
   }
@@ -117,13 +116,13 @@ class SgMKLDNNConcatPostQuantizeSelector : public SubgraphSelectorV2 {
   std::unordered_set<const nnvm::Node*> visit_list_;
 };
 
-class SgMKLDNNPostQuantizeAlignScaleProperty : public SubgraphProperty {
+class SgDNNLPostQuantizeAlignScaleProperty : public SubgraphProperty {
  public:
-  SgMKLDNNPostQuantizeAlignScaleProperty() : SubgraphProperty(kAdjust) {}
+  SgDNNLPostQuantizeAlignScaleProperty() : SubgraphProperty(kAdjust) {}
 
   static SubgraphPropertyPtr Create() {
-    static const std::string& name = "MKLDNN post-quantization scale alignment optimization pass";
-    auto property                  = std::make_shared<SgMKLDNNPostQuantizeAlignScaleProperty>();
+    static const std::string& name = "oneDNN post-quantization scale alignment optimization pass";
+    auto property                  = std::make_shared<SgDNNLPostQuantizeAlignScaleProperty>();
     property->SetAttr<std::string>("property_name", name);
     property->SetAttr<bool>("inference_only", true);
     return property;
@@ -169,7 +168,7 @@ class SgMKLDNNPostQuantizeAlignScaleProperty : public SubgraphProperty {
   }
 
   SubgraphSelectorV2Ptr CreateSubgraphSelectorV2() const override {
-    auto selector = std::make_shared<SgMKLDNNConcatPostQuantizeSelector>();
+    auto selector = std::make_shared<SgDNNLConcatPostQuantizeSelector>();
     return selector;
   }
 };
@@ -178,4 +177,4 @@ class SgMKLDNNPostQuantizeAlignScaleProperty : public SubgraphProperty {
 }  // namespace mxnet
 
 #endif  // if MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_POST_QUANTIZE_ALIGN_SCALE_PROPERTY_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_POST_QUANTIZE_ALIGN_SCALE_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_post_quantize_property.h b/src/operator/subgraph/dnnl/dnnl_post_quantize_property.h
similarity index 82%
rename from src/operator/subgraph/mkldnn/mkldnn_post_quantize_property.h
rename to src/operator/subgraph/dnnl/dnnl_post_quantize_property.h
index 5184297b7581..6da43869afd9 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_post_quantize_property.h
+++ b/src/operator/subgraph/dnnl/dnnl_post_quantize_property.h
@@ -16,25 +16,24 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_POST_QUANTIZE_PROPERTY_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_POST_QUANTIZE_PROPERTY_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_POST_QUANTIZE_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_POST_QUANTIZE_PROPERTY_H_
 #if MXNET_USE_ONEDNN == 1
 
 #include <set>
 #include <string>
 #include <vector>
 
-#include "../../nn/mkldnn/mkldnn_convolution-inl.h"
+#include "../../nn/dnnl/dnnl_convolution-inl.h"
 #include "../../quantization/requantize-inl.h"
 #include "../common.h"
-
-#include "mkldnn_conv-inl.h"
-#include "mkldnn_subgraph_base-inl.h"
+#include "dnnl_conv-inl.h"
+#include "dnnl_subgraph_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-class SgMKLDNNPostQuantizeSelector : public SubgraphSelector {
+class SgDNNLPostQuantizeSelector : public SubgraphSelector {
  public:
   /*! \brief pattern match status */
   enum SelectStatus {
@@ -49,17 +48,17 @@ class SgMKLDNNPostQuantizeSelector : public SubgraphSelector {
   std::set<std::string> support_requantize_fusion_op_name;
 
  public:
-  SgMKLDNNPostQuantizeSelector() {
-    support_requantize_fusion_op_name.insert("_sg_mkldnn_conv");
+  SgDNNLPostQuantizeSelector() {
+    support_requantize_fusion_op_name.insert("_sg_onednn_conv");
     support_requantize_fusion_op_name.insert("_contrib_quantized_elemwise_add");
     support_requantize_fusion_op_name.insert("_contrib_quantized_npi_add");
   }
 
   bool Select(const nnvm::Node& n) override {
     if (n.op() && support_requantize_fusion_op_name.count(n.op()->name)) {
-      if (n.op()->name == "_sg_mkldnn_conv") {
-        auto const& param = nnvm::get<MKLDNNConvFusionParam>(n.attrs.parsed);
-        if (param.full_conv_param.mkldnn_param.quantized) {
+      if (n.op()->name == "_sg_onednn_conv") {
+        auto const& param = nnvm::get<DNNLConvFusionParam>(n.attrs.parsed);
+        if (param.full_conv_param.dnnl_param.quantized) {
           status = kStart;
           matched_list.clear();
           matched_list.push_back(&n);
@@ -112,22 +111,22 @@ class SgMKLDNNPostQuantizeSelector : public SubgraphSelector {
 
   void Reset() override {
     CHECK_GE(matched_list.size(), 1);
-    auto new_selector = SgMKLDNNPostQuantizeSelector();
+    auto new_selector = SgDNNLPostQuantizeSelector();
     new_selector.Select(*matched_list[0]);
     *this = new_selector;
   }
 };
 
-class SgMKLDNNPostQuantizeProperty : public SubgraphProperty {
+class SgDNNLPostQuantizeProperty : public SubgraphProperty {
  public:
-  SgMKLDNNPostQuantizeProperty() {
-    support_requantize_fusion_op_name.insert("_sg_mkldnn_conv");
+  SgDNNLPostQuantizeProperty() {
+    support_requantize_fusion_op_name.insert("_sg_onednn_conv");
     support_requantize_fusion_op_name.insert("_contrib_quantized_elemwise_add");
     support_requantize_fusion_op_name.insert("_contrib_quantized_npi_add");
   }
   static SubgraphPropertyPtr Create() {
-    static const std::string& name = "MKLDNN post-quantization optimization pass";
-    auto property                  = std::make_shared<SgMKLDNNPostQuantizeProperty>();
+    static const std::string& name = "oneDNN post-quantization optimization pass";
+    auto property                  = std::make_shared<SgDNNLPostQuantizeProperty>();
     property->SetAttr<std::string>("property_name", name);
     property->SetAttr<bool>("inference_only", true);
     return property;
@@ -160,7 +159,7 @@ class SgMKLDNNPostQuantizeProperty : public SubgraphProperty {
   }
 
   SubgraphSelectorPtr CreateSubgraphSelector() const override {
-    auto selector = std::make_shared<SgMKLDNNPostQuantizeSelector>();
+    auto selector = std::make_shared<SgDNNLPostQuantizeSelector>();
     return selector;
   }
 
@@ -179,4 +178,4 @@ class SgMKLDNNPostQuantizeProperty : public SubgraphProperty {
 }  // namespace mxnet
 
 #endif  // if MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_POST_QUANTIZE_PROPERTY_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_POST_QUANTIZE_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_subgraph_base-inl.h b/src/operator/subgraph/dnnl/dnnl_subgraph_base-inl.h
similarity index 82%
rename from src/operator/subgraph/mkldnn/mkldnn_subgraph_base-inl.h
rename to src/operator/subgraph/dnnl/dnnl_subgraph_base-inl.h
index 910fed68a95c..0cb8a11d643f 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_subgraph_base-inl.h
+++ b/src/operator/subgraph/dnnl/dnnl_subgraph_base-inl.h
@@ -16,8 +16,8 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_SUBGRAPH_BASE_INL_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_SUBGRAPH_BASE_INL_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_SUBGRAPH_BASE_INL_H_
+#define MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_SUBGRAPH_BASE_INL_H_
 #if MXNET_USE_ONEDNN == 1
 
 #include "../subgraph_property.h"
@@ -25,7 +25,7 @@
 namespace mxnet {
 namespace op {
 
-static inline bool SupportMKLDNNAttr(const std::shared_ptr<NodeAttr>& node_attr) {
+static inline bool SupportDNNLAttr(const std::shared_ptr<NodeAttr>& node_attr) {
   if (node_attr) {
     int ndim = node_attr->ishape[0].ndim();
     return (node_attr->dispatch_mode == DispatchMode::kFComputeEx) &&
@@ -41,4 +41,4 @@ static inline bool SupportMKLDNNAttr(const std::shared_ptr<NodeAttr>& node_attr)
 }  // namespace mxnet
 
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_SUBGRAPH_BASE_INL_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_SUBGRAPH_BASE_INL_H_
diff --git a/src/operator/subgraph/dnnl/dnnl_subgraph_property.cc b/src/operator/subgraph/dnnl/dnnl_subgraph_property.cc
new file mode 100644
index 000000000000..de2ac27dad9e
--- /dev/null
+++ b/src/operator/subgraph/dnnl/dnnl_subgraph_property.cc
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#if MXNET_USE_ONEDNN == 1
+
+#include "dnnl_bn_relu_property.h"
+#include "dnnl_conv_property.h"
+#include "dnnl_elemwisemul_post_quantize_property.h"
+#include "dnnl_fc_post_quantize_property.h"
+#include "dnnl_fc_property.h"
+#include "dnnl_post_quantize_align_scale_property.h"
+#include "dnnl_post_quantize_property.h"
+#include "dnnl_transformer_post_quantize_property.h"
+#include "dnnl_transformer_qk_property.h"
+#include "dnnl_transformer_valatt_property.h"
+
+namespace mxnet {
+namespace op {
+
+MXNET_REGISTER_SUBGRAPH_BACKEND(ONEDNN)
+    .set_attr("enable", DNNLEnvSet())
+    .set_attr("context", Context::CPU());
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(ONEDNN, SgDNNLConvProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(ONEDNN, SgDNNLFCProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(ONEDNN, SgDNNLBNReLUProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(ONEDNN, SgDNNLTransformerQKProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(ONEDNN, SgDNNLTransformerValAttProperty);
+
+MXNET_REGISTER_SUBGRAPH_BACKEND(ONEDNN_QUANTIZE).set_attr("context", Context::CPU());
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(ONEDNN_QUANTIZE, SgDNNLConvProperty).set_attr("quantize", true);
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(ONEDNN_QUANTIZE, SgDNNLFCProperty).set_attr("quantize", true);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(ONEDNN_QUANTIZE, SgDNNLTransformerQKProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(ONEDNN_QUANTIZE, SgDNNLTransformerValAttProperty);
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(ONEDNN_QUANTIZE, SgDNNLPostQuantizeProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(ONEDNN_QUANTIZE, SgDNNLFCPostQuantizeProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(ONEDNN_QUANTIZE, ElemwiseMulPostQuantizeProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(ONEDNN_QUANTIZE, SgDNNLPostQuantizeAlignScaleProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(ONEDNN_QUANTIZE, SgDNNLTransformerPostQuantizeProperty)
+    .set_attr("quantize", true);
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_ONEDNN == 1
diff --git a/src/operator/subgraph/mkldnn/mkldnn_transformer-inl.h b/src/operator/subgraph/dnnl/dnnl_transformer-inl.h
similarity index 87%
rename from src/operator/subgraph/mkldnn/mkldnn_transformer-inl.h
rename to src/operator/subgraph/dnnl/dnnl_transformer-inl.h
index 8622bbe46906..b711e5364957 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_transformer-inl.h
+++ b/src/operator/subgraph/dnnl/dnnl_transformer-inl.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_TRANSFORMER_INL_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_TRANSFORMER_INL_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_TRANSFORMER_INL_H_
+#define MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_TRANSFORMER_INL_H_
 
 #include "../../mshadow_op.h"
 #include "../../mxnet_op.h"
@@ -26,13 +26,13 @@
 namespace mxnet {
 namespace op {
 
-struct MKLDNNSelfAttParam : public dmlc::Parameter<MKLDNNSelfAttParam> {
+struct DNNLSelfAttParam : public dmlc::Parameter<DNNLSelfAttParam> {
   int heads;
   bool quantized;
   bool enable_float_output;
   dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
   dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
-  DMLC_DECLARE_PARAMETER(MKLDNNSelfAttParam) {
+  DMLC_DECLARE_PARAMETER(DNNLSelfAttParam) {
     DMLC_DECLARE_FIELD(heads).describe("Set number of heads.");
     DMLC_DECLARE_FIELD(quantized).set_default(false).describe(
         "Whether it's a quantized self attention matmul operator.");
@@ -56,4 +56,4 @@ struct MKLDNNSelfAttParam : public dmlc::Parameter<MKLDNNSelfAttParam> {
 
 }  // namespace op
 }  // namespace mxnet
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_TRANSFORMER_INL_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_TRANSFORMER_INL_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_transformer.cc b/src/operator/subgraph/dnnl/dnnl_transformer.cc
similarity index 76%
rename from src/operator/subgraph/mkldnn/mkldnn_transformer.cc
rename to src/operator/subgraph/dnnl/dnnl_transformer.cc
index 191006780844..965aff4df301 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_transformer.cc
+++ b/src/operator/subgraph/dnnl/dnnl_transformer.cc
@@ -23,12 +23,11 @@
 #include <utility>
 #include <vector>
 
-#include "./mkldnn_transformer-inl.h"
-
 #include "../../contrib/transformer-inl.h"
 #include "../../quantization/quantization_utils.h"
 #include "../../tensor/elemwise_unary_op.h"
 #include "../common.h"
+#include "./dnnl_transformer-inl.h"
 
 // 3 tensors within one (queries key values) =
 #define QKV_NUM 3
@@ -36,12 +35,12 @@
 namespace mxnet {
 namespace op {
 
-DMLC_REGISTER_PARAMETER(MKLDNNSelfAttParam);
+DMLC_REGISTER_PARAMETER(DNNLSelfAttParam);
 
-static bool SgMKLDNNSelfAttShape(const NodeAttrs& attrs,
-                                 mxnet::ShapeVector* in_shape,
-                                 mxnet::ShapeVector* out_shape) {
-  const auto& params = nnvm::get<MKLDNNSelfAttParam>(attrs.parsed);
+static bool SgDNNLSelfAttShape(const NodeAttrs& attrs,
+                               mxnet::ShapeVector* in_shape,
+                               mxnet::ShapeVector* out_shape) {
+  const auto& params = nnvm::get<DNNLSelfAttParam>(attrs.parsed);
   auto qkv_shape     = in_shape->at(0);
   CHECK_EQ(qkv_shape.ndim(), 3U)
       << "Input queries_keys_values should be 3D in batch-seq_length-proj_dim, "
@@ -72,10 +71,10 @@ static bool SgMKLDNNSelfAttShape(const NodeAttrs& attrs,
   return true;
 }
 
-static bool SgMKLDNNSelfAttQKInferType(const nnvm::NodeAttrs& attrs,
-                                       std::vector<int>* in_types,
-                                       std::vector<int>* out_types) {
-  const auto& params = nnvm::get<MKLDNNSelfAttParam>(attrs.parsed);
+static bool SgDNNLSelfAttQKInferType(const nnvm::NodeAttrs& attrs,
+                                     std::vector<int>* in_types,
+                                     std::vector<int>* out_types) {
+  const auto& params = nnvm::get<DNNLSelfAttParam>(attrs.parsed);
   if (params.quantized) {
     CHECK_EQ(in_types->size(), 3U);
 
@@ -109,10 +108,10 @@ static bool SgMKLDNNSelfAttQKInferType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-class SgMKLDNNSelfAttQKOp {
+class SgDNNLSelfAttQKOp {
  public:
-  explicit SgMKLDNNSelfAttQKOp(const nnvm::NodeAttrs& attrs)
-      : param_(nnvm::get<MKLDNNSelfAttParam>(attrs.parsed)) {}
+  explicit SgDNNLSelfAttQKOp(const nnvm::NodeAttrs& attrs)
+      : param_(nnvm::get<DNNLSelfAttParam>(attrs.parsed)) {}
 
   void Forward(const OpContext& ctx,
                const std::vector<NDArray>& inputs,
@@ -123,7 +122,7 @@ class SgMKLDNNSelfAttQKOp {
                 const std::vector<NDArray>& inputs,
                 const std::vector<OpReqType>& req,
                 const std::vector<NDArray>& outputs) {
-    LOG(FATAL) << "Not implemented: subgraph mkldnn self attention qk only supports "
+    LOG(FATAL) << "Not implemented: subgraph oneDNN self attention qk only supports "
                   "inference computation.";
   }
 
@@ -138,8 +137,8 @@ class SgMKLDNNSelfAttQKOp {
 
  private:
   bool initialized_{false};
-  MKLDNNSelfAttParam param_;
-  mkldnn_args_map_t args_;
+  DNNLSelfAttParam param_;
+  dnnl_args_map_t args_;
   std::shared_ptr<dnnl::matmul> fwd_;
   std::shared_ptr<dnnl::memory> cached_query_mem_;
   std::shared_ptr<dnnl::memory> cached_key_mem_;
@@ -151,43 +150,43 @@ class SgMKLDNNSelfAttQKOp {
   float data_scale_{0.0f};
 };
 
-static OpStatePtr CreateSgMKLDNNSelfAttQKState(const nnvm::NodeAttrs& attrs,
-                                               Context ctx,
-                                               const mxnet::ShapeVector& in_shapes,
-                                               const std::vector<int>& in_types) {
-  return OpStatePtr::Create<SgMKLDNNSelfAttQKOp>(attrs);
+static OpStatePtr CreateSgDNNLSelfAttQKState(const nnvm::NodeAttrs& attrs,
+                                             Context ctx,
+                                             const mxnet::ShapeVector& in_shapes,
+                                             const std::vector<int>& in_types) {
+  return OpStatePtr::Create<SgDNNLSelfAttQKOp>(attrs);
 }
 
-static void SgMKLDNNSelfAttQKForward(const OpStatePtr& state_pointer,
-                                     const OpContext& ctx,
-                                     const std::vector<NDArray>& inputs,
-                                     const std::vector<OpReqType>& req,
-                                     const std::vector<NDArray>& outputs) {
-  SgMKLDNNSelfAttQKOp& op = state_pointer.get_state<SgMKLDNNSelfAttQKOp>();
+static void SgDNNLSelfAttQKForward(const OpStatePtr& state_pointer,
+                                   const OpContext& ctx,
+                                   const std::vector<NDArray>& inputs,
+                                   const std::vector<OpReqType>& req,
+                                   const std::vector<NDArray>& outputs) {
+  SgDNNLSelfAttQKOp& op = state_pointer.get_state<SgDNNLSelfAttQKOp>();
   if (!op.IsInitialized()) {
     op.Initialize(ctx, inputs, req, outputs);
   }
   op.Forward(ctx, inputs, req, outputs);
 }
 
-static bool SgMKLDNNSelfAttStorageType(const nnvm::NodeAttrs& attrs,
-                                       const int dev_mask,
-                                       DispatchMode* dispatch_mode,
-                                       std::vector<int>* in_attrs,
-                                       std::vector<int>* out_attrs) {
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+static bool SgDNNLSelfAttStorageType(const nnvm::NodeAttrs& attrs,
+                                     const int dev_mask,
+                                     DispatchMode* dispatch_mode,
+                                     std::vector<int>* in_attrs,
+                                     std::vector<int>* out_attrs) {
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 
-void SgMKLDNNSelfAttQKOp::Initialize(const OpContext& ctx,
-                                     const std::vector<NDArray>& inputs,
-                                     const std::vector<OpReqType>& req,
-                                     const std::vector<NDArray>& outputs) {
-  using namespace mkldnn;
+void SgDNNLSelfAttQKOp::Initialize(const OpContext& ctx,
+                                   const std::vector<NDArray>& inputs,
+                                   const std::vector<OpReqType>& req,
+                                   const std::vector<NDArray>& outputs) {
+  using namespace dnnl;
 
   const auto qkv_tensor = inputs[0];
   const auto out_tensor = outputs[0];
 
-  const auto qkv_dtype = get_mkldnn_type(qkv_tensor.dtype());
+  const auto qkv_dtype = get_dnnl_type(qkv_tensor.dtype());
 
   const memory::dim heads          = param_.heads;
   const memory::dim sequences      = inputs[0].shape()[0];
@@ -262,10 +261,10 @@ void SgMKLDNNSelfAttQKOp::Initialize(const OpContext& ctx,
   initialized_            = true;
 }
 
-void SgMKLDNNSelfAttQKOp::Forward(const OpContext& ctx,
-                                  const std::vector<NDArray>& inputs,
-                                  const std::vector<OpReqType>& req,
-                                  const std::vector<NDArray>& outputs) {
+void SgDNNLSelfAttQKOp::Forward(const OpContext& ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
   const size_t output_lin_dim = inputs[0].shape()[2];
   const size_t embed_dim      = output_lin_dim / QKV_NUM;
 
@@ -280,8 +279,8 @@ void SgMKLDNNSelfAttQKOp::Forward(const OpContext& ctx,
     cached_out_mem_->set_data_handle(outputs[0].data().dptr<DType>());
   });
 
-  MKLDNNStream::Get()->RegisterPrimArgs(*fwd_, args_);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->RegisterPrimArgs(*fwd_, args_);
+  DNNLStream::Get()->Submit();
 
   if (param_.quantized && !param_.enable_float_output) {
     float* output_min = outputs[1].data().dptr<float>();
@@ -292,10 +291,10 @@ void SgMKLDNNSelfAttQKOp::Forward(const OpContext& ctx,
   }
 }
 
-nnvm::ObjectPtr SgMKLDNNSelfAttQKQuantizedOp(const NodeAttrs& attrs) {
+nnvm::ObjectPtr SgDNNLSelfAttQKQuantizedOp(const NodeAttrs& attrs) {
   nnvm::ObjectPtr node          = nnvm::Node::Create();
-  auto const& param             = nnvm::get<MKLDNNSelfAttParam>(attrs.parsed);
-  node->attrs.op                = Op::Get("_sg_mkldnn_selfatt_qk");
+  auto const& param             = nnvm::get<DNNLSelfAttParam>(attrs.parsed);
+  node->attrs.op                = Op::Get("_sg_onednn_selfatt_qk");
   node->attrs.name              = "quantized_" + attrs.name;
   node->attrs.dict              = attrs.dict;
   node->attrs.dict["heads"]     = std::to_string(param.heads);
@@ -306,10 +305,10 @@ nnvm::ObjectPtr SgMKLDNNSelfAttQKQuantizedOp(const NodeAttrs& attrs) {
   return node;
 }
 
-NNVM_REGISTER_OP(_sg_mkldnn_selfatt_qk)
-    .describe(R"code(_sg_mkldnn_selfatt_qk)code" ADD_FILELINE)
+NNVM_REGISTER_OP(_sg_onednn_selfatt_qk)
+    .describe(R"code(_sg_onednn_selfatt_qk)code" ADD_FILELINE)
     .set_num_inputs([](const NodeAttrs& attrs) {
-      auto const& param = nnvm::get<MKLDNNSelfAttParam>(attrs.parsed);
+      auto const& param = nnvm::get<DNNLSelfAttParam>(attrs.parsed);
       if (param.quantized) {
         return 3;
       } else {
@@ -317,18 +316,18 @@ NNVM_REGISTER_OP(_sg_mkldnn_selfatt_qk)
       }
     })
     .set_num_outputs([](const NodeAttrs& attrs) {
-      auto const& param = nnvm::get<MKLDNNSelfAttParam>(attrs.parsed);
+      auto const& param = nnvm::get<DNNLSelfAttParam>(attrs.parsed);
       if (param.quantized && !param.enable_float_output) {
         return 3;
       } else {
         return 1;
       }
     })
-    .set_attr_parser(ParamParser<MKLDNNSelfAttParam>)
+    .set_attr_parser(ParamParser<DNNLSelfAttParam>)
     .set_attr<nnvm::FListInputNames>("FListInputNames",
                                      [](const NodeAttrs& attrs) {
                                        auto const& param =
-                                           nnvm::get<MKLDNNSelfAttParam>(attrs.parsed);
+                                           nnvm::get<DNNLSelfAttParam>(attrs.parsed);
                                        std::vector<std::string> input_names{"queries_keys_values"};
                                        if (param.quantized) {
                                          input_names.emplace_back("min_qkv");
@@ -339,7 +338,7 @@ NNVM_REGISTER_OP(_sg_mkldnn_selfatt_qk)
     .set_attr<nnvm::FListOutputNames>("FListOutputNames",
                                       [](const NodeAttrs& attrs) {
                                         auto const& param =
-                                            nnvm::get<MKLDNNSelfAttParam>(attrs.parsed);
+                                            nnvm::get<DNNLSelfAttParam>(attrs.parsed);
                                         std::vector<std::string> output_names{"output"};
                                         if (param.quantized && !param.enable_float_output) {
                                           output_names.emplace_back("min_output");
@@ -347,28 +346,28 @@ NNVM_REGISTER_OP(_sg_mkldnn_selfatt_qk)
                                         }
                                         return output_names;
                                       })
-    .set_attr<mxnet::FInferShape>("FInferShape", SgMKLDNNSelfAttShape)
-    .set_attr<nnvm::FInferType>("FInferType", SgMKLDNNSelfAttQKInferType)
-    .set_attr<FInferStorageType>("FInferStorageType", SgMKLDNNSelfAttStorageType)
-    .set_attr<FCreateOpState>("FCreateOpState", CreateSgMKLDNNSelfAttQKState)
-    .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgMKLDNNSelfAttQKForward)
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<mxnet::FInferShape>("FInferShape", SgDNNLSelfAttShape)
+    .set_attr<nnvm::FInferType>("FInferType", SgDNNLSelfAttQKInferType)
+    .set_attr<FInferStorageType>("FInferStorageType", SgDNNLSelfAttStorageType)
+    .set_attr<FCreateOpState>("FCreateOpState", CreateSgDNNLSelfAttQKState)
+    .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgDNNLSelfAttQKForward)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
     .set_attr<FQuantizable>("FQuantizable",
                             [](const NodeAttrs& attrs) { return QuantizeType::kMust; })
-    .set_attr<FQuantizedOp>("FQuantizedOp", SgMKLDNNSelfAttQKQuantizedOp)
+    .set_attr<FQuantizedOp>("FQuantizedOp", SgDNNLSelfAttQKQuantizedOp)
     .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
     .add_argument("queries_keys_values",
                   "NDArray-or-Symbol",
                   "Interleaved queries, keys and values")
-    .add_arguments(MKLDNNSelfAttParam::__FIELDS__());
+    .add_arguments(DNNLSelfAttParam::__FIELDS__());
 
-/**********************************_sg_mkldnn_selfatt_valatt**********************************/
+/**********************************_sg_onednn_selfatt_valatt**********************************/
 
-static bool SgMKLDNNSelfAttValShape(const NodeAttrs& attrs,
-                                    mxnet::ShapeVector* in_shape,
-                                    mxnet::ShapeVector* out_shape) {
-  const auto& params = nnvm::get<MKLDNNSelfAttParam>(attrs.parsed);
+static bool SgDNNLSelfAttValShape(const NodeAttrs& attrs,
+                                  mxnet::ShapeVector* in_shape,
+                                  mxnet::ShapeVector* out_shape) {
+  const auto& params = nnvm::get<DNNLSelfAttParam>(attrs.parsed);
   auto att_shape     = in_shape->at(0);
   auto qkv_shape     = in_shape->at(1);
 
@@ -418,10 +417,10 @@ static bool SgMKLDNNSelfAttValShape(const NodeAttrs& attrs,
   return true;
 }
 
-static bool SgMKLDNNSelfAttValInferType(const nnvm::NodeAttrs& attrs,
-                                        std::vector<int>* in_types,
-                                        std::vector<int>* out_types) {
-  const auto& params = nnvm::get<MKLDNNSelfAttParam>(attrs.parsed);
+static bool SgDNNLSelfAttValInferType(const nnvm::NodeAttrs& attrs,
+                                      std::vector<int>* in_types,
+                                      std::vector<int>* out_types) {
+  const auto& params = nnvm::get<DNNLSelfAttParam>(attrs.parsed);
 
   if (params.quantized) {
     CHECK_EQ(in_types->size(), 6U) << "Input:[attention, queries_keys_values, min_att, max_att, "
@@ -462,10 +461,10 @@ static bool SgMKLDNNSelfAttValInferType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-nnvm::ObjectPtr SgMKLDNNSelfAttValAttQuantizedOp(const NodeAttrs& attrs) {
+nnvm::ObjectPtr SgDNNLSelfAttValAttQuantizedOp(const NodeAttrs& attrs) {
   nnvm::ObjectPtr node          = nnvm::Node::Create();
-  auto const& param             = nnvm::get<MKLDNNSelfAttParam>(attrs.parsed);
-  node->attrs.op                = Op::Get("_sg_mkldnn_selfatt_valatt");
+  auto const& param             = nnvm::get<DNNLSelfAttParam>(attrs.parsed);
+  node->attrs.op                = Op::Get("_sg_onednn_selfatt_valatt");
   node->attrs.name              = "quantized_" + attrs.name;
   node->attrs.dict              = attrs.dict;
   node->attrs.dict["heads"]     = std::to_string(param.heads);
@@ -476,10 +475,10 @@ nnvm::ObjectPtr SgMKLDNNSelfAttValAttQuantizedOp(const NodeAttrs& attrs) {
   return node;
 }
 
-class MKLDNNSelfAttValAttOp {
+class DNNLSelfAttValAttOp {
  public:
-  explicit MKLDNNSelfAttValAttOp(const nnvm::NodeAttrs& attrs)
-      : param_(nnvm::get<MKLDNNSelfAttParam>(attrs.parsed)) {}
+  explicit DNNLSelfAttValAttOp(const nnvm::NodeAttrs& attrs)
+      : param_(nnvm::get<DNNLSelfAttParam>(attrs.parsed)) {}
 
   void Forward(const OpContext& ctx,
                const std::vector<NDArray>& inputs,
@@ -490,7 +489,7 @@ class MKLDNNSelfAttValAttOp {
                 const std::vector<NDArray>& inputs,
                 const std::vector<OpReqType>& req,
                 const std::vector<NDArray>& outputs) {
-    LOG(FATAL) << "Not implemented: subgraph mkldnn self attention val only supports "
+    LOG(FATAL) << "Not implemented: subgraph oneDNN self attention val only supports "
                   "inference computation.";
   }
 
@@ -505,9 +504,9 @@ class MKLDNNSelfAttValAttOp {
 
  private:
   bool initialized_{false};
-  MKLDNNSelfAttParam param_;
-  mkldnn_args_map_t args_;
-  mkldnn_args_map_t reorder_args;
+  DNNLSelfAttParam param_;
+  dnnl_args_map_t args_;
+  dnnl_args_map_t reorder_args;
   std::shared_ptr<dnnl::matmul> fwd_;
   std::shared_ptr<dnnl::reorder> reorder_;
   std::shared_ptr<dnnl::memory> cached_att_mem_;
@@ -525,37 +524,37 @@ class MKLDNNSelfAttValAttOp {
   float att_scale_{0.0f};
 };
 
-static OpStatePtr CreateMKLDNNSelfAttValAttState(const nnvm::NodeAttrs& attrs,
-                                                 Context ctx,
-                                                 const mxnet::ShapeVector& in_shapes,
-                                                 const std::vector<int>& in_types) {
-  return OpStatePtr::Create<MKLDNNSelfAttValAttOp>(attrs);
+static OpStatePtr CreateDNNLSelfAttValAttState(const nnvm::NodeAttrs& attrs,
+                                               Context ctx,
+                                               const mxnet::ShapeVector& in_shapes,
+                                               const std::vector<int>& in_types) {
+  return OpStatePtr::Create<DNNLSelfAttValAttOp>(attrs);
 }
 
-static void MKLDNNSelfAttValAttForward(const OpStatePtr& state_pointer,
-                                       const OpContext& ctx,
-                                       const std::vector<NDArray>& inputs,
-                                       const std::vector<OpReqType>& req,
-                                       const std::vector<NDArray>& outputs) {
-  MKLDNNSelfAttValAttOp& op = state_pointer.get_state<MKLDNNSelfAttValAttOp>();
+static void DNNLSelfAttValAttForward(const OpStatePtr& state_pointer,
+                                     const OpContext& ctx,
+                                     const std::vector<NDArray>& inputs,
+                                     const std::vector<OpReqType>& req,
+                                     const std::vector<NDArray>& outputs) {
+  DNNLSelfAttValAttOp& op = state_pointer.get_state<DNNLSelfAttValAttOp>();
   if (!op.IsInitialized()) {
     op.Initialize(ctx, inputs, req, outputs);
   }
   op.Forward(ctx, inputs, req, outputs);
 }
 
-void MKLDNNSelfAttValAttOp::Initialize(const OpContext& ctx,
-                                       const std::vector<NDArray>& inputs,
-                                       const std::vector<OpReqType>& req,
-                                       const std::vector<NDArray>& outputs) {
-  using namespace mkldnn;
+void DNNLSelfAttValAttOp::Initialize(const OpContext& ctx,
+                                     const std::vector<NDArray>& inputs,
+                                     const std::vector<OpReqType>& req,
+                                     const std::vector<NDArray>& outputs) {
+  using namespace dnnl;
 
   const auto attn_tensor = inputs[0];
   const auto qkv_tensor  = inputs[1];
   const auto out_tensor  = outputs[0];
 
-  const auto qkv_dtype  = get_mkldnn_type(qkv_tensor.dtype());
-  const auto attn_dtype = get_mkldnn_type(attn_tensor.dtype());
+  const auto qkv_dtype  = get_dnnl_type(qkv_tensor.dtype());
+  const auto attn_dtype = get_dnnl_type(attn_tensor.dtype());
 
   const memory::dim heads          = param_.heads;
   const memory::dim sequences      = qkv_tensor.shape()[0];
@@ -586,8 +585,8 @@ void MKLDNNSelfAttValAttOp::Initialize(const OpContext& ctx,
   // transpose = transposed tmp - output
   memory::desc result_md, tmp_md, transpose_md;
 
-  float oscale             = 1.0f;
-  auto result_mkldnn_dtype = memory::data_type::f32;
+  float oscale           = 1.0f;
+  auto result_dnnl_dtype = memory::data_type::f32;
   if (param_.quantized) {
     min_att_ = inputs[2].data().dptr<float>()[0];
     max_att_ = inputs[3].data().dptr<float>()[0];
@@ -602,28 +601,28 @@ void MKLDNNSelfAttValAttOp::Initialize(const OpContext& ctx,
       max_output_ = param_.max_calib_range.value();
       oscale      = GetQuantizeScale(out_tensor.dtype(), min_output_, max_output_) /
                (att_scale_ * qkv_scale_);
-      result_mkldnn_dtype = memory::data_type::s8;
+      result_dnnl_dtype = memory::data_type::s8;
     } else if (param_.enable_float_output) {
-      oscale              = 1.0f / (att_scale_ * qkv_scale_);
-      result_mkldnn_dtype = memory::data_type::f32;
+      oscale            = 1.0f / (att_scale_ * qkv_scale_);
+      result_dnnl_dtype = memory::data_type::f32;
     } else {
       mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
       mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(
           s, 1, &min_output_, &max_output_, &min_att_, &max_att_, &min_qkv_, &max_qkv_);
-      result_mkldnn_dtype = memory::data_type::s32;
+      result_dnnl_dtype = memory::data_type::s32;
     }
   } else {
-    result_mkldnn_dtype = memory::data_type::f32;
+    result_dnnl_dtype = memory::data_type::f32;
   }
 
-  result_md    = memory::desc(out_dims, result_mkldnn_dtype, memory::format_tag::abcd);
-  tmp_md       = memory::desc(transpose_dims, result_mkldnn_dtype, memory::format_tag::abcde);
-  transpose_md = memory::desc(transpose_dims, result_mkldnn_dtype, memory::format_tag::acbde);
+  result_md    = memory::desc(out_dims, result_dnnl_dtype, memory::format_tag::abcd);
+  tmp_md       = memory::desc(transpose_dims, result_dnnl_dtype, memory::format_tag::abcde);
+  transpose_md = memory::desc(transpose_dims, result_dnnl_dtype, memory::format_tag::acbde);
 
   // multiply by 2 as we need to skip query and key
   const size_t value_offset = inputs[1].shape()[2] / QKV_NUM * 2;
   auto att_buffer           = inputs[0];
-  if (att_buffer.IsMKLDNNData())
+  if (att_buffer.IsDNNLData())
     att_buffer = att_buffer.Reorder2Default();
 
   MSHADOW_TYPE_SWITCH(att_buffer.dtype(), DType, {
@@ -661,15 +660,15 @@ void MKLDNNSelfAttValAttOp::Initialize(const OpContext& ctx,
   initialized_ = true;
 }
 
-void MKLDNNSelfAttValAttOp::Forward(const OpContext& ctx,
-                                    const std::vector<NDArray>& inputs,
-                                    const std::vector<OpReqType>& req,
-                                    const std::vector<NDArray>& outputs) {
+void DNNLSelfAttValAttOp::Forward(const OpContext& ctx,
+                                  const std::vector<NDArray>& inputs,
+                                  const std::vector<OpReqType>& req,
+                                  const std::vector<NDArray>& outputs) {
   // multiply by 2 as we need to skip queries and keys
   const size_t value_offset = inputs[1].shape()[2] / QKV_NUM * 2;
 
   auto att_buffer = inputs[0];
-  if (att_buffer.IsMKLDNNData())
+  if (att_buffer.IsDNNLData())
     att_buffer = att_buffer.Reorder2Default();
 
   MSHADOW_TYPE_SWITCH(att_buffer.dtype(), DType, {
@@ -687,9 +686,9 @@ void MKLDNNSelfAttValAttOp::Forward(const OpContext& ctx,
     cached_transposed_mem_->set_data_handle(outputs[0].data().dptr<DType>());
   });
 
-  MKLDNNStream::Get()->RegisterPrimArgs(*fwd_, args_);
-  MKLDNNStream::Get()->RegisterPrimArgs(*reorder_, reorder_args);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->RegisterPrimArgs(*fwd_, args_);
+  DNNLStream::Get()->RegisterPrimArgs(*reorder_, reorder_args);
+  DNNLStream::Get()->Submit();
 
   if (param_.quantized && !param_.enable_float_output) {
     float* output_min = outputs[1].data().dptr<float>();
@@ -700,10 +699,10 @@ void MKLDNNSelfAttValAttOp::Forward(const OpContext& ctx,
   }
 }
 
-NNVM_REGISTER_OP(_sg_mkldnn_selfatt_valatt)
-    .describe(R"code(_sg_mkldnn_selfatt_valatt)code" ADD_FILELINE)
+NNVM_REGISTER_OP(_sg_onednn_selfatt_valatt)
+    .describe(R"code(_sg_onednn_selfatt_valatt)code" ADD_FILELINE)
     .set_num_inputs([](const NodeAttrs& attrs) {
-      auto const& param = nnvm::get<MKLDNNSelfAttParam>(attrs.parsed);
+      auto const& param = nnvm::get<DNNLSelfAttParam>(attrs.parsed);
       if (param.quantized) {
         return 6;
       } else {
@@ -711,18 +710,18 @@ NNVM_REGISTER_OP(_sg_mkldnn_selfatt_valatt)
       }
     })
     .set_num_outputs([](const NodeAttrs& attrs) {
-      auto const& param = nnvm::get<MKLDNNSelfAttParam>(attrs.parsed);
+      auto const& param = nnvm::get<DNNLSelfAttParam>(attrs.parsed);
       if (param.quantized && !param.enable_float_output) {
         return 3;
       } else {
         return 1;
       }
     })
-    .set_attr_parser(ParamParser<MKLDNNSelfAttParam>)
+    .set_attr_parser(ParamParser<DNNLSelfAttParam>)
     .set_attr<nnvm::FListInputNames>(
         "FListInputNames",
         [](const NodeAttrs& attrs) {
-          auto const& param = nnvm::get<MKLDNNSelfAttParam>(attrs.parsed);
+          auto const& param = nnvm::get<DNNLSelfAttParam>(attrs.parsed);
           std::vector<std::string> input_names{"attention", "queries_keys_values"};
           if (param.quantized) {
             input_names.emplace_back("min_attention");
@@ -736,7 +735,7 @@ NNVM_REGISTER_OP(_sg_mkldnn_selfatt_valatt)
     .set_attr<nnvm::FListOutputNames>("FListOutputNames",
                                       [](const NodeAttrs& attrs) {
                                         auto const& param =
-                                            nnvm::get<MKLDNNSelfAttParam>(attrs.parsed);
+                                            nnvm::get<DNNLSelfAttParam>(attrs.parsed);
                                         std::vector<std::string> output_names{"output"};
                                         if (param.quantized && !param.enable_float_output) {
                                           output_names.emplace_back("min_output");
@@ -744,22 +743,22 @@ NNVM_REGISTER_OP(_sg_mkldnn_selfatt_valatt)
                                         }
                                         return output_names;
                                       })
-    .set_attr<mxnet::FInferShape>("FInferShape", SgMKLDNNSelfAttValShape)
-    .set_attr<nnvm::FInferType>("FInferType", SgMKLDNNSelfAttValInferType)
-    .set_attr<FInferStorageType>("FInferStorageType", SgMKLDNNSelfAttStorageType)
-    .set_attr<FCreateOpState>("FCreateOpState", CreateMKLDNNSelfAttValAttState)
-    .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", MKLDNNSelfAttValAttForward)
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<mxnet::FInferShape>("FInferShape", SgDNNLSelfAttValShape)
+    .set_attr<nnvm::FInferType>("FInferType", SgDNNLSelfAttValInferType)
+    .set_attr<FInferStorageType>("FInferStorageType", SgDNNLSelfAttStorageType)
+    .set_attr<FCreateOpState>("FCreateOpState", CreateDNNLSelfAttValAttState)
+    .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", DNNLSelfAttValAttForward)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
     .set_attr<FQuantizable>("FQuantizable",
                             [](const NodeAttrs& attrs) { return QuantizeType::kMust; })
-    .set_attr<FQuantizedOp>("FQuantizedOp", SgMKLDNNSelfAttValAttQuantizedOp)
+    .set_attr<FQuantizedOp>("FQuantizedOp", SgDNNLSelfAttValAttQuantizedOp)
     .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
     .add_argument("attention", "NDArray-or-Symbol", "Attention maps")
     .add_argument("queries_keys_values",
                   "NDArray-or-Symbol",
                   "Queries, keys and values interleaved")
-    .add_arguments(MKLDNNSelfAttParam::__FIELDS__());
+    .add_arguments(DNNLSelfAttParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/subgraph/mkldnn/mkldnn_transformer_post_quantize_property.h b/src/operator/subgraph/dnnl/dnnl_transformer_post_quantize_property.h
similarity index 79%
rename from src/operator/subgraph/mkldnn/mkldnn_transformer_post_quantize_property.h
rename to src/operator/subgraph/dnnl/dnnl_transformer_post_quantize_property.h
index 745784492fba..7528de54083d 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_transformer_post_quantize_property.h
+++ b/src/operator/subgraph/dnnl/dnnl_transformer_post_quantize_property.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_TRANSFORMER_POST_QUANTIZE_PROPERTY_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_TRANSFORMER_POST_QUANTIZE_PROPERTY_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_TRANSFORMER_POST_QUANTIZE_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_TRANSFORMER_POST_QUANTIZE_PROPERTY_H_
 #if MXNET_USE_ONEDNN == 1
 
 #include <string>
@@ -26,13 +26,12 @@
 
 #include "../../quantization/requantize-inl.h"
 #include "../common.h"
-
-#include "mkldnn_subgraph_base-inl.h"
+#include "dnnl_subgraph_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-class SgMKLDNNTransformerPostQuantizeSelector : public SubgraphSelector {
+class SgDNNLTransformerPostQuantizeSelector : public SubgraphSelector {
  public:
   /*! \brief pattern match status */
   enum SelectStatus {
@@ -49,12 +48,12 @@ class SgMKLDNNTransformerPostQuantizeSelector : public SubgraphSelector {
   std::vector<const nnvm::Node*> matched_list;
 
  public:
-  explicit SgMKLDNNTransformerPostQuantizeSelector(const bool dis_all, const bool dis_float_output)
+  explicit SgDNNLTransformerPostQuantizeSelector(const bool dis_all, const bool dis_float_output)
       : disable_all(dis_all), disable_float_output(dis_float_output) {}
 
   bool Select(const nnvm::Node& n) override {
-    if ((!disable_all) && (n.op() == Op::Get("_sg_mkldnn_selfatt_qk") ||
-                           n.op() == Op::Get("_sg_mkldnn_selfatt_valatt"))) {
+    if ((!disable_all) && (n.op() == Op::Get("_sg_onednn_selfatt_qk") ||
+                           n.op() == Op::Get("_sg_onednn_selfatt_valatt"))) {
       status = disable_all ? kSuccess : kStart;
       matched_list.clear();
       matched_list.push_back(&n);
@@ -122,22 +121,22 @@ class SgMKLDNNTransformerPostQuantizeSelector : public SubgraphSelector {
 
   void Reset() override {
     CHECK_GE(matched_list.size(), 1);
-    auto new_selector = SgMKLDNNTransformerPostQuantizeSelector(disable_all, disable_float_output);
+    auto new_selector = SgDNNLTransformerPostQuantizeSelector(disable_all, disable_float_output);
     new_selector.Select(*matched_list[0]);
     *this = new_selector;
   }
 };
 
-class SgMKLDNNTransformerPostQuantizeProperty : public SubgraphProperty {
+class SgDNNLTransformerPostQuantizeProperty : public SubgraphProperty {
  public:
-  SgMKLDNNTransformerPostQuantizeProperty() {
-    disable_fuse_all     = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_QTRANSFORMER_FUSE_ALL", false);
-    disable_float_output = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_QTRANSFORMER_FLOAT_OUTPUT", false);
+  SgDNNLTransformerPostQuantizeProperty() {
+    disable_fuse_all     = dmlc::GetEnv("MXNET_DISABLE_DNNL_QTRANSFORMER_FUSE_ALL", false);
+    disable_float_output = dmlc::GetEnv("MXNET_DISABLE_DNNL_QTRANSFORMER_FLOAT_OUTPUT", false);
   }
 
   static SubgraphPropertyPtr Create() {
-    static const std::string& name = "MKLDNN Transformer post-quantization optimization pass";
-    auto property                  = std::make_shared<SgMKLDNNTransformerPostQuantizeProperty>();
+    static const std::string& name = "DNNL Transformer post-quantization optimization pass";
+    auto property                  = std::make_shared<SgDNNLTransformerPostQuantizeProperty>();
     property->SetAttr<std::string>("property_name", name);
     property->SetAttr<bool>("inference_only", true);
     return property;
@@ -152,8 +151,8 @@ class SgMKLDNNTransformerPostQuantizeProperty : public SubgraphProperty {
     DFSVisit(sym.outputs, [&](const nnvm::ObjectPtr& node) {
       if (node->is_variable())
         return;
-      if (node->op() == Op::Get("_sg_mkldnn_selfatt_qk") ||
-          node->op() == Op::Get("_sg_mkldnn_selfatt_valatt")) {
+      if (node->op() == Op::Get("_sg_onednn_selfatt_qk") ||
+          node->op() == Op::Get("_sg_onednn_selfatt_valatt")) {
         interleaved_node = node;
       } else if (node->op() == Op::Get("_contrib_requantize")) {
         requantize_node = node;
@@ -184,8 +183,8 @@ class SgMKLDNNTransformerPostQuantizeProperty : public SubgraphProperty {
   }
 
   SubgraphSelectorPtr CreateSubgraphSelector() const override {
-    auto selector = std::make_shared<SgMKLDNNTransformerPostQuantizeSelector>(disable_fuse_all,
-                                                                              disable_float_output);
+    auto selector = std::make_shared<SgDNNLTransformerPostQuantizeSelector>(disable_fuse_all,
+                                                                            disable_float_output);
     return selector;
   }
 
@@ -198,4 +197,4 @@ class SgMKLDNNTransformerPostQuantizeProperty : public SubgraphProperty {
 }  // namespace mxnet
 
 #endif  // if MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_TRANSFORMER_POST_QUANTIZE_PROPERTY_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_TRANSFORMER_POST_QUANTIZE_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_transformer_qk_property.h b/src/operator/subgraph/dnnl/dnnl_transformer_qk_property.h
similarity index 87%
rename from src/operator/subgraph/mkldnn/mkldnn_transformer_qk_property.h
rename to src/operator/subgraph/dnnl/dnnl_transformer_qk_property.h
index c52f55f34162..e0844f7a7e5f 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_transformer_qk_property.h
+++ b/src/operator/subgraph/dnnl/dnnl_transformer_qk_property.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_TRANSFORMER_QK_PROPERTY_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_TRANSFORMER_QK_PROPERTY_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_TRANSFORMER_QK_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_TRANSFORMER_QK_PROPERTY_H_
 #if MXNET_USE_ONEDNN == 1
 
 #include <map>
@@ -29,10 +29,9 @@
 #include "../../numpy/np_matrix_op-inl.h"
 #include "../../tensor/matrix_op-inl.h"
 #include "../common.h"
-
-#include "mkldnn_common.h"
-#include "mkldnn_subgraph_base-inl.h"
-#include "mkldnn_transformer-inl.h"
+#include "dnnl_common.h"
+#include "dnnl_subgraph_base-inl.h"
+#include "dnnl_transformer-inl.h"
 
 /*
               custom_op
@@ -51,7 +50,7 @@
 namespace mxnet {
 namespace op {
 
-class SgMKLDNNTransformerQKSelector : public SubgraphSelector {
+class SgDNNLTransformerQKSelector : public SubgraphSelector {
   enum SelectStatus {
     kFail = 0,
     kStart,
@@ -153,22 +152,22 @@ class SgMKLDNNTransformerQKSelector : public SubgraphSelector {
 
   void Reset() override {
     CHECK_GE(matched_list_.size(), 1);
-    auto new_selector = SgMKLDNNTransformerQKSelector();
+    auto new_selector = SgDNNLTransformerQKSelector();
     new_selector.Select(*matched_list_[0], nullptr);
     *this = new_selector;
   }
 };
 
-class SgMKLDNNTransformerQKProperty : public SubgraphProperty {
+class SgDNNLTransformerQKProperty : public SubgraphProperty {
  public:
-  SgMKLDNNTransformerQKProperty() {}
+  SgDNNLTransformerQKProperty() {}
 
   static SubgraphPropertyPtr Create() {
-    static const std::string& name = "MKLDNN Transformer optimization pass";
-    auto property                  = std::make_shared<SgMKLDNNTransformerQKProperty>();
+    static const std::string& name = "oneDNN Transformer optimization pass";
+    auto property                  = std::make_shared<SgDNNLTransformerQKProperty>();
     property->SetAttr<std::string>("property_name", name);
     property->SetAttr<bool>("inference_only", true);
-    if (dmlc::GetEnv("MXNET_DISABLE_MKLDNN_TRANSFORMER_OPT", 0)) {
+    if (dmlc::GetEnv("MXNET_DISABLE_ONEDNN_TRANSFORMER_OPT", 0)) {
       property->SetAttr<bool>("disable", true);
     }
     return property;
@@ -192,17 +191,17 @@ class SgMKLDNNTransformerQKProperty : public SubgraphProperty {
       }
     });
 
-    node_name << "_sg_mkldnn_selfatt_qk_" << subgraph_id;
+    node_name << "_sg_onednn_selfatt_qk_" << subgraph_id;
 
     n->attrs.name = node_name.str();
-    n->attrs.op   = Op::Get("_sg_mkldnn_selfatt_qk");
+    n->attrs.op   = Op::Get("_sg_onednn_selfatt_qk");
     CHECK(n->attrs.op);
     n->op()->attr_parser(&(n->attrs));
     return n;
   }
 
   SubgraphSelectorPtr CreateSubgraphSelector() const override {
-    auto selector = std::make_shared<SgMKLDNNTransformerQKSelector>();
+    auto selector = std::make_shared<SgDNNLTransformerQKSelector>();
     return selector;
   }
 
@@ -229,4 +228,4 @@ class SgMKLDNNTransformerQKProperty : public SubgraphProperty {
 }  // namespace mxnet
 
 #endif  // if MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_TRANSFORMER_QK_PROPERTY_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_TRANSFORMER_QK_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_transformer_valatt_property.h b/src/operator/subgraph/dnnl/dnnl_transformer_valatt_property.h
similarity index 89%
rename from src/operator/subgraph/mkldnn/mkldnn_transformer_valatt_property.h
rename to src/operator/subgraph/dnnl/dnnl_transformer_valatt_property.h
index f36110a33ee6..a62c10fdb0d1 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_transformer_valatt_property.h
+++ b/src/operator/subgraph/dnnl/dnnl_transformer_valatt_property.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_TRANSFORMER_VALATT_PROPERTY_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_TRANSFORMER_VALATT_PROPERTY_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_TRANSFORMER_VALATT_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_TRANSFORMER_VALATT_PROPERTY_H_
 #if MXNET_USE_ONEDNN == 1
 
 #include <map>
@@ -30,10 +30,9 @@
 #include "../../swapaxis-inl.h"
 #include "../../tensor/matrix_op-inl.h"
 #include "../common.h"
-
-#include "mkldnn_common.h"
-#include "mkldnn_subgraph_base-inl.h"
-#include "mkldnn_transformer-inl.h"
+#include "dnnl_common.h"
+#include "dnnl_subgraph_base-inl.h"
+#include "dnnl_transformer-inl.h"
 
 /*
                  custom_op
@@ -83,7 +82,7 @@ bool CheckSplitConditions(const BiDirectedNode& bi_node) {
   return true;
 }
 
-class SgMKLDNNTransformerValAttSelector : public SubgraphSelectorV2 {
+class SgDNNLTransformerValAttSelector : public SubgraphSelectorV2 {
   enum InStatus { kFail = 0, kStart, kSecondStart, kIgnoreSecond, kSwapAx, kReshape, kSuccess };
   /*                 (custom_op)
              /---> kSecondStart ---\
@@ -227,22 +226,22 @@ class SgMKLDNNTransformerValAttSelector : public SubgraphSelectorV2 {
 
   void Reset() override {
     CHECK_GE(matched_list_.size(), 1);
-    auto new_selector = SgMKLDNNTransformerValAttSelector();
+    auto new_selector = SgDNNLTransformerValAttSelector();
     new_selector.Select(*matched_list_[0], nullptr);
     *this = new_selector;
   }
 };
 
-class SgMKLDNNTransformerValAttProperty : public SubgraphProperty {
+class SgDNNLTransformerValAttProperty : public SubgraphProperty {
  public:
-  SgMKLDNNTransformerValAttProperty() {}
+  SgDNNLTransformerValAttProperty() {}
 
   static SubgraphPropertyPtr Create() {
-    static const std::string& name = "MKLDNN Transformer optimization pass";
-    auto property                  = std::make_shared<SgMKLDNNTransformerValAttProperty>();
+    static const std::string& name = "oneDNN Transformer optimization pass";
+    auto property                  = std::make_shared<SgDNNLTransformerValAttProperty>();
     property->SetAttr<std::string>("property_name", name);
     property->SetAttr<bool>("inference_only", true);
-    if (dmlc::GetEnv("MXNET_DISABLE_MKLDNN_TRANSFORMER_OPT", 0)) {
+    if (dmlc::GetEnv("MXNET_DISABLE_ONEDNN_TRANSFORMER_OPT", 0)) {
       property->SetAttr<bool>("disable", true);
     }
     return property;
@@ -265,16 +264,16 @@ class SgMKLDNNTransformerValAttProperty : public SubgraphProperty {
           n->attrs.dict["heads"] = std::to_string(reshape_param.newshape[2]);
       }
     });
-    node_name << "_sg_mkldnn_selfatt_valatt_" << subgraph_id;
+    node_name << "_sg_onednn_selfatt_valatt_" << subgraph_id;
     n->attrs.name = node_name.str();
-    n->attrs.op   = Op::Get("_sg_mkldnn_selfatt_valatt");
+    n->attrs.op   = Op::Get("_sg_onednn_selfatt_valatt");
     CHECK(n->attrs.op);
     n->op()->attr_parser(&(n->attrs));
     return n;
   }
 
   SubgraphSelectorV2Ptr CreateSubgraphSelectorV2() const override {
-    auto selector = std::make_shared<SgMKLDNNTransformerValAttSelector>();
+    auto selector = std::make_shared<SgDNNLTransformerValAttSelector>();
     return selector;
   }
 };
@@ -283,4 +282,4 @@ class SgMKLDNNTransformerValAttProperty : public SubgraphProperty {
 }  // namespace mxnet
 
 #endif  // if MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_TRANSFORMER_VALATT_PROPERTY_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_DNNL_DNNL_TRANSFORMER_VALATT_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc b/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
deleted file mode 100644
index b59e487df348..000000000000
--- a/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#if MXNET_USE_ONEDNN == 1
-
-#include "mkldnn_bn_relu_property.h"
-#include "mkldnn_conv_property.h"
-#include "mkldnn_elemwisemul_post_quantize_property.h"
-#include "mkldnn_fc_post_quantize_property.h"
-#include "mkldnn_fc_property.h"
-#include "mkldnn_post_quantize_align_scale_property.h"
-#include "mkldnn_post_quantize_property.h"
-#include "mkldnn_transformer_post_quantize_property.h"
-#include "mkldnn_transformer_qk_property.h"
-#include "mkldnn_transformer_valatt_property.h"
-
-namespace mxnet {
-namespace op {
-
-MXNET_REGISTER_SUBGRAPH_BACKEND(MKLDNN)
-    .set_attr("enable", MKLDNNEnvSet())
-    .set_attr("context", Context::CPU());
-
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNConvProperty);
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNFCProperty);
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNBNReLUProperty);
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNTransformerQKProperty);
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNTransformerValAttProperty);
-
-MXNET_REGISTER_SUBGRAPH_BACKEND(MKLDNN_QUANTIZE).set_attr("context", Context::CPU());
-
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNConvProperty).set_attr("quantize", true);
-
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNFCProperty).set_attr("quantize", true);
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNTransformerQKProperty);
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNTransformerValAttProperty);
-
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNPostQuantizeProperty);
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNFCPostQuantizeProperty);
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, ElemwiseMulPostQuantizeProperty);
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNPostQuantizeAlignScaleProperty);
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNTransformerPostQuantizeProperty)
-    .set_attr("quantize", true);
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_USE_ONEDNN == 1
diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index 68eb7964ccda..777eebfc52f2 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -219,8 +219,8 @@ class CustomSubgraphProperty : public SubgraphProperty {
         const NDArray& in_arg = *(in_args_ptr[i]);
 
 #if MXNET_USE_ONEDNN == 1
-        // reorder data if in MKLDNN format
-        if (in_arg.IsMKLDNNData()) {
+        // reorder data if in DNNL format
+        if (in_arg.IsDNNLData()) {
           in_arg.Reorder2DefaultAsync();
           in_arg.WaitToRead();
         }
@@ -253,8 +253,8 @@ class CustomSubgraphProperty : public SubgraphProperty {
         const auto& in_aux = *(in_aux_ptr[i]);
 
 #if MXNET_USE_ONEDNN == 1
-        // reorder data if in MKLDNN format
-        if (in_aux.IsMKLDNNData()) {
+        // reorder data if in DNNL format
+        if (in_aux.IsDNNLData()) {
           in_aux.Reorder2DefaultAsync();
           in_aux.WaitToRead();
         }
diff --git a/src/operator/tensor/amp_cast.cc b/src/operator/tensor/amp_cast.cc
index c2109de209b3..aee5f537d9bc 100644
--- a/src/operator/tensor/amp_cast.cc
+++ b/src/operator/tensor/amp_cast.cc
@@ -43,25 +43,25 @@ static void AMPCastExCPU(const nnvm::NodeAttrs& attrs,
   }
   auto data = inputs[0];
   if (data.dtype() != mshadow::kFloat16 && outputs[0].dtype() != mshadow::kFloat16) {
-    mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
-    if (data.IsView() && data.IsMKLDNNData())
+    dnnl::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+    if (data.IsView() && data.IsDNNLData())
       data = data.Reorder2Default();
-    const auto i_mem            = data.GetMKLDNNData();
+    const auto i_mem            = data.GetDNNLData();
     const size_t i_ndim         = data.shape().ndim();
-    mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+    dnnl::memory::dims i_dims   = dnnl::memory::dims(i_ndim);
     for (size_t i = 0; i < i_ndim; i++) {
       i_dims[i] = static_cast<int>(data.shape()[i]);
     }
     const auto o_desc =
-        mkldnn::memory::desc(i_dims,
-                             get_mkldnn_type(outputs[0].dtype()),
-                             static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(i_ndim)));
-    const auto out_mem = CreateMKLDNNMem(outputs[0], o_desc, req[0]);
-    mkldnn_args_map_t reorder_args;
-    reorder_args[MKLDNN_ARG_SRC] = *i_mem;
-    reorder_args[MKLDNN_ARG_DST] = *out_mem.second;
-    MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::reorder(*i_mem, *out_mem.second), reorder_args);
-    MKLDNNStream::Get()->Submit();
+        dnnl::memory::desc(i_dims,
+                           get_dnnl_type(outputs[0].dtype()),
+                           static_cast<dnnl::memory::format_tag>(GetDefaultFormat(i_ndim)));
+    const auto out_mem = CreateDNNLMem(outputs[0], o_desc, req[0]);
+    dnnl_args_map_t reorder_args;
+    reorder_args[DNNL_ARG_SRC] = *i_mem;
+    reorder_args[DNNL_ARG_DST] = *out_mem.second;
+    DNNLStream::Get()->RegisterPrimArgs(dnnl::reorder(*i_mem, *out_mem.second), reorder_args);
+    DNNLStream::Get()->Submit();
     return;
   }
   FallBackCompute(AMPCastCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -74,7 +74,7 @@ inline static bool AMPCastStorageType(const nnvm::NodeAttrs& attrs,
                                       std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1);
   CHECK_EQ(out_attrs->size(), 1);
-  auto ret = MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  auto ret = DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
   return ret;
 }
 
@@ -86,31 +86,31 @@ static void AMPMultiCastExCPU(const nnvm::NodeAttrs& attrs,
   const AMPMultiCastParam& param = nnvm::get<AMPMultiCastParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), param.num_outputs);
   CHECK_EQ(outputs.size(), param.num_outputs);
-  mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+  dnnl::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
   for (int i = 0; i < param.num_outputs; ++i) {
     if (req[i] == kWriteInplace) {
       continue;
     }
     auto data = inputs[i];
-    if (data.IsView() && data.IsMKLDNNData())
+    if (data.IsView() && data.IsDNNLData())
       data = data.Reorder2Default();
-    const auto i_mem            = data.GetMKLDNNData();
+    const auto i_mem            = data.GetDNNLData();
     const size_t i_ndim         = data.shape().ndim();
-    mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+    dnnl::memory::dims i_dims   = dnnl::memory::dims(i_ndim);
     for (size_t j = 0; j < i_ndim; j++) {
       i_dims[j] = static_cast<int>(data.shape()[j]);
     }
     const auto o_desc =
-        mkldnn::memory::desc(i_dims,
-                             get_mkldnn_type(outputs[i].dtype()),
-                             static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(i_ndim)));
-    const auto out_mem = CreateMKLDNNMem(outputs[i], o_desc, req[i]);
-    mkldnn_args_map_t reorder_args;
-    reorder_args[MKLDNN_ARG_SRC] = *i_mem;
-    reorder_args[MKLDNN_ARG_DST] = *out_mem.second;
-    MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::reorder(*i_mem, *out_mem.second), reorder_args);
+        dnnl::memory::desc(i_dims,
+                           get_dnnl_type(outputs[i].dtype()),
+                           static_cast<dnnl::memory::format_tag>(GetDefaultFormat(i_ndim)));
+    const auto out_mem = CreateDNNLMem(outputs[i], o_desc, req[i]);
+    dnnl_args_map_t reorder_args;
+    reorder_args[DNNL_ARG_SRC] = *i_mem;
+    reorder_args[DNNL_ARG_DST] = *out_mem.second;
+    DNNLStream::Get()->RegisterPrimArgs(dnnl::reorder(*i_mem, *out_mem.second), reorder_args);
   }
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
 inline static bool AMPMultiCastStorageType(const nnvm::NodeAttrs& attrs,
@@ -121,7 +121,7 @@ inline static bool AMPMultiCastStorageType(const nnvm::NodeAttrs& attrs,
   const AMPMultiCastParam& param = nnvm::get<AMPMultiCastParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), param.num_outputs);
   CHECK_EQ(out_attrs->size(), param.num_outputs);
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 
 #endif  // MXNET_USE_ONEDNN == 1
@@ -145,7 +145,7 @@ It casts only between low precision float/FP32 and does not do anything for othe
                                       })
     .set_attr<FCompute>("FCompute<cpu>", AMPCastCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FInferStorageType>("FInferStorageType", AMPCastStorageType)
     .set_attr<FComputeEx>("FComputeEx<cpu>", AMPCastExCPU)
 #endif
@@ -164,7 +164,7 @@ NNVM_REGISTER_OP(_backward_amp_cast)
                                         return std::vector<bool>{true};
                                       })
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FInferStorageType>("FInferStorageType", AMPCastStorageType)
     .set_attr<FComputeEx>("FComputeEx<cpu>", AMPCastExCPU)
 #endif
@@ -217,7 +217,7 @@ It casts only between low precision float/FP32 and does not do anything for othe
                                       })
     .set_attr<FCompute>("FCompute<cpu>", AMPMultiCastCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FInferStorageType>("FInferStorageType", AMPMultiCastStorageType)
     .set_attr<FComputeEx>("FComputeEx<cpu>", AMPMultiCastExCPU)
 #endif
@@ -264,7 +264,7 @@ NNVM_REGISTER_OP(_backward_amp_multicast)
                                         return std::vector<bool>(num_args, true);
                                       })
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FInferStorageType>("FInferStorageType", AMPMultiCastStorageType)
     .set_attr<FComputeEx>("FComputeEx<cpu>", AMPMultiCastExCPU)
 #endif
diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h
index 293b255bea0f..7c6f83a2425a 100644
--- a/src/operator/tensor/cast_storage-inl.h
+++ b/src/operator/tensor/cast_storage-inl.h
@@ -35,7 +35,7 @@
 #include "./cast_storage-inl.cuh"
 #endif  // __CUDACC__
 #if MXNET_USE_ONEDNN == 1
-#include "../nn/mkldnn/mkldnn_base-inl.h"
+#include "../nn/dnnl/dnnl_base-inl.h"
 #endif
 
 namespace mxnet {
@@ -397,16 +397,16 @@ void CastStorageComputeImpl(const OpContext& ctx, const NDArray& input, const ND
 #if MXNET_USE_ONEDNN == 1
   } else if (src_stype == kDefaultStorage && dst_stype == kDefaultStorage) {
     CHECK_EQ(output.ctx().dev_type, input.ctx().dev_type);
-    // If one of them uses the MKLDNN layout.
-    if (input.IsMKLDNNData() || output.IsMKLDNNData()) {
+    // If one of them uses the DNNL layout.
+    if (input.IsDNNLData() || output.IsDNNLData()) {
       NDArray tmp_input = input;
-      // If the input data is MKLDNN and is a view, we need to reorder the input
+      // If the input data is DNNL and is a view, we need to reorder the input
       // data first.
-      if (input.IsMKLDNNData() && input.IsView())
+      if (input.IsDNNLData() && input.IsView())
         tmp_input = input.Reorder2Default();
-      const mkldnn::memory* in_mem = tmp_input.GetMKLDNNData();
+      const dnnl::memory* in_mem = tmp_input.GetDNNLData();
       const_cast<NDArray&>(output).CopyFrom(*in_mem);
-      MKLDNNStream::Get()->Submit();
+      DNNLStream::Get()->Submit();
     } else {
       mxnet_op::copy(ctx.get_stream<xpu>(), output.data(), input.data());
     }
@@ -445,8 +445,8 @@ inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs,
     // dns -> dns
     DispatchMode mode = DispatchMode::kFCompute;
 #if MXNET_USE_ONEDNN == 1
-    // If we use MKLDNN and the arrays are in CPU memory, the array may store
-    // MKLDNN layout, we should convert its layout explicitly.
+    // If we use DNNL and the arrays are in CPU memory, the array may store
+    // DNNL layout, we should convert its layout explicitly.
     if (dev_mask == kCPU)
       mode = DispatchMode::kFComputeEx;
 #endif
diff --git a/src/operator/tensor/dot.cc b/src/operator/tensor/dot.cc
index f9203d51a453..9a19d0c6e754 100644
--- a/src/operator/tensor/dot.cc
+++ b/src/operator/tensor/dot.cc
@@ -24,8 +24,8 @@
 
 #include "./dot-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./../nn/mkldnn/mkldnn_base-inl.h"
-#include "./../nn/mkldnn/mkldnn_ops-inl.h"
+#include "./../nn/dnnl/dnnl_base-inl.h"
+#include "./../nn/dnnl/dnnl_ops-inl.h"
 #endif  // MXNET_USE_ONEDNN
 
 namespace mxnet {
@@ -121,10 +121,10 @@ static void BatchDotComputeExCPU(const nnvm::NodeAttrs& attrs,
                                  const std::vector<NDArray>& inputs,
                                  const std::vector<OpReqType>& req,
                                  const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNBatchDot(inputs, outputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNBatchDotForward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(BatchDotForward_<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLBatchDot(inputs, outputs[0])) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLBatchDotForward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(BatchDotForward_<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(BatchDotForward_<cpu>, attrs, ctx, inputs, req, outputs);
@@ -138,7 +138,7 @@ static bool BatchDotStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 2);
   CHECK_EQ(out_attrs->size(), 1);
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 #endif
 
@@ -172,7 +172,7 @@ which is computed by::
     .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
     .set_attr<FCompute>("FCompute<cpu>", BatchDotForward_<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FInferStorageType>("FInferStorageType", BatchDotStorageType)
     .set_attr<FComputeEx>("FComputeEx<cpu>", BatchDotComputeExCPU)
 #endif
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc
index ab6e15c95ae7..e1b881c2b767 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_op_basic.cc
@@ -21,15 +21,15 @@
  * \file elemwise_binary_op_basic.cc
  * \brief CPU Implementation of basic elementwise binary broadcast operators
  */
-#include "./elemwise_unary_op.h"
+#include "../nn/dnnl/dnnl_base-inl.h"
+#include "../nn/dnnl/dnnl_ops-inl.h"
 #include "./elemwise_binary_op-inl.h"
-#include "../nn/mkldnn/mkldnn_ops-inl.h"
-#include "../nn/mkldnn/mkldnn_base-inl.h"
+#include "./elemwise_unary_op.h"
 
 namespace mxnet {
 namespace op {
 
-bool SupportMKLDNNSum(const NDArray& input) {
+bool SupportDNNLSum(const NDArray& input) {
   int ndim = input.shape().ndim();
   return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16) &&
          (ndim >= 1 && ndim <= 4) && input.storage_type() == kDefaultStorage;
@@ -43,8 +43,8 @@ static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
 #if MXNET_USE_ONEDNN == 1
-  if (SupportMKLDNNSum(inputs[0]) && SupportMKLDNNSum(inputs[1])) {
-    MKLDNNRun(MKLDNNSumForward, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLSum(inputs[0]) && SupportDNNLSum(inputs[1])) {
+    DNNLRun(DNNLSumForward, attrs, ctx, inputs, req, outputs);
     return;
   } else if (inputs[0].storage_type() == kDefaultStorage &&
              inputs[1].storage_type() == kDefaultStorage) {
@@ -66,7 +66,7 @@ static inline bool ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs,
   bool ret = ElemwiseBinaryOp::PreferDenseStorageType<true, true, true>(
       attrs, dev_mask, dispatch_mode, in_attrs, out_attrs);
 #if MXNET_USE_ONEDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet()) {
+  if (dev_mask == mshadow::cpu::kDevMask && !DNNLEnvSet()) {
     *dispatch_mode = DispatchMode::kFComputeFallback;
   } else if (dev_mask == mshadow::cpu::kDevMask &&
              common::ContainsOnlyStorage(*in_attrs, kDefaultStorage) &&
@@ -81,7 +81,7 @@ MXNET_OPERATOR_REGISTER_BINARY(elemwise_add)
     .set_attr<FInferStorageType>("FInferStorageType", ElemwiseAddStorageType)
     .set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::Compute<cpu, op::mshadow_op::plus>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
 #endif
     .set_attr<FComputeEx>("FComputeEx<cpu>", ElemwiseAddEx)
     .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
@@ -123,9 +123,9 @@ static void _backward_ElemwiseAddEx(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 2U);
 #if MXNET_USE_ONEDNN == 1
-  if (inputs[0].IsMKLDNNData()) {
-    MKLDNNRun(MKLDNNCopy, attrs, ctx, inputs[0], req[0], outputs[0]);
-    MKLDNNRun(MKLDNNCopy, attrs, ctx, inputs[0], req[1], outputs[1]);
+  if (inputs[0].IsDNNLData()) {
+    DNNLRun(DNNLCopy, attrs, ctx, inputs[0], req[0], outputs[0]);
+    DNNLRun(DNNLCopy, attrs, ctx, inputs[0], req[1], outputs[1]);
     return;
   } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) {
     FallBackCompute(
@@ -152,7 +152,7 @@ static inline bool ElemwiseAddBackwardStorageType(const nnvm::NodeAttrs& attrs,
   bool ret = ElemwiseStorageType<1, 2, true, true, true>(
       attrs, dev_mask, dispatch_mode, in_attrs, out_attrs);
 #if MXNET_USE_ONEDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet()) {
+  if (dev_mask == mshadow::cpu::kDevMask && !DNNLEnvSet()) {
     *dispatch_mode = DispatchMode::kFComputeFallback;
   } else if (dev_mask == mshadow::cpu::kDevMask) {
     *dispatch_mode = DispatchMode::kFComputeEx;
@@ -174,7 +174,7 @@ NNVM_REGISTER_OP(_backward_add)
                                 [](const NodeAttrs& n) {
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                 })
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
 #endif
     .set_attr<FCompute>(
         "FCompute<cpu>",
diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc
index b6b7035a45f8..67842cd25f4a 100644
--- a/src/operator/tensor/elemwise_sum.cc
+++ b/src/operator/tensor/elemwise_sum.cc
@@ -22,10 +22,11 @@
  * \brief CPU implementation of elementwise sum operator
  */
 #include "./elemwise_sum.h"
-#include "../../ndarray/ndarray_function.h"
-#include "../nn/mkldnn/mkldnn_ops-inl.h"
-#include "../nn/mkldnn/mkldnn_base-inl.h"
+
 #include "../../common/utils.h"
+#include "../../ndarray/ndarray_function.h"
+#include "../nn/dnnl/dnnl_base-inl.h"
+#include "../nn/dnnl/dnnl_ops-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -91,9 +92,9 @@ bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs,
 }
 
 #if MXNET_USE_ONEDNN == 1
-static inline bool IsMKLDNNData(const std::vector<NDArray>& arrs) {
+static inline bool IsDNNLData(const std::vector<NDArray>& arrs) {
   for (auto& arr : arrs) {
-    if (!arr.IsMKLDNNData())
+    if (!arr.IsDNNLData())
       return false;
   }
   return true;
@@ -111,8 +112,8 @@ void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs,
   if (req[0] == kNullOp)
     return;
 #if MXNET_USE_ONEDNN == 1
-  if (IsMKLDNNData(inputs)) {
-    MKLDNNRun(MKLDNNSumForward, attrs, ctx, inputs, req, outputs);
+  if (IsDNNLData(inputs)) {
+    DNNLRun(DNNLSumForward, attrs, ctx, inputs, req, outputs);
   } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) {
     FallBackCompute(ElementWiseSumCompute<cpu>, attrs, ctx, inputs, req, outputs);
   }
@@ -181,7 +182,7 @@ The storage type of ``add_n`` output depends on storage types of inputs
                                 })
     .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
 #endif
     .set_attr<mxnet::FInferShape>("FInferShape", ElementWiseSumShape)
     .set_attr<nnvm::FInferType>("FInferType", ElementWiseSumType)
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 11ce54defb9e..38949f1769ed 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -399,7 +399,7 @@ class UnaryOp : public OpBase {
         });
       } break;
       case kWriteInplace:
-// cannot check if ptrs are the same for MKLDNN because we may have
+// cannot check if ptrs are the same for DNNL because we may have
 // created copies of input when reordering. WriteInPlace will still write to original array
 #if MXNET_USE_ONEDNN == 0
         CHECK_EQ(inputs[0].dptr_, outputs[0].dptr_);
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index 0e8f61a1d930..3cc930b0d8ea 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -22,9 +22,10 @@
  * \brief CPU Implementation of elementwise unary function.
  */
 #include <mxnet/base.h>
-#include "elemwise_unary_op.h"
+
+#include "../nn/dnnl/dnnl_ops-inl.h"
 #include "./elemwise_binary_op-inl.h"
-#include "../nn/mkldnn/mkldnn_ops-inl.h"
+#include "elemwise_unary_op.h"
 
 namespace mxnet {
 namespace op {
@@ -311,8 +312,8 @@ static void CopyEx(const nnvm::NodeAttrs& attrs,
 #if MXNET_USE_ONEDNN == 1
   const auto in_stype  = inputs[0].storage_type();
   const auto out_stype = outputs[0].storage_type();
-  if (inputs[0].IsMKLDNNData()) {
-    MKLDNNRun(MKLDNNCopy, attrs, ctx, inputs[0], req[0], outputs[0]);
+  if (inputs[0].IsDNNLData()) {
+    DNNLRun(DNNLCopy, attrs, ctx, inputs[0], req[0], outputs[0]);
     return;
   } else if (in_stype == kDefaultStorage && out_stype == kDefaultStorage) {
     if (req[0] != kNullOp && req[0] != kWriteInplace)
@@ -354,7 +355,7 @@ MXNET_OPERATOR_REGISTER_UNARY(_copy)
                                 [](const NodeAttrs& n) {
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                 })
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
 #endif  // MXNET_USE_ONEDNN == 1
     .set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
                                       [](const NodeAttrs& attrs) {
@@ -374,7 +375,7 @@ NNVM_REGISTER_OP(_backward_copy)
     .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
     .set_attr<FComputeEx>("FComputeEx<cpu>", CopyEx)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FResourceRequest>("FResourceRequest",
                                 [](const NodeAttrs& n) {
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
diff --git a/src/operator/tensor/elemwise_unary_op_logexp.cc b/src/operator/tensor/elemwise_unary_op_logexp.cc
index 4ce9c8ad5324..65bc76731249 100644
--- a/src/operator/tensor/elemwise_unary_op_logexp.cc
+++ b/src/operator/tensor/elemwise_unary_op_logexp.cc
@@ -22,10 +22,11 @@
  * \brief CPU Implementation of elementwise log and exp function.
  */
 #include <mxnet/base.h>
-#include "elemwise_unary_op.h"
-#include "./elemwise_binary_op-inl.h"
-#include "../nn/mkldnn/mkldnn_ops-inl.h"
+
 #include "../../nnvm/node_op_util.h"
+#include "../nn/dnnl/dnnl_ops-inl.h"
+#include "./elemwise_binary_op-inl.h"
+#include "elemwise_unary_op.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/tensor/elemwise_unary_op_pow.cc b/src/operator/tensor/elemwise_unary_op_pow.cc
index bc935e362cb7..b4e35c4c2607 100644
--- a/src/operator/tensor/elemwise_unary_op_pow.cc
+++ b/src/operator/tensor/elemwise_unary_op_pow.cc
@@ -22,10 +22,11 @@
  * \brief CPU Implementation of elementwise power (x^k for fixed k) function.
  */
 #include <mxnet/base.h>
-#include "elemwise_unary_op.h"
-#include "./elemwise_binary_op-inl.h"
-#include "../nn/mkldnn/mkldnn_ops-inl.h"
+
 #include "../../nnvm/node_op_util.h"
+#include "../nn/dnnl/dnnl_ops-inl.h"
+#include "./elemwise_binary_op-inl.h"
+#include "elemwise_unary_op.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 9a6ccc6ec8a6..04e49d646e45 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -676,8 +676,8 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
   return shape_is_known(in_attrs->at(0)) && shape_is_known(out_attrs->at(0));
 }
 
-// Currently MKLDNN only supports step = 1 or step has no value
-inline bool SupportMKLDNNSlice(const SliceParam& param) {
+// Currently DNNL only supports step = 1 or step has no value
+inline bool SupportDNNLSlice(const SliceParam& param) {
   if (param.step.ndim() == 0U)
     return true;
   for (int i = 0; i < param.step.ndim(); ++i) {
@@ -710,7 +710,7 @@ inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
 
   if (in_stype == kDefaultStorage) {
 #if MXNET_USE_ONEDNN == 1
-    if (dev_mask == Context::kCPU && MKLDNNEnvSet() && SupportMKLDNNSlice(param)) {
+    if (dev_mask == Context::kCPU && DNNLEnvSet() && SupportDNNLSlice(param)) {
       dispatched = storage_type_assign(&out_stype, kDefaultStorage, dispatch_mode, dispatch_ex);
     }
 #endif
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index fe9b62767dcf..91d5cdcc4a0d 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -25,10 +25,10 @@
 #include "./matrix_op-inl.h"
 #include "./elemwise_unary_op.h"
 #if MXNET_USE_ONEDNN == 1
-#include "../nn/mkldnn/mkldnn_base-inl.h"
-#include "../nn/mkldnn/mkldnn_ops-inl.h"
-#include "../nn/mkldnn/mkldnn_reshape-inl.h"
-#include "../nn/mkldnn/mkldnn_slice-inl.h"
+#include "../nn/dnnl/dnnl_base-inl.h"
+#include "../nn/dnnl/dnnl_ops-inl.h"
+#include "../nn/dnnl/dnnl_reshape-inl.h"
+#include "../nn/dnnl/dnnl_slice-inl.h"
 #endif
 
 namespace mxnet {
@@ -121,12 +121,12 @@ void ReshapeComputeExCPU(const nnvm::NodeAttrs& attrs,
                          const std::vector<NDArray>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
-  // If inputs are supposed to be in MKLDNN format and
-  // MKLDNN support the data type or the shape. Then convert
+  // If inputs are supposed to be in DNNL format and
+  // DNNL support the data type or the shape. Then convert
   // it to the output format and shape
 
-  if (SupportMKLDNNReshape(inputs[0], outputs[0])) {
-    MKLDNNRun(MKLDNNReshapeForward, attrs, ctx, inputs[0], req[0], outputs[0]);
+  if (SupportDNNLReshape(inputs[0], outputs[0])) {
+    DNNLRun(DNNLReshapeForward, attrs, ctx, inputs[0], req[0], outputs[0]);
   } else {
     FallBackCompute(UnaryOp::IdentityCompute<cpu>, attrs, ctx, inputs, req, outputs);
   }
@@ -139,7 +139,7 @@ bool ReshapeStorageType(const nnvm::NodeAttrs& attrs,
                         std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 #endif
 
@@ -202,7 +202,7 @@ If the argument `reverse` is set to 1, then the special values are inferred from
     .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_reshape"})
     .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", ReshapeComputeExCPU)
     .set_attr<FInferStorageType>("FInferStorageType", ReshapeStorageType)
     .set_attr<FResourceRequest>("FResourceRequest",
@@ -230,11 +230,11 @@ static void FlattenEx(const nnvm::NodeAttrs& attrs,
                       const std::vector<NDArray>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
-  // If inputs are supposed to be in MKLDNN format and
-  // MKLDNN support the data type or the shape. Then convert
+  // If inputs are supposed to be in DNNL format and
+  // DNNL support the data type or the shape. Then convert
   // it to the output format and shape
-  if (SupportMKLDNNReshape(inputs[0], outputs[0])) {
-    MKLDNNRun(MKLDNNReshapeForward, attrs, ctx, inputs[0], req[0], outputs[0]);
+  if (SupportDNNLReshape(inputs[0], outputs[0])) {
+    DNNLRun(DNNLReshapeForward, attrs, ctx, inputs[0], req[0], outputs[0]);
   } else {
     FallBackCompute(UnaryOp::IdentityCompute<cpu>, attrs, ctx, inputs, req, outputs);
   }
@@ -247,7 +247,7 @@ static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs,
                                       std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1);
   CHECK_EQ(out_attrs->size(), 1);
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 #endif
 
@@ -281,7 +281,7 @@ Example::
     .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_copy"})
     .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", FlattenEx)
     .set_attr<FInferStorageType>("FInferStorageType", FlattenStorageType)
     .set_attr<FResourceRequest>("FResourceRequest",
@@ -315,8 +315,8 @@ static void TransposeComputeExCPU(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
 
-  if (SupportMKLDNNTranspose(param, inputs[0]) && req[0] == kWriteTo) {
-    MKLDNNRun(MKLDNNTransposeForward, attrs, ctx, inputs[0], req[0], outputs[0]);
+  if (SupportDNNLTranspose(param, inputs[0]) && req[0] == kWriteTo) {
+    DNNLRun(DNNLTransposeForward, attrs, ctx, inputs[0], req[0], outputs[0]);
     return;
   }
   FallBackCompute(Transpose<cpu>, attrs, ctx, inputs, req, outputs);
@@ -329,7 +329,7 @@ inline static bool TransposeStorageType(const nnvm::NodeAttrs& attrs,
                                         std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 #endif
 
@@ -382,7 +382,7 @@ Examples::
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                 })
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", TransposeComputeExCPU)
     .set_attr<FInferStorageType>("FInferStorageType", TransposeStorageType)
 #endif
@@ -400,11 +400,11 @@ static void ExpandDimEx(const nnvm::NodeAttrs& attrs,
   // skip zero-size tensor
   if (inputs[0].shape().Size() == 0U)
     return;
-  // If inputs are supposed to be in MKLDNN format and
-  // MKLDNN support the data type or the shape. Then convert
+  // If inputs are supposed to be in DNNL format and
+  // DNNL support the data type or the shape. Then convert
   // it to the output format and shape
-  if (SupportMKLDNNReshape(inputs[0], outputs[0])) {
-    MKLDNNRun(MKLDNNReshapeForward, attrs, ctx, inputs[0], req[0], outputs[0]);
+  if (SupportDNNLReshape(inputs[0], outputs[0])) {
+    DNNLRun(DNNLReshapeForward, attrs, ctx, inputs[0], req[0], outputs[0]);
   } else {
     FallBackCompute(UnaryOp::IdentityCompute<cpu>, attrs, ctx, inputs, req, outputs);
   }
@@ -417,7 +417,7 @@ inline static bool ExpandDimStorageType(const nnvm::NodeAttrs& attrs,
                                         std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 #endif
 
@@ -443,7 +443,7 @@ will return a new array with shape ``(2,1,3,4)``.
     .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_reshape"})
     .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", ExpandDimEx)
     .set_attr<FInferStorageType>("FInferStorageType", ExpandDimStorageType)
     .set_attr<FResourceRequest>("FResourceRequest",
@@ -467,8 +467,8 @@ void SliceExCPU(const nnvm::NodeAttrs& attrs,
     SliceCsrImpl<cpu>(param, ctx, inputs[0], req[0], outputs[0]);
 #if MXNET_USE_ONEDNN == 1
   } else if (in_stype == kDefaultStorage) {
-    if (SupportMKLDNN(inputs[0])) {
-      MKLDNNRun(MKLDNNSlice, attrs, ctx, inputs[0], req[0], outputs[0]);
+    if (SupportDNNL(inputs[0])) {
+      DNNLRun(DNNLSlice, attrs, ctx, inputs[0], req[0], outputs[0]);
     } else {
       FallBackCompute(SliceOpForward<cpu>, attrs, ctx, inputs, req, outputs);
     }
@@ -537,7 +537,7 @@ Example::
     .set_attr<FCompute>("FCompute<cpu>", SliceOpForward<cpu>)
     .set_attr<FComputeEx>("FComputeEx<cpu>", SliceExCPU)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
 #endif
     .add_argument("data", "NDArray-or-Symbol", "Source input")
     .add_arguments(SliceParam::__FIELDS__());
diff --git a/src/serialization/cnpy.cc b/src/serialization/cnpy.cc
index 0c1917b9a8d5..0534b3ae7459 100644
--- a/src/serialization/cnpy.cc
+++ b/src/serialization/cnpy.cc
@@ -270,7 +270,7 @@ void save_array(const std::string& fname, const NDArray& array_) {
     array = array_;
     array.WaitToRead();
 #if MXNET_USE_ONEDNN == 1
-    if (array.IsMKLDNNData()) {
+    if (array.IsDNNLData()) {
       array = array.Reorder2Default();
     }
 #endif
@@ -476,7 +476,7 @@ void save_array(mz_zip_archive* archive, const std::string& array_name, const ND
     array = array_;
     array.WaitToRead();
 #if MXNET_USE_ONEDNN == 1
-    if (array.IsMKLDNNData()) {
+    if (array.IsDNNLData()) {
       array = array.Reorder2Default();
     }
 #endif
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index 60839dd5c0f2..0431f95ae4bc 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -50,9 +50,9 @@ class CPUDeviceStorage {
    * \brief Alignment of allocation.
    */
 #if MXNET_USE_ONEDNN == 1 || MXNET_USE_INTGEMM == 1
-  // MKLDNN requires special alignment. 64 is used by the MKLDNN library in
+  // DNNL requires special alignment. 64 is used by the DNNL library in
   // memory allocation.
-  static constexpr size_t alignment_ = kMKLDNNAlign;
+  static constexpr size_t alignment_ = kDNNLAlign;
 #else
   static constexpr size_t alignment_ = 16;
 #endif
diff --git a/src/storage/storage_manager_helpers.h b/src/storage/storage_manager_helpers.h
index 7c2f60dab63d..835af3b6c11e 100644
--- a/src/storage/storage_manager_helpers.h
+++ b/src/storage/storage_manager_helpers.h
@@ -131,9 +131,9 @@ class ContextHelperCPU : public ContextHelper {
 
  private:
 #if MXNET_USE_ONEDNN == 1 || MXNET_USE_INTGEMM == 1
-  // MKLDNN requires special alignment. 64 is used by the MKLDNN library in
+  // DNNL requires special alignment. 64 is used by the DNNL library in
   // memory allocation.
-  static constexpr size_t alignment_ = kMKLDNNAlign;
+  static constexpr size_t alignment_ = kDNNLAlign;
 #else
   static constexpr size_t alignment_ = 16;
 #endif
diff --git a/tests/cpp/include/test_mkldnn.h b/tests/cpp/include/test_dnnl.h
similarity index 65%
rename from tests/cpp/include/test_mkldnn.h
rename to tests/cpp/include/test_dnnl.h
index 89a1c1bbfbb9..359a0f26d82d 100644
--- a/tests/cpp/include/test_mkldnn.h
+++ b/tests/cpp/include/test_dnnl.h
@@ -18,13 +18,13 @@
  */
 
 /*!
- *  \file test_mkldnn.h
- *  \brief helper functions to test mkldnn.
+ *  \file test_dnnl.h
+ *  \brief helper functions to test dnnl.
  *  \author Alex Zai
  */
 
-#ifndef TEST_MKLDNN_H_
-#define TEST_MKLDNN_H_
+#ifndef TEST_DNNL_H_
+#define TEST_DNNL_H_
 
 #if MXNET_USE_ONEDNN == 1
 
@@ -33,24 +33,24 @@
 #include <vector>
 
 #include "../../../3rdparty/googletest/googletest/include/gtest/gtest.h"
-#include "../../../3rdparty/onednn/include/mkldnn_types.h"
-#include "../../../src/operator/nn/mkldnn/mkldnn_base-inl.h"
+#include "../../../3rdparty/onednn/include/dnnl_types.h"
+#include "../../../src/operator/nn/dnnl/dnnl_base-inl.h"
 
 using namespace mxnet;
 
-inline static mkldnn::memory::desc GetMemDesc(const mxnet::TShape s,
-                                              const int dtype,
-                                              const mkldnn::memory::format_tag format_tag) {
-  mkldnn::memory::dims dims(s.ndim());
+inline static dnnl::memory::desc GetMemDesc(const mxnet::TShape s,
+                                            const int dtype,
+                                            const dnnl::memory::format_tag format_tag) {
+  dnnl::memory::dims dims(s.ndim());
   for (size_t i = 0; i < dims.size(); i++)
     dims[i] = s[i];
-  mkldnn::memory::desc desc{dims, get_mkldnn_type(dtype), format_tag};
+  dnnl::memory::desc desc{dims, get_dnnl_type(dtype), format_tag};
   return desc;
 }
 
-inline static mkldnn::memory::desc GetExpandedMemDesc(mkldnn::memory::desc md,
-                                                      const float scale,
-                                                      const int dim = 0) {
+inline static dnnl::memory::desc GetExpandedMemDesc(dnnl::memory::desc md,
+                                                    const float scale,
+                                                    const int dim = 0) {
   CHECK(dim < md.data.ndims) << "dimension cannot be larger than total dimensions of input";
   mxnet::TShape s(md.data.ndims, -1);
   for (size_t i = 0; i < md.data.ndims; i++)
@@ -58,12 +58,12 @@ inline static mkldnn::memory::desc GetExpandedMemDesc(mkldnn::memory::desc md,
   s[dim] = static_cast<int64_t>(s[dim] * scale);
   return GetMemDesc(s,
                     mshadow::DataType<mshadow::default_real_t>::kFlag,
-                    static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(md)));
+                    static_cast<dnnl::memory::format_tag>(GetDefaultFormat(md)));
 }
 
 struct TestArrayShapes {
   std::vector<mxnet::TShape> shapes;
-  std::vector<mkldnn::memory::desc> mds;
+  std::vector<dnnl::memory::desc> mds;
 };
 
 // Init arrays with the default layout.
@@ -81,16 +81,16 @@ inline static void InitDefaultArray(NDArray* arr, bool is_rand = false, int max
 }
 
 // Init arrays with the specified layout.
-inline static void InitMKLDNNArray(NDArray* arr,
-                                   const mkldnn::memory::desc& desc,
-                                   bool is_rand = false,
-                                   int max      = 50) {
+inline static void InitDNNLArray(NDArray* arr,
+                                 const dnnl::memory::desc& desc,
+                                 bool is_rand = false,
+                                 int max      = 50) {
   InitDefaultArray(arr, is_rand, max);
-  arr->MKLDNNDataReorderAsync(desc);
+  arr->DNNLDataReorderAsync(desc);
   arr->WaitToRead();
 }
 
-inline static bool IsSameShape(const mkldnn::memory::desc& desc, const mxnet::TShape& shape) {
+inline static bool IsSameShape(const dnnl::memory::desc& desc, const mxnet::TShape& shape) {
   if (desc.data.ndims != shape.ndim())
     return false;
   for (size_t i = 0; i < shape.ndim(); i++)
@@ -99,94 +99,88 @@ inline static bool IsSameShape(const mkldnn::memory::desc& desc, const mxnet::TS
   return true;
 }
 
-// This function gets special MKLDNN formats without knowing the specific
+// This function gets special DNNL formats without knowing the specific
 // hardware configuration. Certainly, it potentially misses some format if
 // it's specific for certain array shapes. It covers at least one special format
 // for each of the formats: nchw, oihw, goihw.
 // To test the logic of the code in NDArray, these formats should be enough.
-inline static std::vector<mkldnn::memory::format_tag> GetMKLDNNFormat(size_t num_dims, int dtype) {
+inline static std::vector<dnnl::memory::format_tag> GetDNNLFormat(size_t num_dims, int dtype) {
   if (num_dims == 4) {
-    mkldnn::memory::dims data_dims{1, 3, 224, 224};
-    mkldnn::memory::desc data_md{
-        data_dims, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
-    mkldnn::memory::dims weight_dims{96, 3, 11, 11};
-    mkldnn::memory::desc weight_md{
-        weight_dims, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
-    mkldnn::memory::dims output_dims{1, 96, 54, 54};
-    mkldnn::memory::desc out_md{
-        output_dims, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
-    mkldnn::memory::dims strides{4, 4};
-    mkldnn::memory::dims padding{0, 0};
-
-    mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training,
-                                           mkldnn::algorithm::convolution_direct,
-                                           data_md,
-                                           weight_md,
-                                           out_md,
-                                           strides,
-                                           padding,
-                                           padding);
-    mkldnn::convolution_forward::primitive_desc pd(desc, CpuEngine::Get()->get_engine());
+    dnnl::memory::dims data_dims{1, 3, 224, 224};
+    dnnl::memory::desc data_md{data_dims, get_dnnl_type(dtype), dnnl::memory::format_tag::any};
+    dnnl::memory::dims weight_dims{96, 3, 11, 11};
+    dnnl::memory::desc weight_md{weight_dims, get_dnnl_type(dtype), dnnl::memory::format_tag::any};
+    dnnl::memory::dims output_dims{1, 96, 54, 54};
+    dnnl::memory::desc out_md{output_dims, get_dnnl_type(dtype), dnnl::memory::format_tag::any};
+    dnnl::memory::dims strides{4, 4};
+    dnnl::memory::dims padding{0, 0};
+
+    dnnl::convolution_forward::desc desc(dnnl::prop_kind::forward_training,
+                                         dnnl::algorithm::convolution_direct,
+                                         data_md,
+                                         weight_md,
+                                         out_md,
+                                         strides,
+                                         padding,
+                                         padding);
+    dnnl::convolution_forward::primitive_desc pd(desc, CpuEngine::Get()->get_engine());
     while (pd.dst_desc().get_size() != GetMemDescSize(out_md) ||
            pd.src_desc().get_size() != GetMemDescSize(data_md) ||
            pd.weights_desc().get_size() != GetMemDescSize(weight_md)) {
       CHECK(pd.next_impl()) << "No implementation";
     }
 
-    std::vector<mkldnn::memory::format_tag> ret(1);
-    ret[0] = static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(pd.dst_desc()));
+    std::vector<dnnl::memory::format_tag> ret(1);
+    ret[0] = static_cast<dnnl::memory::format_tag>(GetDefaultFormat(pd.dst_desc()));
     printf("format: %d \n", static_cast<int>(ret[0]));
     return ret;
   } else if (num_dims == 5) {
-    mkldnn::memory::dims data_dims{1, 32, 112, 112};
-    mkldnn::memory::desc data_md{
-        data_dims, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
-    mkldnn::memory::dims weight_dims{32, 1, 1, 3, 3};
-    mkldnn::memory::desc weight_md{
-        weight_dims, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
-    mkldnn::memory::dims output_dims{1, 32, 112, 112};
-    mkldnn::memory::desc out_md{
-        output_dims, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
-    mkldnn::memory::dims strides{1, 1};
-    mkldnn::memory::dims padding{1, 1};
-
-    mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training,
-                                           mkldnn::algorithm::convolution_direct,
-                                           data_md,
-                                           weight_md,
-                                           out_md,
-                                           strides,
-                                           padding,
-                                           padding);
-    mkldnn::convolution_forward::primitive_desc pd(desc, CpuEngine::Get()->get_engine());
+    dnnl::memory::dims data_dims{1, 32, 112, 112};
+    dnnl::memory::desc data_md{data_dims, get_dnnl_type(dtype), dnnl::memory::format_tag::any};
+    dnnl::memory::dims weight_dims{32, 1, 1, 3, 3};
+    dnnl::memory::desc weight_md{weight_dims, get_dnnl_type(dtype), dnnl::memory::format_tag::any};
+    dnnl::memory::dims output_dims{1, 32, 112, 112};
+    dnnl::memory::desc out_md{output_dims, get_dnnl_type(dtype), dnnl::memory::format_tag::any};
+    dnnl::memory::dims strides{1, 1};
+    dnnl::memory::dims padding{1, 1};
+
+    dnnl::convolution_forward::desc desc(dnnl::prop_kind::forward_training,
+                                         dnnl::algorithm::convolution_direct,
+                                         data_md,
+                                         weight_md,
+                                         out_md,
+                                         strides,
+                                         padding,
+                                         padding);
+    dnnl::convolution_forward::primitive_desc pd(desc, CpuEngine::Get()->get_engine());
     while (pd.dst_desc().get_size() != GetMemDescSize(out_md) ||
            pd.src_desc().get_size() != GetMemDescSize(data_md) ||
            pd.weights_desc().get_size() != GetMemDescSize(weight_md)) {
       CHECK(pd.next_impl()) << "No implementation";
     }
 
-    std::vector<mkldnn::memory::format_tag> ret(1);
-    ret[0] = static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(pd.weights_desc()));
+    std::vector<dnnl::memory::format_tag> ret(1);
+    ret[0] = static_cast<dnnl::memory::format_tag>(GetDefaultFormat(pd.weights_desc()));
     printf("format: %d\n", static_cast<int>(ret[0]));
     return ret;
   } else {
-    return std::vector<mkldnn::memory::format_tag>();
+    return std::vector<dnnl::memory::format_tag>();
   }
 }
 
 inline static TestArrayShapes GetTestArrayShapes(bool spatial_data_format = false) {
   int dtype = mshadow::DataType<mshadow::default_real_t>::kFlag;
   mxnet::ShapeVector shapes;
-  std::vector<mkldnn::memory::desc> mds;
+  std::vector<dnnl::memory::desc> mds;
   {
     // 1D
     mxnet::TShape s(1, -1);
     s[0] = 279936;
     shapes.push_back(s);
-    mds.push_back(GetMemDesc(s, dtype, mkldnn::memory::format_tag::x));
+    mds.push_back(GetMemDesc(s, dtype, dnnl::memory::format_tag::x));
     s[0] = 34848;
     shapes.push_back(s);
-    mds.push_back(GetMemDesc(s, dtype, mkldnn::memory::format_tag::x));
+    mds.push_back(GetMemDesc(s, dtype, dnnl::memory::format_tag::x));
   }
   {
     // 2D
@@ -194,11 +188,11 @@ inline static TestArrayShapes GetTestArrayShapes(bool spatial_data_format = fals
     s[0] = 96;
     s[1] = 2916;
     shapes.push_back(s);
-    mds.push_back(GetMemDesc(s, dtype, mkldnn::memory::format_tag::nc));
+    mds.push_back(GetMemDesc(s, dtype, dnnl::memory::format_tag::nc));
     s[0] = 96;
     s[1] = 363;
     shapes.push_back(s);
-    mds.push_back(GetMemDesc(s, dtype, mkldnn::memory::format_tag::nc));
+    mds.push_back(GetMemDesc(s, dtype, dnnl::memory::format_tag::nc));
   }
   {
     // 4D
@@ -208,7 +202,7 @@ inline static TestArrayShapes GetTestArrayShapes(bool spatial_data_format = fals
     s1[2] = 54;
     s1[3] = 54;
     shapes.push_back(s1);
-    mds.push_back(GetMemDesc(s1, dtype, mkldnn::memory::format_tag::nchw));
+    mds.push_back(GetMemDesc(s1, dtype, dnnl::memory::format_tag::nchw));
 
     mxnet::TShape s2(4, -1);
     s2[0] = 96;
@@ -216,9 +210,9 @@ inline static TestArrayShapes GetTestArrayShapes(bool spatial_data_format = fals
     s2[2] = 11;
     s2[3] = 11;
     shapes.push_back(s2);
-    mds.push_back(GetMemDesc(s2, dtype, mkldnn::memory::format_tag::oihw));
+    mds.push_back(GetMemDesc(s2, dtype, dnnl::memory::format_tag::oihw));
 
-    std::vector<mkldnn::memory::format_tag> formats = GetMKLDNNFormat(4, dtype);
+    std::vector<dnnl::memory::format_tag> formats = GetDNNLFormat(4, dtype);
     if (!spatial_data_format) {
       mds.push_back(GetMemDesc(s1, dtype, formats[0]));
     }
@@ -232,9 +226,9 @@ inline static TestArrayShapes GetTestArrayShapes(bool spatial_data_format = fals
     s[3] = 11;
     s[4] = 11;
     shapes.push_back(s);
-    mds.push_back(GetMemDesc(s, dtype, mkldnn::memory::format_tag::goihw));
+    mds.push_back(GetMemDesc(s, dtype, dnnl::memory::format_tag::goihw));
 
-    std::vector<mkldnn::memory::format_tag> formats = GetMKLDNNFormat(5, dtype);
+    std::vector<dnnl::memory::format_tag> formats = GetDNNLFormat(5, dtype);
     if (!spatial_data_format) {
       mds.push_back(GetMemDesc(s, dtype, formats[0]));
     }
@@ -264,27 +258,27 @@ struct OpAttrs {
 };
 
 enum ArrayTypes {
-  Normal                  = 1,
-  MKLDNN                  = 2,
-  MKLDNNDiffShape         = 4,
-  MKLDNNDiffDim           = 8,
-  NormalReshaped          = 16,
-  MKLDNNReshaped          = 32,
-  MKLDNNReshapedDiffShape = 64,
-  MKLDNNReshapedDiffDim   = 128,
-  NormalReused            = 256,
-  MKLDNNReused            = 512,
-  MKLDNNReusedDiffDim     = 1024,
-  NormalReshapedReused    = 2048,
-  NormalReusedDiffDtype   = 4096,
-  All                     = 8191,
+  Normal                = 1,
+  DNNL                  = 2,
+  DNNLDiffShape         = 4,
+  DNNLDiffDim           = 8,
+  NormalReshaped        = 16,
+  DNNLReshaped          = 32,
+  DNNLReshapedDiffShape = 64,
+  DNNLReshapedDiffDim   = 128,
+  NormalReused          = 256,
+  DNNLReused            = 512,
+  DNNLReusedDiffDim     = 1024,
+  NormalReshapedReused  = 2048,
+  NormalReusedDiffDtype = 4096,
+  All                   = 8191,
 };
 
 inline NDArray CreateKernelNDArray(mxnet::TShape kernel,
                                    int num_filters,
                                    mxnet::TShape input,
                                    bool is_deconv = false) {
-  CHECK_EQ(kernel.ndim(), 2) << "mkldnn only supports 2d filters on 4d inputs";
+  CHECK_EQ(kernel.ndim(), 2) << "dnnl only supports 2d filters on 4d inputs";
   mxnet::TShape target_shape(4, -1);
   target_shape[0] = is_deconv ? input[1] : num_filters;
   target_shape[1] = is_deconv ? num_filters : input[1];
@@ -292,16 +286,16 @@ inline NDArray CreateKernelNDArray(mxnet::TShape kernel,
   target_shape[3] = kernel[1];
   int dtype       = mshadow::DataType<mshadow::default_real_t>::kFlag;
   NDArray arr(target_shape, Context());
-  auto pd = GetMemDesc(target_shape, dtype, mkldnn::memory::format_tag::nchw);
-  InitMKLDNNArray(&arr, pd);
+  auto pd = GetMemDesc(target_shape, dtype, dnnl::memory::format_tag::nchw);
+  InitDNNLArray(&arr, pd);
   return arr;
 }
 
 inline NDArray CreateBiasNDArray(mxnet::TShape target_shape) {
   int dtype = mshadow::DataType<mshadow::default_real_t>::kFlag;
   NDArray arr(target_shape, Context());
-  auto pd = GetMemDesc(target_shape, dtype, mkldnn::memory::format_tag::x);
-  InitMKLDNNArray(&arr, pd);
+  auto pd = GetMemDesc(target_shape, dtype, dnnl::memory::format_tag::x);
+  InitDNNLArray(&arr, pd);
   return arr;
 }
 
@@ -336,21 +330,21 @@ inline void PrintVerifyMsg(const NDArrayAttrs& arr1, const NDArrayAttrs& arr2) {
 /*
  * We want to get a few types of NDArrays for testing:
  * 1. Normal NDArray
- * 2. Normal NDArray with MKLDNN layout (output from an MKLDNN operator)
- * 3. Normal NDArray with MKLDNN layout whose MKLDNN memory may have different
- *    dimensions from the NDArray (result of MKLDNNDataReorderAsync). However, this
+ * 2. Normal NDArray with DNNL layout (output from an DNNL operator)
+ * 3. Normal NDArray with DNNL layout whose DNNL memory may have different
+ *    dimensions from the NDArray (result of DNNLDataReorderAsync). However, this
  *    type of NDArrays only exists for weight arrays. I don't think we should
  *    pass them to all operators.
- *    In the inference mode, the MKLDNN memory in the weight array will be
+ *    In the inference mode, the DNNL memory in the weight array will be
  *    reordered to 5 dimensions.
  * 4. Reshaped/sliced NDArray
- * 5. Reshaped/sliced NDArray with MKLDNN layout (reshape/slice from Normal NDArray
- *    with MKLDNN layout)
- * 6. Reshaped/sliced NDArray with MKLDNN layout whose MKLDNN memory may have
- *    different dimensions from the NDArray (result of MKLDNNDataReorderAsync).
+ * 5. Reshaped/sliced NDArray with DNNL layout (reshape/slice from Normal NDArray
+ *    with DNNL layout)
+ * 6. Reshaped/sliced NDArray with DNNL layout whose DNNL memory may have
+ *    different dimensions from the NDArray (result of DNNLDataReorderAsync).
  *    However, this type of NDArrays only exists for weight arrays. I don't think
  *    we should pass them to all operators.
- *    In the inference mode, the MKLDNN memory in the weight array will be
+ *    In the inference mode, the DNNL memory in the weight array will be
  *    reordered to 5 dimensions.
  *
  *  num_inputs / dim arguments used to scale shape (used for concat backwards to enlarge input
@@ -361,9 +355,9 @@ inline std::vector<NDArrayAttrs> GetTestInputArrays(int types                = A
                                                     std::vector<float> scale = {1},
                                                     bool spatial_data_format = false,
                                                     int max                  = 50) {
-  TestArrayShapes tas                   = GetTestArrayShapes(spatial_data_format);
-  std::vector<mxnet::TShape> shapes     = tas.shapes;
-  std::vector<mkldnn::memory::desc> mds = tas.mds;
+  TestArrayShapes tas                 = GetTestArrayShapes(spatial_data_format);
+  std::vector<mxnet::TShape> shapes   = tas.shapes;
+  std::vector<dnnl::memory::desc> mds = tas.mds;
 
   std::vector<NDArrayAttrs> in_arrs;
   std::string desc_str;
@@ -405,40 +399,40 @@ inline std::vector<NDArrayAttrs> GetTestInputArrays(int types                = A
 
       // Type 2, 3.
       arr = NDArray(shape, Context());
-      if (shape.ndim() == md.data.ndims && IsSameShape(md, shape) && types & ArrayTypes::MKLDNN) {
-        desc_str = "MKLDNN NDArray";
-        InitMKLDNNArray(&arr, md, rand, max);
+      if (shape.ndim() == md.data.ndims && IsSameShape(md, shape) && types & ArrayTypes::DNNL) {
+        desc_str = "DNNL NDArray";
+        InitDNNLArray(&arr, md, rand, max);
         in_arrs.emplace_back(arr, desc_str);
       } else if (shape.ndim() == md.data.ndims && !IsSameShape(md, shape) &&
-                 types & ArrayTypes::MKLDNNDiffShape) {
-        desc_str = "MKLDNN NDArray with different shape";
-        InitMKLDNNArray(&arr, md, rand, max);
+                 types & ArrayTypes::DNNLDiffShape) {
+        desc_str = "DNNL NDArray with different shape";
+        InitDNNLArray(&arr, md, rand, max);
         in_arrs.emplace_back(arr, desc_str);
-      } else if (shape.ndim() != md.data.ndims && types & ArrayTypes::MKLDNNDiffDim) {
+      } else if (shape.ndim() != md.data.ndims && types & ArrayTypes::DNNLDiffDim) {
         std::stringstream ss;
-        ss << "MKLDNN NDArray with different dim " << shape.ndim() << "/" << md.data.ndims;
+        ss << "DNNL NDArray with different dim " << shape.ndim() << "/" << md.data.ndims;
         desc_str = ss.str();
-        InitMKLDNNArray(&arr, md, rand, max);
+        InitDNNLArray(&arr, md, rand, max);
         in_arrs.emplace_back(arr, desc_str);
       }
 
       // Type 5, 6.
       arr = NDArray(shape, Context());
       if (shape.ndim() == md.data.ndims && IsSameShape(md, shape) &&
-          types & ArrayTypes::MKLDNNReshaped) {
-        desc_str = "Reshaped MKLDNN NDArray";
-        InitMKLDNNArray(&arr, md, rand, max);
+          types & ArrayTypes::DNNLReshaped) {
+        desc_str = "Reshaped DNNL NDArray";
+        InitDNNLArray(&arr, md, rand, max);
         in_arrs.emplace_back(arr.Slice(slice_amount, arr.shape()[0] - slice_amount), desc_str);
       } else if (shape.ndim() == md.data.ndims && !IsSameShape(md, shape) &&
-                 types & ArrayTypes::MKLDNNReshapedDiffShape) {
-        desc_str = "Reshaped MKLDNN NDArray with different shape";
-        InitMKLDNNArray(&arr, md, rand, max);
+                 types & ArrayTypes::DNNLReshapedDiffShape) {
+        desc_str = "Reshaped DNNL NDArray with different shape";
+        InitDNNLArray(&arr, md, rand, max);
         in_arrs.emplace_back(arr.Slice(slice_amount, arr.shape()[0] - slice_amount), desc_str);
-      } else if (shape.ndim() != md.data.ndims && types & ArrayTypes::MKLDNNReshapedDiffDim) {
+      } else if (shape.ndim() != md.data.ndims && types & ArrayTypes::DNNLReshapedDiffDim) {
         std::stringstream ss;
-        ss << "MKLDNN NDArray with different dim " << shape.ndim() << "/" << md.data.ndims;
+        ss << "DNNL NDArray with different dim " << shape.ndim() << "/" << md.data.ndims;
         desc_str = ss.str();
-        InitMKLDNNArray(&arr, md, rand, max);
+        InitDNNLArray(&arr, md, rand, max);
         in_arrs.emplace_back(arr.Slice(slice_amount, arr.shape()[0] - slice_amount), desc_str);
       }
     }
@@ -449,25 +443,25 @@ inline std::vector<NDArrayAttrs> GetTestInputArrays(int types                = A
 /*
  * We want to get a few types of NDArrays for testing:
  * 1. Normal NDArray
- * 2. Normal NDArray with MKLDNN layout (output from an MKLDNN operator)
- * 3. Normal NDArray with MKLDNN layout whose MKLDNN memory may have different
- *    dimensions from the NDArray (result of MKLDNNDataReorderAsync). However, this
+ * 2. Normal NDArray with DNNL layout (output from an DNNL operator)
+ * 3. Normal NDArray with DNNL layout whose DNNL memory may have different
+ *    dimensions from the NDArray (result of DNNLDataReorderAsync). However, this
  *    type of NDArrays only exists for weight arrays. I don't think we should
  *    pass them to all operators.
- *    In the inference mode, the MKLDNN memory in the weight array will be
+ *    In the inference mode, the DNNL memory in the weight array will be
  *    reordered to 5 dimensions.
  * 4. Reshaped/sliced NDArray
  * 5. Reused NDArray (this is created by the MXNet executor). This type of
  *    NDArrays can only be used as output arrays.
  * 6. Reused NDArray converted from an array with a different data type.
  * 7. Reused reshaped/sliced NDArray.
- * 8. Reused NDArray with MKLDNN layout.
- * 9. Reused NDArray with MKLDNN layout of different dimensions.
+ * 8. Reused NDArray with DNNL layout.
+ * 9. Reused NDArray with DNNL layout of different dimensions.
  *
  * Optional num_inputs / dim args can be passed to modify input shape (used for Concat test)
  */
 inline std::vector<NDArrayAttrs> GetTestOutputArrays(const mxnet::TShape& shp,
-                                                     const std::vector<mkldnn::memory::desc>& mds,
+                                                     const std::vector<dnnl::memory::desc>& mds,
                                                      std::vector<float> scale = {1},
                                                      bool rand                = true,
                                                      int types                = ArrayTypes::All,
@@ -538,17 +532,17 @@ inline std::vector<NDArrayAttrs> GetTestOutputArrays(const mxnet::TShape& shp,
 
     // Type 2, 3.
     arr      = NDArray(shape, Context());
-    desc_str = "MKLDNN NDArray";
+    desc_str = "DNNL NDArray";
     if (shape.ndim() != md.data.ndims) {
       std::stringstream ss;
-      ss << "MKLDNN NDArray with different memory layout " << shape.ndim() << "/" << md.data.ndims;
+      ss << "DNNL NDArray with different memory layout " << shape.ndim() << "/" << md.data.ndims;
       desc_str = ss.str();
     }
 
-    if ((types & ArrayTypes::MKLDNN && shape.ndim() == md.data.ndims) ||
-        (types & ArrayTypes::MKLDNNDiffDim && shape.ndim() != md.data.ndims)) {
+    if ((types & ArrayTypes::DNNL && shape.ndim() == md.data.ndims) ||
+        (types & ArrayTypes::DNNLDiffDim && shape.ndim() != md.data.ndims)) {
       in_arrs.emplace_back(arr, desc_str);
-      InitMKLDNNArray(&in_arrs.back().arr, md, rand, max);
+      InitDNNLArray(&in_arrs.back().arr, md, rand, max);
     }
 
     // Type 8, 9.
@@ -557,17 +551,17 @@ inline std::vector<NDArrayAttrs> GetTestOutputArrays(const mxnet::TShape& shp,
     s[0]        = shape.Size();
     NDArray arr = NDArray(s, Context());
     arr         = arr.AsArray(shape, arr.dtype());
-    InitMKLDNNArray(&arr, md, rand, max);
-    desc_str = "Reused MKLDNN NDArray";
+    InitDNNLArray(&arr, md, rand, max);
+    desc_str = "Reused DNNL NDArray";
     if (shape.ndim() != md.data.ndims) {
       std::stringstream ss;
-      ss << "Reused MKLDNN NDArray with different memory layout " << shape.ndim() << "/"
+      ss << "Reused DNNL NDArray with different memory layout " << shape.ndim() << "/"
          << md.data.ndims;
       desc_str = ss.str();
     }
 
-    if ((types & ArrayTypes::MKLDNNReused && shape.ndim() == md.data.ndims) ||
-        (types & ArrayTypes::MKLDNNReusedDiffDim && shape.ndim() != md.data.ndims)) {
+    if ((types & ArrayTypes::DNNLReused && shape.ndim() == md.data.ndims) ||
+        (types & ArrayTypes::DNNLReusedDiffDim && shape.ndim() != md.data.ndims)) {
       in_arrs.emplace_back(arr, desc_str);
     }
   }
@@ -646,4 +640,4 @@ inline void VerifySumResult(const std::vector<NDArray*>& in_arrs,
 }
 
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // TEST_MKLDNN_H_
+#endif  // TEST_DNNL_H_
diff --git a/tests/cpp/operator/mkldnn_operator_test.cc b/tests/cpp/operator/dnnl_operator_test.cc
similarity index 93%
rename from tests/cpp/operator/mkldnn_operator_test.cc
rename to tests/cpp/operator/dnnl_operator_test.cc
index daae1bbc8ae7..7e2233c9b449 100644
--- a/tests/cpp/operator/mkldnn_operator_test.cc
+++ b/tests/cpp/operator/dnnl_operator_test.cc
@@ -18,14 +18,14 @@
  */
 
 /*!
- *  \file mkldnn_test.cc
- *  \brief test functions for mkldnn operators.
+ *  \file dnnl_test.cc
+ *  \brief test functions for dnnl operators.
  *  \author Alex Zai
  */
 
 #if MXNET_USE_ONEDNN == 1
 
-#include <mkldnn_types.h>
+#include <dnnl_types.h>
 
 #include <climits>
 #include <cmath>
@@ -33,13 +33,12 @@
 
 #include "../../src/operator/nn/convolution-inl.h"
 #include "../../src/operator/nn/deconvolution-inl.h"
-#include "../../src/operator/nn/mkldnn/mkldnn_base-inl.h"
-#include "../../src/operator/nn/mkldnn/mkldnn_ops-inl.h"
-#include "../../src/operator/nn/mkldnn/mkldnn_pooling-inl.h"
+#include "../../src/operator/nn/dnnl/dnnl_base-inl.h"
+#include "../../src/operator/nn/dnnl/dnnl_ops-inl.h"
+#include "../../src/operator/nn/dnnl/dnnl_pooling-inl.h"
 #include "../../src/operator/nn/pooling-inl.h"
-#include "../include/test_mkldnn.h"
+#include "../include/test_dnnl.h"
 #include "../include/test_util.h"
-
 #include "gtest/gtest.h"
 #include "mxnet/imperative.h"
 
@@ -197,10 +196,10 @@ OpAttrs GetLRNOp() {
   attrs.attrs.op->attr_parser(&attrs.attrs);
   attrs.accept_dims.insert(4);
   attrs.requests.insert(OpReqType::kWriteTo);
-  attrs.input_types = ArrayTypes::Normal | ArrayTypes::MKLDNN | ArrayTypes::NormalReshaped |
-                      ArrayTypes::MKLDNNReshaped;
-  attrs.output_types = ArrayTypes::Normal | ArrayTypes::MKLDNN | ArrayTypes::NormalReshaped |
-                       ArrayTypes::MKLDNNReshaped;
+  attrs.input_types =
+      ArrayTypes::Normal | ArrayTypes::DNNL | ArrayTypes::NormalReshaped | ArrayTypes::DNNLReshaped;
+  attrs.output_types =
+      ArrayTypes::Normal | ArrayTypes::DNNL | ArrayTypes::NormalReshaped | ArrayTypes::DNNLReshaped;
   return attrs;
 }
 
@@ -225,10 +224,10 @@ OpAttrs GetSoftmaxOp() {
   attrs.accept_dims.insert({1, 2, 3, 4, 5});
   attrs.requests.insert(OpReqType::kWriteTo);
   attrs.requests.insert(OpReqType::kWriteInplace);
-  attrs.input_types = ArrayTypes::Normal | ArrayTypes::MKLDNN | ArrayTypes::NormalReshaped |
-                      ArrayTypes::MKLDNNReshaped;
-  attrs.output_types = ArrayTypes::Normal | ArrayTypes::MKLDNN | ArrayTypes::NormalReshaped |
-                       ArrayTypes::MKLDNNReshaped;
+  attrs.input_types =
+      ArrayTypes::Normal | ArrayTypes::DNNL | ArrayTypes::NormalReshaped | ArrayTypes::DNNLReshaped;
+  attrs.output_types =
+      ArrayTypes::Normal | ArrayTypes::DNNL | ArrayTypes::NormalReshaped | ArrayTypes::DNNLReshaped;
   return attrs;
 }
 
@@ -240,10 +239,10 @@ OpAttrs GetFullyConnectedOp() {
   attrs.num_outputs = 1;
   attrs.attrs.op->attr_parser(&attrs.attrs);
   attrs.requests.insert(OpReqType::kWriteTo);
-  attrs.input_types = ArrayTypes::Normal | ArrayTypes::MKLDNN | ArrayTypes::NormalReshaped |
-                      ArrayTypes::MKLDNNReshaped;
-  attrs.output_types = ArrayTypes::Normal | ArrayTypes::MKLDNN | ArrayTypes::NormalReshaped |
-                       ArrayTypes::MKLDNNReshaped;
+  attrs.input_types =
+      ArrayTypes::Normal | ArrayTypes::DNNL | ArrayTypes::NormalReshaped | ArrayTypes::DNNLReshaped;
+  attrs.output_types =
+      ArrayTypes::Normal | ArrayTypes::DNNL | ArrayTypes::NormalReshaped | ArrayTypes::DNNLReshaped;
   return attrs;
 }
 
@@ -268,12 +267,12 @@ OpAttrs GetConvOp(int kernel, int num_filters, int dim, int stride, int pad) {
   attrs.attrs.dict.insert({"stride", CreateShapeString(stride, dim)});
   attrs.attrs.dict.insert({"pad", CreateShapeString(pad, dim)});
   attrs.attrs.op->attr_parser(&attrs.attrs);
-  attrs.input_types = ArrayTypes::Normal | ArrayTypes::MKLDNN | ArrayTypes::NormalReshaped |
-                      ArrayTypes::MKLDNNReshaped | ArrayTypes::NormalReused |
-                      ArrayTypes::MKLDNNReused | ArrayTypes::NormalReshapedReused;
-  attrs.output_types = ArrayTypes::Normal | ArrayTypes::MKLDNN | ArrayTypes::NormalReshaped |
-                       ArrayTypes::MKLDNNReshaped | ArrayTypes::NormalReused |
-                       ArrayTypes::MKLDNNReused | ArrayTypes::NormalReshapedReused |
+  attrs.input_types = ArrayTypes::Normal | ArrayTypes::DNNL | ArrayTypes::NormalReshaped |
+                      ArrayTypes::DNNLReshaped | ArrayTypes::NormalReused | ArrayTypes::DNNLReused |
+                      ArrayTypes::NormalReshapedReused;
+  attrs.output_types = ArrayTypes::Normal | ArrayTypes::DNNL | ArrayTypes::NormalReshaped |
+                       ArrayTypes::DNNLReshaped | ArrayTypes::NormalReused |
+                       ArrayTypes::DNNLReused | ArrayTypes::NormalReshapedReused |
                        ArrayTypes::NormalReusedDiffDtype;
   return attrs;
 }
@@ -301,12 +300,12 @@ OpAttrs GetDeconvOp(int kernel, int num_filters, int dim, int stride, int pad) {
   attrs.attrs.dict.insert({"stride", CreateShapeString(stride, dim)});
   attrs.attrs.dict.insert({"pad", CreateShapeString(pad, dim)});
   attrs.attrs.op->attr_parser(&attrs.attrs);
-  attrs.input_types = ArrayTypes::Normal | ArrayTypes::MKLDNN | ArrayTypes::NormalReshaped |
-                      ArrayTypes::MKLDNNReshaped | ArrayTypes::NormalReused |
-                      ArrayTypes::MKLDNNReused | ArrayTypes::NormalReshapedReused;
-  attrs.output_types = ArrayTypes::Normal | ArrayTypes::MKLDNN | ArrayTypes::NormalReshaped |
-                       ArrayTypes::MKLDNNReshaped | ArrayTypes::NormalReused |
-                       ArrayTypes::MKLDNNReused | ArrayTypes::NormalReshapedReused |
+  attrs.input_types = ArrayTypes::Normal | ArrayTypes::DNNL | ArrayTypes::NormalReshaped |
+                      ArrayTypes::DNNLReshaped | ArrayTypes::NormalReused | ArrayTypes::DNNLReused |
+                      ArrayTypes::NormalReshapedReused;
+  attrs.output_types = ArrayTypes::Normal | ArrayTypes::DNNL | ArrayTypes::NormalReshaped |
+                       ArrayTypes::DNNLReshaped | ArrayTypes::NormalReused |
+                       ArrayTypes::DNNLReused | ArrayTypes::NormalReshapedReused |
                        ArrayTypes::NormalReusedDiffDtype;
   return attrs;
 }
@@ -332,8 +331,8 @@ OpAttrs GetBNOp() {
   attrs.accept_dims.insert(4);
   attrs.requests.insert(OpReqType::kWriteTo);
   attrs.attrs.op->attr_parser(&attrs.attrs);
-  attrs.input_types  = ArrayTypes::Normal | ArrayTypes::MKLDNN;
-  attrs.output_types = ArrayTypes::Normal | ArrayTypes::MKLDNN;
+  attrs.input_types  = ArrayTypes::Normal | ArrayTypes::DNNL;
+  attrs.output_types = ArrayTypes::Normal | ArrayTypes::DNNL;
   return attrs;
 }
 
@@ -448,8 +447,8 @@ void TestOp(const OpAttrs& attrs, VerifyFunc verify_fn) {
   std::vector<std::vector<NDArrayAttrs>> out_arrs(attrs.num_outputs);
   std::vector<DispatchMode> dispatches = attrs.dispatches;
 
-  TestArrayShapes tas                   = GetTestArrayShapes();
-  std::vector<mkldnn::memory::desc> mds = tas.mds;
+  TestArrayShapes tas                 = GetTestArrayShapes();
+  std::vector<dnnl::memory::desc> mds = tas.mds;
 
   if (attrs.requests.find(OpReqType::kWriteTo) != attrs.requests.end()) {
     std::vector<NDArrayAttrs> in_arrs = GetTestInputArrays();
@@ -536,8 +535,8 @@ void TestConcatOp(const OpAttrs& attrs, VerifyFunc verify_fn, bool backwards = f
   std::vector<OpReqType> req(attrs.num_outputs);
   std::vector<DispatchMode> dispatches = attrs.dispatches;
 
-  TestArrayShapes tas                   = GetTestArrayShapes();
-  std::vector<mkldnn::memory::desc> mds = tas.mds;
+  TestArrayShapes tas                 = GetTestArrayShapes();
+  std::vector<dnnl::memory::desc> mds = tas.mds;
 
   std::vector<NDArrayAttrs> in_arrs = GetTestInputArrays();
 
@@ -643,8 +642,8 @@ void TestOpEx(const OpAttrs& forward_attrs, const OpAttrs& backwards_attrs) {
   std::vector<NDArray*> ex_outputs(forward_attrs.num_outputs);
   std::vector<OpReqType> req(forward_attrs.num_outputs);
 
-  TestArrayShapes tas                   = GetTestArrayShapes();
-  std::vector<mkldnn::memory::desc> mds = tas.mds;
+  TestArrayShapes tas                 = GetTestArrayShapes();
+  std::vector<dnnl::memory::desc> mds = tas.mds;
 
   std::vector<NDArrayAttrs> in_arrs = GetTestInputArrays(forward_attrs.input_types, true);
   std::vector<std::vector<NDArrayAttrs>> out_arrs(forward_attrs.num_outputs);
@@ -821,8 +820,8 @@ void TestOpExBN(const OpAttrs& forward_attrs, const OpAttrs& backwards_attrs) {
   std::vector<NDArray*> ex_outputs(forward_attrs.num_outputs);
   std::vector<OpReqType> req(forward_attrs.num_outputs);
 
-  TestArrayShapes tas                   = GetTestArrayShapes();
-  std::vector<mkldnn::memory::desc> mds = tas.mds;
+  TestArrayShapes tas                 = GetTestArrayShapes();
+  std::vector<dnnl::memory::desc> mds = tas.mds;
 
   std::vector<NDArrayAttrs> in_arrs = GetTestInputArrays(forward_attrs.input_types, false);
   std::vector<std::vector<NDArrayAttrs>> out_arrs(forward_attrs.num_outputs);
@@ -913,8 +912,8 @@ void TestFullyConnectedOp(const OpAttrs& forward_attrs, const OpAttrs& backwards
   std::vector<OpReqType> req(forward_attrs.num_outputs);
   std::vector<OpReqType> back_req(backwards_attrs.num_outputs);
 
-  TestArrayShapes tas                   = GetTestArrayShapes();
-  std::vector<mkldnn::memory::desc> mds = tas.mds;
+  TestArrayShapes tas                 = GetTestArrayShapes();
+  std::vector<dnnl::memory::desc> mds = tas.mds;
 
   std::vector<NDArrayAttrs> in_arrs =
       GetTestInputArrays(forward_attrs.input_types, true, {1}, false, 1);
@@ -1044,8 +1043,8 @@ void TestConvOp(const OpAttrs& forward_attrs,
   std::vector<OpReqType> back_req(backwards_attrs.num_outputs);
   std::vector<DispatchMode> dispatches = forward_attrs.dispatches;
 
-  TestArrayShapes tas                   = GetTestArrayShapes();
-  std::vector<mkldnn::memory::desc> mds = tas.mds;
+  TestArrayShapes tas                 = GetTestArrayShapes();
+  std::vector<dnnl::memory::desc> mds = tas.mds;
 
   P param;
   param.Init(forward_attrs.attrs.dict);
@@ -1187,8 +1186,8 @@ void TestPoolingOp(const OpAttrs& forward_attrs, const OpAttrs& backwards_attrs)
   std::vector<OpReqType> back_req(backwards_attrs.num_outputs);
   std::vector<DispatchMode> dispatches = forward_attrs.dispatches;
 
-  TestArrayShapes tas                   = GetTestArrayShapes();
-  std::vector<mkldnn::memory::desc> mds = tas.mds;
+  TestArrayShapes tas                 = GetTestArrayShapes();
+  std::vector<dnnl::memory::desc> mds = tas.mds;
 
   mxnet::op::PoolingParam param;
   param.Init(forward_attrs.attrs.dict);
@@ -1207,9 +1206,9 @@ void TestPoolingOp(const OpAttrs& forward_attrs, const OpAttrs& backwards_attrs)
     mxnet::TShape input_shape = in_arr.arr.shape();
     if (input_shape.ndim() != kernel.ndim() + 2)
       continue;
-    // cannot pool if ndarray and mkldnn memory have different ndim
+    // cannot pool if ndarray and dnnl memory have different ndim
     if (in_arr.arr.IsView() ||
-        in_arr.arr.GetMKLDNNData()->get_desc().data.ndims != in_arr.arr.shape().ndim())
+        in_arr.arr.GetDNNLData()->get_desc().data.ndims != in_arr.arr.shape().ndim())
       continue;
     std::vector<float> scale_vector(in_arr.arr.shape().ndim());
     for (int i = 0; i < in_arr.arr.shape().ndim(); i++) {
@@ -1268,7 +1267,7 @@ void TestPoolingOp(const OpAttrs& forward_attrs, const OpAttrs& backwards_attrs)
       }
 
       // needs copies of inputs since they be reused in next iteration
-      // cannot use Copy method since we need to maintain MKLDNN format
+      // cannot use Copy method since we need to maintain DNNL format
       auto tmp_output         = GetTestInputArrays()[i1];
       auto tmp_output2        = GetTestInputArrays()[i1];
       backwards_outputs[0]    = &tmp_output.arr;
@@ -1379,7 +1378,7 @@ TEST(IMPERATIVE, PoolingOp) {
 }
 
 TEST(IMPERATIVE, ConvOp) {
-  int dim = 2;  // MKLDNN conv only supports 2d kernels
+  int dim = 2;  // DNNL conv only supports 2d kernels
   for (size_t num_filters = 2; num_filters < 3; ++num_filters) {
     for (size_t kernel = 1; kernel < 4; ++kernel) {
       for (size_t stride = 1; stride < 3; ++stride) {
@@ -1396,7 +1395,7 @@ TEST(IMPERATIVE, ConvOp) {
 }
 
 TEST(IMPERATIVE, DeconvOp) {
-  int dim = 2;  // MKLDNN deconv only supports 2d kernels
+  int dim = 2;  // DNNL deconv only supports 2d kernels
   for (size_t num_filters = 2; num_filters < 3; ++num_filters) {
     for (size_t kernel = 1; kernel < 3; ++kernel) {
       for (size_t stride = 1; stride < 3; ++stride) {
diff --git a/tests/cpp/operator/mkldnn_test.cc b/tests/cpp/operator/dnnl_test.cc
similarity index 64%
rename from tests/cpp/operator/mkldnn_test.cc
rename to tests/cpp/operator/dnnl_test.cc
index 6f03a668e7e0..84b1a5af2c43 100644
--- a/tests/cpp/operator/mkldnn_test.cc
+++ b/tests/cpp/operator/dnnl_test.cc
@@ -18,23 +18,22 @@
  */
 
 /*!
- *  \file mkldnn_test.cc
- *  \brief test functions in mkldnn.
+ *  \file dnnl_test.cc
+ *  \brief test functions in dnnl.
  *  \author Da Zheng
  */
 
 #if MXNET_USE_ONEDNN == 1
 
-#include <mkldnn_types.h>
+#include <dnnl_types.h>
 
 #include <climits>
 #include <cmath>
 #include <set>
 
-#include "../../src/operator/nn/mkldnn/mkldnn_base-inl.h"
-#include "../../src/operator/nn/mkldnn/mkldnn_ops-inl.h"
-#include "../include/test_mkldnn.h"
-
+#include "../../src/operator/nn/dnnl/dnnl_base-inl.h"
+#include "../../src/operator/nn/dnnl/dnnl_ops-inl.h"
+#include "../include/test_dnnl.h"
 #include "gtest/gtest.h"
 #include "mxnet/imperative.h"
 
@@ -54,7 +53,7 @@ bool test_mem_align(void* mem, size_t size, size_t alignment, size_t space) {
 }
 #endif
 
-TEST(MKLDNN_UTIL_FUNC, AlignMem) {
+TEST(DNNL_UTIL_FUNC, AlignMem) {
 #if __GNUC__ >= 5
   size_t alignment = 4096;
   void* mem;
@@ -90,8 +89,8 @@ TEST(MKLDNN_UTIL_FUNC, AlignMem) {
 #endif
 }
 
-static void VerifyDefMem(const mkldnn::memory& mem) {
-  mkldnn::memory::desc desc     = mem.get_desc();
+static void VerifyDefMem(const dnnl::memory& mem) {
+  dnnl::memory::desc desc       = mem.get_desc();
   mshadow::default_real_t* data = static_cast<mshadow::default_real_t*>(mem.get_data_handle());
   size_t size                   = desc.get_size() / sizeof(mshadow::default_real_t);
   size_t num_same               = 0;
@@ -100,39 +99,39 @@ static void VerifyDefMem(const mkldnn::memory& mem) {
   EXPECT_EQ(num_same, size);
 }
 
-TEST(MKLDNN_UTIL_FUNC, MemFormat) {
+TEST(DNNL_UTIL_FUNC, MemFormat) {
   // Check whether the number of format is correct.
-  CHECK_EQ(mkldnn_format_tag_last, 385);
-  CHECK_EQ(mkldnn_nchw, 5);
-  CHECK_EQ(mkldnn_oihw, 5);
+  CHECK_EQ(dnnl_format_tag_last, 385);
+  CHECK_EQ(dnnl_nchw, 5);
+  CHECK_EQ(dnnl_oihw, 5);
 }
 
-static void VerifyMem(const mkldnn::memory& mem) {
-  mkldnn::memory::desc desc = mem.get_desc();
-  mkldnn::memory::dims dims(desc.data.ndims);
+static void VerifyMem(const dnnl::memory& mem) {
+  dnnl::memory::desc desc = mem.get_desc();
+  dnnl::memory::dims dims(desc.data.ndims);
   for (size_t i = 0; i < dims.size(); i++)
     dims[i] = desc.data.dims[i];
-  mkldnn::memory::desc new_desc{dims,
-                                static_cast<mkldnn::memory::data_type>(desc.data.data_type),
-                                static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(desc))};
+  dnnl::memory::desc new_desc{dims,
+                              static_cast<dnnl::memory::data_type>(desc.data.data_type),
+                              static_cast<dnnl::memory::format_tag>(GetDefaultFormat(desc))};
 
   if (desc == new_desc) {
     VerifyDefMem(mem);
   } else {
-    mkldnn::memory* src_mem = const_cast<mkldnn::memory*>(&mem);
-    mkldnn::memory new_mem(new_desc, CpuEngine::Get()->get_engine());
+    dnnl::memory* src_mem = const_cast<dnnl::memory*>(&mem);
+    dnnl::memory new_mem(new_desc, CpuEngine::Get()->get_engine());
 
-    mkldnn::stream s(CpuEngine::Get()->get_engine());
-    mkldnn::reorder(*src_mem, new_mem).execute(s, *src_mem, new_mem);
+    dnnl::stream s(CpuEngine::Get()->get_engine());
+    dnnl::reorder(*src_mem, new_mem).execute(s, *src_mem, new_mem);
 
     VerifyDefMem(new_mem);
   }
 }
 
-TEST(MKLDNN_NDArray, GetDataReorder) {
-  TestArrayShapes tas                   = GetTestArrayShapes();
-  mxnet::ShapeVector shapes             = tas.shapes;
-  std::vector<mkldnn::memory::desc> mds = tas.mds;
+TEST(DNNL_NDArray, GetDataReorder) {
+  TestArrayShapes tas                 = GetTestArrayShapes();
+  mxnet::ShapeVector shapes           = tas.shapes;
+  std::vector<dnnl::memory::desc> mds = tas.mds;
 
   // Reorder from the default to any other layout.
   for (auto s : shapes) {
@@ -140,7 +139,7 @@ TEST(MKLDNN_NDArray, GetDataReorder) {
     InitDefaultArray(&arr);
     for (auto md : mds) {
       if (s.Size() == md.get_size() / sizeof(mshadow::default_real_t)) {
-        const mkldnn::memory* mem = arr.GetMKLDNNDataReorder(md);
+        const dnnl::memory* mem = arr.GetDNNLDataReorder(md);
         printf("reorder from (");
         for (size_t i = 0; i < s.ndim(); i++)
           printf("%ld, ", s[i]);
@@ -148,9 +147,9 @@ TEST(MKLDNN_NDArray, GetDataReorder) {
         for (int i = 0; i < md.data.ndims; i++)
           printf("%ld, ", md.data.dims[i]);
         printf("), format: %d\n", static_cast<int>(GetDefaultFormat(md)));
-        MKLDNNStream::Get()->Submit(false);
+        DNNLStream::Get()->Submit(false);
         VerifyMem(*mem);
-        MKLDNNStream::Get()->Cleanup();
+        DNNLStream::Get()->Cleanup();
       }
     }
   }
@@ -161,18 +160,18 @@ TEST(MKLDNN_NDArray, GetDataReorder) {
       if (md.get_size() / sizeof(mshadow::default_real_t) == s.Size()) {
         NDArray arr(s, Context());
         // There is possibility that the dimensions of an NDArray doesn't match
-        // with the MKLDNN memory inside.
+        // with the DNNL memory inside.
         printf("Init array (");
         for (size_t i = 0; i < s.ndim(); i++)
           printf("%ld, ", s[i]);
-        printf(") with MKLDNN memory (");
+        printf(") with DNNL memory (");
         for (int i = 0; i < md.data.ndims; i++)
           printf("%ld, ", md.data.dims[i]);
         printf("), format: %d\n", static_cast<int>(GetDefaultFormat(md)));
-        InitMKLDNNArray(&arr, md);
+        InitDNNLArray(&arr, md);
         for (auto to_md : mds) {
           if (to_md.get_size() / sizeof(mshadow::default_real_t) == s.Size()) {
-            const mkldnn::memory* mem = arr.GetMKLDNNDataReorder(to_md);
+            const dnnl::memory* mem = arr.GetDNNLDataReorder(to_md);
             printf("reorder from (");
             for (size_t i = 0; i < s.ndim(); i++)
               printf("%ld, ", s[i]);
@@ -180,9 +179,9 @@ TEST(MKLDNN_NDArray, GetDataReorder) {
             for (int i = 0; i < to_md.data.ndims; i++)
               printf("%ld, ", to_md.data.dims[i]);
             printf("), format: %d\n", static_cast<int>(GetDefaultFormat(to_md)));
-            MKLDNNStream::Get()->Submit(false);
+            DNNLStream::Get()->Submit(false);
             VerifyMem(*mem);
-            MKLDNNStream::Get()->Cleanup();
+            DNNLStream::Get()->Cleanup();
           }
         }
       }
@@ -190,30 +189,30 @@ TEST(MKLDNN_NDArray, GetDataReorder) {
   }
 }
 
-TEST(MKLDNN_BASE, MKLDNNSum) {
-  std::vector<NDArrayAttrs> in_arrs     = GetTestInputArrays();
-  std::vector<NDArrayAttrs> in_arrs2    = GetTestInputArrays(ArrayTypes::All, true);
-  TestArrayShapes tas                   = GetTestArrayShapes();
-  std::vector<mkldnn::memory::desc> mds = tas.mds;
+TEST(DNNL_BASE, DNNLSum) {
+  std::vector<NDArrayAttrs> in_arrs   = GetTestInputArrays();
+  std::vector<NDArrayAttrs> in_arrs2  = GetTestInputArrays(ArrayTypes::All, true);
+  TestArrayShapes tas                 = GetTestArrayShapes();
+  std::vector<dnnl::memory::desc> mds = tas.mds;
 
   for (int i = 0; i < in_arrs.size(); i++) {
     auto in_arr  = in_arrs[i];
     auto in_arr2 = in_arrs2[i];
-    if (!SupportMKLDNN(in_arr.arr))
+    if (!SupportDNNL(in_arr.arr))
       continue;
-    if (in_arr.arr.IsMKLDNNData() && in_arr.arr.IsView()) {
+    if (in_arr.arr.IsDNNLData() && in_arr.arr.IsView()) {
       continue;
     }
     std::vector<NDArrayAttrs> out_arrs = GetTestOutputArrays(in_arr.arr.shape(), mds);
     for (auto& out_arr : out_arrs) {
-      auto in_mem1 = in_arr.arr.GetMKLDNNData();
-      auto in_mem2 = in_arr2.arr.GetMKLDNNData();
+      auto in_mem1 = in_arr.arr.GetDNNLData();
+      auto in_mem2 = in_arr2.arr.GetDNNLData();
       if (out_arr.arr.IsView())
         continue;
-      auto out_mem = out_arr.arr.GetMKLDNNData();
+      auto out_mem = out_arr.arr.GetDNNLData();
       PrintVerifyMsg(in_arr, in_arr);
-      op::MKLDNNSum(*in_mem1, *in_mem2, *out_mem);
-      MKLDNNStream::Get()->Submit();
+      op::DNNLSum(*in_mem1, *in_mem2, *out_mem);
+      DNNLStream::Get()->Submit();
       VerifySumResult({&in_arr.arr, &in_arr2.arr}, {&out_arr.arr});
     }
   }
@@ -222,50 +221,50 @@ TEST(MKLDNN_BASE, MKLDNNSum) {
   for (int i = 0; i < in_arrs.size(); i++) {
     auto in_arr  = in_arrs[i];
     auto in_arr2 = in_arrs2[i];
-    if (!SupportMKLDNN(in_arr.arr))
+    if (!SupportDNNL(in_arr.arr))
       continue;
-    if (in_arr.arr.IsMKLDNNData() && in_arr.arr.IsView()) {
+    if (in_arr.arr.IsDNNLData() && in_arr.arr.IsView()) {
       continue;
     }
-    auto input_mem  = in_arr.arr.GetMKLDNNData();
-    auto input_mem2 = in_arr2.arr.GetMKLDNNData();
+    auto input_mem  = in_arr.arr.GetDNNLData();
+    auto input_mem2 = in_arr2.arr.GetDNNLData();
     NDArrayAttrs orig_arr(in_arr.arr.Copy(in_arr.arr.ctx()), "In Place Copy");
     orig_arr.arr.WaitToRead();
     PrintVerifyMsg(orig_arr, in_arr);
-    InitMKLDNNArray(&orig_arr.arr, input_mem->get_desc());
+    InitDNNLArray(&orig_arr.arr, input_mem->get_desc());
     orig_arr.arr.CopyFrom(*input_mem);
-    op::MKLDNNSum(*input_mem, *input_mem2, *input_mem);
-    MKLDNNStream::Get()->Submit();
+    op::DNNLSum(*input_mem, *input_mem2, *input_mem);
+    DNNLStream::Get()->Submit();
     VerifySumResult({&orig_arr.arr, &in_arr2.arr}, {&in_arr.arr});
   }
 }
 
-TEST(MKLDNN_BASE, CreateMKLDNNMem) {
-  std::vector<NDArrayAttrs> in_arrs     = GetTestInputArrays();
-  std::vector<NDArrayAttrs> in_arrs2    = GetTestInputArrays(ArrayTypes::All, true);
-  TestArrayShapes tas                   = GetTestArrayShapes();
-  std::vector<mkldnn::memory::desc> mds = tas.mds;
-  MKLDNNStream* stream                  = MKLDNNStream::Get();
+TEST(DNNL_BASE, CreateDNNLMem) {
+  std::vector<NDArrayAttrs> in_arrs   = GetTestInputArrays();
+  std::vector<NDArrayAttrs> in_arrs2  = GetTestInputArrays(ArrayTypes::All, true);
+  TestArrayShapes tas                 = GetTestArrayShapes();
+  std::vector<dnnl::memory::desc> mds = tas.mds;
+  DNNLStream* stream                  = DNNLStream::Get();
 
   // kWriteTo
   for (int i = 0; i < in_arrs.size(); i++) {
     auto in_arr  = in_arrs[i];
     auto in_arr2 = in_arrs2[i];
-    if (!SupportMKLDNN(in_arr.arr))
+    if (!SupportDNNL(in_arr.arr))
       continue;
-    if (in_arr.arr.IsMKLDNNData() && in_arr.arr.IsView()) {
+    if (in_arr.arr.IsDNNLData() && in_arr.arr.IsView()) {
       continue;
     }
     std::vector<NDArrayAttrs> out_arrs = GetTestOutputArrays(in_arr.arr.shape(), mds);
     for (auto& out_arr : out_arrs) {
-      auto in_mem         = in_arr.arr.GetMKLDNNData();
-      auto in_mem2        = in_arr2.arr.GetMKLDNNData();
+      auto in_mem         = in_arr.arr.GetDNNLData();
+      auto in_mem2        = in_arr2.arr.GetDNNLData();
       NDArray orig_output = out_arr.arr.Copy(out_arr.arr.ctx());
       orig_output.WaitToRead();
       PrintVerifyMsg(in_arr, out_arr);
-      auto out_mem      = out_arr.arr.GetMKLDNNData();
-      auto output_mem_t = CreateMKLDNNMem(out_arr.arr, out_mem->get_desc(), kWriteTo);
-      op::MKLDNNSum(*in_mem, *in_mem2, *output_mem_t.second);
+      auto out_mem      = out_arr.arr.GetDNNLData();
+      auto output_mem_t = CreateDNNLMem(out_arr.arr, out_mem->get_desc(), kWriteTo);
+      op::DNNLSum(*in_mem, *in_mem2, *output_mem_t.second);
       CommitOutput(out_arr.arr, output_mem_t);
       stream->Submit();
       VerifySumResult({&in_arr.arr, &in_arr2.arr}, {&out_arr.arr});
@@ -276,21 +275,21 @@ TEST(MKLDNN_BASE, CreateMKLDNNMem) {
   for (int i = 0; i < in_arrs.size(); i++) {
     auto in_arr  = in_arrs[i];
     auto in_arr2 = in_arrs2[i];
-    if (!SupportMKLDNN(in_arr.arr))
+    if (!SupportDNNL(in_arr.arr))
       continue;
-    if (in_arr.arr.IsMKLDNNData() && in_arr.arr.IsView()) {
+    if (in_arr.arr.IsDNNLData() && in_arr.arr.IsView()) {
       continue;
     }
-    auto input_mem  = in_arr.arr.GetMKLDNNData();
-    auto input_mem2 = in_arr2.arr.GetMKLDNNData();
+    auto input_mem  = in_arr.arr.GetDNNLData();
+    auto input_mem2 = in_arr2.arr.GetDNNLData();
     NDArrayAttrs orig_arr(in_arr.arr.Copy(in_arr.arr.ctx()), "In Place Copy");
     orig_arr.arr.WaitToRead();
     PrintVerifyMsg(orig_arr, in_arr);
-    InitMKLDNNArray(&orig_arr.arr, input_mem->get_desc());
+    InitDNNLArray(&orig_arr.arr, input_mem->get_desc());
     orig_arr.arr.CopyFrom(*input_mem);
     auto output_mem_t =
-        CreateMKLDNNMem(in_arr.arr, input_mem->get_desc(), kWriteInplace, &in_arr.arr);
-    op::MKLDNNSum(*input_mem, *input_mem2, *output_mem_t.second);
+        CreateDNNLMem(in_arr.arr, input_mem->get_desc(), kWriteInplace, &in_arr.arr);
+    op::DNNLSum(*input_mem, *input_mem2, *output_mem_t.second);
     CommitOutput(in_arr.arr, output_mem_t);
     stream->Submit();
     VerifySumResult({&orig_arr.arr, &in_arr2.arr}, {&in_arr.arr});
@@ -300,21 +299,21 @@ TEST(MKLDNN_BASE, CreateMKLDNNMem) {
   for (int i = 0; i < in_arrs.size(); i++) {
     auto in_arr  = in_arrs[i];
     auto in_arr2 = in_arrs2[i];
-    if (!SupportMKLDNN(in_arr.arr))
+    if (!SupportDNNL(in_arr.arr))
       continue;
-    if (in_arr.arr.IsMKLDNNData() && in_arr.arr.IsView()) {
+    if (in_arr.arr.IsDNNLData() && in_arr.arr.IsView()) {
       continue;
     }
     std::vector<NDArrayAttrs> out_arrs = GetTestOutputArrays(in_arr.arr.shape(), mds);
     for (auto& out_arr : out_arrs) {
-      auto in_mem         = in_arr.arr.GetMKLDNNData();
-      auto in_mem2        = in_arr2.arr.GetMKLDNNData();
+      auto in_mem         = in_arr.arr.GetDNNLData();
+      auto in_mem2        = in_arr2.arr.GetDNNLData();
       NDArray orig_output = out_arr.arr.Copy(out_arr.arr.ctx());
       orig_output.WaitToRead();
       PrintVerifyMsg(in_arr, out_arr);
-      auto out_mem      = out_arr.arr.GetMKLDNNData();
-      auto output_mem_t = CreateMKLDNNMem(out_arr.arr, out_mem->get_desc(), kAddTo);
-      op::MKLDNNSum(*in_mem, *in_mem2, *output_mem_t.second);
+      auto out_mem      = out_arr.arr.GetDNNLData();
+      auto output_mem_t = CreateDNNLMem(out_arr.arr, out_mem->get_desc(), kAddTo);
+      op::DNNLSum(*in_mem, *in_mem2, *output_mem_t.second);
       CommitOutput(out_arr.arr, output_mem_t);
       stream->Submit();
       VerifyAddRequest(
@@ -326,20 +325,20 @@ TEST(MKLDNN_BASE, CreateMKLDNNMem) {
   for (int i = 0; i < in_arrs.size(); i++) {
     auto in_arr  = in_arrs[i];
     auto in_arr2 = in_arrs2[i];
-    if (!SupportMKLDNN(in_arr.arr))
+    if (!SupportDNNL(in_arr.arr))
       continue;
-    if (in_arr.arr.IsMKLDNNData() && in_arr.arr.IsView()) {
+    if (in_arr.arr.IsDNNLData() && in_arr.arr.IsView()) {
       continue;
     }
-    auto input_mem  = in_arr.arr.GetMKLDNNData();
-    auto input_mem2 = in_arr2.arr.GetMKLDNNData();
+    auto input_mem  = in_arr.arr.GetDNNLData();
+    auto input_mem2 = in_arr2.arr.GetDNNLData();
     NDArrayAttrs orig_arr(in_arr.arr.Copy(in_arr.arr.ctx()), "In Place Copy");
     orig_arr.arr.WaitToRead();
     PrintVerifyMsg(orig_arr, in_arr);
-    InitMKLDNNArray(&orig_arr.arr, input_mem->get_desc());
+    InitDNNLArray(&orig_arr.arr, input_mem->get_desc());
     orig_arr.arr.CopyFrom(*input_mem);
-    auto output_mem_t = CreateMKLDNNMem(in_arr.arr, input_mem->get_desc(), kNullOp);
-    op::MKLDNNSum(*input_mem, *input_mem2, *output_mem_t.second);
+    auto output_mem_t = CreateDNNLMem(in_arr.arr, input_mem->get_desc(), kNullOp);
+    op::DNNLSum(*input_mem, *input_mem2, *output_mem_t.second);
     CommitOutput(in_arr.arr, output_mem_t);
     stream->Submit();
     // original and input should be the same since noop
@@ -347,7 +346,7 @@ TEST(MKLDNN_BASE, CreateMKLDNNMem) {
   }
 }
 
-TEST(MKLDNN_NDArray, GetTestInputArraysConcat) {
+TEST(DNNL_NDArray, GetTestInputArraysConcat) {
   auto in_arrs = GetTestInputArrays();
   for (int dim = 0; dim < 5; dim++) {
     for (int num_inputs = 2; num_inputs < 5; num_inputs++) {
@@ -371,10 +370,10 @@ TEST(MKLDNN_NDArray, GetTestInputArraysConcat) {
   }
 }
 
-TEST(MKLDNN_NDArray, GetTestOutputArraysConcat) {
-  auto shapes_pds                       = GetTestArrayShapes();
-  std::vector<mxnet::TShape> shapes     = shapes_pds.shapes;
-  std::vector<mkldnn::memory::desc> mds = shapes_pds.mds;
+TEST(DNNL_NDArray, GetTestOutputArraysConcat) {
+  auto shapes_pds                     = GetTestArrayShapes();
+  std::vector<mxnet::TShape> shapes   = shapes_pds.shapes;
+  std::vector<dnnl::memory::desc> mds = shapes_pds.mds;
   for (auto& shape : shapes) {
     for (int dim = 0; dim < 5; dim++) {
       for (int num_inputs = 2; num_inputs < 5; num_inputs++) {
@@ -397,19 +396,19 @@ TEST(MKLDNN_NDArray, GetTestOutputArraysConcat) {
   }
 }
 
-TEST(MKLDNN_NDArray, CopyFrom) {
-  TestArrayShapes tas                   = GetTestArrayShapes();
-  std::vector<mkldnn::memory::desc> mds = tas.mds;
+TEST(DNNL_NDArray, CopyFrom) {
+  TestArrayShapes tas                 = GetTestArrayShapes();
+  std::vector<dnnl::memory::desc> mds = tas.mds;
 
   std::vector<NDArrayAttrs> in_arrs = GetTestInputArrays();
   for (auto& in_arr : in_arrs) {
-    if (in_arr.arr.IsMKLDNNData() && in_arr.arr.IsView())
+    if (in_arr.arr.IsDNNLData() && in_arr.arr.IsView())
       continue;
     std::vector<NDArrayAttrs> out_arrs = GetTestOutputArrays(in_arr.arr.shape(), mds);
     for (auto& out_arr : out_arrs) {
-      const mkldnn::memory* mem = in_arr.arr.GetMKLDNNData();
+      const dnnl::memory* mem = in_arr.arr.GetDNNLData();
       out_arr.arr.CopyFrom(*mem);
-      MKLDNNStream::Get()->Submit();
+      DNNLStream::Get()->Submit();
       std::vector<NDArray*> inputs(1);
       inputs[0] = &in_arr.arr;
       VerifyCopyResult(inputs, {&out_arr.arr});
diff --git a/tests/cpp/storage/storage_test.cc b/tests/cpp/storage/storage_test.cc
index 3d957cd619fe..8cd7fd2e8569 100644
--- a/tests/cpp/storage/storage_test.cc
+++ b/tests/cpp/storage/storage_test.cc
@@ -49,12 +49,12 @@ TEST(Storage, Basic_CPU) {
 
 TEST(Storage, CPU_MemAlign) {
   #if MXNET_USE_ONEDNN == 1
-  // MKLDNN requires special alignment. 64 is used by the MKLDNN library in
+  // DNNL requires special alignment. 64 is used by the DNNL library in
   // memory allocation.
-    static constexpr size_t alignment_ = mxnet::kMKLDNNAlign;
-  #else
-    static constexpr size_t alignment_ = 16;
-  #endif
+  static constexpr size_t alignment_ = mxnet::kDNNLAlign;
+#else
+  static constexpr size_t alignment_ = 16;
+#endif
 
   auto&& storage = mxnet::Storage::Get();
   mxnet::Context context_cpu = mxnet::Context::CPU(0);
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index 841a0b905e2d..e16255b9ad4d 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -280,7 +280,7 @@ def get_np_mean_var(data, running_mean, running_var, eps, use_global_status=True
             # calculate the inverse of standard variance
             invstdvar = 1. / np.sqrt(var + eps)
             return mean, invstdvar
-        # Here use 4D input to cover mkldnn BN and non-mkldnn BN
+        # Here use 4D input to cover dnnl BN and non-dnnl BN
         shape = (1, 2, LARGE_X, SMALL_Y)
         axis = 1  # default
         eps = 1e-3
diff --git a/tests/nightly/test_np_large_array.py b/tests/nightly/test_np_large_array.py
index c5f2a8fe4e6e..b827546ae2ea 100644
--- a/tests/nightly/test_np_large_array.py
+++ b/tests/nightly/test_np_large_array.py
@@ -2066,7 +2066,7 @@ def test_rnn_dim_check():
 
 
 @use_np
-@pytest.mark.skip(reason='runs without MKLDNN, wtih is not default behavior')
+@pytest.mark.skip(reason='runs without DNNL, wtih is not default behavior')
 def test_rnn_vanilla():
     L_SEQ, BAT, L_INP, L_STA = 2**20, 4, 2**10, 2
     def batch_check(x, modes, params):
diff --git a/tests/python/mkl/data/test_mkldnn_test_mkldnn_model_model1.json b/tests/python/dnnl/data/test_dnnl_test_dnnl_model_model1.json
similarity index 100%
rename from tests/python/mkl/data/test_mkldnn_test_mkldnn_model_model1.json
rename to tests/python/dnnl/data/test_dnnl_test_dnnl_model_model1.json
diff --git a/tests/python/mkl/subgraphs/subgraph_common.py b/tests/python/dnnl/subgraphs/subgraph_common.py
similarity index 94%
rename from tests/python/mkl/subgraphs/subgraph_common.py
rename to tests/python/dnnl/subgraphs/subgraph_common.py
index 4467443166cb..349cf628866e 100644
--- a/tests/python/mkl/subgraphs/subgraph_common.py
+++ b/tests/python/dnnl/subgraphs/subgraph_common.py
@@ -29,16 +29,16 @@
 
 OP_NAME='op_name'
 QUANTIZED_OP_NAME='quantized_op_name'
-SG_PASS_NAME='MKLDNN'
-QUANTIZE_SG_PASS_NAME='MKLDNN_QUANTIZE'
+SG_PASS_NAME='ONEDNN'
+QUANTIZE_SG_PASS_NAME='ONEDNN_QUANTIZE'
 config =  {
   'conv': {
-    OP_NAME: 'sg_mkldnn_conv',
-    QUANTIZED_OP_NAME: 'quantized_sg_mkldnn_conv'
+    OP_NAME: 'sg_onednn_conv',
+    QUANTIZED_OP_NAME: 'quantized_sg_onednn_conv'
   },
   'fc': {
-    OP_NAME: 'sg_mkldnn_fully_connected',
-    QUANTIZED_OP_NAME: 'quantized_sg_mkldnn_fully_connected'
+    OP_NAME: 'sg_onednn_fully_connected',
+    QUANTIZED_OP_NAME: 'quantized_sg_onednn_fully_connected'
   }
 }
 
@@ -90,16 +90,16 @@ def check_qsym_calibrated(qsym, out_type, name='conv'):
     if k.find('_quantize') != -1:
       assert v['out_type'] == out_type
     if k.find(quantized_op_name) != -1:
-      if quantized_op_name.startswith("quantized_sg_mkldnn_fully_connected") and 'enable_float_output' in v:
+      if quantized_op_name.startswith("quantized_sg_onednn_fully_connected") and 'enable_float_output' in v:
         continue
       assert 'min_calib_range' in v
       assert 'max_calib_range' in v
 
 def check_qsym_scale_align(qsym):
-  assert ''.join(qsym.attr_dict().keys()).find('quantized_sg_mkldnn_conv') != -1
+  assert ''.join(qsym.attr_dict().keys()).find('quantized_sg_onednn_conv') != -1
   init = False
   for k, v in qsym.attr_dict().items():
-    if k.find('quantized_sg_mkldnn_conv') != -1:
+    if k.find('quantized_sg_onednn_conv') != -1:
       assert 'min_calib_range' in v
       assert 'max_calib_range' in v
       if not init:
diff --git a/tests/python/mkl/subgraphs/test_conv_subgraph.py b/tests/python/dnnl/subgraphs/test_conv_subgraph.py
similarity index 98%
rename from tests/python/mkl/subgraphs/test_conv_subgraph.py
rename to tests/python/dnnl/subgraphs/test_conv_subgraph.py
index e965fab9e4c7..5154d241a926 100644
--- a/tests/python/mkl/subgraphs/test_conv_subgraph.py
+++ b/tests/python/dnnl/subgraphs/test_conv_subgraph.py
@@ -151,8 +151,8 @@ def forward(self, x):
         out = self.act(self.conv0(x)) + self.conv1(x)
         return out
 
-  attrs = {'sg_mkldnn_conv_act_0': {'with_act': 'true'},
-           'sg_mkldnn_conv_add_1': {'with_sum': 'true'}}
+  attrs = {'sg_onednn_conv_act_0': {'with_act': 'true'},
+           'sg_onednn_conv_add_1': {'with_sum': 'true'}}
 
   net = ConvActAdd(use_bias, alg)
   check_fusion(net, data_shape, attrs, check_quantization=quantize)
@@ -397,7 +397,7 @@ def forward(self, x):
 @pytest.mark.parametrize('reverse_sum_order', [True, False])
 @pytest.mark.parametrize('dedup_subgraph', [True, False])
 def test_conv_bn_sum(data_shape, reverse_sum_order, dedup_subgraph):
-  attr = {'sg_mkldnn_conv_bn_add_0' : {'with_bn': 'true'}}
+  attr = {'sg_onednn_conv_bn_add_0' : {'with_bn': 'true'}}
   # channels after conv+bn should be same as input channels
   net = ConvBNSum(channels=data_shape[1] ,reverse_sum_order=reverse_sum_order)
   check_fusion(net, data_shape, attr, out_types=['int8', 'auto'], dedup_subgraph=dedup_subgraph)
@@ -426,7 +426,7 @@ def forward(self, x):
 @pytest.mark.parametrize('reverse_sum_order', [True, False])
 @pytest.mark.parametrize('dedup_subgraph', [True, False])
 def test_mobilenetv2_struct(data_shape, reverse_sum_order, dedup_subgraph):
-  attr = {'sg_mkldnn_conv_bn_0' : {'with_bn': 'true'}}
+  attr = {'sg_onednn_conv_bn_0' : {'with_bn': 'true'}}
   net = MobileNetV2Struct(reverse_sum_order=reverse_sum_order)
   check_fusion(net, data_shape, attr, out_types=['int8', 'auto'], dedup_subgraph=dedup_subgraph)
 
@@ -446,10 +446,10 @@ def test_deduplication(data_shape, reverse_sum_order, model_name):
   model_dedup.initialize()
   model_no_dedup = copy.copy(model_dedup)
 
-  model_dedup.optimize_for(data_nd, backend='MKLDNN', dedup_subgraph = True, skip_infer = True)
+  model_dedup.optimize_for(data_nd, backend='DNNL', dedup_subgraph = True, skip_infer = True)
   out = model_dedup(data_nd)
 
-  model_dedup.optimize_for(data_nd, backend='MKLDNN', dedup_subgraph = False, skip_infer = True)
+  model_dedup.optimize_for(data_nd, backend='DNNL', dedup_subgraph = False, skip_infer = True)
   out_dedup = model_no_dedup(data_nd)
 
   assert_almost_equal(out.asnumpy(), out_dedup.asnumpy(), rtol=1e-3, atol=1e-1)
@@ -776,7 +776,7 @@ def test_bn_relu_fusion(axis):
 
     out1 = net(dummy_data)
     out1.wait_to_read()
-    net.optimize_for(dummy_data, backend='MKLDNN')
+    net.optimize_for(dummy_data, backend='DNNL')
     out2 = net(dummy_data)
 
     assert_almost_equal(out1, out2)
diff --git a/tests/python/mkl/subgraphs/test_fc_subgraph.py b/tests/python/dnnl/subgraphs/test_fc_subgraph.py
similarity index 100%
rename from tests/python/mkl/subgraphs/test_fc_subgraph.py
rename to tests/python/dnnl/subgraphs/test_fc_subgraph.py
diff --git a/tests/python/mkl/subgraphs/test_transformer_subgraph.py b/tests/python/dnnl/subgraphs/test_transformer_subgraph.py
similarity index 98%
rename from tests/python/mkl/subgraphs/test_transformer_subgraph.py
rename to tests/python/dnnl/subgraphs/test_transformer_subgraph.py
index 06daaf2ec24e..0c24bc26cfc5 100644
--- a/tests/python/mkl/subgraphs/test_transformer_subgraph.py
+++ b/tests/python/dnnl/subgraphs/test_transformer_subgraph.py
@@ -67,7 +67,7 @@ def forward(self, x, mask):
   net.hybridize()
   ref_out = net(in_data, mask)
 
-  fused_net.optimize_for(in_data, mask, backend="MKLDNN")
+  fused_net.optimize_for(in_data, mask, backend="DNNL")
   out = fused_net(in_data, mask)
   mx.nd.waitall()
 
diff --git a/tests/python/mkl/test_amp.py b/tests/python/dnnl/test_amp.py
similarity index 100%
rename from tests/python/mkl/test_amp.py
rename to tests/python/dnnl/test_amp.py
diff --git a/tests/python/mkl/test_bf16_operator.py b/tests/python/dnnl/test_bf16_operator.py
similarity index 100%
rename from tests/python/mkl/test_bf16_operator.py
rename to tests/python/dnnl/test_bf16_operator.py
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/dnnl/test_dnnl.py
similarity index 97%
rename from tests/python/mkl/test_mkldnn.py
rename to tests/python/dnnl/test_dnnl.py
index 8d855e65cfb0..713a14b2fb01 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/dnnl/test_dnnl.py
@@ -16,7 +16,7 @@
 # under the License.
 
 """
-MKL-DNN related test cases
+DNNL related test cases
 """
 import sys
 import os
@@ -33,7 +33,7 @@
 
 @use_np
 @pytest.mark.seed(1234)
-def test_mkldnn_ndarray_slice():
+def test_dnnl_ndarray_slice():
     ctx = mx.cpu()
     net = gluon.nn.HybridSequential()
     net.add(gluon.nn.Conv2D(channels=32, kernel_size=3, activation=None))
@@ -46,7 +46,7 @@ def test_mkldnn_ndarray_slice():
 
 @use_np
 @pytest.mark.seed(1234)
-def test_mkldnn_engine_threading():
+def test_dnnl_engine_threading():
     net = gluon.nn.HybridSequential()
     net.add(gluon.nn.Conv2D(channels=32, kernel_size=3, activation=None))
     net.initialize(ctx=mx.cpu())
@@ -59,18 +59,18 @@ def __getitem__(self, key):
     loader = gluon.data.DataLoader(Dummy(), batch_size=2, num_workers=1)
 
     X = (32, 3, 32, 32)
-    # trigger mkldnn execution thread
+    # trigger dnnl execution thread
     y = net(mx.np.array(np.ones(X))).asnumpy()
 
     # Use Gluon dataloader to trigger different thread.
     # below line triggers different execution thread
     for _ in loader:
         y = net(mx.np.array(np.ones(X))).asnumpy()
-        # output should be 056331709 (non-mkldnn mode output)
+        # output should be 056331709 (non-dnnl mode output)
         assert_almost_equal(y[0, 0, 0, 0], np.array(0.056331709))
         break
 
-def test_mkldnn_reshape():
+def test_dnnl_reshape():
     def test_reshape_after_conv(dst_shape):
         shape = (1,1,4,4)
         data = mx.symbol.Variable('data')
@@ -95,7 +95,7 @@ def test_reshape_after_conv(dst_shape):
         assert_almost_equal(outputs, data_npy.reshape(dst_shape))
 
 
-    # Test mkldnn reshape (Using shape)
+    # Test dnnl reshape (Using shape)
     test_cases = [(256), (16, 16), (4, 4, 16), (4, 4, 4, 4)]
     for test_case in test_cases:
         test_reshape_after_conv(test_case)
@@ -222,7 +222,7 @@ def test_flatten_slice_after_conv():
     print(p[0])
 
 
-def test_mkldnn_sum_with_mkldnn_layout():
+def test_dnnl_sum_with_dnnl_layout():
 
     x_shape = (32, 3, 224, 224)
     x_npy = np.ones(x_shape, dtype='float32')
@@ -237,14 +237,14 @@ def test_mkldnn_sum_with_mkldnn_layout():
         inputs = []
         for _ in range(i):
             inputs.append(z)
-        y = mx.sym.add_n(*inputs) # (only MKLDNN data input)
+        y = mx.sym.add_n(*inputs) # (only DNNL data input)
         exe = y._simple_bind(ctx=mx.cpu(), x=x_shape, w=w_shape)
         out = exe.forward(is_train=False, x=x_npy, w=np.ones(w_shape))[0]
         #conv with kernel (3,3) on ones should give result=27
         single_cov = 27.0
         assert_almost_equal(out[0].asnumpy()[0, 0, 0], single_cov*i)
 
-def test_mkldnn_sum_inplace_with_cpu_layout():
+def test_dnnl_sum_inplace_with_cpu_layout():
     x_shape = (32, 3, 224, 224)
     x_npy = np.ones(x_shape, dtype='float32')
     y_shape = (32, 32, 222, 222)
@@ -252,7 +252,7 @@ def test_mkldnn_sum_inplace_with_cpu_layout():
     x = mx.sym.Variable("x")
     y = mx.sym.Variable("y")
     z = mx.symbol.Convolution(data=x, num_filter=32, kernel=(3, 3))
-    z = mx.sym.add_n(z, y) # (MKLDNN data, cpu data)
+    z = mx.sym.add_n(z, y) # (DNNL data, cpu data)
     exe = z._simple_bind(ctx=mx.cpu(), x=x_shape, y=y_shape)
     out = exe.forward(is_train=False, x=x_npy, y=y_npy)[0]
     assert_almost_equal(out[0].asnumpy()[0, 0, 0], 1.0)
@@ -501,10 +501,10 @@ def softmax_forward(input_data, true_output):
     softmax_forward(mx.nd.array([[[[-3.4e38,-3.4e38]]]]), np.array([1.0,1.0]))
     softmax_forward(mx.nd.array([[[[3.4e38,3.4e38]]]]), np.array([1.0,1.0]))
 
-def test_non_mkldnn_fcomputeex():
-    # test special case where MKLDNN formatted NDArray feeds into non-mkldnn fcomputeex operator
-    # conv is example where MKLDNN NDArray is created from regular NDArrays
-    # CustomOps is example of non-mkldnn fcomputeex operator
+def test_non_dnnl_fcomputeex():
+    # test special case where DNNL formatted NDArray feeds into non-dnnl fcomputeex operator
+    # conv is example where DNNL NDArray is created from regular NDArrays
+    # CustomOps is example of non-dnnl fcomputeex operator
 
     @mx.operator.register("custom")
     class CustomProp(mx.operator.CustomOpProp):
diff --git a/tests/python/mkl/test_quantization_mkldnn.py b/tests/python/dnnl/test_quantization_dnnl.py
similarity index 100%
rename from tests/python/mkl/test_quantization_mkldnn.py
rename to tests/python/dnnl/test_quantization_dnnl.py
diff --git a/tests/python/gpu/test_gluon_model_zoo_gpu.py b/tests/python/gpu/test_gluon_model_zoo_gpu.py
index 4d3ba3c2b822..d5514e4c52fd 100644
--- a/tests/python/gpu/test_gluon_model_zoo_gpu.py
+++ b/tests/python/gpu/test_gluon_model_zoo_gpu.py
@@ -96,14 +96,14 @@ def get_nn_model(name):
     else:
         return get_model(name)
 
-# Seed 1521019752 produced a failure on the Py2 MKLDNN-GPU CI runner
+# Seed 1521019752 produced a failure on the Py2 DNNL-GPU CI runner
 # on 2/16/2018 that was not reproducible.  Problem could be timing related or
 # based on non-deterministic algo selection.
 @mx.util.use_np
 @pytest.mark.serial
 def test_training():
     # We use network models without dropout for testing.
-    # TODO(zhengda) mobilenet can't pass this test even without MKLDNN.
+    # TODO(zhengda) mobilenet can't pass this test even without DNNL.
     all_models = ['resnet18_v1', 'densenet121']
 
     batch_size = 10
diff --git a/tests/python/gpu/test_kvstore_gpu.py b/tests/python/gpu/test_kvstore_gpu.py
index a756296c512c..18f632037ec6 100644
--- a/tests/python/gpu/test_kvstore_gpu.py
+++ b/tests/python/gpu/test_kvstore_gpu.py
@@ -39,8 +39,8 @@ def init_kv_with_str(stype='default', kv_type='local'):
     return kv
 
 # 1. Test seed 89411477 (module seed 1829754103) resulted in a py3-gpu CI runner core dump.
-# 2. Test seed 1155716252 (module seed 1032824746) resulted in py3-mkldnn-gpu have error
-# src/operator/nn/mkldnn/mkldnn_base.cc:567: Check failed: similar
+# 2. Test seed 1155716252 (module seed 1032824746) resulted in py3-dnnl-gpu have error
+# src/operator/nn/dnnl/dnnl_base.cc:567: Check failed: similar
 # Both of them are not reproducible, so this test is back on random seeds.
 @pytest.mark.skipif(mx.context.num_gpus() < 2, reason="test_rsp_push_pull needs more than 1 GPU")
 @pytest.mark.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/14189")
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 60a11d6508aa..99d7791f8e20 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -45,7 +45,7 @@ def is_test_for_gpu():
     return mx.current_context().device_type == 'gpu'
 
 
-def is_test_for_mkldnn():
+def is_test_for_dnnl():
     return (mx.current_context().device_type == 'cpu'
             and os.environ.get('ENABLE_ONEDNN_QUANTIZATION_TEST') == '1')
 
@@ -216,9 +216,9 @@ def check_quantized_conv(data_shape, kernel, num_filter, pad, stride, dilate, us
         if is_test_for_native_cpu():
             print('skipped testing quantized_conv for native cpu since it is not supported yet')
             return
-        elif is_test_for_mkldnn():
+        elif is_test_for_dnnl():
             # (TODO)Xinyu: https://github.com/apache/incubator-mxnet/issues/16830
-            print('skipped testing quantized_conv for mkldnn cpu since it is a flaky case')
+            print('skipped testing quantized_conv for dnnl cpu since it is a flaky case')
             return
         elif qdtype == 'uint8' and is_test_for_gpu():
             print('skipped testing quantized_conv for gpu uint8 since it is not supported yet')
@@ -699,7 +699,7 @@ def forward(self, x):
             assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
 
     for qdtype in ['int8', 'uint8']:
-        if is_test_for_mkldnn():
+        if is_test_for_dnnl():
             check_quantized_fc((32, 512, 2), 100, False, qdtype, flatten=False)
             check_quantized_fc((32, 512, 2), 100, True, qdtype, flatten=False)
             check_quantized_fc((32, 512, 2, 2), 100, False, qdtype, flatten=False)
@@ -822,8 +822,8 @@ def check_quantized_act(data_shape, qdtype):
         if is_test_for_native_cpu():
             print('skipped testing quantized_act for native cpu since it is not supported yet')
             return
-        elif qdtype == 'int8' and is_test_for_mkldnn():
-            print('skipped testing quantized_act for mkldnn cpu int8 since it is not supported yet')
+        elif qdtype == 'int8' and is_test_for_dnnl():
+            print('skipped testing quantized_act for dnnl cpu int8 since it is not supported yet')
             return
         elif is_test_for_gpu():
             print('skipped testing quantized_act for gpu since it is not supported yet')
@@ -1057,8 +1057,8 @@ def skip_not_supported():
         if is_test_for_native_cpu():
             print('skipped testing quantize_model for native cpu since it is not supported yet')
             return True
-        elif qdtype == 'int8' and is_test_for_mkldnn():
-            print('skipped testing quantize_model for mkldnn cpu int8 since it is not supported yet')
+        elif qdtype == 'int8' and is_test_for_dnnl():
+            print('skipped testing quantize_model for dnnl cpu int8 since it is not supported yet')
             return True
         elif qdtype == 'uint8' and is_test_for_gpu():
             print('skipped testing quantize_model for gpu uint8 since it is not supported yet')
@@ -1069,8 +1069,8 @@ def check_quantize_model(qdtype):
         if is_test_for_native_cpu():
             print('skipped testing quantize_model for native cpu since it is not supported yet')
             return
-        elif qdtype == 'int8' and is_test_for_mkldnn():
-            print('skipped testing quantize_model for mkldnn cpu int8 since it is not supported yet')
+        elif qdtype == 'int8' and is_test_for_dnnl():
+            print('skipped testing quantize_model for dnnl cpu int8 since it is not supported yet')
             return
         elif qdtype == 'uint8' and is_test_for_gpu():
             print('skipped testing quantize_model for gpu uint8 since it is not supported yet')
diff --git a/tests/python/unittest/test_numpy_gluon.py b/tests/python/unittest/test_numpy_gluon.py
index c79b1633fea2..d5971dc5cb43 100644
--- a/tests/python/unittest/test_numpy_gluon.py
+++ b/tests/python/unittest/test_numpy_gluon.py
@@ -434,7 +434,7 @@ def forward(self, a):
 
     out = net(a)
     b = net.collect_params().pop('d.weight').data()
-    net.optimize_for(a, b, backend="MKLDNN")
+    net.optimize_for(a, b, backend="DNNL")
     out2 = net(a)
 
 
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index 10bfb65c1f1b..1d9b78870ef0 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -1907,7 +1907,7 @@ def test_batchnorm_fallback():
 
 
 @pytest.mark.serial
-def test_mkldnn_sparse():
+def test_dnnl_sparse():
     # This test is trying to create a race condition describedd in
     # https://github.com/apache/incubator-mxnet/issues/10189
     arr = mx.nd.random.uniform(shape=(10, 10, 32, 32))
diff --git a/tests/tutorials/test_sanity_tutorials.py b/tests/tutorials/test_sanity_tutorials.py
index ef31aa7dc612..3cf7f63b32b1 100644
--- a/tests/tutorials/test_sanity_tutorials.py
+++ b/tests/tutorials/test_sanity_tutorials.py
@@ -33,9 +33,9 @@
              'embedded/index.md',
              'embedded/wine_detector.md',
              'gluon/index.md',
-             'mkldnn/index.md',
-             'mkldnn/MKLDNN_README.md',
-             'mkldnn/operator_list.md',
+             'dnnl/index.md',
+             'dnnl/DNNL_README.md',
+             'dnnl/operator_list.md',
              'nlp/index.md',
              'onnx/index.md',
              'python/index.md',
diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py
index 2ebd2f8e92ca..aade365cecaa 100644
--- a/tests/tutorials/test_tutorials.py
+++ b/tests/tutorials/test_tutorials.py
@@ -220,6 +220,6 @@ def test_amp():
     assert _test_tutorial_nb('amp/amp_tutorial')
 # https://github.com/apache/incubator-mxnet/issues/16181
 """
-def test_mkldnn_quantization():
-    assert _test_tutorial_nb('mkldnn/mkldnn_quantization')
+def test_dnnl_quantization():
+    assert _test_tutorial_nb('dnnl/dnnl_quantization')
 """
diff --git a/tools/license_header.py b/tools/license_header.py
index 743b29c5fab5..f93ff7c34c5d 100755
--- a/tools/license_header.py
+++ b/tools/license_header.py
@@ -134,7 +134,7 @@
                'src/operator/contrib/multi_proposal-inl.h',
 
                # Licensed under Apache 2.0 license
-               'src/operator/nn/mkldnn/mkldnn_base-inl.h',
+               'src/operator/nn/dnnl/dnnl_base-inl.h',
 
                # This file
                'tools/license_header.py',