diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index 061cc180f383..ec1cf964c36c 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -22,7 +22,7 @@
 import os
 import numpy as np
 import mxnet as mx
-import unittest
+
 from mxnet.test_utils import rand_ndarray, assert_almost_equal
 from mxnet.module import Module
 from mxnet import gluon
@@ -81,9 +81,12 @@ def test_mkldnn_engine_threading():
     with net.name_scope():
         net.add(gluon.nn.Conv2D(channels=32, kernel_size=3, activation=None))
     net.collect_params().initialize(ctx=mx.cpu())
+
     class Dummy(gluon.data.Dataset):
+
         def __len__(self):
             return 2
+
         def __getitem__(self, key):
             return key, np.ones((3, 224, 224)), np.ones((10, ))
 
@@ -104,7 +107,7 @@ def __getitem__(self, key):
 @with_seed()
 def test_mkldnn_reshape():
     def test_reshape_after_conv(dst_shape):
-        shape = (1,1,4,4)
+        shape = (1, 1, 4, 4)
         data = mx.symbol.Variable('data')
         conv = mx.symbol.Convolution(data=data, num_filter=16, kernel=(1, 1), pad=(0, 0), stride=(1, 1))
         res = mx.symbol.reshape(data=conv, shape=dst_shape)
@@ -112,7 +115,7 @@ def test_reshape_after_conv(dst_shape):
 
         val1 = np.random.uniform(-1, 1, shape)
         val2 = np.random.uniform(-1, 1, (16, 1, 1, 1))
-        val3 = np.random.uniform(-1 ,1, (1))
+        val3 = np.random.uniform(-1, 1, (1))
 
         exe.arg_arrays[0][:] = val1
         exe.arg_arrays[1][:] = val2
@@ -240,9 +243,9 @@ def test_flatten_slice_after_conv():
     data = mx.symbol.Variable('data')
     weight = mx.symbol.Variable('weight')
     bias = mx.symbol.Variable('bias')
-    conv1= mx.symbol.Convolution(data = data, weight=weight, bias=bias, name='conv1', num_filter=64, kernel=(3,3), stride=(1,1))
-    flatten1 = mx.symbol.flatten(data = conv1)
-    slice1 = mx.symbol.slice(data = flatten1, begin=0, end=1)
+    conv1 = mx.symbol.Convolution(data=data, weight=weight, bias=bias, name='conv1', num_filter=64, kernel=(3, 3), stride=(1, 1))
+    flatten1 = mx.symbol.flatten(data=conv1)
+    slice1 = mx.symbol.slice(data=flatten1, begin=0, end=1)
 
     shape = (2, 16, 16, 16)
     val = np.random.rand(2, 16, 16, 16).astype(np.float32)
@@ -270,12 +273,12 @@ def test_mkldnn_sum_with_mkldnn_layout():
         inputs = []
         for n in range(i):
             inputs.append(z)
-        y = mx.sym.add_n(*inputs) # (only MKLDNN data input)
+        y = mx.sym.add_n(*inputs)   # (only MKLDNN data input)
         exe = y.simple_bind(ctx=mx.cpu(), x=x_shape, w=w_shape)
         out = exe.forward(is_train=False, x=x_npy, w=np.ones(w_shape))[0]
-        #conv with kernel (3,3) on ones should give result=27
+        # conv with kernel (3,3) on ones should give result=27
         single_cov = 27.0
-        assert_almost_equal(out[0].asnumpy()[0, 0, 0], single_cov*i)
+        assert_almost_equal(out[0].asnumpy()[0, 0, 0], single_cov * i)
 
 def test_mkldnn_sum_inplace_with_cpu_layout():
 
@@ -286,7 +289,7 @@ def test_mkldnn_sum_inplace_with_cpu_layout():
     x = mx.sym.Variable("x")
     y = mx.sym.Variable("y")
     z = mx.symbol.Convolution(data=x, num_filter=32, kernel=(3, 3))
-    z = mx.sym.add_n(z, y) # (MKLDNN data, cpu data)
+    z = mx.sym.add_n(z, y)  # (MKLDNN data, cpu data)
     exe = z.simple_bind(ctx=mx.cpu(), x=x_shape, y=y_shape)
     out = exe.forward(is_train=False, x=x_npy, y=y_npy)[0]
     assert_almost_equal(out[0].asnumpy()[0, 0, 0], 1.0)
@@ -484,7 +487,7 @@ def check_Deconvolution_training(stype):
             elif np.array(shape).shape[0] == 5 and stype == "default":
                 # Unable to test fallback to native implementation for non-default storage types
                 # as 3D deconvolution is not natively supported
-                test = mx.symbol.Deconvolution(data=data, kernel=(3,3,3), stride=(2,2,2), num_filter=4)
+                test = mx.symbol.Deconvolution(data=data, kernel=(3, 3, 3), stride=(2, 2, 2), num_filter=4)
                 weight_tmp = np.random.normal(-0.1, 0.1, size=(3, 4, 3, 3, 3))
             else:
                 return 0
@@ -542,10 +545,10 @@ def softmax_forward(input_data, true_output):
         nparr = ndarr.asnumpy()
         assert_almost_equal(nparr, true_output, rtol=1e-5, atol=1e-5)
 
-    softmax_forward(mx.nd.array([[[[-1e30,-1e30]]]]), np.array([1.0,1.0]))
-    softmax_forward(mx.nd.array([[[[1e30,1e30]]]]), np.array([1.0,1.0]))
-    softmax_forward(mx.nd.array([[[[-3.4e38,-3.4e38]]]]), np.array([1.0,1.0]))
-    softmax_forward(mx.nd.array([[[[3.4e38,3.4e38]]]]), np.array([1.0,1.0]))
+    softmax_forward(mx.nd.array([[[[-1e30, -1e30]]]]), np.array([1.0, 1.0]))
+    softmax_forward(mx.nd.array([[[[1e30, 1e30]]]]), np.array([1.0, 1.0]))
+    softmax_forward(mx.nd.array([[[[-3.4e38, -3.4e38]]]]), np.array([1.0, 1.0]))
+    softmax_forward(mx.nd.array([[[[3.4e38, 3.4e38]]]]), np.array([1.0, 1.0]))
 
 @with_seed()
 def test_non_mkldnn_fcomputeex():
@@ -586,14 +589,14 @@ def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
             self.assign(in_grad[0], req[0], out_grad)
 
     data = mx.symbol.Variable('data')
-    conv = mx.sym.Convolution(data=data, kernel=(5, 5), pad=(1, 1), stride=(1,1), num_filter=8, name="conv", no_bias=True)
+    conv = mx.sym.Convolution(data=data, kernel=(5, 5), pad=(1, 1), stride=(1, 1), num_filter=8, name="conv", no_bias=True)
     custom = mx.symbol.Custom(name='custom', data=conv, op_type='custom')
-    exec1 = custom.bind(mx.cpu(), args={'data': mx.nd.ones([10,3,96,96]), 'conv_weight': mx.nd.ones([8,3,5,5])})
+    exec1 = custom.bind(mx.cpu(), args={'data': mx.nd.ones([10, 3, 96, 96]), 'conv_weight': mx.nd.ones([8, 3, 5, 5])})
     exec1.forward()[0].wait_to_read()
 
 @with_seed()
 def test_conv_transpose():
-    axes = [(0,2,1,3), (0,2,3,1), (1,2,3,0), (3,2,1,0)]
+    axes = [(0, 2, 1, 3), (0, 2, 3, 1), (1, 2, 3, 0), (3, 2, 1, 0)]
     a = np.random.rand(10, 16, 50, 50)
     b = np.random.rand(32, 16, 3, 3)
     x = mx.nd.array(a)
@@ -617,7 +620,7 @@ def __init__(self, factor):
 
         def hybrid_forward(self, F, x):
             f1, f2 = self._factors
-                                                          # (N, f1*f2*C, H, W)
+
             x = F.reshape(x, (0, -4, -1, f1 * f2, 0, 0))  # (N, C, f1*f2, H, W)
             x = F.reshape(x, (0, 0, -4, f1, f2, 0, 0))    # (N, C, f1, f2, H, W)
             x = F.transpose(x, (0, 1, 4, 2, 5, 3))        # (N, C, H, f1, W, f2)
@@ -655,7 +658,7 @@ def test_weight_async_reorder():
     mod.bind(for_training=False, data_shapes=[('data', (10, 16, 50, 50))])
     mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
     data = [mx.random.uniform(-1.0, 1.0, shape=(10, 16, 50, 50), ctx=mx.current_context())]
-    batch=mx.io.DataBatch(data, [])
+    batch = mx.io.DataBatch(data, [])
     for i in range(2):
         mod.forward(batch, is_train=False)
         for output in mod.get_outputs():
@@ -664,7 +667,7 @@ def test_weight_async_reorder():
 @with_seed()
 def test_concat():
     def ref_concat(a, b, axis):
-      return np.concatenate((a, b), axis=axis)
+        return np.concatenate((a, b), axis=axis)
 
     a_sym = mx.sym.Variable("a")
     b_sym = mx.sym.Variable("b")
@@ -673,14 +676,14 @@ def ref_concat(a, b, axis):
     b_shape = tuple(dshape)
 
     for axis in range(0, 4):
-      z = mx.sym.concat(a_sym, b_sym, dim=axis)
-      a = np.random.uniform(-1, 1, a_shape)
-      b = np.random.uniform(-1, 1, b_shape)
-      exe = z.simple_bind(ctx=mx.cpu(), a=a_shape, b=b_shape)
-      out = exe.forward(is_train=False, a=a, b=b)
-      ref_out = ref_concat(a, b, axis=axis)
-      out = out[0].asnumpy()
-      assert_almost_equal(out, ref_out)
+        z = mx.sym.concat(a_sym, b_sym, dim=axis)
+        a = np.random.uniform(-1, 1, a_shape)
+        b = np.random.uniform(-1, 1, b_shape)
+        exe = z.simple_bind(ctx=mx.cpu(), a=a_shape, b=b_shape)
+        out = exe.forward(is_train=False, a=a, b=b)
+        ref_out = ref_concat(a, b, axis=axis)
+        out = out[0].asnumpy()
+        assert_almost_equal(out, ref_out)
 
     def check_concat_training(stype):
         data_shape = rand_shape_nd(4)
@@ -731,7 +734,7 @@ def calc_output_of_layer(layer):
 @with_seed()
 def test_elemwise_add():
     def ref_add(a, b):
-      return np.add(a, b)
+        return np.add(a, b)
 
     a_sym = mx.sym.Variable("a")
     b_sym = mx.sym.Variable("b")
@@ -768,10 +771,11 @@ def test_rnn():
     STATE_SIZE = [1, 2]
     BATCH_SIZE = [4]
     INPUT_SIZE = [4]
+
     def batch_check(seq_length, state_size, batch_size, input_size):
-        modes_params = [('rnn_relu', mx.np.random.normal(0, 1, ((input_size + state_size + 2)*state_size),)),
-                        ('rnn_tanh', mx.np.random.normal(0, 1, ((input_size + state_size + 2)*state_size),)),
-                        ('gru', mx.np.random.normal(0, 1, ((input_size + state_size + 2)*state_size*3),))
+        modes_params = [('rnn_relu', mx.np.random.normal(0, 1, ((input_size + state_size + 2) * state_size),)),
+                        ('rnn_tanh', mx.np.random.normal(0, 1, ((input_size + state_size + 2) * state_size),)),
+                        ('gru', mx.np.random.normal(0, 1, ((input_size + state_size + 2) * state_size * 3),))
                         ]
         for m, p in modes_params:
             data = mx.np.random.normal(0, 1, (seq_length, batch_size, input_size))
@@ -780,7 +784,7 @@ def batch_check(seq_length, state_size, batch_size, input_size):
             state.attach_grad()
 
             with mx.autograd.record():
-                y = mx.npx.rnn(data=data, parameters=p, mode=m, \
+                y = mx.npx.rnn(data=data, parameters=p, mode=m,
                                state=state, state_size=state_size, num_layers=1)
             assert y.shape == (seq_length, batch_size, state_size)
             assert type(y[0]).__name__ == 'ndarray'