From d5212b418e0302ce22800e2ecd073abd7ecde208 Mon Sep 17 00:00:00 2001
From: Adam Grabowski <adam.grabowski@intel.com>
Date: Fri, 25 Jun 2021 14:26:08 +0200
Subject: [PATCH] Add new benchmark function for single operator comparison

---
 benchmark/opperf/README.md                |  33 +++++++
 benchmark/opperf/rules/default_params.py  |   5 +
 benchmark/opperf/utils/benchmark_utils.py | 110 +++++++++++++++++++---
 3 files changed, 133 insertions(+), 15 deletions(-)
diff --git a/benchmark/opperf/README.md b/benchmark/opperf/README.md
index bb3fb8ed851b..806d91a8462d 100644
--- a/benchmark/opperf/README.md
+++ b/benchmark/opperf/README.md
@@ -172,6 +172,39 @@ Output for the above benchmark run, on a CPU machine, would look something like
 Currently, opperf supports operators in `mx.nd.*` namespace.
 However, locally, one can profile internal operators in `mx.nd.internal.*` namespace.
 
+## Usecase 6 - Compare performance for chosen operator from both NDArray library and its Numpy/Numpy_extension counterpart
+For example, you want to compare add operator from `mx.nd` and `mx.np`. You just run the following python script.
+
+```
+#!/usr/bin/python
+from benchmark.opperf.utils.benchmark_utils import run_benchmark_operator
+
+run_benchmark_operator(name = "add", run_backward=True)
+```
+
+Output for the above benchmark run, on a CPU machine, would look something like below:
+
+```
+<module 'mxnet.ndarray'>
+[{'add': [{'inputs': {'lhs': (128, 128), 'rhs': (128, 128)},
+           'max_storage_mem_alloc_cpu/0': 32.768,
+           'avg_time_forward_add': 0.0496,
+           'avg_time_backward_add': 0.0793}]}]
+<module 'mxnet.numpy'>
+[{'add': [{'inputs': {'x1': (128, 128), 'x2': (128, 128)},
+           'max_storage_mem_alloc_cpu/0': 32.768,
+           'avg_time_forward_add': 0.0484,
+           'avg_time_backward_add': 0.0898}]}]
+
+```
+This function uses `run_performance_test` function mentioned in Usecase 3 and Usecase 4 and it is possible to change all parameters from it.
+All arguments that are of type NDArray will be automatically provided with shape that is passed as `size`.
+If any fuction requires more arguments or different shaped NDArrays, provide those arguments as `additional_inputs` as it is shown below:
+```
+run_benchmark_operator(name = "pick", size = (128,128), additional_inputs = {"index": (128,1)})
+```
+
+
 #### Changes
 Remove the hasattr check for `op.__name__` to be in `mx.nd`
 
diff --git a/benchmark/opperf/rules/default_params.py b/benchmark/opperf/rules/default_params.py
index 4c903384a030..0474eea08f7b 100644
--- a/benchmark/opperf/rules/default_params.py
+++ b/benchmark/opperf/rules/default_params.py
@@ -781,3 +781,8 @@
                           "grads_sum_sq", "mhs", "data1", "data2", "loc", "parameters", "state",
                           "state_cell"]
 
+PARAMS_OF_TYPE_NP_ARRAY = ["x1", "x2", "prototype", "object", "a", "b", "fill_value", "array", "x", "arr",
+                           "values", "ary", "seq", "arrays", "tup", "indices", "m", "ar", "q", "p", "condition",
+                           "arys", "v", "A", "xp", "fp", "data", "mask", "gamma", "beta", "running_mean",
+                           "running_var", "weight", "index", "lhs", "rhs"]
+
diff --git a/benchmark/opperf/utils/benchmark_utils.py b/benchmark/opperf/utils/benchmark_utils.py
index b3bf8213569a..38a1c15a6147 100644
--- a/benchmark/opperf/utils/benchmark_utils.py
+++ b/benchmark/opperf/utils/benchmark_utils.py
@@ -16,35 +16,90 @@
 # under the License.
 
 import logging
+import inspect
 
 import mxnet as mx
 from mxnet import nd
+from mxnet import np
 
 from .ndarray_utils import get_mx_ndarray, nd_forward_and_profile, nd_forward_backward_and_profile
 from .common_utils import merge_map_list
 from .op_registry_utils import prepare_op_inputs
-from benchmark.opperf.rules.default_params import PARAMS_OF_TYPE_NDARRAY
+from benchmark.opperf.rules.default_params import PARAMS_OF_TYPE_NDARRAY, PARAMS_OF_TYPE_NP_ARRAY
 from .profiler_utils import cpp_profile, python_profile
 
 no_backward = {'gather_nd', 'softmax_cross_entropy', 'linalg_gelqf', 'linalg_slogdet', 'moments', 'SequenceLast', 'Embedding'}
 
-def _prepare_op_inputs(inputs, run_backward, dtype, ctx):
+def _prepare_op_inputs(inputs, run_backward, dtype, ctx, module):
     mx.random.seed(41)
     kwargs_list = []
+    if module == 'mxnet.numpy_extension' or module == 'mxnet.numpy':
+        PARAMS_TYPE = PARAMS_OF_TYPE_NP_ARRAY
+        get_array_fn = get_mx_np_ndarray
+    else:
+        PARAMS_TYPE = PARAMS_OF_TYPE_NDARRAY
+        get_array_fn = get_mx_ndarray
 
     for inp in inputs:
         kwargs = {}
         for key, value in inp.items():
-            if key in PARAMS_OF_TYPE_NDARRAY:
-                kwargs[key] = get_mx_ndarray(ctx=ctx, in_tensor=value,
-                                             dtype=dtype,
-                                             initializer=nd.normal,
-                                             attach_grad=run_backward)
+            if key in PARAMS_TYPE:
+                kwargs[key] = get_array_fn(ctx=ctx, in_tensor=value,
+                                           dtype=dtype,
+                                           initializer=nd.normal,
+                                           attach_grad=run_backward)
             else:
                 kwargs[key] = value
         kwargs_list.append(kwargs)
     return kwargs_list
 
+def get_mx_np_ndarray(ctx, in_tensor, dtype, initializer, attach_grad=True):
+    """Helper function to prepare a MXNet Numpy NDArray tensor in given Context (ctx) of type (dtype).
+    You can get a new Tensor by providing only "Shape" or "Numpy NDArray" or another MXNet NDArray as
+    "in_tensor".
+
+    NOTE: This is a sync call and waits for the Tensor to be created.
+
+    Parameters
+    ----------
+    ctx: mx.ctx, default mx.cpu()
+        Context of the new MXNet NDArray Tensor.
+    in_tensor: Numpy NDArray or MXNet NDArray or Tuple of shape
+        Can be a tuple of shape or Numpy NDArray or MXNet NDArray.
+    dtype: str
+        Precision or Dtype of the expected Tensor. Ex: "float32", "Int64"
+    initializer:
+        Function reference to the initialize to use. Ex: mx.nd.random.normal, mx.nd.zeros
+    attach_grad: Boolean, default True
+        To attach a gradient for the Tensor. Default is True.
+
+    Returns
+    -------
+    MXNet NDArray Tensor.
+    """
+    if isinstance(in_tensor, int) or isinstance(in_tensor, float):
+        return in_tensor
+
+    if isinstance(in_tensor, tuple):
+        nd_ndarray = get_mx_ndarray(ctx=ctx, in_tensor=in_tensor,
+                                             dtype="float32",
+                                             initializer=initializer,
+                                             attach_grad=attach_grad)
+        tensor = nd_ndarray.as_np_ndarray().astype(dtype=dtype)
+    elif isinstance(in_tensor, list):
+        tensor = np.array(in_tensor, ctx=ctx)
+    elif isinstance(in_tensor, nd.NDArray):
+        tensor = in_tensor.as_np_ndarray()
+    elif isinstance(in_tensor, np.ndarray):
+        tensor = in_tensor.as_in_context(ctx)
+    else:
+        raise ValueError("Invalid input type for creating input tensor. Input can be tuple() of shape or Numpy Array or"
+                         " MXNet NDArray. Given - ", in_tensor)
+    if attach_grad:
+        tensor.attach_grad()
+
+    tensor.wait_to_read()
+    return tensor
 
 def parse_input_ndarray(input_dict):
     """Parse input for ndarray and extract array shape for better readability
@@ -96,7 +151,7 @@ def parse_input_ndarray(input_dict):
     return no_new_line_input_dict
 
 
-def _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, kwargs_list, profiler):
+def _run_operator_performance_test(op, inputs, run_backward, warmup, runs, kwargs_list, profiler):
     if profiler == 'native':
         if run_backward:
             benchmark_helper_func = cpp_profile(nd_forward_backward_and_profile)
@@ -141,10 +196,11 @@ def run_performance_test(ops, inputs, run_backward=True,
     Parameters
     ----------
     ops: [Str]
-        One or list of operators to benchmark. Should be an NDArray operator.
+        One or list of operators to benchmark. Should be an NDArray, Numpy or Numpy_extension operator.
     inputs: map
         Inputs for operator. Key should be name of parameter for operator.
-        Example: inputs = {"lhs": (1024, 1024), "rhs": (1024, 1024)} for mx.nd.add
+        Example: inputs = {"lhs": (1024, 1024), "rhs": (1024, 1024)} for mx.nd.add or
+                 inputs = {"x1": (1024, 1024), "x2": (1024, 1024)} for mx.np.add
     run_backward: Boolean, Default is True
         Should we have backward operator benchmarks.
     dtype: Str, default 'float32'
@@ -166,20 +222,44 @@ def run_performance_test(ops, inputs, run_backward=True,
     Note: when run_performance_test is called on the nd.Embedding operator with run_backward=True, an error will
     be thrown. Track issue here: https://github.com/apache/incubator-mxnet/issues/11314
     """
-    kwargs_list = _prepare_op_inputs(inputs, run_backward, dtype, ctx)
-
     if not isinstance(ops, list):
         ops = [ops]
 
     op_benchmark_result = []
     for op in ops:
-        if hasattr(mx.nd, op.__name__):
-            benchmark_result = _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, kwargs_list, profiler)
+        if hasattr(mx.nd, op.__name__) or hasattr(mx.np, op.__name__) or hasattr(mx.npx, op.__name__):
+            kwargs_list = _prepare_op_inputs(inputs, run_backward, dtype, ctx, op.__module__)
+            benchmark_result = _run_operator_performance_test(op, inputs, run_backward, warmup, runs, kwargs_list, profiler)
         else:
-            raise ValueError("Unknown NDArray operator provided to benchmark. -  ", op.__name__)
+            raise ValueError("Unknown {0} operator provided to benchmark. - {1}".format(op.__module__,  op.__name__))
         op_benchmark_result.append(benchmark_result)
     return op_benchmark_result
 
+def run_benchmark_operator(name, size = (128,128), additional_inputs = {},
+                           dtype = 'float32', run_backward = False, ctx = mx.cpu(),
+                           warmup=10, runs=50, profiler="native"):
+    arg_list = {mx.nd: PARAMS_OF_TYPE_NDARRAY, mx.np: PARAMS_OF_TYPE_NP_ARRAY, mx.npx: PARAMS_OF_TYPE_NP_ARRAY}
+    modules = [mx.nd, mx.np, mx.npx]
+    responses = []
+    for module in modules:
+        if hasattr(module, name):
+            function = getattr(module, name)
+            args = inspect.getargspec(function).args
+            inputs = {}
+            for arg in args:
+                if arg in additional_inputs.keys():
+                    inputs.update({arg: additional_inputs[arg]})
+                elif arg in arg_list[module]:
+                    inputs.update({arg:size})
+            res = run_performance_test(function, run_backward=run_backward, dtype=dtype, ctx=ctx,
+                                inputs=[inputs],
+                                warmup=warmup, runs=runs, profiler=profiler)
+            responses.append(res)
+        else:
+            responses.append(str(module.__name__) + " does not have operator " + name)
+    for i in range(len(modules)):
+        print(modules[i].__name__)
+        print(responses[i])
 
 def run_op_benchmarks(ops, dtype, ctx, profiler, int64_tensor, warmup, runs):
     # Running im2col either forwards or backwards on GPU results in errors