From a51ba4b3c6d7013b30c6e1bc39e7f8fecac31e9e Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Wed, 10 Oct 2018 14:19:03 +0800
Subject: [PATCH 01/13] add models to cnn benchmark

---
 .../image-classification/benchmark_score.py   | 57 ++++++++++++++++---
 1 file changed, 48 insertions(+), 9 deletions(-)

diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py
index a4118ebcf76b..45283c85e09c 100644
--- a/example/image-classification/benchmark_score.py
+++ b/example/image-classification/benchmark_score.py
@@ -21,26 +21,51 @@
 from common import find_mxnet
 from common.util import get_gpus
 import mxnet as mx
+import mxnet.gluon.model_zoo.vision as models
 from importlib import import_module
 import logging
+import argparse
 import time
 import numpy as np
 logging.basicConfig(level=logging.DEBUG)
 
+parser = argparse.ArgumentParser(description='SymbolAPI-based CNN perf')
+parser.add_argument('--network', type=str, default='all', 
+		                choices=['all', 'alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50',
+                                 'resnet-152', 'inception-bn', 'inception-v3', 
+                                 'inception-v4', 'inception-resnet-v2', 'mobilenet',
+                                 'densenet-121', 'squeezenet1.1'])
+parser.add_argument('--batch-size', type=int, default=0)
+
+opt = parser.parse_args()
+
 def get_symbol(network, batch_size, dtype):
-    image_shape = (3,299,299) if network == 'inception-v3' else (3,224,224)
+    image_shape = (3,299,299) if network in ['inception-v3', 'inception-v4'] else (3,224,224)
     num_layers = 0
-    if 'resnet' in network:
+    if network == 'inception-resnet-v2':
+        network = network
+    elif 'resnet' in network:
         num_layers = int(network.split('-')[1])
         network = network.split('-')[0]
     if 'vgg' in network:
         num_layers = int(network.split('-')[1])
         network = 'vgg'
-    net = import_module('symbols.'+network)
-    sym = net.get_symbol(num_classes=1000,
-                         image_shape=','.join([str(i) for i in image_shape]),
-                         num_layers=num_layers,
-                         dtype=dtype)
+    if 'densenet' in network:
+        num_layers = int(network.split('-')[1])
+        network = 'densenet' + str(num_layers)
+    if network in ['densenet121', 'squeezenet1.1']:
+        logging.info('network: %s is converted from gluon modelzoo', network)
+        sym = models.get_model(network)
+        sym.hybridize()
+        data = mx.sym.var('data')
+        sym = sym(data)
+        sym = mx.sym.SoftmaxOutput(sym, name='softmax')
+    else:
+        net = import_module('symbols.'+network)
+        sym = net.get_symbol(num_classes=1000,
+                            image_shape=','.join([str(i) for i in image_shape]),
+                            num_layers=num_layers,
+                            dtype=dtype)
     return (sym, [('data', (batch_size,)+image_shape)])
 
 def score(network, dev, batch_size, num_batches, dtype):
@@ -69,12 +94,26 @@ def score(network, dev, batch_size, num_batches, dtype):
     return num_batches*batch_size/(time.time() - tic)
 
 if __name__ == '__main__':
-    networks = ['alexnet', 'vgg-16', 'inception-bn', 'inception-v3', 'resnetv1-50', 'resnet-50', 'resnet-152']
+    if opt.network == 'all':
+        networks = ['alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50',
+                    'resnet-152', 'inception-bn', 'inception-v3', 
+                    'inception-v4', 'inception-resnet-v2', 
+                    'mobilenet', 'densenet-121', 'squeezenet1.1']
+        logging.info('It may take some time to run all models, '
+                     'set --network to run a specific one')
+    else:
+        networks = [opt.network]
     devs = [mx.gpu(0)] if len(get_gpus()) > 0 else []
     # Enable USE_MKLDNN for better CPU performance
     devs.append(mx.cpu())
 
-    batch_sizes = [1, 2, 4, 8, 16, 32]
+    if opt.batch_size == 0:
+        batch_sizes = [1, 2, 4, 8, 16, 32]
+        logging.info('run batchsize [1 ,2 , 4, 8, 16, 32] by default, '
+                     'set --batch-size to run a specific one')
+    else:
+        batch_sizes = [opt.batch_size]
+
     for net in networks:
         logging.info('network: %s', net)
         for d in devs:

From 6be4b378be26abe139f52e81c9c5803a59082e21 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Wed, 10 Oct 2018 15:19:13 +0800
Subject: [PATCH 02/13] improve benchmark score

---
 example/image-classification/benchmark_score.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py
index 45283c85e09c..c823f181bef5 100644
--- a/example/image-classification/benchmark_score.py
+++ b/example/image-classification/benchmark_score.py
@@ -34,7 +34,7 @@
 		                choices=['all', 'alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50',
                                  'resnet-152', 'inception-bn', 'inception-v3', 
                                  'inception-v4', 'inception-resnet-v2', 'mobilenet',
-                                 'densenet-121', 'squeezenet1.1'])
+                                 'densenet121', 'squeezenet1.1'])
 parser.add_argument('--batch-size', type=int, default=0)
 
 opt = parser.parse_args()
@@ -50,11 +50,7 @@ def get_symbol(network, batch_size, dtype):
     if 'vgg' in network:
         num_layers = int(network.split('-')[1])
         network = 'vgg'
-    if 'densenet' in network:
-        num_layers = int(network.split('-')[1])
-        network = 'densenet' + str(num_layers)
     if network in ['densenet121', 'squeezenet1.1']:
-        logging.info('network: %s is converted from gluon modelzoo', network)
         sym = models.get_model(network)
         sym.hybridize()
         data = mx.sym.var('data')
@@ -98,7 +94,7 @@ def score(network, dev, batch_size, num_batches, dtype):
         networks = ['alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50',
                     'resnet-152', 'inception-bn', 'inception-v3', 
                     'inception-v4', 'inception-resnet-v2', 
-                    'mobilenet', 'densenet-121', 'squeezenet1.1']
+                    'mobilenet', 'densenet121', 'squeezenet1.1']
         logging.info('It may take some time to run all models, '
                      'set --network to run a specific one')
     else:
@@ -116,6 +112,9 @@ def score(network, dev, batch_size, num_batches, dtype):
 
     for net in networks:
         logging.info('network: %s', net)
+        if net in ['densenet121', 'squeezenet1.1']:
+            logging.info('network: %s is converted from gluon modelzoo', net)
+            logging.info('you can run example/gluon/benchmark_gluon.py for more models')
         for d in devs:
             logging.info('device: %s', d)
             logged_fp16_warning = False

From 1c583dfe12daba32ab8d2df5f9e57386f898b1db Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Wed, 10 Oct 2018 15:59:03 +0800
Subject: [PATCH 03/13] add benchmark_gluon

---
 example/gluon/benchmark_gluon.py | 148 +++++++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)
 create mode 100644 example/gluon/benchmark_gluon.py

diff --git a/example/gluon/benchmark_gluon.py b/example/gluon/benchmark_gluon.py
new file mode 100644
index 000000000000..22164edb768c
--- /dev/null
+++ b/example/gluon/benchmark_gluon.py
@@ -0,0 +1,148 @@
+import mxnet as mx
+import mxnet.gluon.model_zoo.vision as models
+import time
+import logging
+import argparse
+import subprocess
+import os
+import errno
+
+logging.basicConfig(level=logging.INFO)
+parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN perf')
+
+parser.add_argument('--model', type=str, default='all', 
+                                         choices=['all', 'alexnet', 'densenet121', 'densenet161', 
+                                        'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25',
+                                        'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25',
+                                        'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1',
+                                        'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1', 
+                                        'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', 
+                                        'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 
+                                        'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 
+                                        'vgg19', 'vgg19_bn'])
+parser.add_argument('--batch-size', type=int, default=0)
+parser.add_argument('--type', type=str, default='inf', choices=['all', 'train', 'inf'])
+
+opt = parser.parse_args()
+
+num_batches = 100
+dry_run = 10  # use 10 iterations to warm up
+batch_inf = [1, 16, 32, 64, 128, 256]
+batch_train = [1, 2, 4, 8, 16, 32, 64, 126, 256]
+image_shapes = [(3, 224, 224), (3, 299, 299)]
+
+def get_gpus():
+    """
+    return a list of GPUs
+    """
+    try:
+        re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True)
+    except OSError:
+        return []
+    return range(len([i for i in re.split('\n') if 'GPU' in i]))
+
+def score(network, batch_size, ctx):
+    net = models.get_model(network)
+    if 'inceptionv3' == network:
+        data_shape = [('data', (batch_size,) + image_shapes[1])]
+    else:
+        data_shape = [('data', (batch_size,) + image_shapes[0])]
+
+    net.hybridize()
+    data = mx.sym.var('data')
+    out = net(data)
+    softmax = mx.sym.SoftmaxOutput(out, name='softmax')
+    mod = mx.mod.Module(softmax, context=ctx)
+    mod.bind(for_training     = False,
+                inputs_need_grad = False,
+                data_shapes      = data_shape)
+    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
+    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes]
+    batch = mx.io.DataBatch(data, [])
+    for i in range(dry_run + num_batches):
+        if i == dry_run:
+            tic = time.time()
+        mod.forward(batch, is_train=False)
+        for output in mod.get_outputs():
+            output.wait_to_read()
+    fwd = time.time() - tic
+    return fwd
+
+
+def train(network, batch_size, ctx):
+    net = models.get_model(network)
+    if 'inceptionv3' == network:
+        data_shape = [('data', (batch_size,) + image_shapes[1])]
+    else:
+        data_shape = [('data', (batch_size,) + image_shapes[0])]
+
+    net.hybridize()
+    data = mx.sym.var('data')
+    out = net(data)
+    softmax = mx.sym.SoftmaxOutput(out, name='softmax')
+    mod = mx.mod.Module(softmax, context=ctx)
+    mod.bind(for_training     = True,
+                inputs_need_grad = False,
+                data_shapes      = data_shape)
+    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
+    mod.init_optimizer(kvstore='local', optimizer='sgd')
+    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes]
+    batch = mx.io.DataBatch(data, [])
+    for i in range(dry_run + num_batches):
+        if i == dry_run:
+            tic = time.time()
+        mod.forward(batch, is_train=True)
+        for output in mod.get_outputs():
+            output.wait_to_read()
+        mod.backward()
+        mod.update()
+    bwd = time.time() - tic
+    return bwd
+
+if __name__ == '__main__':
+    runtype = opt.type
+    bs = opt.batch_size
+
+    if opt.model == 'all':
+        networks = ['alexnet', 'densenet121', 'densenet161', 'densenet169', 'densenet201',
+	                'inceptionv3', 'mobilenet0.25', 'mobilenet0.5', 'mobilenet0.75',
+                    'mobilenet1.0', 'mobilenetv2_0.25', 'mobilenetv2_0.5', 'mobilenetv2_0.75',
+	                'mobilenetv2_1.0', 'resnet101_v1', 'resnet101_v2', 'resnet152_v1', 'resnet152_v2',
+                    'resnet18_v1', 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', 
+                    'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 'vgg11_bn', 'vgg13', 
+                    'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn']
+        logging.info('It may take some time to run all models, '
+                     'set --network to run a specific one')
+    else:
+        networks = [opt.model]
+
+    devs = [mx.gpu(0)] if len(get_gpus()) > 0 else []
+    # Enable USE_MKLDNN for better CPU performance
+    devs.append(mx.cpu())
+
+    for network in networks:
+        logging.info('network: %s', network)
+        for d in devs:
+            logging.info('device: %s', d)
+            if runtype == 'inf' or runtype == 'all':
+                if bs != 0:
+                    fwd_time = score(network, bs, d)
+                    fps = (bs*num_batches)/fwd_time
+                    logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps)
+                else:
+                    for batch_size in batch_inf:
+                        fwd_time = score(network, batch_size, d)
+                        fps = (batch_size * num_batches) / fwd_time
+                        logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps)
+            if runtype == 'train' or runtype == 'all':
+                if bs != 0:
+                    bwd_time = train(network, bs, d)
+                    fps = (bs*num_batches)/bwd_time
+                    logging.info(network + ' training perf for BS %d is %f img/s', bs, fps)
+                else:
+                    for batch_size in batch_train:
+                        bwd_time = train(network, batch_size, d)
+                        fps = (batch_size * num_batches) / bwd_time
+                        logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps)
+
+

From a3d79266ae38b4f1b1324eb1c2c03f48e627e7e2 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Wed, 10 Oct 2018 16:50:23 +0800
Subject: [PATCH 04/13] improve lint

---
 example/gluon/benchmark_gluon.py              | 28 +++++++++----------
 .../image-classification/benchmark_score.py   |  8 +++---
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/example/gluon/benchmark_gluon.py b/example/gluon/benchmark_gluon.py
index 22164edb768c..d1ee9313b654 100644
--- a/example/gluon/benchmark_gluon.py
+++ b/example/gluon/benchmark_gluon.py
@@ -12,14 +12,14 @@
 
 parser.add_argument('--model', type=str, default='all', 
                                          choices=['all', 'alexnet', 'densenet121', 'densenet161', 
-                                        'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25',
-                                        'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25',
-                                        'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1',
-                                        'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1', 
-                                        'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', 
-                                        'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 
-                                        'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 
-                                        'vgg19', 'vgg19_bn'])
+                                                  'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25',
+                                                  'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25',
+                                                  'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1',
+                                                  'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1', 
+                                                  'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', 
+                                                  'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 
+                                                  'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 
+                                                  'vgg19', 'vgg19_bn'])
 parser.add_argument('--batch-size', type=int, default=0)
 parser.add_argument('--type', type=str, default='inf', choices=['all', 'train', 'inf'])
 
@@ -54,8 +54,8 @@ def score(network, batch_size, ctx):
     softmax = mx.sym.SoftmaxOutput(out, name='softmax')
     mod = mx.mod.Module(softmax, context=ctx)
     mod.bind(for_training     = False,
-                inputs_need_grad = False,
-                data_shapes      = data_shape)
+             inputs_need_grad = False,
+             data_shapes      = data_shape)
     mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
     data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes]
     batch = mx.io.DataBatch(data, [])
@@ -82,8 +82,8 @@ def train(network, batch_size, ctx):
     softmax = mx.sym.SoftmaxOutput(out, name='softmax')
     mod = mx.mod.Module(softmax, context=ctx)
     mod.bind(for_training     = True,
-                inputs_need_grad = False,
-                data_shapes      = data_shape)
+             inputs_need_grad = False,
+             data_shapes      = data_shape)
     mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
     mod.init_optimizer(kvstore='local', optimizer='sgd')
     data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes]
@@ -105,9 +105,9 @@ def train(network, batch_size, ctx):
 
     if opt.model == 'all':
         networks = ['alexnet', 'densenet121', 'densenet161', 'densenet169', 'densenet201',
-	                'inceptionv3', 'mobilenet0.25', 'mobilenet0.5', 'mobilenet0.75',
+                    'inceptionv3', 'mobilenet0.25', 'mobilenet0.5', 'mobilenet0.75',
                     'mobilenet1.0', 'mobilenetv2_0.25', 'mobilenetv2_0.5', 'mobilenetv2_0.75',
-	                'mobilenetv2_1.0', 'resnet101_v1', 'resnet101_v2', 'resnet152_v1', 'resnet152_v2',
+                    'mobilenetv2_1.0', 'resnet101_v1', 'resnet101_v2', 'resnet152_v1', 'resnet152_v2',
                     'resnet18_v1', 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', 
                     'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 'vgg11_bn', 'vgg13', 
                     'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn']
diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py
index c823f181bef5..d4aef913a944 100644
--- a/example/image-classification/benchmark_score.py
+++ b/example/image-classification/benchmark_score.py
@@ -31,10 +31,10 @@
 
 parser = argparse.ArgumentParser(description='SymbolAPI-based CNN perf')
 parser.add_argument('--network', type=str, default='all', 
-		                choices=['all', 'alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50',
-                                 'resnet-152', 'inception-bn', 'inception-v3', 
-                                 'inception-v4', 'inception-resnet-v2', 'mobilenet',
-                                 'densenet121', 'squeezenet1.1'])
+                                 choices=['all', 'alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50',
+                                          'resnet-152', 'inception-bn', 'inception-v3', 
+                                          'inception-v4', 'inception-resnet-v2', 'mobilenet',
+                                          'densenet121', 'squeezenet1.1'])
 parser.add_argument('--batch-size', type=int, default=0)
 
 opt = parser.parse_args()

From 1c2fd17b1281902b2e6f84e5db3112e408fca093 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Wed, 10 Oct 2018 16:53:06 +0800
Subject: [PATCH 05/13] improve lint

---
 example/gluon/benchmark_gluon.py               | 18 +++++++++---------
 .../image-classification/benchmark_score.py    |  6 +++---
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/example/gluon/benchmark_gluon.py b/example/gluon/benchmark_gluon.py
index d1ee9313b654..314aefad9a60 100644
--- a/example/gluon/benchmark_gluon.py
+++ b/example/gluon/benchmark_gluon.py
@@ -11,15 +11,15 @@
 parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN perf')
 
 parser.add_argument('--model', type=str, default='all', 
-                                         choices=['all', 'alexnet', 'densenet121', 'densenet161', 
-                                                  'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25',
-                                                  'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25',
-                                                  'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1',
-                                                  'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1', 
-                                                  'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', 
-                                                  'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 
-                                                  'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 
-                                                  'vgg19', 'vgg19_bn'])
+                               choices=['all', 'alexnet', 'densenet121', 'densenet161', 
+                                        'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25',
+                                        'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25',
+                                        'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1',
+                                        'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1', 
+                                        'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', 
+                                        'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 
+                                        'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 
+                                        'vgg19', 'vgg19_bn'])
 parser.add_argument('--batch-size', type=int, default=0)
 parser.add_argument('--type', type=str, default='inf', choices=['all', 'train', 'inf'])
 
diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py
index d4aef913a944..b3c1bb065e20 100644
--- a/example/image-classification/benchmark_score.py
+++ b/example/image-classification/benchmark_score.py
@@ -59,9 +59,9 @@ def get_symbol(network, batch_size, dtype):
     else:
         net = import_module('symbols.'+network)
         sym = net.get_symbol(num_classes=1000,
-                            image_shape=','.join([str(i) for i in image_shape]),
-                            num_layers=num_layers,
-                            dtype=dtype)
+                             image_shape=','.join([str(i) for i in image_shape]),
+                             num_layers=num_layers,
+                             dtype=dtype)
     return (sym, [('data', (batch_size,)+image_shape)])
 
 def score(network, dev, batch_size, num_batches, dtype):

From 6725cd1582679c75d66aef0d2d60e9b91fd0262f Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Wed, 10 Oct 2018 17:10:38 +0800
Subject: [PATCH 06/13] add licsence for script

---
 example/gluon/benchmark_gluon.py | 313 ++++++++++++++++---------------
 1 file changed, 165 insertions(+), 148 deletions(-)

diff --git a/example/gluon/benchmark_gluon.py b/example/gluon/benchmark_gluon.py
index 314aefad9a60..71caad236add 100644
--- a/example/gluon/benchmark_gluon.py
+++ b/example/gluon/benchmark_gluon.py
@@ -1,148 +1,165 @@
-import mxnet as mx
-import mxnet.gluon.model_zoo.vision as models
-import time
-import logging
-import argparse
-import subprocess
-import os
-import errno
-
-logging.basicConfig(level=logging.INFO)
-parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN perf')
-
-parser.add_argument('--model', type=str, default='all', 
-                               choices=['all', 'alexnet', 'densenet121', 'densenet161', 
-                                        'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25',
-                                        'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25',
-                                        'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1',
-                                        'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1', 
-                                        'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', 
-                                        'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 
-                                        'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 
-                                        'vgg19', 'vgg19_bn'])
-parser.add_argument('--batch-size', type=int, default=0)
-parser.add_argument('--type', type=str, default='inf', choices=['all', 'train', 'inf'])
-
-opt = parser.parse_args()
-
-num_batches = 100
-dry_run = 10  # use 10 iterations to warm up
-batch_inf = [1, 16, 32, 64, 128, 256]
-batch_train = [1, 2, 4, 8, 16, 32, 64, 126, 256]
-image_shapes = [(3, 224, 224), (3, 299, 299)]
-
-def get_gpus():
-    """
-    return a list of GPUs
-    """
-    try:
-        re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True)
-    except OSError:
-        return []
-    return range(len([i for i in re.split('\n') if 'GPU' in i]))
-
-def score(network, batch_size, ctx):
-    net = models.get_model(network)
-    if 'inceptionv3' == network:
-        data_shape = [('data', (batch_size,) + image_shapes[1])]
-    else:
-        data_shape = [('data', (batch_size,) + image_shapes[0])]
-
-    net.hybridize()
-    data = mx.sym.var('data')
-    out = net(data)
-    softmax = mx.sym.SoftmaxOutput(out, name='softmax')
-    mod = mx.mod.Module(softmax, context=ctx)
-    mod.bind(for_training     = False,
-             inputs_need_grad = False,
-             data_shapes      = data_shape)
-    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
-    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes]
-    batch = mx.io.DataBatch(data, [])
-    for i in range(dry_run + num_batches):
-        if i == dry_run:
-            tic = time.time()
-        mod.forward(batch, is_train=False)
-        for output in mod.get_outputs():
-            output.wait_to_read()
-    fwd = time.time() - tic
-    return fwd
-
-
-def train(network, batch_size, ctx):
-    net = models.get_model(network)
-    if 'inceptionv3' == network:
-        data_shape = [('data', (batch_size,) + image_shapes[1])]
-    else:
-        data_shape = [('data', (batch_size,) + image_shapes[0])]
-
-    net.hybridize()
-    data = mx.sym.var('data')
-    out = net(data)
-    softmax = mx.sym.SoftmaxOutput(out, name='softmax')
-    mod = mx.mod.Module(softmax, context=ctx)
-    mod.bind(for_training     = True,
-             inputs_need_grad = False,
-             data_shapes      = data_shape)
-    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
-    mod.init_optimizer(kvstore='local', optimizer='sgd')
-    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes]
-    batch = mx.io.DataBatch(data, [])
-    for i in range(dry_run + num_batches):
-        if i == dry_run:
-            tic = time.time()
-        mod.forward(batch, is_train=True)
-        for output in mod.get_outputs():
-            output.wait_to_read()
-        mod.backward()
-        mod.update()
-    bwd = time.time() - tic
-    return bwd
-
-if __name__ == '__main__':
-    runtype = opt.type
-    bs = opt.batch_size
-
-    if opt.model == 'all':
-        networks = ['alexnet', 'densenet121', 'densenet161', 'densenet169', 'densenet201',
-                    'inceptionv3', 'mobilenet0.25', 'mobilenet0.5', 'mobilenet0.75',
-                    'mobilenet1.0', 'mobilenetv2_0.25', 'mobilenetv2_0.5', 'mobilenetv2_0.75',
-                    'mobilenetv2_1.0', 'resnet101_v1', 'resnet101_v2', 'resnet152_v1', 'resnet152_v2',
-                    'resnet18_v1', 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', 
-                    'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 'vgg11_bn', 'vgg13', 
-                    'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn']
-        logging.info('It may take some time to run all models, '
-                     'set --network to run a specific one')
-    else:
-        networks = [opt.model]
-
-    devs = [mx.gpu(0)] if len(get_gpus()) > 0 else []
-    # Enable USE_MKLDNN for better CPU performance
-    devs.append(mx.cpu())
-
-    for network in networks:
-        logging.info('network: %s', network)
-        for d in devs:
-            logging.info('device: %s', d)
-            if runtype == 'inf' or runtype == 'all':
-                if bs != 0:
-                    fwd_time = score(network, bs, d)
-                    fps = (bs*num_batches)/fwd_time
-                    logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps)
-                else:
-                    for batch_size in batch_inf:
-                        fwd_time = score(network, batch_size, d)
-                        fps = (batch_size * num_batches) / fwd_time
-                        logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps)
-            if runtype == 'train' or runtype == 'all':
-                if bs != 0:
-                    bwd_time = train(network, bs, d)
-                    fps = (bs*num_batches)/bwd_time
-                    logging.info(network + ' training perf for BS %d is %f img/s', bs, fps)
-                else:
-                    for batch_size in batch_train:
-                        bwd_time = train(network, batch_size, d)
-                        fps = (batch_size * num_batches) / bwd_time
-                        logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps)
-
-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+import mxnet.gluon.model_zoo.vision as models
+import time
+import logging
+import argparse
+import subprocess
+import os
+import errno
+
+logging.basicConfig(level=logging.INFO)
+parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN perf')
+
+parser.add_argument('--model', type=str, default='all',
+                               choices=['all', 'alexnet', 'densenet121', 'densenet161',
+                                        'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25',
+                                        'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25',
+                                        'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1',
+                                        'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1',
+                                        'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1',
+                                        'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11',
+                                        'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn',
+                                        'vgg19', 'vgg19_bn'])
+parser.add_argument('--batch-size', type=int, default=0)
+parser.add_argument('--type', type=str, default='inf', choices=['all', 'train', 'inf'])
+
+opt = parser.parse_args()
+
+num_batches = 100
+dry_run = 10  # use 10 iterations to warm up
+batch_inf = [1, 16, 32, 64, 128, 256]
+batch_train = [1, 2, 4, 8, 16, 32, 64, 126, 256]
+image_shapes = [(3, 224, 224), (3, 299, 299)]
+
+def get_gpus():
+    """
+    return a list of GPUs
+    """
+    try:
+        re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True)
+    except OSError:
+        return []
+    return range(len([i for i in re.split('\n') if 'GPU' in i]))
+
+def score(network, batch_size, ctx):
+    net = models.get_model(network)
+    if 'inceptionv3' == network:
+        data_shape = [('data', (batch_size,) + image_shapes[1])]
+    else:
+        data_shape = [('data', (batch_size,) + image_shapes[0])]
+
+    net.hybridize()
+    data = mx.sym.var('data')
+    out = net(data)
+    softmax = mx.sym.SoftmaxOutput(out, name='softmax')
+    mod = mx.mod.Module(softmax, context=ctx)
+    mod.bind(for_training     = False,
+             inputs_need_grad = False,
+             data_shapes      = data_shape)
+    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
+    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes]
+    batch = mx.io.DataBatch(data, [])
+    for i in range(dry_run + num_batches):
+        if i == dry_run:
+            tic = time.time()
+        mod.forward(batch, is_train=False)
+        for output in mod.get_outputs():
+            output.wait_to_read()
+    fwd = time.time() - tic
+    return fwd
+
+
+def train(network, batch_size, ctx):
+    net = models.get_model(network)
+    if 'inceptionv3' == network:
+        data_shape = [('data', (batch_size,) + image_shapes[1])]
+    else:
+        data_shape = [('data', (batch_size,) + image_shapes[0])]
+
+    net.hybridize()
+    data = mx.sym.var('data')
+    out = net(data)
+    softmax = mx.sym.SoftmaxOutput(out, name='softmax')
+    mod = mx.mod.Module(softmax, context=ctx)
+    mod.bind(for_training     = True,
+             inputs_need_grad = False,
+             data_shapes      = data_shape)
+    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
+    mod.init_optimizer(kvstore='local', optimizer='sgd')
+    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes]
+    batch = mx.io.DataBatch(data, [])
+    for i in range(dry_run + num_batches):
+        if i == dry_run:
+            tic = time.time()
+        mod.forward(batch, is_train=True)
+        for output in mod.get_outputs():
+            output.wait_to_read()
+        mod.backward()
+        mod.update()
+    bwd = time.time() - tic
+    return bwd
+
+if __name__ == '__main__':
+    runtype = opt.type
+    bs = opt.batch_size
+
+    if opt.model == 'all':
+        networks = ['alexnet', 'densenet121', 'densenet161', 'densenet169', 'densenet201',
+                    'inceptionv3', 'mobilenet0.25', 'mobilenet0.5', 'mobilenet0.75',
+                    'mobilenet1.0', 'mobilenetv2_0.25', 'mobilenetv2_0.5', 'mobilenetv2_0.75',
+                    'mobilenetv2_1.0', 'resnet101_v1', 'resnet101_v2', 'resnet152_v1', 'resnet152_v2',
+                    'resnet18_v1', 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1',
+                    'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 'vgg11_bn', 'vgg13',
+                    'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn']
+        logging.info('It may take some time to run all models, '
+                     'set --network to run a specific one')
+    else:
+        networks = [opt.model]
+
+    devs = [mx.gpu(0)] if len(get_gpus()) > 0 else []
+    # Enable USE_MKLDNN for better CPU performance
+    devs.append(mx.cpu())
+
+    for network in networks:
+        logging.info('network: %s', network)
+        for d in devs:
+            logging.info('device: %s', d)
+            if runtype == 'inf' or runtype == 'all':
+                if bs != 0:
+                    fwd_time = score(network, bs, d)
+                    fps = (bs*num_batches)/fwd_time
+                    logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps)
+                else:
+                    for batch_size in batch_inf:
+                        fwd_time = score(network, batch_size, d)
+                        fps = (batch_size * num_batches) / fwd_time
+                        logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps)
+            if runtype == 'train' or runtype == 'all':
+                if bs != 0:
+                    bwd_time = train(network, bs, d)
+                    fps = (bs*num_batches)/bwd_time
+                    logging.info(network + ' training perf for BS %d is %f img/s', bs, fps)
+                else:
+                    for batch_size in batch_train:
+                        bwd_time = train(network, batch_size, d)
+                        fps = (batch_size * num_batches) / bwd_time
+                        logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps)
+
+

From 1cb52906714da5a054bdff22904e35f58a22687f Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Wed, 10 Oct 2018 22:32:49 +0800
Subject: [PATCH 07/13] improve script lint

---
 example/image-classification/benchmark_score.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py
index b3c1bb065e20..eb2b0dc17e60 100644
--- a/example/image-classification/benchmark_score.py
+++ b/example/image-classification/benchmark_score.py
@@ -105,7 +105,7 @@ def score(network, dev, batch_size, num_batches, dtype):
 
     if opt.batch_size == 0:
         batch_sizes = [1, 2, 4, 8, 16, 32]
-        logging.info('run batchsize [1 ,2 , 4, 8, 16, 32] by default, '
+        logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, '
                      'set --batch-size to run a specific one')
     else:
         batch_sizes = [opt.batch_size]

From 0b17d6fb420ef80c3427cad21306839c14cac88e Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Fri, 12 Oct 2018 00:17:14 +0800
Subject: [PATCH 08/13] mv benchmark_gluon to new location

---
 {example => benchmark/python}/gluon/benchmark_gluon.py | 0
 example/image-classification/benchmark_score.py        | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename {example => benchmark/python}/gluon/benchmark_gluon.py (100%)

diff --git a/example/gluon/benchmark_gluon.py b/benchmark/python/gluon/benchmark_gluon.py
similarity index 100%
rename from example/gluon/benchmark_gluon.py
rename to benchmark/python/gluon/benchmark_gluon.py
diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py
index eb2b0dc17e60..c46cf25ae385 100644
--- a/example/image-classification/benchmark_score.py
+++ b/example/image-classification/benchmark_score.py
@@ -114,7 +114,7 @@ def score(network, dev, batch_size, num_batches, dtype):
         logging.info('network: %s', net)
         if net in ['densenet121', 'squeezenet1.1']:
             logging.info('network: %s is converted from gluon modelzoo', net)
-            logging.info('you can run example/gluon/benchmark_gluon.py for more models')
+            logging.info('you can run benchmark/python/gluon/benchmark_gluon.py for more models')
         for d in devs:
             logging.info('device: %s', d)
             logged_fp16_warning = False

From d5e8a5dd9d9929ef259af717b492023df6f836ef Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Fri, 12 Oct 2018 00:19:17 +0800
Subject: [PATCH 09/13] support multi-gpus

---
 benchmark/python/gluon/benchmark_gluon.py | 90 ++++++++++++-----------
 1 file changed, 46 insertions(+), 44 deletions(-)

diff --git a/benchmark/python/gluon/benchmark_gluon.py b/benchmark/python/gluon/benchmark_gluon.py
index 71caad236add..0a114644c47f 100644
--- a/benchmark/python/gluon/benchmark_gluon.py
+++ b/benchmark/python/gluon/benchmark_gluon.py
@@ -25,7 +25,7 @@
 import errno
 
 logging.basicConfig(level=logging.INFO)
-parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN perf')
+parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN perf benchmark')
 
 parser.add_argument('--model', type=str, default='all',
                                choices=['all', 'alexnet', 'densenet121', 'densenet161',
@@ -38,26 +38,19 @@
                                         'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn',
                                         'vgg19', 'vgg19_bn'])
 parser.add_argument('--batch-size', type=int, default=0)
-parser.add_argument('--type', type=str, default='inf', choices=['all', 'train', 'inf'])
+parser.add_argument('--num-batches', type=int, default=10)
+parser.add_argument('--gpus', type=str, default='',
+                    help='ordinates of gpus to use, can be "0,1,2" or empty for cpu only.')
+parser.add_argument('--type', type=str, default='inference', choices=['all', 'training', 'inference'])
 
 opt = parser.parse_args()
 
-num_batches = 100
+num_batches = opt.num_batches
 dry_run = 10  # use 10 iterations to warm up
 batch_inf = [1, 16, 32, 64, 128, 256]
 batch_train = [1, 2, 4, 8, 16, 32, 64, 126, 256]
 image_shapes = [(3, 224, 224), (3, 299, 299)]
 
-def get_gpus():
-    """
-    return a list of GPUs
-    """
-    try:
-        re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True)
-    except OSError:
-        return []
-    return range(len([i for i in re.split('\n') if 'GPU' in i]))
-
 def score(network, batch_size, ctx):
     net = models.get_model(network)
     if 'inceptionv3' == network:
@@ -74,7 +67,10 @@ def score(network, batch_size, ctx):
              inputs_need_grad = False,
              data_shapes      = data_shape)
     mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
-    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes]
+    if mx.cpu() in ctx:
+        data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.cpu()) for _, shape in mod.data_shapes]
+    elif mx.gpu(0) in ctx:
+        data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.gpu()) for _, shape in mod.data_shapes]
     batch = mx.io.DataBatch(data, [])
     for i in range(dry_run + num_batches):
         if i == dry_run:
@@ -102,8 +98,14 @@ def train(network, batch_size, ctx):
              inputs_need_grad = False,
              data_shapes      = data_shape)
     mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
-    mod.init_optimizer(kvstore='local', optimizer='sgd')
-    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes]
+    if len(ctx) > 1:
+        mod.init_optimizer(kvstore='device', optimizer='sgd')
+    else:
+        mod.init_optimizer(kvstore='local', optimizer='sgd')
+    if mx.cpu() in ctx:
+        data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.cpu()) for _, shape in mod.data_shapes]
+    elif mx.gpu(0) in ctx:
+        data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.gpu()) for _, shape in mod.data_shapes]
     batch = mx.io.DataBatch(data, [])
     for i in range(dry_run + num_batches):
         if i == dry_run:
@@ -132,34 +134,34 @@ def train(network, batch_size, ctx):
                      'set --network to run a specific one')
     else:
         networks = [opt.model]
-
-    devs = [mx.gpu(0)] if len(get_gpus()) > 0 else []
-    # Enable USE_MKLDNN for better CPU performance
-    devs.append(mx.cpu())
+    
+    devs = [mx.gpu(int(i)) for i in opt.gpus.split(',')] if opt.gpus.strip() else [mx.cpu()]
+    num_gpus = len(devs)
 
     for network in networks:
         logging.info('network: %s', network)
-        for d in devs:
-            logging.info('device: %s', d)
-            if runtype == 'inf' or runtype == 'all':
-                if bs != 0:
-                    fwd_time = score(network, bs, d)
-                    fps = (bs*num_batches)/fwd_time
-                    logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps)
-                else:
-                    for batch_size in batch_inf:
-                        fwd_time = score(network, batch_size, d)
-                        fps = (batch_size * num_batches) / fwd_time
-                        logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps)
-            if runtype == 'train' or runtype == 'all':
-                if bs != 0:
-                    bwd_time = train(network, bs, d)
-                    fps = (bs*num_batches)/bwd_time
-                    logging.info(network + ' training perf for BS %d is %f img/s', bs, fps)
-                else:
-                    for batch_size in batch_train:
-                        bwd_time = train(network, batch_size, d)
-                        fps = (batch_size * num_batches) / bwd_time
-                        logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps)
-
-
+        logging.info('device: %s', devs)
+        if runtype == 'inference' or runtype == 'all':
+            if bs != 0:
+                batch_sizes = bs * max(1, num_gpus)
+                fwd_time = score(network, batch_sizes, devs)
+                fps = (batch_sizes * num_batches)/fwd_time
+                logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps)
+            else:
+                for batch_size in batch_inf:
+                    batch_sizes = batch_size * max(1, num_gpus)
+                    fwd_time = score(network, batch_sizes, devs)
+                    fps = (batch_sizes * num_batches) / fwd_time
+                    logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps)
+        if runtype == 'training' or runtype == 'all':
+            if bs != 0:
+                batch_sizes = bs * max(1, num_gpus)
+                bwd_time = train(network, batch_sizes, devs)
+                fps = (batch_sizes * num_batches) / bwd_time
+                logging.info(network + ' training perf for BS %d is %f img/s', bs, fps)
+            else:
+                for batch_size in batch_train:
+                    batch_sizes = batch_size * max(1, num_gpus)
+                    bwd_time = train(network, batch_sizes, devs)
+                    fps = (batch_sizes * num_batches) / bwd_time
+                    logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps)

From 129802536f25ea8ec6839e646e1e6c742d9e2394 Mon Sep 17 00:00:00 2001
From: Shufan <shufan@mxnet-mac.sh.intel.com>
Date: Sun, 14 Oct 2018 18:12:38 +0800
Subject: [PATCH 10/13] Add a new parameter 'global batchsize' for the batch
 size multiplication for multi-gpu case

---
 benchmark/python/gluon/benchmark_gluon.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/benchmark/python/gluon/benchmark_gluon.py b/benchmark/python/gluon/benchmark_gluon.py
index 0a114644c47f..980c879129b3 100644
--- a/benchmark/python/gluon/benchmark_gluon.py
+++ b/benchmark/python/gluon/benchmark_gluon.py
@@ -41,11 +41,14 @@
 parser.add_argument('--num-batches', type=int, default=10)
 parser.add_argument('--gpus', type=str, default='',
                     help='ordinates of gpus to use, can be "0,1,2" or empty for cpu only.')
+parser.add_argument('--global-batchsize', type=bool, default=True,
+                    help='for multi-gpu case, the batchsize will not be multiplied with GPU number if set true.') 
 parser.add_argument('--type', type=str, default='inference', choices=['all', 'training', 'inference'])
 
 opt = parser.parse_args()
 
 num_batches = opt.num_batches
+global_bs = opt.global_batchsize
 dry_run = 10  # use 10 iterations to warm up
 batch_inf = [1, 16, 32, 64, 128, 256]
 batch_train = [1, 2, 4, 8, 16, 32, 64, 126, 256]
@@ -143,25 +146,37 @@ def train(network, batch_size, ctx):
         logging.info('device: %s', devs)
         if runtype == 'inference' or runtype == 'all':
             if bs != 0:
-                batch_sizes = bs * max(1, num_gpus)
+                if not global_bs:
+                    batch_sizes = bs * max(1, num_gpus)
+                else:
+                    batch_sizes = bs
                 fwd_time = score(network, batch_sizes, devs)
                 fps = (batch_sizes * num_batches)/fwd_time
                 logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps)
             else:
                 for batch_size in batch_inf:
-                    batch_sizes = batch_size * max(1, num_gpus)
+                    if not global_bs:
+                        batch_sizes = batch_size * max(1, num_gpus)
+                    else:
+                        batch_sizes = batch_size
                     fwd_time = score(network, batch_sizes, devs)
                     fps = (batch_sizes * num_batches) / fwd_time
                     logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps)
         if runtype == 'training' or runtype == 'all':
             if bs != 0:
-                batch_sizes = bs * max(1, num_gpus)
+                if not global_bs:
+                    batch_sizes = bs * max(1, num_gpus)
+                else:
+                    batch_sizes = bs
                 bwd_time = train(network, batch_sizes, devs)
                 fps = (batch_sizes * num_batches) / bwd_time
                 logging.info(network + ' training perf for BS %d is %f img/s', bs, fps)
             else:
                 for batch_size in batch_train:
-                    batch_sizes = batch_size * max(1, num_gpus)
+                    if not global_bs:
+                        batch_sizes = batch_size * max(1, num_gpus)
+                    else:
+                        batch_sizes = batch_size
                     bwd_time = train(network, batch_sizes, devs)
                     fps = (batch_sizes * num_batches) / bwd_time
                     logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps)

From 39834113a1933ac93728ce827fab55adc2eb2411 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Mon, 15 Oct 2018 13:37:44 +0800
Subject: [PATCH 11/13] add batch size argument help

---
 benchmark/python/gluon/benchmark_gluon.py       | 9 +++++++--
 example/image-classification/benchmark_score.py | 5 +++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/benchmark/python/gluon/benchmark_gluon.py b/benchmark/python/gluon/benchmark_gluon.py
index 980c879129b3..7943e05dbc81 100644
--- a/benchmark/python/gluon/benchmark_gluon.py
+++ b/benchmark/python/gluon/benchmark_gluon.py
@@ -25,7 +25,7 @@
 import errno
 
 logging.basicConfig(level=logging.INFO)
-parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN perf benchmark')
+parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN performance benchmark')
 
 parser.add_argument('--model', type=str, default='all',
                                choices=['all', 'alexnet', 'densenet121', 'densenet161',
@@ -37,7 +37,8 @@
                                         'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11',
                                         'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn',
                                         'vgg19', 'vgg19_bn'])
-parser.add_argument('--batch-size', type=int, default=0)
+parser.add_argument('--batch-size', type=int, default=0,
+                     help='run batch size [1, 2, 4, 8, 16, 32] by default')
 parser.add_argument('--num-batches', type=int, default=10)
 parser.add_argument('--gpus', type=str, default='',
                     help='ordinates of gpus to use, can be "0,1,2" or empty for cpu only.')
@@ -154,6 +155,8 @@ def train(network, batch_size, ctx):
                 fps = (batch_sizes * num_batches)/fwd_time
                 logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps)
             else:
+                logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, '
+                             'set --batch-size to run a specific one')
                 for batch_size in batch_inf:
                     if not global_bs:
                         batch_sizes = batch_size * max(1, num_gpus)
@@ -172,6 +175,8 @@ def train(network, batch_size, ctx):
                 fps = (batch_sizes * num_batches) / bwd_time
                 logging.info(network + ' training perf for BS %d is %f img/s', bs, fps)
             else:
+                logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, '
+                             'set --batch-size to run a specific one')
                 for batch_size in batch_train:
                     if not global_bs:
                         batch_sizes = batch_size * max(1, num_gpus)
diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py
index c46cf25ae385..3cdb1adb5f6e 100644
--- a/example/image-classification/benchmark_score.py
+++ b/example/image-classification/benchmark_score.py
@@ -29,13 +29,14 @@
 import numpy as np
 logging.basicConfig(level=logging.DEBUG)
 
-parser = argparse.ArgumentParser(description='SymbolAPI-based CNN perf')
+parser = argparse.ArgumentParser(description='SymbolAPI-based CNN inference performance benchmark')
 parser.add_argument('--network', type=str, default='all', 
                                  choices=['all', 'alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50',
                                           'resnet-152', 'inception-bn', 'inception-v3', 
                                           'inception-v4', 'inception-resnet-v2', 'mobilenet',
                                           'densenet121', 'squeezenet1.1'])
-parser.add_argument('--batch-size', type=int, default=0)
+parser.add_argument('--batch-size', type=int, default=0,
+                     help='run batch size [1, 2, 4, 8, 16, 32] by default')
 
 opt = parser.parse_args()
 

From 68e80095d4afb1acc8ab1d75241be5eb6ff12fa3 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Tue, 16 Oct 2018 13:13:28 +0800
Subject: [PATCH 12/13] improve help and change default batchsize

---
 benchmark/python/gluon/benchmark_gluon.py       | 15 ++++++++++-----
 example/image-classification/benchmark_score.py |  7 ++++---
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/benchmark/python/gluon/benchmark_gluon.py b/benchmark/python/gluon/benchmark_gluon.py
index 7943e05dbc81..4e029e9442ad 100644
--- a/benchmark/python/gluon/benchmark_gluon.py
+++ b/benchmark/python/gluon/benchmark_gluon.py
@@ -38,12 +38,17 @@
                                         'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn',
                                         'vgg19', 'vgg19_bn'])
 parser.add_argument('--batch-size', type=int, default=0,
-                     help='run batch size [1, 2, 4, 8, 16, 32] by default')
+                     help='Batch size to use for benchmarking. Example: 32, 64, 128.'
+                          'By default, runs benchmark for batch sizes - 1, 32, 64, 128, 256')
 parser.add_argument('--num-batches', type=int, default=10)
 parser.add_argument('--gpus', type=str, default='',
-                    help='ordinates of gpus to use, can be "0,1,2" or empty for cpu only.')
+                    help='GPU IDs to use for this benchmark task. Example: --gpus=0,1,2,3 to use 4 GPUs.'
+                         'By default, use CPU only.')
 parser.add_argument('--global-batchsize', type=bool, default=True,
-                    help='for multi-gpu case, the batchsize will not be multiplied with GPU number if set true.') 
+                    help='Optional. Set this to True if batch-size should be used as is.'
+                         'Example: If --batch-size=64, --gpus=0,1. With --global-batchsize=True,'
+                         'each of the 2 GPUs will get 64/2 = 32 samples per batch.'
+                         'With --global-batchsize=False, each of the 2 GPUs will get 64 samples per batch.') 
 parser.add_argument('--type', type=str, default='inference', choices=['all', 'training', 'inference'])
 
 opt = parser.parse_args()
@@ -51,8 +56,8 @@
 num_batches = opt.num_batches
 global_bs = opt.global_batchsize
 dry_run = 10  # use 10 iterations to warm up
-batch_inf = [1, 16, 32, 64, 128, 256]
-batch_train = [1, 2, 4, 8, 16, 32, 64, 126, 256]
+batch_inf = [1, 32, 64, 128, 256]
+batch_train = [1, 32, 64, 128, 256]
 image_shapes = [(3, 224, 224), (3, 299, 299)]
 
 def score(network, batch_size, ctx):
diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py
index 3cdb1adb5f6e..e81a30bd6439 100644
--- a/example/image-classification/benchmark_score.py
+++ b/example/image-classification/benchmark_score.py
@@ -36,7 +36,8 @@
                                           'inception-v4', 'inception-resnet-v2', 'mobilenet',
                                           'densenet121', 'squeezenet1.1'])
 parser.add_argument('--batch-size', type=int, default=0,
-                     help='run batch size [1, 2, 4, 8, 16, 32] by default')
+                     help='Batch size to use for benchmarking. Example: 32, 64, 128.'
+                          'By default, runs benchmark for batch sizes - 1, 32, 64, 128, 256')
 
 opt = parser.parse_args()
 
@@ -105,8 +106,8 @@ def score(network, dev, batch_size, num_batches, dtype):
     devs.append(mx.cpu())
 
     if opt.batch_size == 0:
-        batch_sizes = [1, 2, 4, 8, 16, 32]
-        logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, '
+        batch_sizes = [1, 32, 64, 128, 256]
+        logging.info('run batchsize [1, 32, 64, 128, 256] by default, '
                      'set --batch-size to run a specific one')
     else:
         batch_sizes = [opt.batch_size]

From e21c8f5594b656fa69a7d54ddb4c5b53617be5d1 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Fri, 19 Oct 2018 11:04:00 +0800
Subject: [PATCH 13/13] simplify benchmark_gluon

---
 benchmark/python/gluon/benchmark_gluon.py | 52 ++++++-----------------
 1 file changed, 12 insertions(+), 40 deletions(-)

diff --git a/benchmark/python/gluon/benchmark_gluon.py b/benchmark/python/gluon/benchmark_gluon.py
index 4e029e9442ad..3dbb36404d07 100644
--- a/benchmark/python/gluon/benchmark_gluon.py
+++ b/benchmark/python/gluon/benchmark_gluon.py
@@ -44,30 +44,24 @@
 parser.add_argument('--gpus', type=str, default='',
                     help='GPU IDs to use for this benchmark task. Example: --gpus=0,1,2,3 to use 4 GPUs.'
                          'By default, use CPU only.')
-parser.add_argument('--global-batchsize', type=bool, default=True,
-                    help='Optional. Set this to True if batch-size should be used as is.'
-                         'Example: If --batch-size=64, --gpus=0,1. With --global-batchsize=True,'
-                         'each of the 2 GPUs will get 64/2 = 32 samples per batch.'
-                         'With --global-batchsize=False, each of the 2 GPUs will get 64 samples per batch.') 
 parser.add_argument('--type', type=str, default='inference', choices=['all', 'training', 'inference'])
 
 opt = parser.parse_args()
 
 num_batches = opt.num_batches
-global_bs = opt.global_batchsize
 dry_run = 10  # use 10 iterations to warm up
 batch_inf = [1, 32, 64, 128, 256]
 batch_train = [1, 32, 64, 128, 256]
 image_shapes = [(3, 224, 224), (3, 299, 299)]
 
 def score(network, batch_size, ctx):
+    assert (batch_size >= len(ctx)), "ERROR: batch size should not be smaller than num of GPUs."
     net = models.get_model(network)
     if 'inceptionv3' == network:
         data_shape = [('data', (batch_size,) + image_shapes[1])]
     else:
         data_shape = [('data', (batch_size,) + image_shapes[0])]
 
-    net.hybridize()
     data = mx.sym.var('data')
     out = net(data)
     softmax = mx.sym.SoftmaxOutput(out, name='softmax')
@@ -76,10 +70,7 @@ def score(network, batch_size, ctx):
              inputs_need_grad = False,
              data_shapes      = data_shape)
     mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
-    if mx.cpu() in ctx:
-        data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.cpu()) for _, shape in mod.data_shapes]
-    elif mx.gpu(0) in ctx:
-        data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.gpu()) for _, shape in mod.data_shapes]
+    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx[0]) for _, shape in mod.data_shapes]
     batch = mx.io.DataBatch(data, [])
     for i in range(dry_run + num_batches):
         if i == dry_run:
@@ -92,13 +83,13 @@ def score(network, batch_size, ctx):
 
 
 def train(network, batch_size, ctx):
+    assert (batch_size >= len(ctx)), "ERROR: batch size should not be smaller than num of GPUs."
     net = models.get_model(network)
     if 'inceptionv3' == network:
         data_shape = [('data', (batch_size,) + image_shapes[1])]
     else:
         data_shape = [('data', (batch_size,) + image_shapes[0])]
 
-    net.hybridize()
     data = mx.sym.var('data')
     out = net(data)
     softmax = mx.sym.SoftmaxOutput(out, name='softmax')
@@ -111,10 +102,7 @@ def train(network, batch_size, ctx):
         mod.init_optimizer(kvstore='device', optimizer='sgd')
     else:
         mod.init_optimizer(kvstore='local', optimizer='sgd')
-    if mx.cpu() in ctx:
-        data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.cpu()) for _, shape in mod.data_shapes]
-    elif mx.gpu(0) in ctx:
-        data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.gpu()) for _, shape in mod.data_shapes]
+    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx[0]) for _, shape in mod.data_shapes]
     batch = mx.io.DataBatch(data, [])
     for i in range(dry_run + num_batches):
         if i == dry_run:
@@ -152,41 +140,25 @@ def train(network, batch_size, ctx):
         logging.info('device: %s', devs)
         if runtype == 'inference' or runtype == 'all':
             if bs != 0:
-                if not global_bs:
-                    batch_sizes = bs * max(1, num_gpus)
-                else:
-                    batch_sizes = bs
-                fwd_time = score(network, batch_sizes, devs)
-                fps = (batch_sizes * num_batches)/fwd_time
+                fwd_time = score(network, bs, devs)
+                fps = (bs * num_batches)/fwd_time
                 logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps)
             else:
                 logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, '
                              'set --batch-size to run a specific one')
                 for batch_size in batch_inf:
-                    if not global_bs:
-                        batch_sizes = batch_size * max(1, num_gpus)
-                    else:
-                        batch_sizes = batch_size
-                    fwd_time = score(network, batch_sizes, devs)
-                    fps = (batch_sizes * num_batches) / fwd_time
+                    fwd_time = score(network, batch_size, devs)
+                    fps = (batch_size * num_batches) / fwd_time
                     logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps)
         if runtype == 'training' or runtype == 'all':
             if bs != 0:
-                if not global_bs:
-                    batch_sizes = bs * max(1, num_gpus)
-                else:
-                    batch_sizes = bs
-                bwd_time = train(network, batch_sizes, devs)
-                fps = (batch_sizes * num_batches) / bwd_time
+                bwd_time = train(network, bs, devs)
+                fps = (bs * num_batches) / bwd_time
                 logging.info(network + ' training perf for BS %d is %f img/s', bs, fps)
             else:
                 logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, '
                              'set --batch-size to run a specific one')
                 for batch_size in batch_train:
-                    if not global_bs:
-                        batch_sizes = batch_size * max(1, num_gpus)
-                    else:
-                        batch_sizes = batch_size
-                    bwd_time = train(network, batch_sizes, devs)
-                    fps = (batch_sizes * num_batches) / bwd_time
+                    bwd_time = train(network, batch_size, devs)
+                    fps = (batch_size * num_batches) / bwd_time
                     logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps)