From a51ba4b3c6d7013b30c6e1bc39e7f8fecac31e9e Mon Sep 17 00:00:00 2001 From: xinyu-intel Date: Wed, 10 Oct 2018 14:19:03 +0800 Subject: [PATCH 01/13] add models to cnn benchmark --- .../image-classification/benchmark_score.py | 57 ++++++++++++++++--- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py index a4118ebcf76b..45283c85e09c 100644 --- a/example/image-classification/benchmark_score.py +++ b/example/image-classification/benchmark_score.py @@ -21,26 +21,51 @@ from common import find_mxnet from common.util import get_gpus import mxnet as mx +import mxnet.gluon.model_zoo.vision as models from importlib import import_module import logging +import argparse import time import numpy as np logging.basicConfig(level=logging.DEBUG) +parser = argparse.ArgumentParser(description='SymbolAPI-based CNN perf') +parser.add_argument('--network', type=str, default='all', + choices=['all', 'alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50', + 'resnet-152', 'inception-bn', 'inception-v3', + 'inception-v4', 'inception-resnet-v2', 'mobilenet', + 'densenet-121', 'squeezenet1.1']) +parser.add_argument('--batch-size', type=int, default=0) + +opt = parser.parse_args() + def get_symbol(network, batch_size, dtype): - image_shape = (3,299,299) if network == 'inception-v3' else (3,224,224) + image_shape = (3,299,299) if network in ['inception-v3', 'inception-v4'] else (3,224,224) num_layers = 0 - if 'resnet' in network: + if network == 'inception-resnet-v2': + network = network + elif 'resnet' in network: num_layers = int(network.split('-')[1]) network = network.split('-')[0] if 'vgg' in network: num_layers = int(network.split('-')[1]) network = 'vgg' - net = import_module('symbols.'+network) - sym = net.get_symbol(num_classes=1000, - image_shape=','.join([str(i) for i in image_shape]), - num_layers=num_layers, - dtype=dtype) + if 'densenet' in network: + num_layers = int(network.split('-')[1]) + network = 'densenet' + str(num_layers) + if network in ['densenet121', 'squeezenet1.1']: + logging.info('network: %s is converted from gluon modelzoo', network) + sym = models.get_model(network) + sym.hybridize() + data = mx.sym.var('data') + sym = sym(data) + sym = mx.sym.SoftmaxOutput(sym, name='softmax') + else: + net = import_module('symbols.'+network) + sym = net.get_symbol(num_classes=1000, + image_shape=','.join([str(i) for i in image_shape]), + num_layers=num_layers, + dtype=dtype) return (sym, [('data', (batch_size,)+image_shape)]) def score(network, dev, batch_size, num_batches, dtype): @@ -69,12 +94,26 @@ def score(network, dev, batch_size, num_batches, dtype): return num_batches*batch_size/(time.time() - tic) if __name__ == '__main__': - networks = ['alexnet', 'vgg-16', 'inception-bn', 'inception-v3', 'resnetv1-50', 'resnet-50', 'resnet-152'] + if opt.network == 'all': + networks = ['alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50', + 'resnet-152', 'inception-bn', 'inception-v3', + 'inception-v4', 'inception-resnet-v2', + 'mobilenet', 'densenet-121', 'squeezenet1.1'] + logging.info('It may take some time to run all models, ' + 'set --network to run a specific one') + else: + networks = [opt.network] devs = [mx.gpu(0)] if len(get_gpus()) > 0 else [] # Enable USE_MKLDNN for better CPU performance devs.append(mx.cpu()) - batch_sizes = [1, 2, 4, 8, 16, 32] + if opt.batch_size == 0: + batch_sizes = [1, 2, 4, 8, 16, 32] + logging.info('run batchsize [1 ,2 , 4, 8, 16, 32] by default, ' + 'set --batch-size to run a specific one') + else: + batch_sizes = [opt.batch_size] + for net in networks: logging.info('network: %s', net) for d in devs: From 6be4b378be26abe139f52e81c9c5803a59082e21 Mon Sep 17 00:00:00 2001 From: xinyu-intel Date: Wed, 10 Oct 2018 15:19:13 +0800 Subject: [PATCH 02/13] improve benchmark score --- example/image-classification/benchmark_score.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py index 45283c85e09c..c823f181bef5 100644 --- a/example/image-classification/benchmark_score.py +++ b/example/image-classification/benchmark_score.py @@ -34,7 +34,7 @@ choices=['all', 'alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50', 'resnet-152', 'inception-bn', 'inception-v3', 'inception-v4', 'inception-resnet-v2', 'mobilenet', - 'densenet-121', 'squeezenet1.1']) + 'densenet121', 'squeezenet1.1']) parser.add_argument('--batch-size', type=int, default=0) opt = parser.parse_args() @@ -50,11 +50,7 @@ def get_symbol(network, batch_size, dtype): if 'vgg' in network: num_layers = int(network.split('-')[1]) network = 'vgg' - if 'densenet' in network: - num_layers = int(network.split('-')[1]) - network = 'densenet' + str(num_layers) if network in ['densenet121', 'squeezenet1.1']: - logging.info('network: %s is converted from gluon modelzoo', network) sym = models.get_model(network) sym.hybridize() data = mx.sym.var('data') @@ -98,7 +94,7 @@ def score(network, dev, batch_size, num_batches, dtype): networks = ['alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50', 'resnet-152', 'inception-bn', 'inception-v3', 'inception-v4', 'inception-resnet-v2', - 'mobilenet', 'densenet-121', 'squeezenet1.1'] + 'mobilenet', 'densenet121', 'squeezenet1.1'] logging.info('It may take some time to run all models, ' 'set --network to run a specific one') else: @@ -116,6 +112,9 @@ def score(network, dev, batch_size, num_batches, dtype): for net in networks: logging.info('network: %s', net) + if net in ['densenet121', 'squeezenet1.1']: + logging.info('network: %s is converted from gluon modelzoo', net) + logging.info('you can run example/gluon/benchmark_gluon.py for more models') for d in devs: logging.info('device: %s', d) logged_fp16_warning = False From 1c583dfe12daba32ab8d2df5f9e57386f898b1db Mon Sep 17 00:00:00 2001 From: xinyu-intel Date: Wed, 10 Oct 2018 15:59:03 +0800 Subject: [PATCH 03/13] add benchmark_gluon --- example/gluon/benchmark_gluon.py | 148 +++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 example/gluon/benchmark_gluon.py diff --git a/example/gluon/benchmark_gluon.py b/example/gluon/benchmark_gluon.py new file mode 100644 index 000000000000..22164edb768c --- /dev/null +++ b/example/gluon/benchmark_gluon.py @@ -0,0 +1,148 @@ +import mxnet as mx +import mxnet.gluon.model_zoo.vision as models +import time +import logging +import argparse +import subprocess +import os +import errno + +logging.basicConfig(level=logging.INFO) +parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN perf') + +parser.add_argument('--model', type=str, default='all', + choices=['all', 'alexnet', 'densenet121', 'densenet161', + 'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25', + 'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25', + 'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1', + 'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1', + 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', + 'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', + 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', + 'vgg19', 'vgg19_bn']) +parser.add_argument('--batch-size', type=int, default=0) +parser.add_argument('--type', type=str, default='inf', choices=['all', 'train', 'inf']) + +opt = parser.parse_args() + +num_batches = 100 +dry_run = 10 # use 10 iterations to warm up +batch_inf = [1, 16, 32, 64, 128, 256] +batch_train = [1, 2, 4, 8, 16, 32, 64, 126, 256] +image_shapes = [(3, 224, 224), (3, 299, 299)] + +def get_gpus(): + """ + return a list of GPUs + """ + try: + re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True) + except OSError: + return [] + return range(len([i for i in re.split('\n') if 'GPU' in i])) + +def score(network, batch_size, ctx): + net = models.get_model(network) + if 'inceptionv3' == network: + data_shape = [('data', (batch_size,) + image_shapes[1])] + else: + data_shape = [('data', (batch_size,) + image_shapes[0])] + + net.hybridize() + data = mx.sym.var('data') + out = net(data) + softmax = mx.sym.SoftmaxOutput(out, name='softmax') + mod = mx.mod.Module(softmax, context=ctx) + mod.bind(for_training = False, + inputs_need_grad = False, + data_shapes = data_shape) + mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) + data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes] + batch = mx.io.DataBatch(data, []) + for i in range(dry_run + num_batches): + if i == dry_run: + tic = time.time() + mod.forward(batch, is_train=False) + for output in mod.get_outputs(): + output.wait_to_read() + fwd = time.time() - tic + return fwd + + +def train(network, batch_size, ctx): + net = models.get_model(network) + if 'inceptionv3' == network: + data_shape = [('data', (batch_size,) + image_shapes[1])] + else: + data_shape = [('data', (batch_size,) + image_shapes[0])] + + net.hybridize() + data = mx.sym.var('data') + out = net(data) + softmax = mx.sym.SoftmaxOutput(out, name='softmax') + mod = mx.mod.Module(softmax, context=ctx) + mod.bind(for_training = True, + inputs_need_grad = False, + data_shapes = data_shape) + mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) + mod.init_optimizer(kvstore='local', optimizer='sgd') + data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes] + batch = mx.io.DataBatch(data, []) + for i in range(dry_run + num_batches): + if i == dry_run: + tic = time.time() + mod.forward(batch, is_train=True) + for output in mod.get_outputs(): + output.wait_to_read() + mod.backward() + mod.update() + bwd = time.time() - tic + return bwd + +if __name__ == '__main__': + runtype = opt.type + bs = opt.batch_size + + if opt.model == 'all': + networks = ['alexnet', 'densenet121', 'densenet161', 'densenet169', 'densenet201', + 'inceptionv3', 'mobilenet0.25', 'mobilenet0.5', 'mobilenet0.75', + 'mobilenet1.0', 'mobilenetv2_0.25', 'mobilenetv2_0.5', 'mobilenetv2_0.75', + 'mobilenetv2_1.0', 'resnet101_v1', 'resnet101_v2', 'resnet152_v1', 'resnet152_v2', + 'resnet18_v1', 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', + 'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 'vgg11_bn', 'vgg13', + 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn'] + logging.info('It may take some time to run all models, ' + 'set --network to run a specific one') + else: + networks = [opt.model] + + devs = [mx.gpu(0)] if len(get_gpus()) > 0 else [] + # Enable USE_MKLDNN for better CPU performance + devs.append(mx.cpu()) + + for network in networks: + logging.info('network: %s', network) + for d in devs: + logging.info('device: %s', d) + if runtype == 'inf' or runtype == 'all': + if bs != 0: + fwd_time = score(network, bs, d) + fps = (bs*num_batches)/fwd_time + logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps) + else: + for batch_size in batch_inf: + fwd_time = score(network, batch_size, d) + fps = (batch_size * num_batches) / fwd_time + logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps) + if runtype == 'train' or runtype == 'all': + if bs != 0: + bwd_time = train(network, bs, d) + fps = (bs*num_batches)/bwd_time + logging.info(network + ' training perf for BS %d is %f img/s', bs, fps) + else: + for batch_size in batch_train: + bwd_time = train(network, batch_size, d) + fps = (batch_size * num_batches) / bwd_time + logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps) + + From a3d79266ae38b4f1b1324eb1c2c03f48e627e7e2 Mon Sep 17 00:00:00 2001 From: xinyu-intel Date: Wed, 10 Oct 2018 16:50:23 +0800 Subject: [PATCH 04/13] improve lint --- example/gluon/benchmark_gluon.py | 28 +++++++++---------- .../image-classification/benchmark_score.py | 8 +++--- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/example/gluon/benchmark_gluon.py b/example/gluon/benchmark_gluon.py index 22164edb768c..d1ee9313b654 100644 --- a/example/gluon/benchmark_gluon.py +++ b/example/gluon/benchmark_gluon.py @@ -12,14 +12,14 @@ parser.add_argument('--model', type=str, default='all', choices=['all', 'alexnet', 'densenet121', 'densenet161', - 'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25', - 'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25', - 'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1', - 'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1', - 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', - 'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', - 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', - 'vgg19', 'vgg19_bn']) + 'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25', + 'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25', + 'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1', + 'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1', + 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', + 'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', + 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', + 'vgg19', 'vgg19_bn']) parser.add_argument('--batch-size', type=int, default=0) parser.add_argument('--type', type=str, default='inf', choices=['all', 'train', 'inf']) @@ -54,8 +54,8 @@ def score(network, batch_size, ctx): softmax = mx.sym.SoftmaxOutput(out, name='softmax') mod = mx.mod.Module(softmax, context=ctx) mod.bind(for_training = False, - inputs_need_grad = False, - data_shapes = data_shape) + inputs_need_grad = False, + data_shapes = data_shape) mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes] batch = mx.io.DataBatch(data, []) @@ -82,8 +82,8 @@ def train(network, batch_size, ctx): softmax = mx.sym.SoftmaxOutput(out, name='softmax') mod = mx.mod.Module(softmax, context=ctx) mod.bind(for_training = True, - inputs_need_grad = False, - data_shapes = data_shape) + inputs_need_grad = False, + data_shapes = data_shape) mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) mod.init_optimizer(kvstore='local', optimizer='sgd') data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes] @@ -105,9 +105,9 @@ def train(network, batch_size, ctx): if opt.model == 'all': networks = ['alexnet', 'densenet121', 'densenet161', 'densenet169', 'densenet201', - 'inceptionv3', 'mobilenet0.25', 'mobilenet0.5', 'mobilenet0.75', + 'inceptionv3', 'mobilenet0.25', 'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25', 'mobilenetv2_0.5', 'mobilenetv2_0.75', - 'mobilenetv2_1.0', 'resnet101_v1', 'resnet101_v2', 'resnet152_v1', 'resnet152_v2', + 'mobilenetv2_1.0', 'resnet101_v1', 'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1', 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', 'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn'] diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py index c823f181bef5..d4aef913a944 100644 --- a/example/image-classification/benchmark_score.py +++ b/example/image-classification/benchmark_score.py @@ -31,10 +31,10 @@ parser = argparse.ArgumentParser(description='SymbolAPI-based CNN perf') parser.add_argument('--network', type=str, default='all', - choices=['all', 'alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50', - 'resnet-152', 'inception-bn', 'inception-v3', - 'inception-v4', 'inception-resnet-v2', 'mobilenet', - 'densenet121', 'squeezenet1.1']) + choices=['all', 'alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50', + 'resnet-152', 'inception-bn', 'inception-v3', + 'inception-v4', 'inception-resnet-v2', 'mobilenet', + 'densenet121', 'squeezenet1.1']) parser.add_argument('--batch-size', type=int, default=0) opt = parser.parse_args() From 1c2fd17b1281902b2e6f84e5db3112e408fca093 Mon Sep 17 00:00:00 2001 From: xinyu-intel Date: Wed, 10 Oct 2018 16:53:06 +0800 Subject: [PATCH 05/13] improve lint --- example/gluon/benchmark_gluon.py | 18 +++++++++--------- .../image-classification/benchmark_score.py | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/example/gluon/benchmark_gluon.py b/example/gluon/benchmark_gluon.py index d1ee9313b654..314aefad9a60 100644 --- a/example/gluon/benchmark_gluon.py +++ b/example/gluon/benchmark_gluon.py @@ -11,15 +11,15 @@ parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN perf') parser.add_argument('--model', type=str, default='all', - choices=['all', 'alexnet', 'densenet121', 'densenet161', - 'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25', - 'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25', - 'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1', - 'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1', - 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', - 'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', - 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', - 'vgg19', 'vgg19_bn']) + choices=['all', 'alexnet', 'densenet121', 'densenet161', + 'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25', + 'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25', + 'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1', + 'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1', + 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', + 'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', + 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', + 'vgg19', 'vgg19_bn']) parser.add_argument('--batch-size', type=int, default=0) parser.add_argument('--type', type=str, default='inf', choices=['all', 'train', 'inf']) diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py index d4aef913a944..b3c1bb065e20 100644 --- a/example/image-classification/benchmark_score.py +++ b/example/image-classification/benchmark_score.py @@ -59,9 +59,9 @@ def get_symbol(network, batch_size, dtype): else: net = import_module('symbols.'+network) sym = net.get_symbol(num_classes=1000, - image_shape=','.join([str(i) for i in image_shape]), - num_layers=num_layers, - dtype=dtype) + image_shape=','.join([str(i) for i in image_shape]), + num_layers=num_layers, + dtype=dtype) return (sym, [('data', (batch_size,)+image_shape)]) def score(network, dev, batch_size, num_batches, dtype): From 6725cd1582679c75d66aef0d2d60e9b91fd0262f Mon Sep 17 00:00:00 2001 From: xinyu-intel Date: Wed, 10 Oct 2018 17:10:38 +0800 Subject: [PATCH 06/13] add licsence for script --- example/gluon/benchmark_gluon.py | 313 ++++++++++++++++--------------- 1 file changed, 165 insertions(+), 148 deletions(-) diff --git a/example/gluon/benchmark_gluon.py b/example/gluon/benchmark_gluon.py index 314aefad9a60..71caad236add 100644 --- a/example/gluon/benchmark_gluon.py +++ b/example/gluon/benchmark_gluon.py @@ -1,148 +1,165 @@ -import mxnet as mx -import mxnet.gluon.model_zoo.vision as models -import time -import logging -import argparse -import subprocess -import os -import errno - -logging.basicConfig(level=logging.INFO) -parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN perf') - -parser.add_argument('--model', type=str, default='all', - choices=['all', 'alexnet', 'densenet121', 'densenet161', - 'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25', - 'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25', - 'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1', - 'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1', - 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', - 'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', - 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', - 'vgg19', 'vgg19_bn']) -parser.add_argument('--batch-size', type=int, default=0) -parser.add_argument('--type', type=str, default='inf', choices=['all', 'train', 'inf']) - -opt = parser.parse_args() - -num_batches = 100 -dry_run = 10 # use 10 iterations to warm up -batch_inf = [1, 16, 32, 64, 128, 256] -batch_train = [1, 2, 4, 8, 16, 32, 64, 126, 256] -image_shapes = [(3, 224, 224), (3, 299, 299)] - -def get_gpus(): - """ - return a list of GPUs - """ - try: - re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True) - except OSError: - return [] - return range(len([i for i in re.split('\n') if 'GPU' in i])) - -def score(network, batch_size, ctx): - net = models.get_model(network) - if 'inceptionv3' == network: - data_shape = [('data', (batch_size,) + image_shapes[1])] - else: - data_shape = [('data', (batch_size,) + image_shapes[0])] - - net.hybridize() - data = mx.sym.var('data') - out = net(data) - softmax = mx.sym.SoftmaxOutput(out, name='softmax') - mod = mx.mod.Module(softmax, context=ctx) - mod.bind(for_training = False, - inputs_need_grad = False, - data_shapes = data_shape) - mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) - data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes] - batch = mx.io.DataBatch(data, []) - for i in range(dry_run + num_batches): - if i == dry_run: - tic = time.time() - mod.forward(batch, is_train=False) - for output in mod.get_outputs(): - output.wait_to_read() - fwd = time.time() - tic - return fwd - - -def train(network, batch_size, ctx): - net = models.get_model(network) - if 'inceptionv3' == network: - data_shape = [('data', (batch_size,) + image_shapes[1])] - else: - data_shape = [('data', (batch_size,) + image_shapes[0])] - - net.hybridize() - data = mx.sym.var('data') - out = net(data) - softmax = mx.sym.SoftmaxOutput(out, name='softmax') - mod = mx.mod.Module(softmax, context=ctx) - mod.bind(for_training = True, - inputs_need_grad = False, - data_shapes = data_shape) - mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) - mod.init_optimizer(kvstore='local', optimizer='sgd') - data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes] - batch = mx.io.DataBatch(data, []) - for i in range(dry_run + num_batches): - if i == dry_run: - tic = time.time() - mod.forward(batch, is_train=True) - for output in mod.get_outputs(): - output.wait_to_read() - mod.backward() - mod.update() - bwd = time.time() - tic - return bwd - -if __name__ == '__main__': - runtype = opt.type - bs = opt.batch_size - - if opt.model == 'all': - networks = ['alexnet', 'densenet121', 'densenet161', 'densenet169', 'densenet201', - 'inceptionv3', 'mobilenet0.25', 'mobilenet0.5', 'mobilenet0.75', - 'mobilenet1.0', 'mobilenetv2_0.25', 'mobilenetv2_0.5', 'mobilenetv2_0.75', - 'mobilenetv2_1.0', 'resnet101_v1', 'resnet101_v2', 'resnet152_v1', 'resnet152_v2', - 'resnet18_v1', 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', - 'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 'vgg11_bn', 'vgg13', - 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn'] - logging.info('It may take some time to run all models, ' - 'set --network to run a specific one') - else: - networks = [opt.model] - - devs = [mx.gpu(0)] if len(get_gpus()) > 0 else [] - # Enable USE_MKLDNN for better CPU performance - devs.append(mx.cpu()) - - for network in networks: - logging.info('network: %s', network) - for d in devs: - logging.info('device: %s', d) - if runtype == 'inf' or runtype == 'all': - if bs != 0: - fwd_time = score(network, bs, d) - fps = (bs*num_batches)/fwd_time - logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps) - else: - for batch_size in batch_inf: - fwd_time = score(network, batch_size, d) - fps = (batch_size * num_batches) / fwd_time - logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps) - if runtype == 'train' or runtype == 'all': - if bs != 0: - bwd_time = train(network, bs, d) - fps = (bs*num_batches)/bwd_time - logging.info(network + ' training perf for BS %d is %f img/s', bs, fps) - else: - for batch_size in batch_train: - bwd_time = train(network, batch_size, d) - fps = (batch_size * num_batches) / bwd_time - logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps) - - +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +import mxnet.gluon.model_zoo.vision as models +import time +import logging +import argparse +import subprocess +import os +import errno + +logging.basicConfig(level=logging.INFO) +parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN perf') + +parser.add_argument('--model', type=str, default='all', + choices=['all', 'alexnet', 'densenet121', 'densenet161', + 'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25', + 'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25', + 'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1', + 'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1', + 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', + 'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', + 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', + 'vgg19', 'vgg19_bn']) +parser.add_argument('--batch-size', type=int, default=0) +parser.add_argument('--type', type=str, default='inf', choices=['all', 'train', 'inf']) + +opt = parser.parse_args() + +num_batches = 100 +dry_run = 10 # use 10 iterations to warm up +batch_inf = [1, 16, 32, 64, 128, 256] +batch_train = [1, 2, 4, 8, 16, 32, 64, 126, 256] +image_shapes = [(3, 224, 224), (3, 299, 299)] + +def get_gpus(): + """ + return a list of GPUs + """ + try: + re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True) + except OSError: + return [] + return range(len([i for i in re.split('\n') if 'GPU' in i])) + +def score(network, batch_size, ctx): + net = models.get_model(network) + if 'inceptionv3' == network: + data_shape = [('data', (batch_size,) + image_shapes[1])] + else: + data_shape = [('data', (batch_size,) + image_shapes[0])] + + net.hybridize() + data = mx.sym.var('data') + out = net(data) + softmax = mx.sym.SoftmaxOutput(out, name='softmax') + mod = mx.mod.Module(softmax, context=ctx) + mod.bind(for_training = False, + inputs_need_grad = False, + data_shapes = data_shape) + mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) + data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes] + batch = mx.io.DataBatch(data, []) + for i in range(dry_run + num_batches): + if i == dry_run: + tic = time.time() + mod.forward(batch, is_train=False) + for output in mod.get_outputs(): + output.wait_to_read() + fwd = time.time() - tic + return fwd + + +def train(network, batch_size, ctx): + net = models.get_model(network) + if 'inceptionv3' == network: + data_shape = [('data', (batch_size,) + image_shapes[1])] + else: + data_shape = [('data', (batch_size,) + image_shapes[0])] + + net.hybridize() + data = mx.sym.var('data') + out = net(data) + softmax = mx.sym.SoftmaxOutput(out, name='softmax') + mod = mx.mod.Module(softmax, context=ctx) + mod.bind(for_training = True, + inputs_need_grad = False, + data_shapes = data_shape) + mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) + mod.init_optimizer(kvstore='local', optimizer='sgd') + data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes] + batch = mx.io.DataBatch(data, []) + for i in range(dry_run + num_batches): + if i == dry_run: + tic = time.time() + mod.forward(batch, is_train=True) + for output in mod.get_outputs(): + output.wait_to_read() + mod.backward() + mod.update() + bwd = time.time() - tic + return bwd + +if __name__ == '__main__': + runtype = opt.type + bs = opt.batch_size + + if opt.model == 'all': + networks = ['alexnet', 'densenet121', 'densenet161', 'densenet169', 'densenet201', + 'inceptionv3', 'mobilenet0.25', 'mobilenet0.5', 'mobilenet0.75', + 'mobilenet1.0', 'mobilenetv2_0.25', 'mobilenetv2_0.5', 'mobilenetv2_0.75', + 'mobilenetv2_1.0', 'resnet101_v1', 'resnet101_v2', 'resnet152_v1', 'resnet152_v2', + 'resnet18_v1', 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', + 'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 'vgg11_bn', 'vgg13', + 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn'] + logging.info('It may take some time to run all models, ' + 'set --network to run a specific one') + else: + networks = [opt.model] + + devs = [mx.gpu(0)] if len(get_gpus()) > 0 else [] + # Enable USE_MKLDNN for better CPU performance + devs.append(mx.cpu()) + + for network in networks: + logging.info('network: %s', network) + for d in devs: + logging.info('device: %s', d) + if runtype == 'inf' or runtype == 'all': + if bs != 0: + fwd_time = score(network, bs, d) + fps = (bs*num_batches)/fwd_time + logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps) + else: + for batch_size in batch_inf: + fwd_time = score(network, batch_size, d) + fps = (batch_size * num_batches) / fwd_time + logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps) + if runtype == 'train' or runtype == 'all': + if bs != 0: + bwd_time = train(network, bs, d) + fps = (bs*num_batches)/bwd_time + logging.info(network + ' training perf for BS %d is %f img/s', bs, fps) + else: + for batch_size in batch_train: + bwd_time = train(network, batch_size, d) + fps = (batch_size * num_batches) / bwd_time + logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps) + + From 1cb52906714da5a054bdff22904e35f58a22687f Mon Sep 17 00:00:00 2001 From: xinyu-intel Date: Wed, 10 Oct 2018 22:32:49 +0800 Subject: [PATCH 07/13] improve script lint --- example/image-classification/benchmark_score.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py index b3c1bb065e20..eb2b0dc17e60 100644 --- a/example/image-classification/benchmark_score.py +++ b/example/image-classification/benchmark_score.py @@ -105,7 +105,7 @@ def score(network, dev, batch_size, num_batches, dtype): if opt.batch_size == 0: batch_sizes = [1, 2, 4, 8, 16, 32] - logging.info('run batchsize [1 ,2 , 4, 8, 16, 32] by default, ' + logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, ' 'set --batch-size to run a specific one') else: batch_sizes = [opt.batch_size] From 0b17d6fb420ef80c3427cad21306839c14cac88e Mon Sep 17 00:00:00 2001 From: xinyu-intel Date: Fri, 12 Oct 2018 00:17:14 +0800 Subject: [PATCH 08/13] mv benchmark_gluon to new location --- {example => benchmark/python}/gluon/benchmark_gluon.py | 0 example/image-classification/benchmark_score.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename {example => benchmark/python}/gluon/benchmark_gluon.py (100%) diff --git a/example/gluon/benchmark_gluon.py b/benchmark/python/gluon/benchmark_gluon.py similarity index 100% rename from example/gluon/benchmark_gluon.py rename to benchmark/python/gluon/benchmark_gluon.py diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py index eb2b0dc17e60..c46cf25ae385 100644 --- a/example/image-classification/benchmark_score.py +++ b/example/image-classification/benchmark_score.py @@ -114,7 +114,7 @@ def score(network, dev, batch_size, num_batches, dtype): logging.info('network: %s', net) if net in ['densenet121', 'squeezenet1.1']: logging.info('network: %s is converted from gluon modelzoo', net) - logging.info('you can run example/gluon/benchmark_gluon.py for more models') + logging.info('you can run benchmark/python/gluon/benchmark_gluon.py for more models') for d in devs: logging.info('device: %s', d) logged_fp16_warning = False From d5e8a5dd9d9929ef259af717b492023df6f836ef Mon Sep 17 00:00:00 2001 From: xinyu-intel Date: Fri, 12 Oct 2018 00:19:17 +0800 Subject: [PATCH 09/13] support multi-gpus --- benchmark/python/gluon/benchmark_gluon.py | 90 ++++++++++++----------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/benchmark/python/gluon/benchmark_gluon.py b/benchmark/python/gluon/benchmark_gluon.py index 71caad236add..0a114644c47f 100644 --- a/benchmark/python/gluon/benchmark_gluon.py +++ b/benchmark/python/gluon/benchmark_gluon.py @@ -25,7 +25,7 @@ import errno logging.basicConfig(level=logging.INFO) -parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN perf') +parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN perf benchmark') parser.add_argument('--model', type=str, default='all', choices=['all', 'alexnet', 'densenet121', 'densenet161', @@ -38,26 +38,19 @@ 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn']) parser.add_argument('--batch-size', type=int, default=0) -parser.add_argument('--type', type=str, default='inf', choices=['all', 'train', 'inf']) +parser.add_argument('--num-batches', type=int, default=10) +parser.add_argument('--gpus', type=str, default='', + help='ordinates of gpus to use, can be "0,1,2" or empty for cpu only.') +parser.add_argument('--type', type=str, default='inference', choices=['all', 'training', 'inference']) opt = parser.parse_args() -num_batches = 100 +num_batches = opt.num_batches dry_run = 10 # use 10 iterations to warm up batch_inf = [1, 16, 32, 64, 128, 256] batch_train = [1, 2, 4, 8, 16, 32, 64, 126, 256] image_shapes = [(3, 224, 224), (3, 299, 299)] -def get_gpus(): - """ - return a list of GPUs - """ - try: - re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True) - except OSError: - return [] - return range(len([i for i in re.split('\n') if 'GPU' in i])) - def score(network, batch_size, ctx): net = models.get_model(network) if 'inceptionv3' == network: @@ -74,7 +67,10 @@ def score(network, batch_size, ctx): inputs_need_grad = False, data_shapes = data_shape) mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) - data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes] + if mx.cpu() in ctx: + data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.cpu()) for _, shape in mod.data_shapes] + elif mx.gpu(0) in ctx: + data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.gpu()) for _, shape in mod.data_shapes] batch = mx.io.DataBatch(data, []) for i in range(dry_run + num_batches): if i == dry_run: @@ -102,8 +98,14 @@ def train(network, batch_size, ctx): inputs_need_grad = False, data_shapes = data_shape) mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) - mod.init_optimizer(kvstore='local', optimizer='sgd') - data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes] + if len(ctx) > 1: + mod.init_optimizer(kvstore='device', optimizer='sgd') + else: + mod.init_optimizer(kvstore='local', optimizer='sgd') + if mx.cpu() in ctx: + data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.cpu()) for _, shape in mod.data_shapes] + elif mx.gpu(0) in ctx: + data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.gpu()) for _, shape in mod.data_shapes] batch = mx.io.DataBatch(data, []) for i in range(dry_run + num_batches): if i == dry_run: @@ -132,34 +134,34 @@ def train(network, batch_size, ctx): 'set --network to run a specific one') else: networks = [opt.model] - - devs = [mx.gpu(0)] if len(get_gpus()) > 0 else [] - # Enable USE_MKLDNN for better CPU performance - devs.append(mx.cpu()) + + devs = [mx.gpu(int(i)) for i in opt.gpus.split(',')] if opt.gpus.strip() else [mx.cpu()] + num_gpus = len(devs) for network in networks: logging.info('network: %s', network) - for d in devs: - logging.info('device: %s', d) - if runtype == 'inf' or runtype == 'all': - if bs != 0: - fwd_time = score(network, bs, d) - fps = (bs*num_batches)/fwd_time - logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps) - else: - for batch_size in batch_inf: - fwd_time = score(network, batch_size, d) - fps = (batch_size * num_batches) / fwd_time - logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps) - if runtype == 'train' or runtype == 'all': - if bs != 0: - bwd_time = train(network, bs, d) - fps = (bs*num_batches)/bwd_time - logging.info(network + ' training perf for BS %d is %f img/s', bs, fps) - else: - for batch_size in batch_train: - bwd_time = train(network, batch_size, d) - fps = (batch_size * num_batches) / bwd_time - logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps) - - + logging.info('device: %s', devs) + if runtype == 'inference' or runtype == 'all': + if bs != 0: + batch_sizes = bs * max(1, num_gpus) + fwd_time = score(network, batch_sizes, devs) + fps = (batch_sizes * num_batches)/fwd_time + logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps) + else: + for batch_size in batch_inf: + batch_sizes = batch_size * max(1, num_gpus) + fwd_time = score(network, batch_sizes, devs) + fps = (batch_sizes * num_batches) / fwd_time + logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps) + if runtype == 'training' or runtype == 'all': + if bs != 0: + batch_sizes = bs * max(1, num_gpus) + bwd_time = train(network, batch_sizes, devs) + fps = (batch_sizes * num_batches) / bwd_time + logging.info(network + ' training perf for BS %d is %f img/s', bs, fps) + else: + for batch_size in batch_train: + batch_sizes = batch_size * max(1, num_gpus) + bwd_time = train(network, batch_sizes, devs) + fps = (batch_sizes * num_batches) / bwd_time + logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps) From 129802536f25ea8ec6839e646e1e6c742d9e2394 Mon Sep 17 00:00:00 2001 From: Shufan Date: Sun, 14 Oct 2018 18:12:38 +0800 Subject: [PATCH 10/13] Add a new parameter 'global batchsize' for the batch size multiplication for multi-gpu case --- benchmark/python/gluon/benchmark_gluon.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/benchmark/python/gluon/benchmark_gluon.py b/benchmark/python/gluon/benchmark_gluon.py index 0a114644c47f..980c879129b3 100644 --- a/benchmark/python/gluon/benchmark_gluon.py +++ b/benchmark/python/gluon/benchmark_gluon.py @@ -41,11 +41,14 @@ parser.add_argument('--num-batches', type=int, default=10) parser.add_argument('--gpus', type=str, default='', help='ordinates of gpus to use, can be "0,1,2" or empty for cpu only.') +parser.add_argument('--global-batchsize', type=bool, default=True, + help='for multi-gpu case, the batchsize will not be multiplied with GPU number if set true.') parser.add_argument('--type', type=str, default='inference', choices=['all', 'training', 'inference']) opt = parser.parse_args() num_batches = opt.num_batches +global_bs = opt.global_batchsize dry_run = 10 # use 10 iterations to warm up batch_inf = [1, 16, 32, 64, 128, 256] batch_train = [1, 2, 4, 8, 16, 32, 64, 126, 256] @@ -143,25 +146,37 @@ def train(network, batch_size, ctx): logging.info('device: %s', devs) if runtype == 'inference' or runtype == 'all': if bs != 0: - batch_sizes = bs * max(1, num_gpus) + if not global_bs: + batch_sizes = bs * max(1, num_gpus) + else: + batch_sizes = bs fwd_time = score(network, batch_sizes, devs) fps = (batch_sizes * num_batches)/fwd_time logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps) else: for batch_size in batch_inf: - batch_sizes = batch_size * max(1, num_gpus) + if not global_bs: + batch_sizes = batch_size * max(1, num_gpus) + else: + batch_sizes = batch_size fwd_time = score(network, batch_sizes, devs) fps = (batch_sizes * num_batches) / fwd_time logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps) if runtype == 'training' or runtype == 'all': if bs != 0: - batch_sizes = bs * max(1, num_gpus) + if not global_bs: + batch_sizes = bs * max(1, num_gpus) + else: + batch_sizes = bs bwd_time = train(network, batch_sizes, devs) fps = (batch_sizes * num_batches) / bwd_time logging.info(network + ' training perf for BS %d is %f img/s', bs, fps) else: for batch_size in batch_train: - batch_sizes = batch_size * max(1, num_gpus) + if not global_bs: + batch_sizes = batch_size * max(1, num_gpus) + else: + batch_sizes = batch_size bwd_time = train(network, batch_sizes, devs) fps = (batch_sizes * num_batches) / bwd_time logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps) From 39834113a1933ac93728ce827fab55adc2eb2411 Mon Sep 17 00:00:00 2001 From: xinyu-intel Date: Mon, 15 Oct 2018 13:37:44 +0800 Subject: [PATCH 11/13] add batch size argument help --- benchmark/python/gluon/benchmark_gluon.py | 9 +++++++-- example/image-classification/benchmark_score.py | 5 +++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/benchmark/python/gluon/benchmark_gluon.py b/benchmark/python/gluon/benchmark_gluon.py index 980c879129b3..7943e05dbc81 100644 --- a/benchmark/python/gluon/benchmark_gluon.py +++ b/benchmark/python/gluon/benchmark_gluon.py @@ -25,7 +25,7 @@ import errno logging.basicConfig(level=logging.INFO) -parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN perf benchmark') +parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN performance benchmark') parser.add_argument('--model', type=str, default='all', choices=['all', 'alexnet', 'densenet121', 'densenet161', @@ -37,7 +37,8 @@ 'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn']) -parser.add_argument('--batch-size', type=int, default=0) +parser.add_argument('--batch-size', type=int, default=0, + help='run batch size [1, 2, 4, 8, 16, 32] by default') parser.add_argument('--num-batches', type=int, default=10) parser.add_argument('--gpus', type=str, default='', help='ordinates of gpus to use, can be "0,1,2" or empty for cpu only.') @@ -154,6 +155,8 @@ def train(network, batch_size, ctx): fps = (batch_sizes * num_batches)/fwd_time logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps) else: + logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, ' + 'set --batch-size to run a specific one') for batch_size in batch_inf: if not global_bs: batch_sizes = batch_size * max(1, num_gpus) @@ -172,6 +175,8 @@ def train(network, batch_size, ctx): fps = (batch_sizes * num_batches) / bwd_time logging.info(network + ' training perf for BS %d is %f img/s', bs, fps) else: + logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, ' + 'set --batch-size to run a specific one') for batch_size in batch_train: if not global_bs: batch_sizes = batch_size * max(1, num_gpus) diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py index c46cf25ae385..3cdb1adb5f6e 100644 --- a/example/image-classification/benchmark_score.py +++ b/example/image-classification/benchmark_score.py @@ -29,13 +29,14 @@ import numpy as np logging.basicConfig(level=logging.DEBUG) -parser = argparse.ArgumentParser(description='SymbolAPI-based CNN perf') +parser = argparse.ArgumentParser(description='SymbolAPI-based CNN inference performance benchmark') parser.add_argument('--network', type=str, default='all', choices=['all', 'alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50', 'resnet-152', 'inception-bn', 'inception-v3', 'inception-v4', 'inception-resnet-v2', 'mobilenet', 'densenet121', 'squeezenet1.1']) -parser.add_argument('--batch-size', type=int, default=0) +parser.add_argument('--batch-size', type=int, default=0, + help='run batch size [1, 2, 4, 8, 16, 32] by default') opt = parser.parse_args() From 68e80095d4afb1acc8ab1d75241be5eb6ff12fa3 Mon Sep 17 00:00:00 2001 From: xinyu-intel Date: Tue, 16 Oct 2018 13:13:28 +0800 Subject: [PATCH 12/13] improve help and change default batchsize --- benchmark/python/gluon/benchmark_gluon.py | 15 ++++++++++----- example/image-classification/benchmark_score.py | 7 ++++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/benchmark/python/gluon/benchmark_gluon.py b/benchmark/python/gluon/benchmark_gluon.py index 7943e05dbc81..4e029e9442ad 100644 --- a/benchmark/python/gluon/benchmark_gluon.py +++ b/benchmark/python/gluon/benchmark_gluon.py @@ -38,12 +38,17 @@ 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn']) parser.add_argument('--batch-size', type=int, default=0, - help='run batch size [1, 2, 4, 8, 16, 32] by default') + help='Batch size to use for benchmarking. Example: 32, 64, 128.' + 'By default, runs benchmark for batch sizes - 1, 32, 64, 128, 256') parser.add_argument('--num-batches', type=int, default=10) parser.add_argument('--gpus', type=str, default='', - help='ordinates of gpus to use, can be "0,1,2" or empty for cpu only.') + help='GPU IDs to use for this benchmark task. Example: --gpus=0,1,2,3 to use 4 GPUs.' + 'By default, use CPU only.') parser.add_argument('--global-batchsize', type=bool, default=True, - help='for multi-gpu case, the batchsize will not be multiplied with GPU number if set true.') + help='Optional. Set this to True if batch-size should be used as is.' + 'Example: If --batch-size=64, --gpus=0,1. With --global-batchsize=True,' + 'each of the 2 GPUs will get 64/2 = 32 samples per batch.' + 'With --global-batchsize=False, each of the 2 GPUs will get 64 samples per batch.') parser.add_argument('--type', type=str, default='inference', choices=['all', 'training', 'inference']) opt = parser.parse_args() @@ -51,8 +56,8 @@ num_batches = opt.num_batches global_bs = opt.global_batchsize dry_run = 10 # use 10 iterations to warm up -batch_inf = [1, 16, 32, 64, 128, 256] -batch_train = [1, 2, 4, 8, 16, 32, 64, 126, 256] +batch_inf = [1, 32, 64, 128, 256] +batch_train = [1, 32, 64, 128, 256] image_shapes = [(3, 224, 224), (3, 299, 299)] def score(network, batch_size, ctx): diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py index 3cdb1adb5f6e..e81a30bd6439 100644 --- a/example/image-classification/benchmark_score.py +++ b/example/image-classification/benchmark_score.py @@ -36,7 +36,8 @@ 'inception-v4', 'inception-resnet-v2', 'mobilenet', 'densenet121', 'squeezenet1.1']) parser.add_argument('--batch-size', type=int, default=0, - help='run batch size [1, 2, 4, 8, 16, 32] by default') + help='Batch size to use for benchmarking. Example: 32, 64, 128.' + 'By default, runs benchmark for batch sizes - 1, 32, 64, 128, 256') opt = parser.parse_args() @@ -105,8 +106,8 @@ def score(network, dev, batch_size, num_batches, dtype): devs.append(mx.cpu()) if opt.batch_size == 0: - batch_sizes = [1, 2, 4, 8, 16, 32] - logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, ' + batch_sizes = [1, 32, 64, 128, 256] + logging.info('run batchsize [1, 32, 64, 128, 256] by default, ' 'set --batch-size to run a specific one') else: batch_sizes = [opt.batch_size] From e21c8f5594b656fa69a7d54ddb4c5b53617be5d1 Mon Sep 17 00:00:00 2001 From: xinyu-intel Date: Fri, 19 Oct 2018 11:04:00 +0800 Subject: [PATCH 13/13] simplify benchmark_gluon --- benchmark/python/gluon/benchmark_gluon.py | 52 ++++++----------------- 1 file changed, 12 insertions(+), 40 deletions(-) diff --git a/benchmark/python/gluon/benchmark_gluon.py b/benchmark/python/gluon/benchmark_gluon.py index 4e029e9442ad..3dbb36404d07 100644 --- a/benchmark/python/gluon/benchmark_gluon.py +++ b/benchmark/python/gluon/benchmark_gluon.py @@ -44,30 +44,24 @@ parser.add_argument('--gpus', type=str, default='', help='GPU IDs to use for this benchmark task. Example: --gpus=0,1,2,3 to use 4 GPUs.' 'By default, use CPU only.') -parser.add_argument('--global-batchsize', type=bool, default=True, - help='Optional. Set this to True if batch-size should be used as is.' - 'Example: If --batch-size=64, --gpus=0,1. With --global-batchsize=True,' - 'each of the 2 GPUs will get 64/2 = 32 samples per batch.' - 'With --global-batchsize=False, each of the 2 GPUs will get 64 samples per batch.') parser.add_argument('--type', type=str, default='inference', choices=['all', 'training', 'inference']) opt = parser.parse_args() num_batches = opt.num_batches -global_bs = opt.global_batchsize dry_run = 10 # use 10 iterations to warm up batch_inf = [1, 32, 64, 128, 256] batch_train = [1, 32, 64, 128, 256] image_shapes = [(3, 224, 224), (3, 299, 299)] def score(network, batch_size, ctx): + assert (batch_size >= len(ctx)), "ERROR: batch size should not be smaller than num of GPUs." net = models.get_model(network) if 'inceptionv3' == network: data_shape = [('data', (batch_size,) + image_shapes[1])] else: data_shape = [('data', (batch_size,) + image_shapes[0])] - net.hybridize() data = mx.sym.var('data') out = net(data) softmax = mx.sym.SoftmaxOutput(out, name='softmax') @@ -76,10 +70,7 @@ def score(network, batch_size, ctx): inputs_need_grad = False, data_shapes = data_shape) mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) - if mx.cpu() in ctx: - data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.cpu()) for _, shape in mod.data_shapes] - elif mx.gpu(0) in ctx: - data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.gpu()) for _, shape in mod.data_shapes] + data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx[0]) for _, shape in mod.data_shapes] batch = mx.io.DataBatch(data, []) for i in range(dry_run + num_batches): if i == dry_run: @@ -92,13 +83,13 @@ def score(network, batch_size, ctx): def train(network, batch_size, ctx): + assert (batch_size >= len(ctx)), "ERROR: batch size should not be smaller than num of GPUs." net = models.get_model(network) if 'inceptionv3' == network: data_shape = [('data', (batch_size,) + image_shapes[1])] else: data_shape = [('data', (batch_size,) + image_shapes[0])] - net.hybridize() data = mx.sym.var('data') out = net(data) softmax = mx.sym.SoftmaxOutput(out, name='softmax') @@ -111,10 +102,7 @@ def train(network, batch_size, ctx): mod.init_optimizer(kvstore='device', optimizer='sgd') else: mod.init_optimizer(kvstore='local', optimizer='sgd') - if mx.cpu() in ctx: - data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.cpu()) for _, shape in mod.data_shapes] - elif mx.gpu(0) in ctx: - data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.gpu()) for _, shape in mod.data_shapes] + data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx[0]) for _, shape in mod.data_shapes] batch = mx.io.DataBatch(data, []) for i in range(dry_run + num_batches): if i == dry_run: @@ -152,41 +140,25 @@ def train(network, batch_size, ctx): logging.info('device: %s', devs) if runtype == 'inference' or runtype == 'all': if bs != 0: - if not global_bs: - batch_sizes = bs * max(1, num_gpus) - else: - batch_sizes = bs - fwd_time = score(network, batch_sizes, devs) - fps = (batch_sizes * num_batches)/fwd_time + fwd_time = score(network, bs, devs) + fps = (bs * num_batches)/fwd_time logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps) else: logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, ' 'set --batch-size to run a specific one') for batch_size in batch_inf: - if not global_bs: - batch_sizes = batch_size * max(1, num_gpus) - else: - batch_sizes = batch_size - fwd_time = score(network, batch_sizes, devs) - fps = (batch_sizes * num_batches) / fwd_time + fwd_time = score(network, batch_size, devs) + fps = (batch_size * num_batches) / fwd_time logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps) if runtype == 'training' or runtype == 'all': if bs != 0: - if not global_bs: - batch_sizes = bs * max(1, num_gpus) - else: - batch_sizes = bs - bwd_time = train(network, batch_sizes, devs) - fps = (batch_sizes * num_batches) / bwd_time + bwd_time = train(network, bs, devs) + fps = (bs * num_batches) / bwd_time logging.info(network + ' training perf for BS %d is %f img/s', bs, fps) else: logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, ' 'set --batch-size to run a specific one') for batch_size in batch_train: - if not global_bs: - batch_sizes = batch_size * max(1, num_gpus) - else: - batch_sizes = batch_size - bwd_time = train(network, batch_sizes, devs) - fps = (batch_sizes * num_batches) / bwd_time + bwd_time = train(network, batch_size, devs) + fps = (batch_size * num_batches) / bwd_time logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps)