From b360d8cace6b44a9cb9cc7c3da084b0f7422710d Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 9 Aug 2018 14:21:05 -0700 Subject: [PATCH 001/529] [TEST] force openblas threads to be 1 (#1580) --- tests/scripts/task_python_nnvm.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/scripts/task_python_nnvm.sh b/tests/scripts/task_python_nnvm.sh index 2fc41980fb3d..790073a2fe8b 100755 --- a/tests/scripts/task_python_nnvm.sh +++ b/tests/scripts/task_python_nnvm.sh @@ -1,6 +1,8 @@ #!/bin/bash export PYTHONPATH=nnvm/python:python:topi/python +# to avoid openblas threading error +export OMP_NUM_THREADS=1 echo "Running unittest..." python -m nose -v nnvm/tests/python/unittest || exit -1 From 0688ceb3deeb9b1fe656ac5cff36f50e1d02b3b1 Mon Sep 17 00:00:00 2001 From: Dayananda V Date: Fri, 10 Aug 2018 07:11:49 +0530 Subject: [PATCH 002/529] Vulkan TVM Android Support (#1571) --- apps/android_rpc/README.md | 25 +++-- .../app/src/main/jni/Application.mk | 16 ++-- apps/android_rpc/tests/android_rpc_test.py | 91 +++++++++++++------ .../src/main/java/ml/dmlc/tvm/TVMContext.java | 15 +++ .../main/java/ml/dmlc/tvm/rpc/RPCSession.java | 18 ++++ python/tvm/rpc/client.py | 4 + web/tvm_runtime.js | 2 + 7 files changed, 125 insertions(+), 46 deletions(-) diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md index 41d361c823ed..eef22f3c7010 100644 --- a/apps/android_rpc/README.md +++ b/apps/android_rpc/README.md @@ -123,18 +123,25 @@ export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++ python android_rpc_test.py ``` -This will compile TVM IR to shared libraries (CPU and OpenCL) and run vector addition on your Android device. On my test device, it gives following results. +This will compile TVM IR to shared libraries (CPU, OpenCL and Vulkan) and run vector addition on your Android device. To verify compiled TVM IR shared libraries on OpenCL target set [`'test_opencl = True'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L25) and on Vulkan target set [`'test_vulkan = False'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L27) in [tests/android_rpc_test.py](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py), by default on CPU target will execute. +On my test device, it gives following results. ```bash -TVM: Initializing cython mode... -[01:21:43] src/codegen/llvm/codegen_llvm.cc:75: set native vector to be 32 for target aarch64 -[01:21:43] src/runtime/opencl/opencl_device_api.cc:194: Initialize OpenCL platform 'Apple' -[01:21:43] src/runtime/opencl/opencl_device_api.cc:214: opencl(0)='Iris' cl_device_id=0x1024500 -[01:21:44] src/codegen/llvm/codegen_llvm.cc:75: set native vector to be 32 for target aarch64 -Run GPU test ... -0.000155807 secs/op Run CPU test ... -0.00139824 secs/op +0.000962932 secs/op + +Run GPU(OpenCL Flavor) test ... +0.000155807 secs/op + +[23:29:34] /home/tvm/src/runtime/vulkan/vulkan_device_api.cc:674: Cannot initialize vulkan: [23:29:34] /home/tvm/src/runtime/vulkan/vulkan_device_api.cc:512: Check failed: __e == VK_SUCCESS Vulan Error, code=-9: VK_ERROR_INCOMPATIBLE_DRIVER + +Stack trace returned 10 entries: +[bt] (0) /home/user/.local/lib/python3.6/site-packages/tvm-0.4.0-py3.6-linux-x86_64.egg/tvm/libtvm.so(dmlc::StackTrace[abi:cxx11]()+0x53) [0x7f477f5399f3] +......... + +You can still compile vulkan module but cannot run locally +Run GPU(Vulkan Flavor) test ... +0.000225198 secs/op ``` You can define your own TVM operators and test via this RPC app on your Android device to find the most optimized TVM schedule. diff --git a/apps/android_rpc/app/src/main/jni/Application.mk b/apps/android_rpc/app/src/main/jni/Application.mk index 5bf52bdaffc0..f142e2995777 100644 --- a/apps/android_rpc/app/src/main/jni/Application.mk +++ b/apps/android_rpc/app/src/main/jni/Application.mk @@ -1,9 +1,9 @@ ifndef config - ifneq ("$(wildcard ./config.mk)","") - config ?= config.mk - else - config ?= make/config.mk - endif + ifneq ("$(wildcard ./config.mk)","") + config ?= config.mk + else + config ?= make/config.mk + endif endif include $(config) @@ -16,10 +16,10 @@ APP_STL := c++_static APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++11 -Oz -frtti ifeq ($(USE_OPENCL), 1) - APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 + APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 endif ifeq ($(USE_VULKAN), 1) - APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1 - APP_LDFLAGS += -lvulkan + APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1 + APP_LDFLAGS += -lvulkan endif diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py index cfb04c1ca9a9..44618efd45c1 100644 --- a/apps/android_rpc/tests/android_rpc_test.py +++ b/apps/android_rpc/tests/android_rpc_test.py @@ -21,59 +21,92 @@ arch = "arm64" target = "llvm -target=%s-linux-android" % arch +# whether enable to execute test on OpenCL target +test_opencl = False +# whether enable to execute test on Vulkan target +test_vulkan = False + def test_rpc_module(): # graph n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') + a_np = np.random.uniform(size=1024).astype(A.dtype) temp = util.tempdir() - s = tvm.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=64) - s[B].bind(xi, tvm.thread_axis("threadIdx.x")) - s[B].bind(xo, tvm.thread_axis("blockIdx.x")) - # Build the dynamic lib. - # If we don't want to do metal and only use cpu, just set target to be target - f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd") - path_dso1 = temp.relpath("dev_lib2.so") - f.export_library(path_dso1, ndk.create_shared) + # Establish remote connection with target hardware + tracker = rpc.connect_tracker(tracker_host, tracker_port) + remote = tracker.request(key, priority=0, + session_timeout=60) + + # Compile the Graph for CPU target s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].parallel(xi) s[B].pragma(xo, "parallel_launch_point") s[B].pragma(xi, "parallel_barrier_when_finish") f = tvm.build(s, [A, B], target, name="myadd_cpu") - path_dso2 = temp.relpath("cpu_lib.so") - f.export_library(path_dso2, ndk.create_shared) - - tracker = rpc.connect_tracker(tracker_host, tracker_port) - remote = tracker.request(key, priority=0, - session_timeout=60) + path_dso_cpu = temp.relpath("cpu_lib.so") + f.export_library(path_dso_cpu, ndk.create_shared) + # Execute the portable graph on cpu target print('Run CPU test ...') ctx = remote.cpu(0) - remote.upload(path_dso2) + remote.upload(path_dso_cpu) f2 = remote.load_module("cpu_lib.so") - a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f2.time_evaluator(f2.entry_name, ctx, number=10) cost = time_f(a, b).mean - print('%g secs/op' % cost) + print('%g secs/op\n' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) + # Compile the Graph for OpenCL target + if test_opencl: + s = tvm.create_schedule(B.op) + xo, xi = s[B].split(B.op.axis[0], factor=64) + s[B].bind(xi, tvm.thread_axis("threadIdx.x")) + s[B].bind(xo, tvm.thread_axis("blockIdx.x")) + # Build the dynamic lib. + # If we don't want to do metal and only use cpu, just set target to be target + f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd") + path_dso_cl = temp.relpath("dev_lib_cl.so") + f.export_library(path_dso_cl, ndk.create_shared) + + print('Run GPU(OpenCL Flavor) test ...') + ctx = remote.cl(0) + remote.upload(path_dso_cl) + f1 = remote.load_module("dev_lib_cl.so") + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) + time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) + cost = time_f(a, b).mean + print('%g secs/op\n' % cost) + np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) + + # Compile the Graph for Vulkan target + if test_vulkan: + s = tvm.create_schedule(B.op) + xo, xi = s[B].split(B.op.axis[0], factor=64) + s[B].bind(xi, tvm.thread_axis("threadIdx.x")) + s[B].bind(xo, tvm.thread_axis("blockIdx.x")) + # Build the dynamic lib. + # If we don't want to do metal and only use cpu, just set target to be target + f = tvm.build(s, [A, B], "vulkan", target_host=target, name="myadd") + path_dso_vulkan = temp.relpath("dev_lib_vulkan.so") + f.export_library(path_dso_vulkan, ndk.create_shared) + + print('Run GPU(Vulkan Flavor) test ...') + ctx = remote.vulkan(0) + remote.upload(path_dso_vulkan) + f1 = remote.load_module("dev_lib_vulkan.so") + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) + time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) + cost = time_f(a, b).mean + print('%g secs/op\n' % cost) + np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) - print('Run GPU test ...') - ctx = remote.cl(0) - remote.upload(path_dso1) - f1 = remote.load_module("dev_lib2.so") - a_np = np.random.uniform(size=1024).astype(A.dtype) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) - time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) - cost = time_f(a, b).mean - print('%g secs/op' % cost) - np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) if __name__ == "__main__": test_rpc_module() diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java b/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java index 0d108e0a2943..d9051f0d9d4d 100644 --- a/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java +++ b/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java @@ -30,6 +30,7 @@ public class TVMContext { MASK2STR.put(1, "cpu"); MASK2STR.put(2, "gpu"); MASK2STR.put(4, "opencl"); + MASK2STR.put(7, "vulkan"); MASK2STR.put(8, "metal"); MASK2STR.put(9, "vpi"); @@ -38,6 +39,7 @@ public class TVMContext { STR2MASK.put("cuda", 2); STR2MASK.put("cl", 4); STR2MASK.put("opencl", 4); + STR2MASK.put("vulkan", 7); STR2MASK.put("metal", 8); STR2MASK.put("vpi", 9); } @@ -81,6 +83,19 @@ public static TVMContext opencl() { return opencl(0); } + /** + * Construct a Vulkan device. + * @param devId The device id + * @return The created context + */ + public static TVMContext vulkan(int devId) { + return new TVMContext(7, devId); + } + + public static TVMContext vulkan() { + return vulkan(0); + } + /** * Construct a metal device. * @param devId The device id diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java index 0eec9224a40c..8ebf188b0667 100644 --- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java +++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java @@ -143,6 +143,24 @@ public TVMContext cl() { return cl(0); } + /** + * Construct remote OpenCL device. + * @param devId device id. + * @return Remote OpenCL context. + */ + public TVMContext vulkan(int devId) { + return context(7, devId); + } + + /** + * Construct remote OpenCL device. + * @return Remote OpenCL context. + */ + public TVMContext vulkan() { + return vulkan(0); + } + + /** * Construct remote Metal device. * @param devId device id. diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py index 57f368b0e660..ffbe6eeab6ee 100644 --- a/python/tvm/rpc/client.py +++ b/python/tvm/rpc/client.py @@ -130,6 +130,10 @@ def cl(self, dev_id=0): """Construct OpenCL device.""" return self.context(4, dev_id) + def vulkan(self, dev_id=0): + """Construct Vulkan device.""" + return self.context(7, dev_id) + def metal(self, dev_id=0): """Construct Metal device.""" return self.context(8, dev_id) diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js index ef594e9433fb..786745d3ce88 100644 --- a/web/tvm_runtime.js +++ b/web/tvm_runtime.js @@ -696,6 +696,7 @@ var tvm_runtime = tvm_runtime || {}; 1 : "cpu", 2 : "gpu", 4 : "opencl", + 7 : "vulkan", 8 : "metal", 9 : "vpi", 11 : "opengl", @@ -706,6 +707,7 @@ var tvm_runtime = tvm_runtime || {}; "cuda": 2, "cl": 4, "opencl": 4, + "vulkan": 7, "metal": 8, "vpi": 9, "opengl": 11, From ddadde8987aea401950692c570b16a421f47d680 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 9 Aug 2018 18:55:48 -0700 Subject: [PATCH 003/529] [TEAM] merrymercy->code owner (#1581) --- CONTRIBUTORS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 6e3cf55b94b0..2d571ba668ea 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -18,6 +18,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h - [Yuwei Hu](https://github.com/Huyuwei) TOPI - [Zhixun Tan](https://github.com/phisiart) OpenGL/WebGL backend - [Nick Hynes](https://github.com/nhynes) SGX and secured computing +- [Lianmin Zheng](https://github.com/merrymercy) AutoTVM ## Reviewers - [Masahiro Masuda](https://github.com/masahi) @@ -27,7 +28,6 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h - [Alex Weaver](https://github.com/alex-weaver) - [Eddie Yan](https://github.com/eqy) - [Joshua Z. Zhang](https://github.com/zhreshold) -- [Lianmin Zheng](https://github.com/merrymercy) ## List of Contributors - [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors) From a6ec4faf9628362d9c092933f515c3ca751efe50 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Fri, 10 Aug 2018 11:45:09 -0700 Subject: [PATCH 004/529] [AUTOTVM] API change (#1583) --- python/tvm/autotvm/__init__.py | 2 +- python/tvm/autotvm/measure/__init__.py | 2 +- python/tvm/autotvm/measure/measure.py | 10 ++++---- python/tvm/autotvm/measure/measure_methods.py | 24 +++++++++---------- python/tvm/autotvm/tuner/tuner.py | 4 ++-- tutorials/autotvm/tune_conv2d_cuda.py | 2 +- tutorials/autotvm/tune_nnvm_arm.py | 6 ++--- 7 files changed, 25 insertions(+), 25 deletions(-) diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py index 20426be84aa1..5b312d93d288 100644 --- a/python/tvm/autotvm/__init__.py +++ b/python/tvm/autotvm/__init__.py @@ -22,7 +22,7 @@ from . import tophub # some shortcuts -from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo, use_rpc +from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo from .tuner import callback from .task import template, get_config, create, ConfigSpace, ConfigEntity, \ ApplyHistoryBest as apply_history_best diff --git a/python/tvm/autotvm/measure/__init__.py b/python/tvm/autotvm/measure/__init__.py index b9bd3c37b01d..880dfd1ffe29 100644 --- a/python/tvm/autotvm/measure/__init__.py +++ b/python/tvm/autotvm/measure/__init__.py @@ -1,7 +1,7 @@ """Distributed executor infrastructure to scale up the tuning""" from .measure import MeasureInput, MeasureResult, MeasureErrorNo, measure_option -from .measure_methods import request_remote, check_remote, create_measure_batch, use_rpc +from .measure_methods import request_remote, check_remote, create_measure_batch, rpc from .local_executor import LocalExecutor from .executor import Future, Executor diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py index 6a05e1a6a349..2325a970bc45 100644 --- a/python/tvm/autotvm/measure/measure.py +++ b/python/tvm/autotvm/measure/measure.py @@ -49,7 +49,7 @@ def measure_option(measure_func, number=1, repeat=1, timeout=60, - parallel_num=1, + n_parallel=1, do_fork=True, build_func='default', check_correctness=False, @@ -63,7 +63,7 @@ def measure_option(measure_func, and a RPC server silently for the user. callable: It is a callable function for measurement. - See the return value of measure/measure_methods.py::use_rpc for example. + See the return value of measure/measure_methods.py::rpc for example. number : int, optional Number of times to do the measurement for average repeat : int, optional @@ -74,7 +74,7 @@ def measure_option(measure_func, timeout: int, optional Timeout for a whole batch. TimeoutError will be returned as the result if a task timeouts. - parallel_num: int, optional + n_parallel: int, optional The number of measurement task that can run in parallel. Set this according to the number of cpu cores (for compilation) and the number of devices you have (for measuring generate code). @@ -106,7 +106,7 @@ def measure_option(measure_func, and handle the logic of measurement. Signature: - * measure_func (see the return value of measure/measure_methods.py::use_rpc for example) + * measure_func (see the return value of measure/measure_methods.py::rpc for example) def measure_func(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output): return measure_results @@ -119,7 +119,7 @@ def build_func(inp, tmp_dir, **kwargs): 'number': number, 'repeat': repeat, 'timeout': timeout, - 'parallel_num': parallel_num, + 'n_parallel': n_parallel, 'do_fork': do_fork, 'build_func': build_func, 'check_correctness': check_correctness, diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 6e95a6e435d0..e192ee26ee3e 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -13,8 +13,8 @@ import numpy as np -from ... import rpc, ir_pass, build, build_config, nd, context, TVMError, register_func, \ - target as _target +from ... import ir_pass, build, build_config, nd, context, TVMError, register_func, \ + target as _target, rpc as _rpc from ...contrib import nvcc, util, ndk from ..util import get_const_tuple @@ -60,7 +60,7 @@ def request_remote(device_key, tracker_addr=None, priority=1, timeout=60): host = os.environ['TVM_TRACKER_HOST'] port = int(os.environ['TVM_TRACKER_PORT']) - tracker = rpc.connect_tracker(host, port) + tracker = _rpc.connect_tracker(host, port) remote = tracker.request(device_key, priority=priority, session_timeout=timeout) return remote @@ -113,7 +113,7 @@ def create_measure_batch(task, option): measure_func = option['measure_func'] number, repeat = option['number'], option['repeat'] - timeout, parallel_num, do_fork = option['timeout'], option['parallel_num'], option['do_fork'] + timeout, n_parallel, do_fork = option['timeout'], option['n_parallel'], option['do_fork'] build_func = option['build_func'] check_correctness = option['check_correctness'] replay_db = option['replay_db'] @@ -134,7 +134,7 @@ def create_measure_batch(task, option): use_popen=True, silent=True, tracker_addr=(tracker.host, tracker.port)) - measure_func = use_rpc(device_key, tracker.host, tracker.port) + measure_func = rpc(device_key, tracker.host, tracker.port) attach_objects = (server, tracker) build_kwargs = {} @@ -218,18 +218,18 @@ def measure_batch(measure_inputs): return partial_results return results - measure_batch.parallel_num = parallel_num + measure_batch.n_parallel = n_parallel # attach server and tracker object to avoid them of being garbage-collected measure_batch.attach_objects = attach_objects return measure_batch -def use_rpc(key, - host=None, - port=None, - priority=1, - session_timeout=60, - pack_size=1): +def rpc(key, + host=None, + port=None, + priority=1, + session_timeout=60, + pack_size=1): """ Create a standard measure_func which uses RPC Tracker for measurement. This measure_func will request a device from the RPC Tracker and diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py index 5d1fc1507e58..91004cba4603 100644 --- a/python/tvm/autotvm/tuner/tuner.py +++ b/python/tvm/autotvm/tuner/tuner.py @@ -85,7 +85,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()): every measurement pair. See autotvm/tuner/callback.py for some examples. """ measure_batch = create_measure_batch(self.task, measure_option) - parallel_num = getattr(measure_batch, 'parallel_num', 1) + n_parallel = getattr(measure_batch, 'n_parallel', 1) early_stopping = early_stopping or 1e9 old_level = logger.level @@ -95,7 +95,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()): if not self.has_next(): break - configs = self.next_batch(min(parallel_num, n_trial - i)) + configs = self.next_batch(min(n_parallel, n_trial - i)) inputs = [MeasureInput(self.task.target, self.task, config) for config in configs] results = measure_batch(inputs) diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py index 179ac811ab70..375d1a9b755e 100644 --- a/tutorials/autotvm/tune_conv2d_cuda.py +++ b/tutorials/autotvm/tune_conv2d_cuda.py @@ -168,7 +168,7 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding): # run 8 parallel threads for compilation measure_option = autotvm.measure_option('local', number=5, - parallel_num=8, + n_parallel=8, timeout=20) # begin tuning, log records to file `conv2d.log` diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py index d11823f204e1..f3d1c62bdaf2 100644 --- a/tutorials/autotvm/tune_nnvm_arm.py +++ b/tutorials/autotvm/tune_nnvm_arm.py @@ -191,9 +191,9 @@ def get_network(name, batch_size): 'early_stopping': 250, 'measure_option': autotvm.measure_option( - autotvm.use_rpc(device_key, host='localhost', port=9190), + autotvm.measure.rpc(device_key, host='localhost', port=9190), number=4, - parallel_num=1, + n_parallel=1, timeout=10, build_func='ndk' if use_android else 'default', ), @@ -205,7 +205,7 @@ def get_network(name, batch_size): # # In general, the default value provided here works well. It is the same # value that we used to generate pre-tuned parameters. -# If you have multiple devices, you can set :code:`parallel_num` to +# If you have multiple devices, you can set :code:`n_parallel` to # the number of devices you have. (e.g. set it to 3 if you register 3 rk3399 # boards to the tracker). # If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger, From e571a80f584f007695a217638f124a568010314f Mon Sep 17 00:00:00 2001 From: Hao Jin Date: Fri, 10 Aug 2018 18:01:55 -0400 Subject: [PATCH 005/529] update dmlc-core for security reason (#1584) --- dmlc-core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dmlc-core b/dmlc-core index e864aa6757cd..4f0564ec7694 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit e864aa6757cdbe78b1296fe5231fd3050b7802c3 +Subproject commit 4f0564ec769477c66d480dd966088f172050c874 From 545d10c617d8f0eb082ddb0854edb700a82495db Mon Sep 17 00:00:00 2001 From: eqy Date: Fri, 10 Aug 2018 15:02:10 -0700 Subject: [PATCH 006/529] DLPack Conversion API (#1573) --- include/tvm/runtime/c_runtime_api.h | 26 ++++++++++++ include/tvm/runtime/ndarray.h | 2 +- python/tvm/_ffi/ndarray.py | 62 ++++++++++++++++++++++++++++- python/tvm/contrib/dlpack.py | 43 ++++++++++++++++++++ python/tvm/ndarray.py | 2 +- src/runtime/ndarray.cc | 36 +++++++++++++---- tests/python/contrib/test_dlpack.py | 44 ++++++++++++++++++++ 7 files changed, 205 insertions(+), 10 deletions(-) create mode 100644 python/tvm/contrib/dlpack.py create mode 100644 tests/python/contrib/test_dlpack.py diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index 17d00bf479aa..dca0d5ed4a30 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -445,6 +445,32 @@ TVM_DLL int TVMArrayCopyFromTo(TVMArrayHandle from, TVMArrayHandle to, TVMStreamHandle stream); +/*! + * \brief Produce an array from the DLManagedTensor that shares data memory + * with the DLManagedTensor. + * \param from The source DLManagedTensor. + * \param out The output array handle. + * \return 0 when success, -1 when failure happens + */ +TVM_DLL int TVMArrayFromDLPack(DLManagedTensor* from, + TVMArrayHandle* out); + +/*! + * \brief Produce a DLMangedTensor from the array that shares data memory with + * the array. + * \param from The source array. + * \param out The DLManagedTensor handle. + * \return 0 when success, -1 when failure happens + */ +TVM_DLL int TVMArrayToDLPack(TVMArrayHandle from, + DLManagedTensor** out); + +/*! + * \brief Delete (free) a DLManagedTensor's data. + * \param dltensor Pointer to the DLManagedTensor. + */ +TVM_DLL void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor); + /*! * \brief Create a new runtime stream. * diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h index 2b51b2e0fcfe..d3ecce8ba9d0 100644 --- a/include/tvm/runtime/ndarray.h +++ b/include/tvm/runtime/ndarray.h @@ -155,7 +155,7 @@ class NDArray { * that is DLPack compatible. * * The memory is retained until the NDArray went out of scope. - * + * \param tensor The DLPack tensor to copy from. * \return The created NDArray view. */ TVM_DLL static NDArray FromDLPack(DLManagedTensor* tensor); diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py index 3788c07ac440..d994d7c2e4a5 100644 --- a/python/tvm/_ffi/ndarray.py +++ b/python/tvm/_ffi/ndarray.py @@ -5,7 +5,7 @@ import sys import ctypes import numpy as np -from .base import _LIB, check_call, c_array, string_types, _FFI_MODE +from .base import _LIB, check_call, c_array, string_types, _FFI_MODE, c_str from .runtime_ctypes import TVMType, TVMContext, TVMArray, TVMArrayHandle from .runtime_ctypes import TypeCode, tvm_shape_index_t @@ -28,6 +28,17 @@ from ._ctypes.ndarray import NDArrayBase as _NDArrayBase +TVMPyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p) +_c_str_dltensor = c_str('dltensor') + + +# used for PyCapsule manipulation +if hasattr(ctypes, 'pythonapi'): + ctypes.pythonapi.PyCapsule_GetName.restype = ctypes.c_char_p + ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p + ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object + + def context(dev_type, dev_id=0): """Construct a TVM context with given device type and id. @@ -62,6 +73,7 @@ def context(dev_type, dev_id=0): dev_type = TVMContext.STR2MASK[dev_type] return TVMContext(dev_type, dev_id) + def numpyasarray(np_data): """Return a TVMArray representation of a numpy array. """ @@ -112,6 +124,42 @@ def empty(shape, dtype="float32", ctx=context(1, 0)): ctypes.byref(handle))) return _make_array(handle, False) + +def from_dlpack(dltensor): + """Produce an array from a DLPack tensor without memory copy. + Retreives the underlying DLPack tensor's pointer to create an array from the + data. Removes the original DLPack tensor's destructor as now the array is + responsible for destruction. + + Parameters + ---------- + dltensor : DLPack tensor + + Returns + ------- + arr: tvm.nd.NDArray + The array view of the tensor data. + """ + dltensor = ctypes.py_object(dltensor) + name = ctypes.pythonapi.PyCapsule_GetName(dltensor) + ptr = ctypes.pythonapi.PyCapsule_GetPointer(dltensor, name) + handle = TVMArrayHandle() + check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle))) + ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, None) + return _make_array(handle, False) + + +def _dlpack_deleter(pycapsule): + pycapsule = ctypes.py_object(pycapsule) + if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor): + ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor) + _LIB.TVMDLManagedTensorCallDeleter(ptr) + ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0)) + + +_c_dlpack_deleter = TVMPyCapsuleDestructor(_dlpack_deleter) + + class NDArrayBase(_NDArrayBase): """A simple Device/CPU Array object in runtime.""" @property @@ -260,6 +308,18 @@ def copyto(self, target): raise ValueError("Unsupported target type %s" % str(type(target))) return target + def to_dlpack(self): + """Produce an array from a DLPack Tensor without copying memory + + Returns + ------- + dlpack : DLPack tensor view of the array data + """ + handle = ctypes.c_void_p() + check_call(_LIB.TVMArrayToDLPack(self.handle, ctypes.byref(handle))) + return ctypes.pythonapi.PyCapsule_New(handle, _c_str_dltensor, _c_dlpack_deleter) + + def free_extension_handle(handle, type_code): """Free c++ extension type handle diff --git a/python/tvm/contrib/dlpack.py b/python/tvm/contrib/dlpack.py new file mode 100644 index 000000000000..11db29f98b3e --- /dev/null +++ b/python/tvm/contrib/dlpack.py @@ -0,0 +1,43 @@ +"""Wrapping functions to bridge frameworks with DLPack support to TVM""" +from .. import ndarray + +def convert_func(tvm_func, tensor_type, to_dlpack_func): + """Convert a tvm function into one that accepts a tensor from another + framework, provided the other framework supports DLPACK + + Parameters + ---------- + tvm_func: Function + Built tvm function operating on arrays + + tensor_type: Type + Type of the tensors of the target framework + + to_dlpack_func: Function + Function to convert the source tensors to DLPACK + """ + assert callable(tvm_func) + + def _wrapper(*args): + args = tuple(ndarray.from_dlpack(to_dlpack_func(arg))\ + if isinstance(arg, tensor_type) else arg for arg in args) + return tvm_func(*args) + + return _wrapper + +def to_pytorch_func(tvm_func): + """Convert a tvm function into one that accepts PyTorch tensors + + Parameters + ---------- + tvm_func: Function + Built tvm function operating on arrays + + Returns + ------- + wrapped_func: Function + Wrapped tvm function that operates on PyTorch tensors + """ + import torch + import torch.utils.dlpack + return convert_func(tvm_func, torch.Tensor, torch.utils.dlpack.to_dlpack) diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py index 18e958973d94..448e5f6d8bdb 100644 --- a/python/tvm/ndarray.py +++ b/python/tvm/ndarray.py @@ -8,7 +8,7 @@ import numpy as _np from ._ffi.ndarray import TVMContext, TVMType, NDArrayBase -from ._ffi.ndarray import context, empty +from ._ffi.ndarray import context, empty, from_dlpack from ._ffi.ndarray import _set_class_ndarray from ._ffi.ndarray import register_extension, free_extension_handle diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc index f862f32f6e99..424a2b09cb15 100644 --- a/src/runtime/ndarray.cc +++ b/src/runtime/ndarray.cc @@ -93,6 +93,16 @@ struct NDArray::Internal { arr.data_ = nullptr; return tensor; } + // Container to DLManagedTensor + static DLManagedTensor* ToDLPack(NDArray::Container* from) { + CHECK(from != nullptr); + DLManagedTensor* ret = new DLManagedTensor(); + ret->dl_tensor = from->dl_tensor; + ret->manager_ctx = from; + from->IncRef(); + ret->deleter = NDArrayDLPackDeleter; + return ret; + } }; NDArray NDArray::CreateView(std::vector shape, @@ -115,13 +125,7 @@ NDArray NDArray::CreateView(std::vector shape, } DLManagedTensor* NDArray::ToDLPack() const { - CHECK(data_ != nullptr); - DLManagedTensor* ret = new DLManagedTensor(); - ret->dl_tensor = data_->dl_tensor; - ret->manager_ctx = const_cast(this); - data_->IncRef(); - ret->deleter = NDArrayDLPackDeleter; - return ret; + return Internal::ToDLPack(data_); } NDArray NDArray::Empty(std::vector shape, @@ -213,6 +217,24 @@ int TVMArrayCopyFromTo(TVMArrayHandle from, API_END(); } +int TVMArrayFromDLPack(DLManagedTensor* from, + TVMArrayHandle* out) { + API_BEGIN(); + *out = NDArray::Internal::MoveAsDLTensor(NDArray::FromDLPack(from)); + API_END(); +} + +int TVMArrayToDLPack(TVMArrayHandle from, + DLManagedTensor** out) { + API_BEGIN(); + *out = NDArray::Internal::ToDLPack(reinterpret_cast(from)); + API_END(); +} + +void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor) { + (*(dltensor->deleter))(dltensor); +} + int TVMArrayCopyFromBytes(TVMArrayHandle handle, void* data, size_t nbytes) { diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py new file mode 100644 index 000000000000..9a8ea34e69d5 --- /dev/null +++ b/tests/python/contrib/test_dlpack.py @@ -0,0 +1,44 @@ +import tvm +import numpy as np +from tvm.contrib.dlpack import to_pytorch_func + +def test(): + a = np.random.randn(1337) + tvm_a = tvm.nd.array(a) + np.testing.assert_equal(tvm.nd.from_dlpack(tvm_a.to_dlpack()).asnumpy(), a) + + try: + import torch + import torch.utils.dlpack + + x = torch.rand(56, 56) + tvm_x = tvm.nd.from_dlpack(torch.utils.dlpack.to_dlpack(x)) + np.testing.assert_equal(x.numpy(), tvm_x.asnumpy()) + y = tvm.nd.from_dlpack(tvm_x.to_dlpack()) + np.testing.assert_equal(y.asnumpy(), tvm_x.asnumpy()) + np.testing.assert_equal(torch.utils.dlpack.from_dlpack(y.to_dlpack()).numpy(), tvm_x.asnumpy()) + + n = tvm.convert(137) + xx = torch.rand(137,137) + yy = torch.rand(137,137) + zz2 = torch.empty(137,137) + zz = xx.mm(yy) + XX = tvm.placeholder((n,n), name='X') + YY = tvm.placeholder((n,n), name='Y') + + k = tvm.reduce_axis((0, n), name='k') + ZZ = tvm.compute((n,n), lambda i,j : tvm.sum(XX[i,k]*YY[k,j], axis=k)) + s = tvm.create_schedule(ZZ.op) + f = tvm.build(s, [XX, YY, ZZ], target_host='llvm', name='f') + + f_pytorch = to_pytorch_func(f) + zz2 = torch.empty(137,137) + f_pytorch(xx, yy, zz2) + np.testing.assert_allclose(zz.numpy(), zz2.numpy(), rtol=1e-6) + + except ImportError: + pass + + +if __name__ == '__main__': + test() From 7009295e538a1775b3b4b34d871232132cd0479b Mon Sep 17 00:00:00 2001 From: eqy Date: Fri, 10 Aug 2018 19:04:46 -0700 Subject: [PATCH 007/529] use phone EditText for numerical fields (#1587) --- apps/android_rpc/app/src/main/res/layout/content_main.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/android_rpc/app/src/main/res/layout/content_main.xml b/apps/android_rpc/app/src/main/res/layout/content_main.xml index 0f2564833ecd..82be44d98451 100644 --- a/apps/android_rpc/app/src/main/res/layout/content_main.xml +++ b/apps/android_rpc/app/src/main/res/layout/content_main.xml @@ -20,6 +20,7 @@ android:hint="@string/input_address" android:layout_width="wrap_content" android:layout_height="wrap_content" + android:inputType="phone" android:background="@android:drawable/editbox_background"/> @@ -37,6 +38,7 @@ android:minWidth="100dip" android:layout_width="wrap_content" android:layout_height="wrap_content" + android:inputType="phone" android:background="@android:drawable/editbox_background"/> From 77dc1c446832a8c70b005690c744e00ff9bcf00a Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 11 Aug 2018 09:15:05 -0700 Subject: [PATCH 008/529] [RUNTIME] Refactor to enable stackvm in runtime. (#1588) --- CMakeLists.txt | 12 +- Jenkinsfile | 1 + cmake/config.cmake | 3 + include/tvm/ir.h | 20 +-- include/tvm/runtime/util.h | 28 +++- python/tvm/module.py | 9 +- src/codegen/codegen.cc | 1 - src/codegen/stack_vm/stack_vm_module.cc | 71 ---------- .../codegen_stackvm.cc} | 23 +++- .../codegen_stackvm.h} | 10 +- .../stackvm/stackvm.cc} | 71 +++++++--- .../stack_vm.h => runtime/stackvm/stackvm.h} | 55 +++++--- src/runtime/stackvm/stackvm_module.cc | 128 ++++++++++++++++++ src/runtime/stackvm/stackvm_module.h | 27 ++++ tests/python/unittest/test_module_load.py | 22 ++- 15 files changed, 337 insertions(+), 144 deletions(-) delete mode 100644 src/codegen/stack_vm/stack_vm_module.cc rename src/codegen/{stack_vm/codegen_stack_vm.cc => stackvm/codegen_stackvm.cc} (95%) rename src/codegen/{stack_vm/codegen_stack_vm.h => stackvm/codegen_stackvm.h} (95%) rename src/{codegen/stack_vm/stack_vm.cc => runtime/stackvm/stackvm.cc} (90%) rename src/{codegen/stack_vm/stack_vm.h => runtime/stackvm/stackvm.h} (89%) create mode 100644 src/runtime/stackvm/stackvm_module.cc create mode 100644 src/runtime/stackvm/stackvm_module.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 39776d53d1f1..572f4aef1432 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,7 @@ tvm_option(USE_ROCM "Build with ROCM" OFF) tvm_option(ROCM_PATH "The path to rocm" /opt/rocm) tvm_option(USE_RPC "Build with RPC" ON) tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF) +tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF) tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON) tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF) tvm_option(USE_RTTI "Build with RTTI" ON) @@ -97,7 +98,6 @@ file(GLOB COMPILER_SRCS src/arithmetic/*.cc src/autotvm/*.cc src/codegen/*.cc - src/codegen/stack_vm/*.cc src/lang/*.cc src/pass/*.cc src/op/*.cc @@ -135,6 +135,16 @@ if(USE_RPC) list(APPEND RUNTIME_SRCS ${RUNTIME_RPC_SRCS}) endif(USE_RPC) +file(GLOB STACKVM_RUNTIME_SRCS src/runtime/stackvm/*.cc) +file(GLOB STACKVM_CODEGEN_SRCS src/codegen/stackvm/*.cc) +list(APPEND COMPILER_SRCS ${STACKVM_CODEGEN_SRCS}) +if(USE_STACKVM_RUNTIME) + message(STATUS "Build with stackvm support in runtime...") + list(APPEND RUNTIME_SRCS ${STACKVM_RUNTIME_SRCS}) +else() + list(APPEND COMPILER_SRCS ${STACKVM_RUNTIME_SRCS}) +endif(USE_STACKVM_RUNTIME) + if(USE_GRAPH_RUNTIME) message(STATUS "Build with Graph runtime support...") file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc) diff --git a/Jenkinsfile b/Jenkinsfile index bec0d2be5df8..2ecf3c59f8aa 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -96,6 +96,7 @@ stage('Build') { echo set\\(USE_RPC ON\\) >> config.cmake echo set\\(USE_SORT ON\\) >> config.cmake echo set\\(USE_GRAPH_RUNTIME ON\\) >> config.cmake + echo set\\(USE_STACKVM_RUNTIME ON\\) >> config.cmake echo set\\(USE_BLAS openblas\\) >> config.cmake echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake diff --git a/cmake/config.cmake b/cmake/config.cmake index 85c5102169a9..c364a88cce11 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -65,6 +65,9 @@ set(USE_OPENGL OFF) # Whether enable RPC runtime set(USE_RPC ON) +# Whether embed stackvm into the runtime +set(USE_STACKVM_RUNTIME OFF) + # Whether enable tiny embedded graph runtime. set(USE_GRAPH_RUNTIME ON) diff --git a/include/tvm/ir.h b/include/tvm/ir.h index 9ea16131188d..646824332902 100644 --- a/include/tvm/ir.h +++ b/include/tvm/ir.h @@ -12,6 +12,7 @@ #include #include "./base.h" #include "./expr.h" +#include "./runtime/util.h" namespace tvm { namespace ir { @@ -449,25 +450,6 @@ constexpr const char* tvm_global_barrier_kinit = "tvm_global_barrier_kinit"; */ constexpr const char* tvm_thread_allreduce = "tvm_thread_allreduce"; -/*! \brief The kind of structure field info */ -enum TVMStructFieldKind : int { - // array head address - kArrAddr, - kArrData, - kArrShape, - kArrStrides, - kArrNDim, - kArrTypeCode, - kArrTypeBits, - kArrTypeLanes, - kArrByteOffset, - kArrDeviceId, - kArrDeviceType, - kArrKindBound_, - // TVMValue field - kTVMValueContent, - kTVMValueKindBound_ -}; } // namespace intrinsic // Reuse IR node defintiion from HalideIR diff --git a/include/tvm/runtime/util.h b/include/tvm/runtime/util.h index 160642ffcc85..7fa62be912be 100644 --- a/include/tvm/runtime/util.h +++ b/include/tvm/runtime/util.h @@ -21,7 +21,33 @@ namespace runtime { inline bool TypeMatch(TVMType t, int code, int bits, int lanes = 1) { return t.code == code && t.bits == bits && t.lanes == lanes; } - } // namespace runtime } // namespace tvm +// Forward declare the intrinsic id we need +// in structure fetch to enable stackvm in runtime +namespace tvm { +namespace ir { +namespace intrinsic { +/*! \brief The kind of structure field info used in intrinsic */ +enum TVMStructFieldKind : int { + // array head address + kArrAddr, + kArrData, + kArrShape, + kArrStrides, + kArrNDim, + kArrTypeCode, + kArrTypeBits, + kArrTypeLanes, + kArrByteOffset, + kArrDeviceId, + kArrDeviceType, + kArrKindBound_, + // TVMValue field + kTVMValueContent, + kTVMValueKindBound_ +}; +} // namespace intrinsic +} // namespace ir +} // namespace tvm #endif // TVM_RUNTIME_UTIL_H_ diff --git a/python/tvm/module.py b/python/tvm/module.py index 1b83c9b26243..6cca6fb0f722 100644 --- a/python/tvm/module.py +++ b/python/tvm/module.py @@ -90,9 +90,12 @@ def export_library(self, kwargs : dict, optiona; Additional arguments passed to fcompile """ - if self.type_key == "stacktvm": - raise ValueError("Module[%s]: export_library requires llvm module," - " did you build with LLVM enabled?" % self.type_key) + if self.type_key == "stackvm": + if not file_name.endswith(".stackvm"): + raise ValueError("Module[%s]: can only be saved as stackvm format." + "did you build with LLVM enabled?" % self.type_key) + self.save(file_name) + return if self.type_key != "llvm": raise ValueError("Module[%s]: Only llvm support export shared" % self.type_key) diff --git a/src/codegen/codegen.cc b/src/codegen/codegen.cc index 8bc7d238a866..12570e5881a9 100644 --- a/src/codegen/codegen.cc +++ b/src/codegen/codegen.cc @@ -40,7 +40,6 @@ std::string PackImportsToC(const runtime::Module& mod, bool system_lib) { CHECK_EQ(im->imports().size(), 0U) << "Only support simply one-level hierarchy"; std::string tkey = im->type_key(); - std::string bin; stream->Write(tkey); im->SaveToBinary(stream); } diff --git a/src/codegen/stack_vm/stack_vm_module.cc b/src/codegen/stack_vm/stack_vm_module.cc deleted file mode 100644 index 731663deb448..000000000000 --- a/src/codegen/stack_vm/stack_vm_module.cc +++ /dev/null @@ -1,71 +0,0 @@ -/*! - * Copyright (c) 2017 by Contributors - * \file stack_vm_module.cc - */ -#include -#include -#include -#include "./codegen_stack_vm.h" - -namespace tvm { -namespace codegen { - -class StackVMModuleNode : public runtime::ModuleNode { - public: - const char* type_key() const { - return "stackvm"; - } - - PackedFunc GetFunction( - const std::string& name, - const std::shared_ptr& sptr_to_self) final { - if (name == runtime::symbol::tvm_module_main) { - return GetFunction(entry_func_, sptr_to_self); - } - auto it = fmap_.find(name); - if (it == fmap_.end()) return PackedFunc(); - const StackVM& vm = it->second; - // capture sptr_to_self to keep module node alive. - return PackedFunc([vm, sptr_to_self](TVMArgs args, TVMRetValue* rv) { - vm(args); - }); - } - - std::string GetSource(const std::string& format) final { - std::ostringstream os; - for (const auto& kv : fmap_) { - os << "Function: " << kv.first << '\n'; - os << kv.second; - } - return os.str(); - } - - static runtime::Module Build(const Array& funcs) { - CHECK_NE(funcs.size(), 0U); - std::shared_ptr n = - std::make_shared(); - for (LoweredFunc f : funcs) { - StackVM vm = codegen::CodeGenStackVM().Compile(f); - CHECK(!n->fmap_.count(f->name)) - << "Function name " << f->name << "already exist in list"; - vm.mod_ctx = n.get(); - n->fmap_[f->name] = std::move(vm); - } - n->entry_func_ = funcs[0]->name; - return runtime::Module(n); - } - - private: - // entry function. - std::string entry_func_; - // internal function map - std::unordered_map fmap_; -}; - -TVM_REGISTER_API("codegen.build_stackvm") -.set_body([](TVMArgs args, TVMRetValue* rv) { - *rv = StackVMModuleNode::Build(args[0]); - }); - -} // namespace codegen -} // namespace tvm diff --git a/src/codegen/stack_vm/codegen_stack_vm.cc b/src/codegen/stackvm/codegen_stackvm.cc similarity index 95% rename from src/codegen/stack_vm/codegen_stack_vm.cc rename to src/codegen/stackvm/codegen_stackvm.cc index 168e411fa6e2..517793ff14a3 100644 --- a/src/codegen/stack_vm/codegen_stack_vm.cc +++ b/src/codegen/stackvm/codegen_stackvm.cc @@ -1,11 +1,12 @@ /*! * Copyright (c) 2017 by Contributors - * \file codegen_stack_vm.cc + * \file codegen_stackvm.cc */ #include #include #include -#include "./codegen_stack_vm.h" +#include "./codegen_stackvm.h" +#include "../../runtime/stackvm/stackvm_module.h" namespace tvm { namespace codegen { @@ -19,6 +20,7 @@ StackVM CodeGenStackVM::Compile(LoweredFunc f) { CHECK_EQ(static_cast(vid), i); } this->Push(f->body); + vm_.InitCache(); return std::move(vm_); } @@ -486,5 +488,22 @@ void CodeGenStackVM::VisitExpr_(const Let *op) { this->PushOp(StackVM::STORE_HEAP, static_cast(vid)); this->Push(op->body); } + +runtime::Module BuildStackVM(const Array& funcs) { + CHECK_NE(funcs.size(), 0U); + std::unordered_map fmap; + for (LoweredFunc f : funcs) { + StackVM vm = codegen::CodeGenStackVM().Compile(f); + CHECK(!fmap.count(f->name)) + << "Function name " << f->name << "already exist in list"; + fmap[f->name] = std::move(vm); + } + return runtime::StackVMModuleCreate(fmap, funcs[0]->name); +} + +TVM_REGISTER_API("codegen.build_stackvm") +.set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = BuildStackVM(args[0]); + }); } // namespace codegen } // namespace tvm diff --git a/src/codegen/stack_vm/codegen_stack_vm.h b/src/codegen/stackvm/codegen_stackvm.h similarity index 95% rename from src/codegen/stack_vm/codegen_stack_vm.h rename to src/codegen/stackvm/codegen_stackvm.h index 089284529242..23bd61dcb4c2 100644 --- a/src/codegen/stack_vm/codegen_stack_vm.h +++ b/src/codegen/stackvm/codegen_stackvm.h @@ -3,8 +3,8 @@ * \file codegen_stack_vm.h * \brief Codegen into Simple Stack VM. */ -#ifndef TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_ -#define TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_ +#ifndef TVM_CODEGEN_STACKVM_CODEGEN_STACKVM_H_ +#define TVM_CODEGEN_STACKVM_CODEGEN_STACKVM_H_ #include #include @@ -14,12 +14,14 @@ #include #include -#include "./stack_vm.h" +#include "../../runtime/stackvm/stackvm.h" namespace tvm { namespace codegen { using namespace ir; +using runtime::StackVM; + /*! * \brief A base class to generate a stack VM. * This module is used to generate host wrapper @@ -145,4 +147,4 @@ class CodeGenStackVM } // namespace codegen } // namespace tvm -#endif // TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_ +#endif // TVM_CODEGEN_STACKVM_CODEGEN_STACKVM_H_ diff --git a/src/codegen/stack_vm/stack_vm.cc b/src/runtime/stackvm/stackvm.cc similarity index 90% rename from src/codegen/stack_vm/stack_vm.cc rename to src/runtime/stackvm/stackvm.cc index 95feeae3679e..f86bfec087e4 100644 --- a/src/codegen/stack_vm/stack_vm.cc +++ b/src/runtime/stackvm/stackvm.cc @@ -1,15 +1,16 @@ /*! * Copyright (c) 2017 by Contributors * Implementation stack VM. - * \file stack_vm.cc + * \file stackvm.cc */ #include -#include +#include #include -#include "./stack_vm.h" +#include +#include "./stackvm.h" namespace tvm { -namespace codegen { +namespace runtime { typedef dmlc::ThreadLocalStore StackVMStateStore; @@ -172,28 +173,64 @@ std::ostream& operator<<(std::ostream& os, const StackVM& vm) { // NOLINT(*) return os; } -void StackVM::operator()(const runtime::TVMArgs& args) const { +void StackVM::Run(const runtime::TVMArgs& args, + runtime::ModuleNode* mod_ctx) const { StackVM::State* s = StackVM::ThreadLocalState(); + if (s->heap.size() < heap_size) { + s->heap.resize(heap_size); + } s->sp = 0; s->pc = 0; - if (s->heap.size() < this->heap_size) { - s->heap.resize(this->heap_size); - } - + s->mod_ctx = mod_ctx; s->heap[0].v_handle = (void*)args.values; // NOLINT(*) s->heap[1].v_handle = (void*)args.type_codes; // NOLINT(*) s->heap[2].v_int64 = args.num_args; this->Run(s); } +void StackVM::InitCache() { + extern_func_cache_.clear(); + extern_func_cache_.resize( + extern_func_name.size(), PackedFunc(nullptr)); +} + +void StackVM::Save(dmlc::Stream* strm) const { + // to be endian invariant. + std::vector code_copy(code.size()); + std::transform(code.begin(), code.end(), code_copy.begin(), [](Code c) { + return c.v_int; + }); + strm->Write(code_copy); + strm->Write(str_data); + strm->Write(extern_func_name); + strm->Write(heap_id_name); + strm->Write(heap_size); + strm->Write(stack_size); +} + +bool StackVM::Load(dmlc::Stream* strm) { + // to be endian invariant. + std::vector code_copy; + if (!strm->Read(&code_copy)) return false; + code.resize(code_copy.size()); + std::transform(code_copy.begin(), code_copy.end(), code.begin(), [](int v) { + Code code; code.v_int = v; return code; + }); + if (!strm->Read(&str_data)) return false; + if (!strm->Read(&extern_func_name)) return false; + if (!strm->Read(&heap_id_name)) return false; + if (!strm->Read(&heap_size)) return false; + if (!strm->Read(&stack_size)) return false; + this->InitCache(); + return true; +} + void StackVM::Run(State* s) const { int64_t sp = s->sp; int64_t pc = s->pc; int64_t alloca_sp = s->sp; std::vector& stack = s->stack; std::vector& heap = s->heap; - s->extern_func.clear(); - s->extern_func.resize(extern_func_name.size()); if (stack.size() < stack_size) { stack.resize(stack_size); } @@ -488,17 +525,19 @@ void StackVM::Run(State* s) const { } const PackedFunc& StackVM::GetExtern(State* s, int fid) const { - PackedFunc& f = s->extern_func[fid]; + CHECK_LT(static_cast(fid), extern_func_cache_.size()); + // allow race write in this, since write is idempotent + PackedFunc& f = extern_func_cache_[fid]; if (f == nullptr) { - CHECK(mod_ctx != nullptr) + CHECK(s->mod_ctx != nullptr) << "No local context is set in stackvm"; - const PackedFunc* pf = mod_ctx->GetFuncFromEnv(extern_func_name[fid]); + CHECK(s->mod_ctx != nullptr); + const PackedFunc* pf = s->mod_ctx->GetFuncFromEnv(extern_func_name[fid]); CHECK(pf != nullptr); f = *pf; - CHECK(f != nullptr); } return f; } -} // namespace codegen +} // namespace runtime } // namespace tvm diff --git a/src/codegen/stack_vm/stack_vm.h b/src/runtime/stackvm/stackvm.h similarity index 89% rename from src/codegen/stack_vm/stack_vm.h rename to src/runtime/stackvm/stackvm.h index 54972d39a5df..b2ce975b2c73 100644 --- a/src/codegen/stack_vm/stack_vm.h +++ b/src/runtime/stackvm/stackvm.h @@ -1,36 +1,36 @@ /*! * Copyright (c) 2016 by Contributors - * \file stack_vm.h + * \file stackvm.h * \brief A simple stack-based virtual machine. * * This can be used to interepret host side code * to setup calls into device functions * when only Runtime compilation for device is available(via NVRTC or OpenCL). */ -#ifndef TVM_CODEGEN_STACK_VM_STACK_VM_H_ -#define TVM_CODEGEN_STACK_VM_STACK_VM_H_ +#ifndef TVM_RUNTIME_STACKVM_STACKVM_H_ +#define TVM_RUNTIME_STACKVM_STACKVM_H_ #include #include #include -#include #include #include namespace tvm { -namespace codegen { +namespace runtime { using runtime::operator<<; /*! - * \brief A simple stack-based virtual machine. + * \brief A simple stack-based virtual machine program. */ class StackVM { public: /*! - * \brief Invoke the StackVM as PackedFunc + * \brief Invoke the StackVM program. * \param args The arguments to the StackVM. + * \param mod_ctx The module context used in running. */ - void operator()(const TVMArgs& args) const; + void Run(const TVMArgs& args, runtime::ModuleNode* mod_ctx) const; /*! * \brief The opcode of stack vm * \note Notation @@ -276,21 +276,25 @@ class StackVM { std::vector stack; /*! \brief The global heap space */ std::vector heap; - /*! \brief extern functions */ - std::vector extern_func; /*! \brief stack pointer */ int64_t sp{0}; /*! \brief program counter */ int64_t pc{0}; + /*! \brief The current module context of stackvm */ + runtime::ModuleNode* mod_ctx{nullptr}; }; - /*! \brief The external function entries. */ - struct ExternFuncEntry { - std::string name; - runtime::PackedFunc func; - }; - - /*! \brief execute the stack vm with given state */ - void Run(State* state) const; + /*! \brief Initialize local cache*/ + void InitCache(); + /*! + * \brief Save stackvm program to an output stream + * \param strm The output stream + */ + void Save(dmlc::Stream* strm) const; + /*! + * \brief Load stackvm program from output stream + * \param strm The output stream + */ + bool Load(dmlc::Stream* strm); /*! * \brief Print instruction at location pc * \param os The ostream @@ -300,12 +304,11 @@ class StackVM { int64_t PrintCode(std::ostream&os, int64_t pc) const; // NOLINT(*) /*! \brief Get thread local state of the stack VM */ static State* ThreadLocalState(); + // The code below are programs /*! \brief The instructions */ std::vector code; /*! \brief constant error messages */ std::vector str_data; - /*! \brief The current module context of stackvm */ - runtime::ModuleNode* mod_ctx{nullptr}; /*! \brief Extern functions */ std::vector extern_func_name; /*! \brief name of each heap id */ @@ -385,10 +388,18 @@ class StackVM { friend std::ostream& operator<<(std::ostream& os, const StackVM& vm); // NOLINT(*) private: + // execute the stack vm with given state + void Run(State* state) const; // get extern function. const PackedFunc& GetExtern(State* s, int fid) const; + // cached extern function + mutable std::vector extern_func_cache_; }; -} // namespace codegen +} // namespace runtime } // namespace tvm -#endif // TVM_CODEGEN_STACK_VM_STACK_VM_H_ + +namespace dmlc { +DMLC_DECLARE_TRAITS(has_saveload, ::tvm::runtime::StackVM, true); +} +#endif // TVM_RUNTIME_STACKVM_STACKVM_H_ diff --git a/src/runtime/stackvm/stackvm_module.cc b/src/runtime/stackvm/stackvm_module.cc new file mode 100644 index 000000000000..71ca9ba6c09a --- /dev/null +++ b/src/runtime/stackvm/stackvm_module.cc @@ -0,0 +1,128 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file stackvm_module.cc + */ +#include +#include +#include +#include "./stackvm_module.h" +#include "../file_util.h" +#include "../module_util.h" + +namespace tvm { +namespace runtime { + +class StackVMModuleNode : public runtime::ModuleNode { + public: + const char* type_key() const { + return "stackvm"; + } + + PackedFunc GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) final { + if (name == runtime::symbol::tvm_module_main) { + return GetFunction(entry_func_, sptr_to_self); + } + auto it = fmap_.find(name); + if (it == fmap_.end()) return PackedFunc(); + const StackVM& vm = it->second; + // capture sptr_to_self to keep module node alive. + return PackedFunc([vm, sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + vm.Run(args, this); + }); + } + + std::string GetSource(const std::string& format) final { + std::ostringstream os; + for (const auto& kv : fmap_) { + os << "Function: " << kv.first << '\n'; + os << kv.second; + } + return os.str(); + } + + void SaveToFile(const std::string& file_name, + const std::string& format) final { + std::string data, mblob; + dmlc::MemoryStringStream writer(&data); + dmlc::Stream* strm = &writer; + strm->Write(fmap_); + strm->Write(entry_func_); + // also save imports + uint64_t num_imports = static_cast(imports_.size()); + strm->Write(num_imports); + + for (runtime::Module im : imports_) { + CHECK_EQ(im->imports().size(), 0U) + << "Only support simply one-level hierarchy"; + std::string tkey = im->type_key(); + strm->Write(tkey); + LOG(INFO) << "save " << tkey; + im->SaveToBinary(strm); + LOG(INFO) << "FInish save " << tkey; + } + SaveBinaryToFile(file_name, data); + } + + static Module Create(std::unordered_map fmap, + std::string entry_func) { + std::shared_ptr n = + std::make_shared(); + n->fmap_ = std::move(fmap); + n->entry_func_ = std::move(entry_func); + return Module(n); + } + + static Module Load(dmlc::Stream* strm) { + std::unordered_map fmap; + std::string entry_func, data; + strm->Read(&fmap); + strm->Read(&entry_func); + std::shared_ptr n = + std::make_shared(); + n->fmap_ = std::move(fmap); + n->entry_func_ = std::move(entry_func); + uint64_t num_imports; + strm->Read(&num_imports); + for (uint64_t i = 0; i < num_imports; ++i) { + std::string tkey; + CHECK(strm->Read(&tkey)); + std::string fkey = "module.loadbinary_" + tkey; + const PackedFunc* f = Registry::Get(fkey); + CHECK(f != nullptr) + << "Loader of " << tkey << "(" + << fkey << ") is not presented."; + Module m = (*f)(static_cast(strm)); + n->imports_.emplace_back(std::move(m)); + } + return Module(n); + } + + static Module LoadFromFile(std::string file_name, + std::string format) { + std::string data; + LoadBinaryFromFile(file_name, &data); + dmlc::MemoryStringStream reader(&data); + return Load(&reader); + } + + private: + // internal function map + std::unordered_map fmap_; + // entry function. + std::string entry_func_; +}; + +Module StackVMModuleCreate(std::unordered_map fmap, + std::string entry_func) { + return StackVMModuleNode::Create(fmap, entry_func); +} + +TVM_REGISTER_GLOBAL("module.loadfile_stackvm") +.set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = StackVMModuleNode::LoadFromFile(args[0], args[1]); + }); + +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/stackvm/stackvm_module.h b/src/runtime/stackvm/stackvm_module.h new file mode 100644 index 000000000000..fcd51a64f870 --- /dev/null +++ b/src/runtime/stackvm/stackvm_module.h @@ -0,0 +1,27 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file stackvm_module.h + * \brief StackVM module + */ +#ifndef TVM_RUNTIME_STACKVM_STACKVM_MODULE_H_ +#define TVM_RUNTIME_STACKVM_STACKVM_MODULE_H_ + +#include +#include +#include "./stackvm.h" + +namespace tvm { +namespace runtime { +/*! + * \brief create a stackvm module + * + * \param fmap The map from name to function + * \param entry_func The entry function name. + * \return The created module + */ +Module StackVMModuleCreate(std::unordered_map fmap, + std::string entry_func); + +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_STACKVM_STACKVM_MODULE_H_ diff --git a/tests/python/unittest/test_module_load.py b/tests/python/unittest/test_module_load.py index 1b239a357f66..8ee3ea5e06c0 100644 --- a/tests/python/unittest/test_module_load.py +++ b/tests/python/unittest/test_module_load.py @@ -109,11 +109,25 @@ def check_device(device): f2[name](a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) - check_device("cuda") - check_device("vulkan") - check_device("opencl") - check_device("metal") + def check_stackvm(device): + ctx = tvm.context(device, 0) + if not ctx.exist: + print("Skip because %s is not enabled" % device) + return + temp = util.tempdir() + name = "myadd_%s" % device + f = tvm.build(s, [A, B], device, "stackvm", name=name) + path_dso = temp.relpath("dev_lib.stackvm") + #f.export_library(path_dso) + #f1 = tvm.module.load(path_dso) + a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) + f(a, b) + np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) + for device in ["cuda", "vulkan", "opencl", "metal"]: + check_device(device) + check_stackvm(device) def test_combine_module_llvm(): """Test combine multiple module into one shared lib.""" From 5e5aec0a310de0646cd7c76c930456aeaad81f0e Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 11 Aug 2018 15:31:15 -0700 Subject: [PATCH 009/529] [DLPACK] Enable cython support (#1589) --- HalideIR | 2 +- include/tvm/runtime/c_runtime_api.h | 2 +- python/tvm/_ffi/_ctypes/ndarray.py | 49 ++++++++++++++++++++++++++++- python/tvm/_ffi/_cython/base.pxi | 13 +++++++- python/tvm/_ffi/_cython/ndarray.pxi | 38 ++++++++++++++++++++++ python/tvm/_ffi/ndarray.py | 48 +++------------------------- tests/scripts/task_python_nnvm.sh | 4 +++ tests/scripts/task_python_topi.sh | 4 +++ 8 files changed, 113 insertions(+), 47 deletions(-) diff --git a/HalideIR b/HalideIR index a5a80bdc8232..a0b9563f4571 160000 --- a/HalideIR +++ b/HalideIR @@ -1 +1 @@ -Subproject commit a5a80bdc8232c9dbfe508bb5c46e8f58cdf7ec20 +Subproject commit a0b9563f45719553adf4d39fe3c14db1af0e1f40 diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index dca0d5ed4a30..32d574340052 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -467,7 +467,7 @@ TVM_DLL int TVMArrayToDLPack(TVMArrayHandle from, /*! * \brief Delete (free) a DLManagedTensor's data. - * \param dltensor Pointer to the DLManagedTensor. + * \param dltensor Pointer to the DLManagedTensor. */ TVM_DLL void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor); diff --git a/python/tvm/_ffi/_ctypes/ndarray.py b/python/tvm/_ffi/_ctypes/ndarray.py index df877679fc7d..8b88e7dc98ea 100644 --- a/python/tvm/_ffi/_ctypes/ndarray.py +++ b/python/tvm/_ffi/_ctypes/ndarray.py @@ -1,11 +1,47 @@ +# pylint: disable=invalid-name """Runtime NDArray api""" from __future__ import absolute_import import ctypes -from ..base import _LIB, check_call +from ..base import _LIB, check_call, c_str from ..runtime_ctypes import TVMArrayHandle from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _return_handle + +TVMPyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p) +_c_str_dltensor = c_str('dltensor') +_c_str_used_dltensor = c_str('used_dltensor') + + +# used for PyCapsule manipulation +if hasattr(ctypes, 'pythonapi'): + ctypes.pythonapi.PyCapsule_GetName.restype = ctypes.c_char_p + ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p + ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object + + +def _from_dlpack(dltensor): + dltensor = ctypes.py_object(dltensor) + if ctypes.pythonapi.PyCapsule_IsValid(dltensor, _c_str_dltensor): + ptr = ctypes.pythonapi.PyCapsule_GetPointer(dltensor, _c_str_dltensor) + handle = TVMArrayHandle() + check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle))) + ctypes.pythonapi.PyCapsule_SetName(dltensor, _c_str_used_dltensor) + ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0)) + return _make_array(handle, False) + raise ValueError("Expect a dltensor field, PyCapsule can only be consumed once") + + +def _dlpack_deleter(pycapsule): + pycapsule = ctypes.cast(pycapsule, ctypes.py_object) + if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor): + ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor) + _LIB.TVMDLManagedTensorCallDeleter(ptr) + ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0)) + +_c_dlpack_deleter = TVMPyCapsuleDestructor(_dlpack_deleter) + + class NDArrayBase(object): """A simple Device/CPU Array object in runtime.""" __slots__ = ["handle", "is_view"] @@ -29,6 +65,17 @@ def __del__(self): def _tvm_handle(self): return ctypes.cast(self.handle, ctypes.c_void_p).value + def to_dlpack(self): + """Produce an array from a DLPack Tensor without copying memory + + Returns + ------- + dlpack : DLPack tensor view of the array data + """ + handle = ctypes.c_void_p() + check_call(_LIB.TVMArrayToDLPack(self.handle, ctypes.byref(handle))) + return ctypes.pythonapi.PyCapsule_New(handle, _c_str_dltensor, _c_dlpack_deleter) + def _make_array(handle, is_view): handle = ctypes.cast(handle, TVMArrayHandle) diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi index 50a99245f793..00173c431bb7 100644 --- a/python/tvm/_ffi/_cython/base.pxi +++ b/python/tvm/_ffi/_cython/base.pxi @@ -1,6 +1,7 @@ from ..base import TVMError from libcpp.vector cimport vector from cpython.version cimport PY_MAJOR_VERSION +from cpython cimport pycapsule from libc.stdint cimport int64_t, uint64_t, uint8_t, uint16_t import ctypes @@ -40,6 +41,11 @@ cdef extern from "tvm/runtime/c_runtime_api.h": int64_t* strides uint64_t byte_offset + ctypedef struct DLManagedTensor: + DLTensor dl_tensor + void* manager_ctx + void (*deleter)(DLManagedTensor* self) + ctypedef struct TVMValue: int64_t v_int64 double v_float64 @@ -49,7 +55,7 @@ cdef extern from "tvm/runtime/c_runtime_api.h": DLContext v_ctx ctypedef int64_t tvm_index_t -ctypedef void* DLTensorHandle +ctypedef DLTensor* DLTensorHandle ctypedef void* TVMStreamHandle ctypedef void* TVMRetValueHandle ctypedef void* TVMFunctionHandle @@ -92,6 +98,11 @@ cdef extern from "tvm/runtime/c_runtime_api.h": int TVMArrayCopyFromTo(DLTensorHandle src, DLTensorHandle to, TVMStreamHandle stream) + int TVMArrayFromDLPack(DLManagedTensor* arr_from, + DLTensorHandle* out) + int TVMArrayToDLPack(DLTensorHandle arr_from, + DLManagedTensor** out) + void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor) cdef extern from "tvm/c_dsl_api.h": int TVMNodeFree(NodeHandle handle) diff --git a/python/tvm/_ffi/_cython/ndarray.pxi b/python/tvm/_ffi/_cython/ndarray.pxi index 44b0a544609d..0a507affec1c 100644 --- a/python/tvm/_ffi/_cython/ndarray.pxi +++ b/python/tvm/_ffi/_cython/ndarray.pxi @@ -1,5 +1,29 @@ from ..runtime_ctypes import TVMArrayHandle +cdef const char* _c_str_dltensor = "dltensor" +cdef const char* _c_str_used_dltensor = "used_dltensor" + + +cdef void _c_dlpack_deleter(object pycaps): + cdef DLManagedTensor* dltensor + if pycapsule.PyCapsule_IsValid(pycaps, _c_str_dltensor): + dltensor = pycapsule.PyCapsule_GetPointer(pycaps, _c_str_dltensor) + TVMDLManagedTensorCallDeleter(dltensor) + + +def _from_dlpack(object dltensor): + cdef DLManagedTensor* ptr + cdef DLTensorHandle chandle + if pycapsule.PyCapsule_IsValid(dltensor, _c_str_dltensor): + ptr = pycapsule.PyCapsule_GetPointer(dltensor, _c_str_dltensor) + CALL(TVMArrayFromDLPack(ptr, &chandle)) + # set name and destructor to be empty + pycapsule.PyCapsule_SetDestructor(dltensor, NULL) + pycapsule.PyCapsule_SetName(dltensor, _c_str_used_dltensor) + return c_make_array(chandle, 0) + raise ValueError("Expect a dltensor field, pycapsule.PyCapsule can only be consumed once") + + cdef class NDArrayBase: cdef DLTensor* chandle cdef int c_is_view @@ -35,12 +59,26 @@ cdef class NDArrayBase: if self.c_is_view == 0: CALL(TVMArrayFree(self.chandle)) + def to_dlpack(self): + """Produce an array from a DLPack Tensor without copying memory + + Returns + ------- + dlpack : DLPack tensor view of the array data + """ + cdef DLManagedTensor* dltensor + if self.c_is_view != 0: + raise ValueError("to_dlpack do not work with memory views") + CALL(TVMArrayToDLPack(self.chandle, &dltensor)) + return pycapsule.PyCapsule_New(dltensor, _c_str_dltensor, _c_dlpack_deleter) + cdef c_make_array(void* chandle, is_view): ret = _CLASS_NDARRAY(None, is_view) (ret).chandle = chandle return ret + cdef _TVM_COMPATS = () cdef _TVM_EXT_RET = {} diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py index d994d7c2e4a5..e49c3b62f473 100644 --- a/python/tvm/_ffi/ndarray.py +++ b/python/tvm/_ffi/ndarray.py @@ -17,28 +17,17 @@ if _FFI_MODE == "ctypes": raise ImportError() if sys.version_info >= (3, 0): - from ._cy3.core import _set_class_ndarray, _reg_extension, _make_array + from ._cy3.core import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack from ._cy3.core import NDArrayBase as _NDArrayBase else: - from ._cy2.core import _set_class_ndarray, _reg_extension, _make_array + from ._cy2.core import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack from ._cy2.core import NDArrayBase as _NDArrayBase except IMPORT_EXCEPT: # pylint: disable=wrong-import-position - from ._ctypes.ndarray import _set_class_ndarray, _reg_extension, _make_array + from ._ctypes.ndarray import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack from ._ctypes.ndarray import NDArrayBase as _NDArrayBase -TVMPyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p) -_c_str_dltensor = c_str('dltensor') - - -# used for PyCapsule manipulation -if hasattr(ctypes, 'pythonapi'): - ctypes.pythonapi.PyCapsule_GetName.restype = ctypes.c_char_p - ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p - ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object - - def context(dev_type, dev_id=0): """Construct a TVM context with given device type and id. @@ -134,30 +123,14 @@ def from_dlpack(dltensor): Parameters ---------- dltensor : DLPack tensor + Input DLManagedTensor, can only be consumed once. Returns ------- arr: tvm.nd.NDArray The array view of the tensor data. """ - dltensor = ctypes.py_object(dltensor) - name = ctypes.pythonapi.PyCapsule_GetName(dltensor) - ptr = ctypes.pythonapi.PyCapsule_GetPointer(dltensor, name) - handle = TVMArrayHandle() - check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle))) - ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, None) - return _make_array(handle, False) - - -def _dlpack_deleter(pycapsule): - pycapsule = ctypes.py_object(pycapsule) - if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor): - ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor) - _LIB.TVMDLManagedTensorCallDeleter(ptr) - ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0)) - - -_c_dlpack_deleter = TVMPyCapsuleDestructor(_dlpack_deleter) + return _from_dlpack(dltensor) class NDArrayBase(_NDArrayBase): @@ -308,17 +281,6 @@ def copyto(self, target): raise ValueError("Unsupported target type %s" % str(type(target))) return target - def to_dlpack(self): - """Produce an array from a DLPack Tensor without copying memory - - Returns - ------- - dlpack : DLPack tensor view of the array data - """ - handle = ctypes.c_void_p() - check_call(_LIB.TVMArrayToDLPack(self.handle, ctypes.byref(handle))) - return ctypes.pythonapi.PyCapsule_New(handle, _c_str_dltensor, _c_dlpack_deleter) - def free_extension_handle(handle, type_code): """Free c++ extension type handle diff --git a/tests/scripts/task_python_nnvm.sh b/tests/scripts/task_python_nnvm.sh index 790073a2fe8b..cf6039d58416 100755 --- a/tests/scripts/task_python_nnvm.sh +++ b/tests/scripts/task_python_nnvm.sh @@ -4,6 +4,10 @@ export PYTHONPATH=nnvm/python:python:topi/python # to avoid openblas threading error export OMP_NUM_THREADS=1 +# Rebuild cython +make cython || exit -1 +make cython3 || exit -1 + echo "Running unittest..." python -m nose -v nnvm/tests/python/unittest || exit -1 python3 -m nose -v nnvm/tests/python/unittest || exit -1 diff --git a/tests/scripts/task_python_topi.sh b/tests/scripts/task_python_topi.sh index 13a324d79b1f..6842ddaae13a 100755 --- a/tests/scripts/task_python_topi.sh +++ b/tests/scripts/task_python_topi.sh @@ -1,4 +1,8 @@ export PYTHONPATH=python:topi/python +# Rebuild cython +make cython || exit -1 +make cython3 || exit -1 + python -m nose -v topi/tests/python || exit -1 python3 -m nose -v topi/tests/python || exit -1 From 0cdc7b3f2421378746f6a68b8db4ddf0f18ebf01 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Mon, 13 Aug 2018 09:35:46 -0700 Subject: [PATCH 010/529] Fixed bugs for SSD sorting and multbox detection (#1578) --- topi/python/topi/cuda/nms.py | 480 ++++++++++++++++++++------ topi/python/topi/cuda/ssd/multibox.py | 225 ++++++++---- 2 files changed, 534 insertions(+), 171 deletions(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 4d4e402de5c2..361208bf1cfb 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -7,19 +7,155 @@ from topi.vision import nms -def sort_ir(data, index, output, axis, is_descend): - """Low level IR to do sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU. +def sort_pre_ir(index, sizes_out, axis_mul_before, axis_mul_after): + """Low level IR routing subfunction 1/4 for computing segments' staring locatons. + + Parameters + ---------- + index : Buffer + Buffer of number of valid output boxes. + + sizes_out : Buffer + Output buffer of start locations of each sorting segment. + + axis_mul_before : int + The multiplication result of axis dimensions before axis. + + axis_mul_after : int + The multiplication result of axis dimensions after axis. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ + max_threads = int( + tvm.target.current_target(allow_none=False).max_num_threads) + tx = tvm.thread_axis("threadIdx.x") + bx = tvm.thread_axis("blockIdx.x") + ib = tvm.ir_builder.create() + p_index = ib.buffer_ptr(index) + dshape = sizes_out.shape + sizes = ib.buffer_ptr(sizes_out) + nthread_tx = max_threads + nthread_bx = dshape[0] // max_threads + 1 + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + + with ib.if_scope(tid < axis_mul_before * axis_mul_after): + sizes[tid] = p_index[tid] + + # scan + with ib.if_scope(tid < 1): + with ib.for_range(0, axis_mul_before * axis_mul_after - 1, name="k") as k: + sizes[k + 1] += sizes[k] + body = ib.get() + return body + + +def sort_pre_ir_data(data, index, sizes_in, data_out, index_out, \ + axis, axis_mul_before, axis_mul_after): + """Low level IR routing subfunction 2/4 for flattening data and indices into segmented format. Parameters ---------- data: Buffer - 2D Buffer of input boxes' score with shape [batch_size, num_anchors]. + Buffer of output boxes with class and score. index : Buffer - Buffer of number of valid number of boxes. + Buffer of number of valid output boxes. - output : Buffer - Output buffer of indicies of sorted tensor. + sizes_in : Buffer + Buffer of start locations of each sorting segment. + + data_out : Buffer + Buffer of flattened segmented data. + + index_out : Buffer + Buffer of flattened segmented indices. + + axis : int + The axis used for sorting. + + axis_mul_before : int + The multiplication result of axis dimensions before axis. + + axis_mul_after : int + The multiplication result of axis dimensions after axis. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ + ib = tvm.ir_builder.create() + sizes = ib.buffer_ptr(sizes_in) + p_index = ib.buffer_ptr(index) + p_data = ib.buffer_ptr(data) + data_new = ib.buffer_ptr(data_out) + index_new = ib.buffer_ptr(index_out) + max_threads = int( + tvm.target.current_target(allow_none=False).max_num_threads) + tx = tvm.thread_axis("threadIdx.x") + bx = tvm.thread_axis("blockIdx.x") + dshape = tvm.max(sizes_in.shape[0], p_index[0]) + nthread_tx = max_threads + nthread_bx = dshape // max_threads + 1 + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + with ib.if_scope(axis_mul_before * axis_mul_after > 1): + with ib.if_scope(tid < axis_mul_before * axis_mul_after): + i = tid / axis_mul_after + j = tid % axis_mul_after + current_sort_num = p_index[tid] + base_idx = i * data.shape[axis] * axis_mul_after + j + with ib.for_range(0, current_sort_num, name="k") as k: + full_idx = base_idx + k * axis_mul_after + with ib.if_scope(tid == 0): + start = 0 + with ib.else_scope(): + start = sizes[tid-1] + index_new[start + k] = k + data_new[start + k] = p_data[full_idx] + with ib.else_scope(): + with ib.if_scope(tid == 0): + with ib.for_range(0, p_index[0], name="k") as k: + index_new[k] = k + + body = ib.get() + return body + +def sort_oet_ir(data, index, new_data, new_index, loc, out_index, axis_mul_before, \ + axis_mul_after, axis, is_descend): + """Low level IR routing subfunction 3/4 for Odd-Even-Transposition sorting. + + Parameters + ---------- + data: Buffer + Buffer of output boxes with class and score. + + index : Buffer + Buffer of number of valid output boxes. + + new_data : Buffer + Buffer of flattened segmented data. + + new_index : Buffer + Buffer of flattened segmented indices. + + loc : Buffer + Buffer of start locations of each sorting segment. + + out_index : Buffer + Output buffer of output box indexes sorted by score in a flattened segmented format. + + axis_mul_before : int + The multiplication result of axis dimensions before axis. + + axis_mul_after : int + The multiplication result of axis dimensions after axis. axis : int The axis used for sorting. @@ -32,15 +168,197 @@ def sort_ir(data, index, output, axis, is_descend): stmt : Stmt The result IR statement. """ - max_threads = int( tvm.target.current_target(allow_none=False).max_num_threads) tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib = tvm.ir_builder.create() + dshape = loc.shape + fshape = data.shape[axis] * dshape[0] + temp_data = ib.allocate( + "float32", dshape, name="temp_data", scope="local") p_data = ib.buffer_ptr(data) p_index = ib.buffer_ptr(index) + data_new = ib.buffer_ptr(new_data) + index_new = ib.buffer_ptr(new_index) + index_out = ib.buffer_ptr(out_index) + sizes = ib.buffer_ptr(loc) + nthread_tx = max_threads + nthread_bx = fshape // max_threads + 1 + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + + with ib.if_scope(axis_mul_before * axis_mul_after > 1): + with ib.if_scope(tid < axis_mul_before * axis_mul_after): + with ib.if_scope(tid == 0): + start = 0 + with ib.else_scope(): + start = sizes[tid-1] + # OddEvenTransposeSort + with ib.for_range(0, p_index[tid], name="k") as k: + with ib.for_range(0, p_index[tid] - 1, name="i") as i: + with ib.if_scope(i % 2 == k % 2): + with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) == is_descend)): + temp_data[tid] = data_new[i+start] + data_new[i+start] = data_new[i+start+1] + data_new[i+start+1] = temp_data[tid] + index_out[tid] = index_new[i+start] + index_new[i+start] = index_new[i+start+1] + index_new[i+start+1] = index_out[tid] + with ib.if_scope(tid < 1): + with ib.for_range(0, sizes[dshape[0] - 1], name="i") as i: + index_out[i] = index_new[i] + with ib.else_scope(): + with ib.for_range(0, fshape, name="k", for_type="unroll") as k: + with ib.if_scope(tvm.all(k % 2 == tid % 2, tid < fshape)): + with ib.if_scope(k % 2 == 0): + with ib.if_scope(tvm.all(tid + 1 < fshape, (p_data[tid] < p_data[tid+1]) \ + == is_descend)): + data_new[tid] = p_data[tid+1] + index_out[tid] = index_new[tid+1] + with ib.else_scope(): + data_new[tid] = p_data[tid] + index_out[tid] = index_new[tid] + with ib.else_scope(): + with ib.if_scope(tvm.all(tid + 1 < fshape, (data_new[tid] < data_new[tid+1]) \ + == is_descend)): + p_data[tid] = data_new[tid+1] + index_new[tid] = index_out[tid+1] + with ib.else_scope(): + p_data[tid] = data_new[tid] + index_new[tid] = index_out[tid] + with ib.if_scope(tvm.all(k % 2 != tid % 2, tid < fshape)): + with ib.if_scope(k % 2 == 0): + with ib.if_scope(tvm.all(tid > 0, (p_data[tid-1] < p_data[tid]) == is_descend)): + data_new[tid] = p_data[tid-1] + index_out[tid] = index_new[tid-1] + with ib.else_scope(): + data_new[tid] = p_data[tid] + index_out[tid] = index_new[tid] + with ib.else_scope(): + with ib.if_scope(tvm.all(tid > 0, (data_new[tid-1] < data_new[tid]) \ + == is_descend)): + p_data[tid] = data_new[tid-1] + index_new[tid] = index_out[tid-1] + with ib.else_scope(): + p_data[tid] = data_new[tid] + index_new[tid] = index_out[tid] + with ib.if_scope(fshape % 2 == 1): + with ib.if_scope(tid < 1): + with ib.for_range(0, fshape, name="k") as k: + index_out[tid] = index_new[tid] + body = ib.get() + return body + + +def sort_ir_out(data, index, new_index, loc, output, axis_mul_before, axis_mul_after, axis): + """Low level IR routing subfunction 4/4 for writing sorted indices to output format. + + Parameters + ---------- + data: Buffer + Buffer of output boxes with class and score. + + index : Buffer + Buffer of number of valid output boxes. + + new_index : Buffer + Buffer of sorted indices in a flatten format. + + loc : Buffer + Buffer of start locations of each sorting segment. + + output : Buffer + Output buffer of output box indexes sorted by score. + + axis_mul_before : int + The multiplication result of axis dimensions before axis. + + axis_mul_after : int + The multiplication result of axis dimensions after axis. + + axis : int + The axis used for sorting. + + is_descend : bool + If the sorted data is in descending order. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ + max_threads = int( + tvm.target.current_target(allow_none=False).max_num_threads) + tx = tvm.thread_axis("threadIdx.x") + bx = tvm.thread_axis("blockIdx.x") + ib = tvm.ir_builder.create() + dshape = tvm.max(loc.shape[0], data.shape[axis]) + p_index = ib.buffer_ptr(index) + index_new = ib.buffer_ptr(new_index) + sizes = ib.buffer_ptr(loc) p_out = ib.buffer_ptr(output) + nthread_tx = max_threads + nthread_bx = dshape // max_threads + 1 + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + + with ib.if_scope(axis_mul_before * axis_mul_after > 1): + with ib.if_scope(tid < axis_mul_before * axis_mul_after): + i = tid / axis_mul_after + j = tid % axis_mul_after + base_idx = i * data.shape[axis] * axis_mul_after + j + with ib.for_range(0, data.shape[axis], name="k") as k: + with ib.if_scope(tid == 0): + start = 0 + with ib.else_scope(): + start = sizes[tid-1] + p_out[base_idx + k * axis_mul_after] = tvm.select( + k < p_index[tid], index_new[k+start], k) + with ib.else_scope(): + with ib.if_scope(tid < data.shape[axis]): + p_out[tid] = tvm.select(tid < p_index[0], index_new[tid], tid) + + body = ib.get() + return body + + +def sort_gpu(data, data_buf, index, index_buf, output_buf, axis, is_descend): + """Function to generate low level IR to do sorting on the GPU, use it by calling sort_gpu. + + Parameters + ---------- + data: tvm.Tensor + 3-D tensor with shape [batch_size, num_anchors, 6]. + The last dimension should be in format of + [class_id, score, box_left, box_top, box_right, box_bottom]. + + data_buf: Buffer + 2D Buffer of input boxes' score with shape [batch_size, num_anchors]. + + index : tvm.Tensor + 1-D tensor for valid number of boxes. + + index_buf : Buffer + Buffer of number of valid number of boxes. + + output_buf : Buffer + Output buffer of indicies of sorted tensor. + + axis : int + The axis used for sorting. + + is_descend : bool + If the sorted data is in descending order. + + Returns + ------- + out : tvm.Tensor + 3-D tensor with shape [batch_size, num_anchors]. + """ + ndim = len(data.shape) assert data.dtype == "float32", "Currently only supports input dtype to be float32" assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim @@ -55,89 +373,60 @@ def sort_ir(data, index, output, axis, is_descend): elif i > axis: axis_mul_after *= data.shape[i] - dshape = 0 - for i in range(0, len(index.shape)): - dshape += index.shape[i] - dshape = tvm.select(dshape > axis_mul_before*axis_mul_after, dshape, - axis_mul_before*axis_mul_after) - - sizes_temp = ib.allocate( - "int32", dshape, name="sizes_temp", scope="global") - sizes = ib.allocate("int32", dshape, name="sizes", scope="global") - temp_index = ib.allocate("int32", dshape, name="temp_index", scope="local") - temp_data = ib.allocate("float32", dshape, name="temp_data", scope="local") - data_new = ib.allocate("float32", dshape, name="data_new", scope="global") - index_new = ib.allocate("int32", dshape, name="index_new", scope="global") - nthread_tx = max_threads - nthread_bx = dshape // max_threads + 1 - ib.scope_attr(tx, "thread_extent", nthread_tx) - ib.scope_attr(bx, "thread_extent", nthread_bx) - tid = bx * max_threads + tx - - with ib.if_scope(tid < axis_mul_before * axis_mul_after): - sizes[tid] = p_index[tid] - sizes_temp[tid] = p_index[tid] - - with ib.if_scope(tid < axis_mul_before * axis_mul_after): - with ib.for_range(0, tvm.floor(tvm.sqrt((axis_mul_before * axis_mul_after) \ - .astype("float32"))) + 1, name="k") as k: - with ib.if_scope(tid - (tvm.const(1, "int32") << k) >= 0): - with ib.if_scope(k % 2 == 0): - sizes[tid] += sizes_temp[tid - ( - tvm.const(1, "int32") << k)] - sizes_temp[tid] = sizes[tid] - with ib.else_scope(): - sizes_temp[tid] += sizes[tid - ( - tvm.const(1, "int32") << k)] - sizes[tid] = sizes_temp[tid] - - with ib.if_scope(tid < axis_mul_before * axis_mul_after): - i = tid / axis_mul_after - j = tid % axis_mul_after - current_sort_num = p_index[tid] - base_idx = i * data.shape[axis] * axis_mul_after + j - with ib.for_range(0, current_sort_num, name="k") as k: - full_idx = base_idx + k * axis_mul_after - with ib.if_scope(tid == 0): - start = 0 - with ib.else_scope(): - start = sizes[tid-1] - index_new[start + k] = k - data_new[start + k] = p_data[full_idx] - - with ib.if_scope(tid < axis_mul_before * axis_mul_after): - with ib.if_scope(tid == 0): - start = 0 - with ib.else_scope(): - start = sizes[tid-1] - # OddEvenTransposeSort - with ib.for_range(0, p_index[tid], name="k") as k: - with ib.for_range(0, p_index[tid] - 1, name="i") as i: - with ib.if_scope(i % 2 == (k & 1)): - with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) ^ - is_descend) == False): - temp_data[tid] = data_new[i+start] - data_new[i+start] = data_new[i+start+1] - data_new[i+start+1] = temp_data[tid] - temp_index[tid] = index_new[i+start] - index_new[i+start] = index_new[i+start+1] - index_new[i+start+1] = temp_index[tid] - - with ib.if_scope(tid < axis_mul_before * axis_mul_after): - i = tid / axis_mul_after - j = tid % axis_mul_after - current_sort_num = p_index[tid] - base_idx = i * data.shape[axis] * axis_mul_after + j - with ib.for_range(0, data.shape[axis], name="k") as k: - with ib.if_scope(tid == 0): - start = 0 - with ib.else_scope(): - start = sizes[tid-1] - p_out[base_idx + k * axis_mul_after] = tvm.select( - k < current_sort_num, - index_new[k+start], k) - body = ib.get() - return body + dshape = axis_mul_before*axis_mul_after + fshape = data.shape[axis] * dshape + + loc_buf = api.decl_buffer(dshape, index.dtype, "sizes", data_alignment=8) + new_index_buf = api.decl_buffer( + fshape, index.dtype, "index_new", data_alignment=8) + out_index_buf = api.decl_buffer( + fshape, index.dtype, "index_out", data_alignment=8) + new_data_buf = api.decl_buffer( + dshape, data.dtype, "data_new", data_alignment=8) + + loc = \ + tvm.extern([(dshape,)], + [index], + lambda ins, outs: sort_pre_ir( + ins[0], outs[0], axis_mul_before, axis_mul_after), + dtype=[index.dtype], + in_buffers=index_buf, + out_buffers=[loc_buf], + tag="sorting_prepare") + + data_new, index_new = \ + tvm.extern([(dshape,), (fshape,)], + [data, index, loc], + lambda ins, outs: sort_pre_ir_data( + ins[0], ins[1], ins[2], outs[0], outs[1], axis, + axis_mul_before, axis_mul_after), + dtype=[data.dtype, index.dtype], + in_buffers=[data_buf, index_buf, loc_buf], + out_buffers=[new_data_buf, new_index_buf], + tag="sorting_data") + + index_out = \ + tvm.extern([(fshape,)], + [data, index, data_new, index_new, loc], + lambda ins, outs: sort_oet_ir( + ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], + axis_mul_before, axis_mul_after, axis, is_descend), + dtype=[index.dtype], + in_buffers=[data_buf, index_buf, + new_data_buf, new_index_buf, loc_buf], + out_buffers=[out_index_buf], + tag="sorting_oet") + out = \ + tvm.extern([data.shape], + [data, index, index_out, loc], + lambda ins, outs: sort_ir_out( + ins[0], ins[1], ins[2], ins[3], outs[0], + axis_mul_before, axis_mul_after, axis), + dtype=[index.dtype], + in_buffers=[data_buf, index_buf, out_index_buf, loc_buf], + out_buffers=output_buf, + tag="sorting_output") + return out def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk): @@ -333,15 +622,8 @@ def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype, "sort_tensor_buf", data_alignment=8) - sort_tensor = \ - tvm.extern(score_shape, - [score_tensor, valid_count], - lambda ins, outs: sort_ir( - ins[0], ins[1], outs[0], score_axis, True), - dtype=sort_tensor_dtype, - in_buffers=[score_tensor_buf, valid_count_buf], - out_buffers=sort_tensor_buf, - name="nms_sort") + sort_tensor = sort_gpu(score_tensor, score_tensor_buf, valid_count, + valid_count_buf, sort_tensor_buf, score_axis, True) out = \ tvm.extern(data.shape, [data, sort_tensor, valid_count], diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py index c22e7a513d7d..3c013c4d1605 100644 --- a/topi/python/topi/cuda/ssd/multibox.py +++ b/topi/python/topi/cuda/ssd/multibox.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements +# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, too-many-function-args """SSD multibox operators""" from __future__ import absolute_import as _abs import math @@ -13,6 +13,7 @@ from topi.vision.ssd import multibox_transform_loc from ..nms import nms + def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): """Low level IR routing for multibox_prior operator. @@ -41,7 +42,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): stmt : Stmt The result IR statement. """ - max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads)) + max_threads = int(math.sqrt( + tvm.target.current_target(allow_none=False).max_num_threads)) tx = tvm.thread_axis("threadIdx.x") ty = tvm.thread_axis("threadIdx.y") bx = tvm.thread_axis("blockIdx.x") @@ -76,7 +78,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): for k in range(num_sizes + num_ratios - 1): w = tvm.select(k < num_sizes, - size_ratio_concat[k] * in_height / in_width / 2.0, + size_ratio_concat[ + k] * in_height / in_width / 2.0, size_ratio_concat[0] * in_height / in_width * math.sqrt(size_ratio_concat[k + 1]) / 2.0) h = tvm.select(k < num_sizes, size_ratio_concat[k] / 2.0, @@ -93,7 +96,7 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): @multibox_prior.register(["cuda", "gpu"]) -def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), \ +def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False): """Generate prior(anchor) boxes from data, sizes and ratios. @@ -124,31 +127,114 @@ def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), \ """ num_sizes = len(sizes) num_ratios = len(ratios) - oshape = (1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4) + oshape = ( + 1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4) out = tvm.extern(oshape, [data], lambda ins, outs: - multibox_prior_ir(ins[0], outs[0], sizes, ratios, steps, offsets), + multibox_prior_ir( + ins[0], outs[0], sizes, ratios, steps, offsets), tag="multibox_prior") if clip: out = topi.clip(out, 0, 1) return out -def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, threshold, variances): - """Low level IR routing for transform location in multibox_detection operator. +def transform_loc_pre(cls_prob, valid_count, temp_flag, temp_id, temp_score_out, threshold): + """Low level IR routing for transform location data preparation. Parameters ---------- cls_prob : Buffer Buffer of class probabilities. + valid_count : Buffer + Buffer of number of valid output boxes. + + temp_flag : Buffer + Output intermediate result buffer + + temp_id : Buffer + Output intermediate result buffer + + temp_score_out : Buffer + Output buffer + + threshold : float + Threshold to be a positive prediction. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ + batch_size = cls_prob.shape[0] + num_classes = cls_prob.shape[1] + num_anchors = cls_prob.shape[2] + + max_threads = int( + tvm.target.current_target(allow_none=False).max_num_threads) + ib = tvm.ir_builder.create() + score = ib.buffer_ptr(temp_score_out) + cls_id = ib.buffer_ptr(temp_id) + flag = ib.buffer_ptr(temp_flag) + tx = tvm.thread_axis("threadIdx.x") + bx = tvm.thread_axis("blockIdx.x") + nthread_tx = max_threads + nthread_bx = (batch_size * num_anchors * num_classes) // max_threads + 1 + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + p_cls_prob = ib.buffer_ptr(cls_prob) + p_valid_count = ib.buffer_ptr(valid_count) + + with ib.if_scope(tid < batch_size * num_anchors): + n = tid / num_anchors # number of batches + i = tid % num_anchors # number of anchors + score[i] = -1.0 + cls_id[i] = 0 + p_valid_count[n] = 0 + with ib.for_range(0, num_classes-1, name="k") as k: + temp = p_cls_prob[n * num_anchors * num_classes + (k + 1) * num_anchors + i] + with ib.if_scope(temp > score[i]): + cls_id[i] = k + 1 + score[i] = temp + with ib.if_scope(tvm.all(cls_id[i] > 0, score[i] < threshold)): + cls_id[i] = 0 + with ib.if_scope(cls_id[i] > 0): + flag[i] = 1 + with ib.else_scope(): + flag[i] = 0 + + with ib.if_scope(tid < batch_size): + with ib.for_range(0, num_anchors, name="k") as k: + with ib.if_scope(k > 0): + flag[tid * num_anchors + + k] += flag[tid * num_anchors + k - 1] + p_valid_count[n] = flag[tid * num_anchors + num_anchors - 1] + + body = ib.get() + return body + + +def transform_loc_ir(loc_pred, anchor, temp_flag, temp_id, temp_score_in, \ + out, clip, variances, batch_size, num_classes, num_anchors): + """Low level IR routing for transform location in multibox_detection operator. + + Parameters + ---------- loc_pred : Buffer Buffer of location regression predictions. anchor : Buffer Buffer of prior anchor boxes. - valid_count : Buffer - Buffer of number of valid output boxes. + temp_flag : Buffer + Intermediate result buffer. + + temp_id : Buffer + Intermediate result buffer. + + temp_score_in : Buffer + Input buffer which stores intermediate results. out : Buffer Output buffer. @@ -156,12 +242,18 @@ def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, thresho clip : boolean Whether to clip out-of-boundary boxes. - threshold : float - Threshold to be a positive prediction. - variances : tuple of float Variances to be decoded from box regression output. + batch_size : int + Batch size + + num_classes : int + Number of classes + + num_anchors : int + Number of anchors + Returns ------- stmt : Stmt @@ -187,21 +279,16 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, ow = tvm.exp(pw * vw) * aw / 2.0 oh = tvm.exp(ph * vh) * ah / 2.0 return tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox - ow)), ox - ow), \ - tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \ - tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \ - tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh) - - batch_size = cls_prob.shape[0] - num_classes = cls_prob.shape[1] - num_anchors = cls_prob.shape[2] + tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \ + tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \ + tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh) + max_threads = int( + tvm.target.current_target(allow_none=False).max_num_threads) ib = tvm.ir_builder.create() - temp_score = ib.allocate('float32', (batch_size * (num_classes -1) * num_anchors, \ - ), name="temp_score", scope="global") - score = ib.allocate('float32', (batch_size * num_anchors, ), name="score", scope="local") - cls_id = ib.allocate('int32', (batch_size * num_anchors, ), name="id", scope="local") - flag = ib.allocate('int32', (batch_size * num_anchors, ), name="flag", scope="global") - max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads) + score = ib.buffer_ptr(temp_score_in) + cls_id = ib.buffer_ptr(temp_id) + flag = ib.buffer_ptr(temp_flag) tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") nthread_tx = max_threads @@ -209,42 +296,13 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx - p_cls_prob = ib.buffer_ptr(cls_prob) p_loc_pred = ib.buffer_ptr(loc_pred) p_anchor = ib.buffer_ptr(anchor) - p_valid_count = ib.buffer_ptr(valid_count) p_out = ib.buffer_ptr(out) - with ib.if_scope(tid < batch_size * num_anchors * num_classes): - n = tid / (num_anchors * num_classes) - j = (tid % (num_anchors * num_classes)) / num_anchors - i = tid % num_anchors - with ib.if_scope(j > 0): - temp_score[n * num_anchors * num_classes + i * (num_classes - 1) + j-1] = \ - p_cls_prob[tid] - p_valid_count[n] = 0 - with ib.if_scope(tid < batch_size * num_anchors): - n = tid / num_anchors - i = tid % num_anchors - score[tid] = -1.0 - cls_id[tid] = 0 - with ib.for_range(0, num_classes-1, name="k") as k: - temp = temp_score[tid * (num_classes-1) + k] - cls_id[tid] = tvm.select(temp > score[tid], k + 1, cls_id[tid]) - score[tid] = tvm.make.Max(temp, score[tid]) - with ib.if_scope(tvm.all(cls_id[tid] > 0, score[tid] < threshold)): - cls_id[tid] = 0 - with ib.if_scope(cls_id[tid] > 0): - flag[tid] = 1 - with ib.else_scope(): - flag[tid] = 0 - with ib.if_scope(tid < batch_size): - with ib.for_range(0, num_anchors, name="k") as k: - with ib.if_scope(k > 0): - flag[tid * num_anchors + k] += flag[tid * num_anchors + k - 1] - p_valid_count[tid] = flag[tid * num_anchors + num_anchors - 1] + with ib.if_scope(tid < batch_size * num_anchors): - n = tid / num_anchors - i = tid % num_anchors + n = tid / num_anchors # number of batches + i = tid % num_anchors # number of anchors with ib.if_scope(cls_id[tid] > 0): with ib.if_scope(tid == 0): out_base_idx = n * num_anchors * 6 @@ -253,17 +311,17 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, p_out[out_base_idx] = cls_id[tid] - 1.0 p_out[out_base_idx + 1] = score[tid] p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \ - p_out[out_base_idx + 5] = transform_loc(p_loc_pred, tid * 4, p_anchor, i*4, - clip, variances[0], variances[1], - variances[2], variances[3]) + p_out[out_base_idx + 5] = transform_loc(p_loc_pred, tid * 4, + p_anchor, i*4, clip, variances[0], + variances[1], variances[2], variances[3]) body = ib.get() return body @multibox_transform_loc.register(["cuda", "gpu"]) -def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, - variances=(0.1, 0.1, 0.2, 0.2)): +def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, \ + threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)): """Location transformation for multibox detection Parameters @@ -297,20 +355,42 @@ def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, threshold= 1-D tensor with shape (batch_size,), number of valid anchor boxes. """ batch_size = cls_prob.shape[0] - num_anchors = anchor.shape[1] + num_classes = cls_prob.shape[1] + num_anchors = cls_prob.shape[2] oshape = (batch_size, num_anchors, 6) # Define data alignment for intermediate buffer valid_count_dtype = "int32" valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype, "valid_count_buf", data_alignment=4) - out_buf = api.decl_buffer(oshape, cls_prob.dtype, "out_buf", data_alignment=8) - valid_count, out = \ - tvm.extern([(batch_size,), oshape], - [cls_prob, loc_pred, anchor], + out_buf = api.decl_buffer( + oshape, cls_prob.dtype, "out_buf", data_alignment=8) + size = num_anchors + temp_flag_buf = api.decl_buffer( + (size,), valid_count_dtype, "flag", data_alignment=8) + temp_id_buf = api.decl_buffer( + (size,), valid_count_dtype, "cls_id", data_alignment=8) + temp_score_buf = api.decl_buffer( + (size,), cls_prob.dtype, "score", data_alignment=8) + + valid_count, temp_flag, temp_id, temp_score = \ + tvm.extern([(batch_size,), (size,), (size,), (size,)], + [cls_prob], + lambda ins, outs: transform_loc_pre( + ins[0], outs[0], outs[1], outs[2], outs[3], threshold), + dtype=[valid_count_dtype, + valid_count_dtype, valid_count_dtype, cls_prob.dtype], + out_buffers=[valid_count_buf, + temp_flag_buf, temp_id_buf, temp_score_buf], + tag="multibox_transform_loc_first_step") + + out = \ + tvm.extern([oshape], + [loc_pred, anchor, temp_flag, temp_id, temp_score], lambda ins, outs: transform_loc_ir( - ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances), - dtype=[valid_count_dtype, cls_prob.dtype], - out_buffers=[valid_count_buf, out_buf], + ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], clip, \ + variances, batch_size, num_classes, num_anchors), + dtype=[cls_prob.dtype], + out_buffers=[out_buf], tag="multibox_transform_loc") return [out, valid_count] @@ -356,5 +436,6 @@ def multibox_detection_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01 """ inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor, clip, threshold, variances) - out = nms(inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk) + out = nms( + inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk) return out From 3900392ccaa881c88398acd64844d36c794cd0bd Mon Sep 17 00:00:00 2001 From: Pariksheet Pinjari Date: Tue, 14 Aug 2018 01:43:09 +0530 Subject: [PATCH 011/529] Split_indices negative axis added (#1595) --- topi/include/topi/transform.h | 5 +++++ topi/tests/python_cpp/test_topi_transform.py | 1 + 2 files changed, 6 insertions(+) diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h index 09af612b957b..245b38cfb63d 100644 --- a/topi/include/topi/transform.h +++ b/topi/include/topi/transform.h @@ -475,6 +475,11 @@ inline Array split_sections(const Tensor& x, int axis, std::string name = "tensor", std::string tag = kInjective) { + if (axis < 0) { + axis += static_cast(x->shape.size()); + } + CHECK_LT(axis, x->shape.size()) << "axis out of bounds"; + auto src_axis_size = static_cast(GetConstInt(x->shape[axis])); CHECK_GT(num_sections, 0) << "Slice count must be > 0"; diff --git a/topi/tests/python_cpp/test_topi_transform.py b/topi/tests/python_cpp/test_topi_transform.py index c8b7c3906caa..3f7bdbfdd499 100644 --- a/topi/tests/python_cpp/test_topi_transform.py +++ b/topi/tests/python_cpp/test_topi_transform.py @@ -340,6 +340,7 @@ def test_concatenate(): def test_split(): verify_split((2, 12, 3), 3, 1) + verify_split((2, 12, 3), 3, -1) verify_split((2, 12, 3), [2, 4], 1) verify_split((10, 12, 24), [5, 7, 9], -1) From 4daa9ee8ada2b99688ccb01f7a9257f58ae483ee Mon Sep 17 00:00:00 2001 From: Albin Joy Date: Tue, 14 Aug 2018 02:44:26 +0530 Subject: [PATCH 012/529] [FRONTEND][TENSORFLOW] Optimized tensorflow testcases (#1546) * [NNVM][TENSORFLOW] Optimized tensorflow testcases * Replace Constants with Placeholder * Review comment fix --- .../frontend/tensorflow/test_forward.py | 457 ++++++------------ 1 file changed, 136 insertions(+), 321 deletions(-) diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py index 495852f9e5d6..64c57c126f8d 100644 --- a/nnvm/tests/python/frontend/tensorflow/test_forward.py +++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py @@ -83,6 +83,34 @@ def run_tf_graph(sess, input_data, input_node, output_node): output_data = sess.run(tensor, input_dict) return output_data + +def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False): + """Generic function to generate and compare tensorflow and TVM output""" + + out_node = out_name.split(':')[0] if ":" in out_name else out_name + + if isinstance(in_name, list): + in_node = [0]*len(in_name) + for i in range(len(in_name)): + in_node[i] = in_name[i].split(':')[0] if ":" in in_name[i] else in_name[i] + else: + in_node = in_name.split(':')[0] if ":" in in_name else in_name + + with tf.Session() as sess: + if init_global_variables: + sess.run(variables.global_variables_initializer()) + final_graph_def = tf.graph_util.convert_variables_to_constants( + sess, + sess.graph.as_graph_def(add_shapes=True), + [out_node], + ) + + tf_output = run_tf_graph(sess, in_data, in_name, out_name) + tvm_output = run_tvm_graph(final_graph_def, in_data, + in_node, tf_output.shape, tf_output.dtype) + np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5) + sess.close() + ####################################################################### # Pooling # ------- @@ -93,31 +121,15 @@ def _test_pooling(input_shape, **kwargs): np.prod(input_shape), dtype=np.float32).reshape(input_shape) - 1 with tf.Graph().as_default(): - in_data = constant_op.constant(x, shape=input_shape, dtype='float32') - # pylint: disable=unused-variable - pool = nn_ops.pool(in_data, **kwargs) - # pylint: enable=unused-variable + in_data = array_ops.placeholder(shape=input_shape, dtype='float32') + nn_ops.pool(in_data, **kwargs) if kwargs['pooling_type'] == 'MAX': - out_node = 'max_pool' out_name = 'max_pool:0' else: - out_node = 'avg_pool' out_name = 'avg_pool:0' - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - [out_node], - ) - - tf_output = run_tf_graph(sess, x, 'Const:0', out_name) - tvm_output = run_tvm_graph(graph_def, x.astype('float32'), - "Const", tf_output.shape, 'float32') - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3) - - sess.close() + compare_tf_with_tvm(x, 'Placeholder:0', out_name) def test_forward_pooling(): """ Pooling """ @@ -195,35 +207,19 @@ def _test_convolution(tensor_in_sizes, filter_in_sizes, filter_array = [f * 1.0 for f in range(1, total_size_2 + 1)] with tf.Graph().as_default(): - in_data = constant_op.constant(data_array, shape=tensor_in_sizes, dtype='float32') + in_data = array_ops.placeholder(shape=tensor_in_sizes, dtype='float32') in_filter = constant_op.constant(filter_array, shape=filter_in_sizes, dtype='float32') strides = [1] + strides + [1] dilations = [1] + dilations + [1] - # pylint: disable=unused-variable - conv = nn_ops.conv2d(in_data, - in_filter, - strides=strides, - padding=padding, - data_format=data_format) - # pylint: enable=unused-variable + nn_ops.conv2d(in_data, + in_filter, + strides=strides, + padding=padding, + data_format=data_format) - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['Conv2D'], - ) - - tf_output = run_tf_graph(sess, np.reshape(data_array, tensor_in_sizes), - 'Const:0', 'Conv2D:0') - tvm_output = run_tvm_graph(graph_def, - np.reshape(data_array, tensor_in_sizes).astype('float32'), - "Const", tf_output.shape, 'float32') - - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3) - - sess.close() + compare_tf_with_tvm(np.reshape(data_array, tensor_in_sizes).astype('float32'), + 'Placeholder:0', 'Conv2D:0') def test_forward_convolution(): _test_convolution([4, 8, 8, 176], [1, 1, 176, 32], [1, 1], [1, 1], 'SAME', 'NHWC') @@ -239,28 +235,10 @@ def _test_reshape(data, out_shape): """ One iteration of reshape operation with given data and out shape """ with tf.Graph().as_default(): - in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype) - - # pylint: disable=unused-variable - reshape_out = array_ops.reshape(in_data, out_shape) - # pylint: enable=unused-variable + in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype) + array_ops.reshape(in_data, out_shape) - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['Reshape'], - ) - - tf_output = run_tf_graph(sess, data, - 'Const:0', 'Reshape:0') - tvm_output = run_tvm_graph(graph_def, - data, - "Const", tf_output.shape, data.dtype) - - np.testing.assert_allclose(tf_output, tvm_output) - - sess.close() + compare_tf_with_tvm(data, 'Placeholder:0', 'Reshape:0') def test_forward_reshape(): _test_reshape(np.arange(6.0), [2, 3]) @@ -279,31 +257,14 @@ def _test_squeeze(data, squeeze_dims=None): squeeze_dims = [] with tf.Graph().as_default(): - in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype) + in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype) - # pylint: disable=unused-variable if squeeze_dims: - squeeze_out = array_ops.squeeze(in_data, squeeze_dims) + array_ops.squeeze(in_data, squeeze_dims) else: - squeeze_out = array_ops.squeeze(in_data) - # pylint: enable=unused-variable - - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['Squeeze'], - ) - - tf_output = run_tf_graph(sess, data, - 'Const:0', 'Squeeze:0') - tvm_output = run_tvm_graph(graph_def, - data, - "Const", tf_output.shape, data.dtype) + array_ops.squeeze(in_data) - np.testing.assert_allclose(tf_output, tvm_output) - - sess.close() + compare_tf_with_tvm(data, 'Placeholder:0', 'Squeeze:0') def test_forward_squeeze(): """ Squeeze """ @@ -336,28 +297,10 @@ def _test_concat_v2(data, dim): """ One iteration of ConcatV2 """ with tf.Graph().as_default(): + gen_array_ops._concat_v2(data, dim) - # pylint: disable=unused-variable - concat_out = gen_array_ops._concat_v2(data, dim) - # pylint: enable=unused-variable - - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['ConcatV2'], - ) - - tf_output = run_tf_graph(sess, data, - ['ConcatV2/values_0:0', 'ConcatV2/values_1:0'], 'ConcatV2:0') - tvm_output = run_tvm_graph(graph_def, - data, - ["ConcatV2/values_0", 'ConcatV2/values_1'], - tf_output.shape, tf_output.dtype) - - np.testing.assert_allclose(tf_output, tvm_output) - - sess.close() + compare_tf_with_tvm(data, ['ConcatV2/values_0:0', 'ConcatV2/values_1:0'], + 'ConcatV2:0') def _test_forward_concat_v2(): t1 = np.array([]) @@ -377,28 +320,10 @@ def _test_sigmoid(data): """ One iteration of sigmoid """ with tf.Graph().as_default(): - in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype) - - # pylint: disable=unused-variable + in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype) sigmoid_out = math_ops.sigmoid(in_data) - # pylint: enable=unused-variable - - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['Sigmoid'], - ) - - tf_output = run_tf_graph(sess, data, - 'Const:0', 'Sigmoid:0') - tvm_output = run_tvm_graph(graph_def, - data, - "Const", tf_output.shape, data.dtype) - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5) - - sess.close() + compare_tf_with_tvm(data, 'Placeholder:0', 'Sigmoid:0') def test_forward_sigmoid(): """ Sigmoid """ @@ -412,24 +337,10 @@ def test_forward_sigmoid(): def _test_argx(func, data, **kwargs): with tf.Graph().as_default(): - inp = constant_op.constant(data, shape=data.shape, dtype=data.dtype, name="c0") - - # pylint: disable=unused-variable - out = func(inp, name="argx0", **kwargs) - # pylint: enable=unused-variable - - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess=sess, - input_graph_def=sess.graph.as_graph_def(add_shapes=True), - output_node_names=["argx0"]) - - tf_output = run_tf_graph(sess, data, input_node="c0:0", output_node="argx0:0") - tvm_output = run_tvm_graph(graph_def, data, "c0", tf_output.shape, output_dtype='int32') - - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5) + inp = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="c0") + func(inp, name="argx0", **kwargs, output_type=tf.int32) - sess.close() + compare_tf_with_tvm(data, 'c0:0', 'argx0:0') def test_argmin_argmax(): for axis in [None,0,1,2]: @@ -442,6 +353,8 @@ def test_argmin_argmax(): # -------- def _test_variable(data): + """ One iteration of a variable """ + tf.reset_default_graph() input_op = array_ops.placeholder(shape=data.shape, dtype=data.dtype) input_tensor = array_ops.reshape(input_op, data.shape) @@ -450,84 +363,15 @@ def _test_variable(data): with variable_scope.variable_scope("linear", reuse=None): w = variable_scope.get_variable( "w", shape=[size, size], dtype=input_tensor.dtype) - # pylint: disable=unused-variable - output_op = math_ops.matmul(input_tensor, w) - # pylint: enable=unused-variable - - with tf.Session() as sess: - sess.run(variables.global_variables_initializer()) - final_graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['MatMul'], - ) - - tf_output = run_tf_graph(sess, data, 'Placeholder:0', 'MatMul:0') - tvm_output = run_tvm_graph(final_graph_def, data, - "Placeholder", tf_output.shape, data.dtype) + math_ops.matmul(input_tensor, w) - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5) - sess.close() + compare_tf_with_tvm(data, 'Placeholder:0', 'MatMul:0', init_global_variables=True) def test_forward_variable(): """Variable type op test""" _test_variable(np.random.uniform(size=(32, 100)).astype('float32')) -####################################################################### -# LSTM -# ---- -def _test_lstm_cell(batch_size, num_hidden, num_layers, forget_bias, dtype): - tf.reset_default_graph() - input_size = num_hidden - input_data = np.full((batch_size, input_size), 1., dtype=dtype) - in_state_c = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype) - in_state_h = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype) - - def _get_tensorflow_output(): - with tf.Session() as sess: - with variable_scope.variable_scope( - "root", initializer=init_ops.constant_initializer(0.5)): - m0 = array_ops.zeros([batch_size, num_hidden]) - m1 = array_ops.zeros([batch_size, num_hidden]) - x=tf.placeholder(shape=(batch_size, input_size), dtype=dtype) - g, ((out_m0, out_m1)) = \ - tf.contrib.rnn.LSTMBlockCell(num_hidden, - forget_bias=forget_bias)(x, ((m0, m1))) - sess.run([variables.global_variables_initializer()]) - res = sess.run([g, out_m0, out_m1], { - x.name: np.array([[1., 1.]]), - m0.name: 0.1 * np.ones([batch_size, num_hidden]), - m1.name: 0.1 * np.ones([batch_size, num_hidden]), - }) - graph_def = sess.graph.as_graph_def(add_shapes=True) - final_graph_def = graph_util.convert_variables_to_constants( - sess, - graph_def, - ['root/lstm_cell/LSTMBlockCell']) - return final_graph_def, res - - graph_def, tf_out = _get_tensorflow_output() - tvm_output = run_tvm_graph(graph_def, [input_data, in_state_c, in_state_h], - ['root/Placeholder', 'root/lstm_cell/LSTMBlockCell_c', - 'root/lstm_cell/LSTMBlockCell_h'], - [tf_out[0].shape, (2, batch_size, num_hidden)], - [tf_out[0].dtype, tf_out[1].dtype]) - - if isinstance(tvm_output, list): - out = tvm_output[0] - out_state = tvm_output[1] - out_state_tup = np.split(out_state, indices_or_sections=2, axis=0) - out_state_c = np.reshape(out_state_tup[0], (batch_size, num_hidden)) - out_state_h = np.reshape(out_state_tup[1], (batch_size, num_hidden)) - tvm_out = [out, out_state_c, out_state_h] - np.testing.assert_allclose(tf_out, tvm_out, rtol=1e-3, atol=1e-3) - -def test_forward_lstm(): - '''test LSTM block cell''' - _test_lstm_cell(1, 2, 1, 0.0, 'float32') - - ####################################################################### # StridedSlice # ------------ @@ -535,6 +379,8 @@ def test_forward_lstm(): def _test_stridedslice(ip_shape, begin, end, stride, dtype, begin_mask=0, end_mask=0, new_axis_mask=0, shrink_axis_mask=0, ellipsis_mask=0): + """ One iteration of a Stridedslice """ + tf.reset_default_graph() in_data = tf.placeholder(dtype, ip_shape, name="in_data") tf.strided_slice(in_data, begin, end, stride, begin_mask=begin_mask, @@ -543,17 +389,7 @@ def _test_stridedslice(ip_shape, begin, end, stride, dtype, ellipsis_mask=ellipsis_mask, name="strided_slice") np_data = np.random.uniform(size=ip_shape).astype(dtype) - with tf.Session() as sess: - final_graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['strided_slice']) - tf_output = run_tf_graph(sess, np_data, - 'in_data:0', 'strided_slice:0') - tvm_output = run_tvm_graph(final_graph_def, np_data, - "in_data", tf_output.shape, np_data.dtype) - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5) - sess.close() + compare_tf_with_tvm(np_data, 'in_data:0', 'strided_slice:0') def test_forward_stridedslice(): '''test StridedSlice''' @@ -586,6 +422,8 @@ def test_forward_stridedslice(): # ------ def _test_gather(ip_shape, indice_shape, indice_value, axis, dtype): + """ One iteration of a Gather """ + tf.reset_default_graph() in_data = tf.placeholder(dtype, ip_shape, name="in_data") indices = tf.placeholder("int32", indice_shape, name="indices") @@ -601,17 +439,7 @@ def _fill_indices(indice_value): return indices np_indices = _fill_indices(indice_value) - with tf.Session() as sess: - final_graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['GatherV2']) - tf_output = run_tf_graph(sess, [np_data, np_indices], ['in_data:0', - 'indices:0'], 'GatherV2:0') - tvm_output = run_tvm_graph(final_graph_def, [np_data, np_indices], - ['in_data', 'indices'], tf_output.shape, dtype) - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5) - sess.close() + compare_tf_with_tvm([np_data, np_indices], ['in_data:0', 'indices:0'], 'GatherV2:0') def test_forward_gather(): '''test gather layer''' @@ -640,28 +468,11 @@ def test_forward_multi_input(): out1 = tf.add(in1, in2, name='out1') out2 = tf.subtract(in3, in4, name='out2') - out = tf.multiply(out1, out2, name='out') + in_data = np.arange(9, dtype='int32').reshape([3, 3]) - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['out'], - ) - - in_data = np.arange(9, dtype='int32').reshape([3, 3]) - - tf_output = run_tf_graph(sess, [in_data, in_data, in_data, in_data ], - ['in1:0', 'in2:0', 'in3:0', 'in4:0'], 'out:0') - tvm_output = run_tvm_graph(graph_def, - [in_data, in_data, in_data, in_data ], - ['in1', 'in2', 'in3', 'in4'], - tf_output.shape, tf_output.dtype) - - np.testing.assert_allclose(tf_output, tvm_output) - - sess.close() + compare_tf_with_tvm([in_data, in_data, in_data, in_data], + ['in1:0', 'in2:0', 'in3:0', 'in4:0'], 'out:0') ####################################################################### # Resize Bilinear @@ -674,36 +485,75 @@ def _test_resize_bilinear(in_shape, to_shape, align_corners): shape_data = np.array(to_shape).astype('int32') with tf.Graph().as_default(): - in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype) + in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype) shape_data = constant_op.constant(shape_data, shape=shape_data.shape, dtype=shape_data.dtype) + tf.image.resize_bilinear(in_data, shape_data, align_corners=align_corners) - # pylint: disable=unused-variable - resize_out = tf.image.resize_bilinear(in_data, shape_data, align_corners=align_corners) - # pylint: enable=unused-variable + compare_tf_with_tvm(data, 'Placeholder:0', 'ResizeBilinear:0') - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['ResizeBilinear'], - ) +def test_forward_resize_bilinear(): + """ Resize Bilinear """ - tf_output = run_tf_graph(sess, data, - 'Const:0', 'ResizeBilinear:0') + _test_resize_bilinear((4, 16, 32, 32), [50, 50], False) + _test_resize_bilinear((6, 32, 64, 64), [20, 20], True) - tvm_output = run_tvm_graph(graph_def, - data, - "Const", tf_output.shape, data.dtype) - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3) +####################################################################### +# LSTM +# ---- - sess.close() +def _test_lstm_cell(batch_size, num_hidden, num_layers, forget_bias, dtype): + """ One iteration of a LSTM cell """ -def test_forward_resize_bilinear(): - """ Resize Bilinear """ + tf.reset_default_graph() + input_size = num_hidden + input_data = np.full((batch_size, input_size), 1., dtype=dtype) + in_state_c = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype) + in_state_h = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype) - _test_resize_bilinear((4, 16, 32, 32), [50, 50], False) - _test_resize_bilinear((6, 32, 64, 64), [20, 20], True) + def _get_tensorflow_output(): + with tf.Session() as sess: + with variable_scope.variable_scope( + "root", initializer=init_ops.constant_initializer(0.5)): + m0 = array_ops.zeros([batch_size, num_hidden]) + m1 = array_ops.zeros([batch_size, num_hidden]) + x=tf.placeholder(shape=(batch_size, input_size), dtype=dtype) + g, ((out_m0, out_m1)) = \ + tf.contrib.rnn.LSTMBlockCell(num_hidden, + forget_bias=forget_bias)(x, ((m0, m1))) + sess.run([variables.global_variables_initializer()]) + res = sess.run([g, out_m0, out_m1], { + x.name: np.array([[1., 1.]]), + m0.name: 0.1 * np.ones([batch_size, num_hidden]), + m1.name: 0.1 * np.ones([batch_size, num_hidden]), + }) + graph_def = sess.graph.as_graph_def(add_shapes=True) + final_graph_def = graph_util.convert_variables_to_constants( + sess, + graph_def, + ['root/lstm_cell/LSTMBlockCell']) + return final_graph_def, res + + graph_def, tf_out = _get_tensorflow_output() + tvm_output = run_tvm_graph(graph_def, [input_data, in_state_c, in_state_h], + ['root/Placeholder', 'root/lstm_cell/LSTMBlockCell_c', + 'root/lstm_cell/LSTMBlockCell_h'], + [tf_out[0].shape, (2, batch_size, num_hidden)], + [tf_out[0].dtype, tf_out[1].dtype]) + assert isinstance(tvm_output, list) + + out = tvm_output[0] + out_state = tvm_output[1] + out_state_tup = np.split(out_state, indices_or_sections=2, axis=0) + out_state_c = np.reshape(out_state_tup[0], (batch_size, num_hidden)) + out_state_h = np.reshape(out_state_tup[1], (batch_size, num_hidden)) + tvm_out = [out, out_state_c, out_state_h] + np.testing.assert_allclose(tf_out, tvm_out, rtol=1e-3, atol=1e-3) + +def test_forward_lstm(): + '''test LSTM block cell''' + + _test_lstm_cell(1, 2, 1, 0.0, 'float32') ####################################################################### # Pad @@ -714,30 +564,17 @@ def _test_pad(input_shape, paddings, mode, **kwargs): x = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) with tf.Graph().as_default(): - in_data = constant_op.constant(x, shape=input_shape, dtype='float32') + in_data = array_ops.placeholder(shape=input_shape, dtype='float32') pad_values = constant_op.constant(paddings) pad = tf.pad(in_data, paddings=pad_values, mode=mode, **kwargs) if mode == 'CONSTANT': if 'constant_values' in kwargs: - out_node = 'PadV2' out_name = 'PadV2:0' else: - out_node = 'Pad' out_name = 'Pad:0' - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - [out_node], - ) - - tf_output = run_tf_graph(sess, x, 'Const:0', out_name) - tvm_output = run_tvm_graph(graph_def, x.astype('float32'), - "Const", tf_output.shape, 'float32') - np.testing.assert_allclose(tf_output, tvm_output) - sess.close() + compare_tf_with_tvm(x, 'Placeholder:0', out_name) def test_forward_pad(): """ Pad """ @@ -944,17 +781,7 @@ def _test_lrn(ishape, size, axis, bias, alpha, beta): alpha=alpha, beta=beta) - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['lrn'],) - tf_output = run_tf_graph(sess, inp_array, 'lrn0_data:0', 'lrn:0') - tvm_output = run_tvm_graph(graph_def, - inp_array, - "lrn0_data", tf_output.shape, tf_output.dtype) - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3) - sess.close() + compare_tf_with_tvm(inp_array, 'lrn0_data:0', 'lrn:0') def test_forward_lrn(): _test_lrn((1, 3, 20, 20), 3, 1, 1.0, 1.0, 0.5) @@ -962,38 +789,26 @@ def test_forward_lrn(): ####################################################################### # l2_normalize # ------------ + def _test_l2_normalize(ishape, eps, axis): """ testing l2 normalize (uses max, sum, square, sqrt frontend operators)""" inp_array = np.random.uniform(size=ishape).astype(np.float32) - inp_array.fill(1) with tf.Graph().as_default(): - in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype, name="Placeholder") + in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype) nn.l2_normalize(in1, axis=axis, epsilon=eps, name=None, dim=None) - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['l2_normalize'], - ) - tf_output = run_tf_graph(sess, inp_array, 'Placeholder:0', 'Placeholder:0') - tvm_output = run_tvm_graph(graph_def, - inp_array, - "Placeholder", - tf_output.shape, - tf_output.dtype) - - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3) - sess.close() + compare_tf_with_tvm(inp_array, 'Placeholder:0', 'l2_normalize:0') + def test_forward_l2_normalize(): _test_l2_normalize((1, 3, 20, 20), 0.001, (0,)) + ####################################################################### # Main # ---- @@ -1011,7 +826,7 @@ def test_forward_l2_normalize(): test_forward_mobilenet() test_forward_variable() test_forward_resize_bilinear() - test_forward_pad() + test_forward_pad() test_forward_lstm() test_forward_stridedslice() test_forward_gather() From f2814fc14eeaa12886810c90b29c9c834cb5f102 Mon Sep 17 00:00:00 2001 From: Siva Date: Tue, 14 Aug 2018 21:28:34 +0530 Subject: [PATCH 013/529] [NNVM][DOC] Update NNVM symbol documentation to latest. Ref. 1591 (#1599) --- docs/nnvm_top.rst | 79 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/docs/nnvm_top.rst b/docs/nnvm_top.rst index 96a37b779e1e..927990647a69 100644 --- a/docs/nnvm_top.rst +++ b/docs/nnvm_top.rst @@ -29,6 +29,7 @@ This level enables fully connected multi-layer perceptron. nnvm.symbol.dense nnvm.symbol.relu + nnvm.symbol.prelu nnvm.symbol.tanh nnvm.symbol.sigmoid nnvm.symbol.exp @@ -39,6 +40,8 @@ This level enables fully connected multi-layer perceptron. nnvm.symbol.elemwise_mul nnvm.symbol.elemwise_div nnvm.symbol.elemwise_sum + nnvm.symbol.elemwise_mod + nnvm.symbol.elemwise_pow nnvm.symbol.flatten nnvm.symbol.concatenate nnvm.symbol.expand_dims @@ -50,6 +53,14 @@ This level enables fully connected multi-layer perceptron. nnvm.symbol.log_softmax nnvm.symbol.pad nnvm.symbol.block_grad + nnvm.symbol.matmul + nnvm.symbol.resize + nnvm.symbol.upsampling + nnvm.symbol.take + nnvm.symbol.l2_normalize + nnvm.symbol.flip + nnvm.symbol.lrn + nnvm.symbol.where **Level 2: Convolutions** @@ -92,6 +103,7 @@ This level enables typical convnet models. nnvm.symbol.__lshift_scalar__ nnvm.symbol.__rshift_scalar__ + **Level 4: Broadcast and Reductions** .. autosummary:: @@ -117,11 +129,43 @@ This level enables typical convnet models. nnvm.symbol.ones_like nnvm.symbol.zeros nnvm.symbol.zeros_like + nnvm.symbol.slice_like + nnvm.symbol.strided_slice + nnvm.symbol.argmax + nnvm.symbol.argmin + nnvm.symbol.collapse_sum + nnvm.symbol.broadcast_equal + nnvm.symbol.broadcast_greater_equal + nnvm.symbol.broadcast_greater_equal + nnvm.symbol.broadcast_greater + nnvm.symbol.broadcast_left_shift + nnvm.symbol.broadcast_less_equal + nnvm.symbol.broadcast_less_equal + nnvm.symbol.broadcast_less + nnvm.symbol.broadcast_max + nnvm.symbol.broadcast_min + nnvm.symbol.broadcast_mod + nnvm.symbol.broadcast_not_equal + nnvm.symbol.broadcast_pow + nnvm.symbol.broadcast_right_shift + + +**Level 5: Vision Operators** + +.. autosummary:: + :nosignatures: + + nnvm.symbol.multibox_prior + nnvm.symbol.multibox_transform_loc + nnvm.symbol.nms + nnvm.symbol.yolo_region + nnvm.symbol.yolo_reorg Detailed Definitions -------------------- .. autofunction:: nnvm.symbol.dense .. autofunction:: nnvm.symbol.relu +.. autofunction:: nnvm.symbol.prelu .. autofunction:: nnvm.symbol.tanh .. autofunction:: nnvm.symbol.sigmoid .. autofunction:: nnvm.symbol.exp @@ -132,6 +176,8 @@ Detailed Definitions .. autofunction:: nnvm.symbol.elemwise_mul .. autofunction:: nnvm.symbol.elemwise_div .. autofunction:: nnvm.symbol.elemwise_sum +.. autofunction:: nnvm.symbol.elemwise_mod +.. autofunction:: nnvm.symbol.elemwise_pow .. autofunction:: nnvm.symbol.flatten .. autofunction:: nnvm.symbol.concatenate .. autofunction:: nnvm.symbol.expand_dims @@ -143,6 +189,14 @@ Detailed Definitions .. autofunction:: nnvm.symbol.log_softmax .. autofunction:: nnvm.symbol.pad .. autofunction:: nnvm.symbol.block_grad +.. autofunction:: nnvm.symbol.matmul +.. autofunction:: nnvm.symbol.resize +.. autofunction:: nnvm.symbol.upsampling +.. autofunction:: nnvm.symbol.take +.. autofunction:: nnvm.symbol.l2_normalize +.. autofunction:: nnvm.symbol.flip +.. autofunction:: nnvm.symbol.lrn +.. autofunction:: nnvm.symbol.where .. autofunction:: nnvm.symbol.conv2d .. autofunction:: nnvm.symbol.conv2d_transpose @@ -191,3 +245,28 @@ Detailed Definitions .. autofunction:: nnvm.symbol.ones_like .. autofunction:: nnvm.symbol.zeros .. autofunction:: nnvm.symbol.zeros_like +.. autofunction:: nnvm.symbol.slice_like +.. autofunction:: nnvm.symbol.strided_slice +.. autofunction:: nnvm.symbol.argmax +.. autofunction:: nnvm.symbol.argmin +.. autofunction:: nnvm.symbol.collapse_sum +.. autofunction:: nnvm.symbol.broadcast_equal +.. autofunction:: nnvm.symbol.broadcast_greater_equal +.. autofunction:: nnvm.symbol.broadcast_greater_equal +.. autofunction:: nnvm.symbol.broadcast_greater +.. autofunction:: nnvm.symbol.broadcast_left_shift +.. autofunction:: nnvm.symbol.broadcast_less_equal +.. autofunction:: nnvm.symbol.broadcast_less_equal +.. autofunction:: nnvm.symbol.broadcast_less +.. autofunction:: nnvm.symbol.broadcast_max +.. autofunction:: nnvm.symbol.broadcast_min +.. autofunction:: nnvm.symbol.broadcast_mod +.. autofunction:: nnvm.symbol.broadcast_not_equal +.. autofunction:: nnvm.symbol.broadcast_pow +.. autofunction:: nnvm.symbol.broadcast_right_shift + +.. autofunction:: nnvm.symbol.multibox_prior +.. autofunction:: nnvm.symbol.multibox_transform_loc +.. autofunction:: nnvm.symbol.nms +.. autofunction:: nnvm.symbol.yolo_region +.. autofunction:: nnvm.symbol.yolo_reorg From e9f942ab3aa2b386304ba05f5e72affd8e5b2260 Mon Sep 17 00:00:00 2001 From: Siva Date: Tue, 14 Aug 2018 22:14:33 +0530 Subject: [PATCH 014/529] [NNVM][POOL] bug fix. Remove the hardcode. (#1600) --- nnvm/src/top/nn/pooling.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nnvm/src/top/nn/pooling.cc b/nnvm/src/top/nn/pooling.cc index cccd5b1c710b..8b9b7a64aa0d 100644 --- a/nnvm/src/top/nn/pooling.cc +++ b/nnvm/src/top/nn/pooling.cc @@ -77,7 +77,7 @@ inline bool Pool2DInferShape(const nnvm::NodeAttrs& attrs, } else { oshape[hidx] = ((dshape[hidx] + pad_h - param.pool_size[0] + param.strides[0] - 1) / param.strides[0]) + 1; - oshape[widx] = ((dshape[3] + pad_w - param.pool_size[1] + + oshape[widx] = ((dshape[widx] + pad_w - param.pool_size[1] + param.strides[1] - 1) / param.strides[1]) + 1; } NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape); From 9c8d9d2c8f367c8a2249b5e783213ab41db2cfd8 Mon Sep 17 00:00:00 2001 From: Siju Date: Tue, 14 Aug 2018 23:18:02 +0530 Subject: [PATCH 015/529] [FRONTEND][DARKNET]LSTM and GRU support (#1576) --- nnvm/python/nnvm/frontend/darknet.py | 140 ++++++++++++++++-- nnvm/python/nnvm/testing/darknet.py | 3 + .../python/frontend/darknet/test_forward.py | 44 +++++- 3 files changed, 175 insertions(+), 12 deletions(-) diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py index 3a197a416219..3aa36b7e7ef9 100644 --- a/nnvm/python/nnvm/frontend/darknet.py +++ b/nnvm/python/nnvm/frontend/darknet.py @@ -412,7 +412,12 @@ def __init__(self, net, dtype='float32'): self._sym_array = {} self._tvmparams = {} self._outs = [] - self._rnn_state_ctr = 0 + self._state_ctr = {} + self._state_ctr['rnn'] = 0 + self._state_ctr['crnn'] = 0 + self._state_ctr['lstm'] = 0 + self._state_ctr['cell_state'] = 0 + self._state_ctr['gru'] = 0 def _read_memory_buffer(self, shape, data): length = 1 @@ -623,16 +628,16 @@ def _get_opname(self, layer): """Returs the layer name.""" return layer.type - def _new_rnn_state_sym(self, state=None): + def _new_rnn_state_sym(self, state=None, name='rnn'): """Returs a symbol for state""" - name = "rnn%d_state" % (self._rnn_state_ctr) - self._rnn_state_ctr += 1 - return _sym.Variable(name=name, init=state) + sym_name = name + "%d_state" % self._state_ctr[name] + self._state_ctr[name] += 1 + return _sym.Variable(name=sym_name, init=state) - def _get_rnn_state_buffer(self, layer): + def _get_rnn_state_buffer(self, layer, name): """Get the state buffer for rnn.""" buffer = np.zeros((1, layer.outputs), self.dtype) - return self._new_rnn_state_sym(buffer) + return self._new_rnn_state_sym(buffer, name) def _get_darknet_rnn_attrs(self, layer, sym): """Get the rnn converted symbol from attributes.""" @@ -653,7 +658,7 @@ def _handle_darknet_rnn_layers(self, layer_num, sym): attr.update({'batch' : layer.batch}) attr.update({'num_hidden' : str(layer.outputs)}) - state = self._get_rnn_state_buffer(layer) + state = self._get_rnn_state_buffer(layer, 'rnn') for _ in range(layer.steps): input_layer = layer.input_layer @@ -678,7 +683,7 @@ def _handle_darknet_rnn_layers(self, layer_num, sym): attr.update({'batch' : layer.batch}) attr.update({'num_hidden' : str(layer.outputs)}) - state = self._get_rnn_state_buffer(layer) + state = self._get_rnn_state_buffer(layer, 'crnn') for _ in range(layer.steps): input_layer = layer.input_layer @@ -698,6 +703,123 @@ def _handle_darknet_rnn_layers(self, layer_num, sym): self._sym_array[layer_num] = sym processed = True + elif LAYERTYPE.LSTM == layer.type: + if layer.steps > 1: + raise NotImplementedError("Currently support only single step GRU") + + op_name_add = 'elemwise_add' + op_name_mul = 'elemwise_mul' + attrs = {} + act_attr = {} + + h_state = self._get_rnn_state_buffer(layer, 'lstm') + c_state = self._get_rnn_state_buffer(layer, 'cell_state') + for _ in range(layer.steps): + sym_wf = self._get_darknet_rnn_attrs(layer.wf, h_state) + sym_wi = self._get_darknet_rnn_attrs(layer.wi, h_state) + sym_wg = self._get_darknet_rnn_attrs(layer.wg, h_state) + sym_wo = self._get_darknet_rnn_attrs(layer.wo, h_state) + + input_sym = sym + sym_uf = self._get_darknet_rnn_attrs(layer.uf, input_sym) + sym_ui = self._get_darknet_rnn_attrs(layer.ui, input_sym) + sym_ug = self._get_darknet_rnn_attrs(layer.ug, input_sym) + sym_uo = self._get_darknet_rnn_attrs(layer.uo, input_sym) + + new_inputs = _as_list([sym_wf, sym_uf]) + add_f = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + new_inputs = _as_list([sym_wi, sym_ui]) + add_i = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + new_inputs = _as_list([sym_wg, sym_ug]) + add_g = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + new_inputs = _as_list([sym_wo, sym_uo]) + add_o = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + act_attr['activation'] = ACTIVATION.LOGISTIC + act_f, _ = _darknet_activations(_as_list(add_f), act_attr) + + act_attr['activation'] = ACTIVATION.LOGISTIC + act_i, _ = _darknet_activations(_as_list(add_i), act_attr) + + act_attr['activation'] = ACTIVATION.TANH + act_g, _ = _darknet_activations(_as_list(add_g), act_attr) + + act_attr['activation'] = ACTIVATION.LOGISTIC + act_o, _ = _darknet_activations(_as_list(add_o), act_attr) + + new_inputs = _as_list([act_i, act_g]) + mul_t = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs) + + new_inputs = _as_list([act_f, c_state]) + c_state = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs) + + new_inputs = _as_list([mul_t, c_state]) + c_state = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + act_attr['activation'] = ACTIVATION.TANH + h_state, _ = _darknet_activations(_as_list(c_state), act_attr) + + new_inputs = _as_list([act_o, h_state]) + h_state = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs) + self._outs = self._outs + [c_state, h_state] + sym = h_state + self._sym_array[layer_num] = sym + processed = True + + elif LAYERTYPE.GRU == layer.type: + if layer.steps > 1: + raise NotImplementedError("Currently support only single step GRU") + + op_name_add = 'elemwise_add' + op_name_mul = 'elemwise_mul' + attrs = {} + act_attr = {} + + state = self._get_rnn_state_buffer(layer, "gru") + for _ in range(layer.steps): + sym_wz = self._get_darknet_rnn_attrs(layer.wz, state) + sym_wr = self._get_darknet_rnn_attrs(layer.wr, state) + + input_sym = sym + sym_uz = self._get_darknet_rnn_attrs(layer.uz, input_sym) + sym_ur = self._get_darknet_rnn_attrs(layer.ur, input_sym) + sym_uh = self._get_darknet_rnn_attrs(layer.uh, input_sym) + + new_inputs = _as_list([sym_uz, sym_wz]) + add_z = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + new_inputs = _as_list([sym_ur, sym_wr]) + add_r = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + act_attr['activation'] = ACTIVATION.LOGISTIC + act_z, _ = _darknet_activations(_as_list(add_z), act_attr) + + act_attr['activation'] = ACTIVATION.LOGISTIC + act_r, _ = _darknet_activations(_as_list(add_r), act_attr) + + new_inputs = _as_list([act_r, state]) + forgot = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs) + + sym_wh = self._get_darknet_rnn_attrs(layer.wh, forgot) + + new_inputs = _as_list([sym_uh, sym_wh]) + h_state = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + if layer.tanh == 1: + act_attr['activation'] = ACTIVATION.TANH + else: + act_attr['activation'] = ACTIVATION.LOGISTIC + h_state, _ = _darknet_activations(_as_list(h_state), act_attr) + + sym = act_z * state + (1 - act_z) * h_state + + self._outs = self._outs + [sym] + self._sym_array[layer_num] = sym + processed = True + return processed, sym def from_darknet(self): diff --git a/nnvm/python/nnvm/testing/darknet.py b/nnvm/python/nnvm/testing/darknet.py index 362fd3058954..e3d110e9605e 100644 --- a/nnvm/python/nnvm/testing/darknet.py +++ b/nnvm/python/nnvm/testing/darknet.py @@ -491,6 +491,9 @@ class ACTIVATION(object): layer make_region_layer(int batch, int w, int h, int n, int classes, int coords); layer make_softmax_layer(int batch, int inputs, int groups); layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam); +layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, ACTIVATION activation, int batch_normalize); +layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam); +layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam); void free_network(network *net); """ ) diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py index e68aed085664..5fc71a86211e 100644 --- a/nnvm/tests/python/frontend/darknet/test_forward.py +++ b/nnvm/tests/python/frontend/darknet/test_forward.py @@ -306,7 +306,7 @@ def test_forward_softmax_temperature(): LIB.free_network(net) def test_forward_rnn(): - '''test softmax layer''' + '''test RNN layer''' net = LIB.make_network(1) batch = 1 inputs = 256 @@ -325,7 +325,7 @@ def test_forward_rnn(): LIB.free_network(net) def test_forward_crnn(): - '''test softmax layer''' + '''test CRNN layer''' net = LIB.make_network(1) batch = 1 c = 3 @@ -349,6 +349,42 @@ def test_forward_crnn(): test_forward(net) LIB.free_network(net) +def test_forward_lstm(): + '''test LSTM layer''' + net = LIB.make_network(1) + batch = 1 + inputs = 256 + outputs = 256 + steps = 1 + batch_normalize = 0 + adam = 0 + layer_1 = LIB.make_lstm_layer(batch, inputs, outputs, steps, batch_normalize, adam) + net.layers[0] = layer_1 + net.inputs = inputs + net.outputs = outputs + net.w = net.h = 0 + LIB.resize_network(net, net.w, net.h) + test_rnn_forward(net) + LIB.free_network(net) + +def test_forward_gru(): + '''test GRU layer''' + net = LIB.make_network(1) + batch = 1 + inputs = 256 + outputs = 256 + steps = 1 + batch_normalize = 0 + adam = 0 + layer_1 = LIB.make_gru_layer(batch, inputs, outputs, steps, batch_normalize, adam) + net.layers[0] = layer_1 + net.inputs = inputs + net.outputs = outputs + net.w = net.h = 0 + LIB.resize_network(net, net.w, net.h) + test_rnn_forward(net) + LIB.free_network(net) + def test_forward_activation_logistic(): '''test logistic activation layer''' net = LIB.make_network(1) @@ -395,4 +431,6 @@ def test_forward_activation_logistic(): test_forward_elu() test_forward_rnn() test_forward_crnn() - test_forward_activation_logistic() \ No newline at end of file + test_forward_lstm() + test_forward_gru() + test_forward_activation_logistic() From d05026a24a2682b11fbe4eaf58790ad9dcbed41a Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 15 Aug 2018 15:34:05 -0700 Subject: [PATCH 016/529] [AUTOTVM] Fix GATuner and improve error message (#1605) --- include/tvm/operation.h | 2 ++ python/tvm/autotvm/measure/measure_methods.py | 2 ++ python/tvm/autotvm/task/nnvm_integration.py | 15 ++++++++++++++- python/tvm/autotvm/task/task.py | 2 +- python/tvm/autotvm/task/topi_integration.py | 2 +- python/tvm/autotvm/tuner/callback.py | 8 +++++++- python/tvm/autotvm/tuner/ga_tuner.py | 14 ++++++++++---- 7 files changed, 37 insertions(+), 8 deletions(-) diff --git a/include/tvm/operation.h b/include/tvm/operation.h index d13680531af9..ed8be6e4a7c0 100644 --- a/include/tvm/operation.h +++ b/include/tvm/operation.h @@ -366,6 +366,8 @@ class ExternOpNode : public OperationNode { v->Visit("tag", &tag); v->Visit("attrs", &attrs); v->Visit("inputs", &inputs); + v->Visit("input_placeholders", &input_placeholders); + v->Visit("output_placeholders", &output_placeholders); v->Visit("body", &body); } EXPORT static Operation make(std::string name, diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index e192ee26ee3e..d845cc1f88fd 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -394,6 +394,8 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat, msg = str(exc) if "Stack trace returned" in msg: msg = msg[:msg.index("Stack trace returned")] + if "CUDA Source" in msg: + msg = msg[:msg.index("CUDA Source")] costs = (RuntimeError(msg),) errno = MeasureErrorNo.RUNTIME_DEVICE tstamp = time.time() diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py index 338b46784a75..1b50869fc378 100644 --- a/python/tvm/autotvm/task/nnvm_integration.py +++ b/python/tvm/autotvm/task/nnvm_integration.py @@ -4,12 +4,16 @@ """ import warnings +import logging + from ... import tensor, placeholder, target as _target from ..util import get_const_tuple from .task import create, register +from .dispatcher import ApplyHistoryBest +logger = logging.getLogger('autotvm') def serialize_args(args): """serialize arguments of a topi function to a hashable tuple. @@ -176,8 +180,17 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None): # run compiler to collect all TOPI calls during compilation env.reset() + + # disable logger temporarily + old_state = logger.disabled + logger.disabled = True + + # use a dummy target to do a fake compile for collecting topi calls dummy_target = _target.create("opencl -device=dummy") - nnvm.compiler.build(graph, target=dummy_target, shape=shape, dtype=dtype) + with ApplyHistoryBest([], allow_fallback=True): + nnvm.compiler.build(graph, target=dummy_target, shape=shape, dtype=dtype) + + logger.disabled = old_state tasks = [] for task_name, args in env.get_tasks(): diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py index 7a386f1f9e67..f8923fca56e3 100644 --- a/python/tvm/autotvm/task/task.py +++ b/python/tvm/autotvm/task/task.py @@ -368,7 +368,7 @@ def traverse(ops): pass else: raise FlopCalculationError("Only support tvm.compute currently. " - "Other ops like tvm.scan is not supported") + "Other ops like tvm.scan/tvm.extern is not supported") return ret try: diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py index 012ca4a214e9..18f45f8d6708 100644 --- a/python/tvm/autotvm/task/topi_integration.py +++ b/python/tvm/autotvm/task/topi_integration.py @@ -62,7 +62,7 @@ def _decorator(f): for target_key in targets: if target_key not in _REGISTED_DISPATHCER: _REGISTED_DISPATHCER[target_key] = {} - if topi_compute not in _REGISTED_DISPATHCER: + if topi_compute not in _REGISTED_DISPATHCER[target_key]: @topi_compute.register(target_key) @dispatcher def config_dispatcher(*args, **kwargs): diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py index 15d5ac1c9689..6f66871f671c 100644 --- a/python/tvm/autotvm/tuner/callback.py +++ b/python/tvm/autotvm/tuner/callback.py @@ -101,11 +101,17 @@ def __init__(self): self.total = total def __del__(self): - sys.stdout.write(' Done.\n') + if logger.level < logging.DEBUG: # only print progress bar in non-debug mode + sys.stdout.write(' Done.\n') ctx = _Context() tic = time.time() + if logger.level < logging.DEBUG: # only print progress bar in non-debug mode + sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) ' + '| %.2f s' % (prefix, 0, 0, 0, total, time.time() - tic)) + sys.stdout.flush() + def _callback(tuner, inputs, results): ctx.ct += len(inputs) diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py index 916bd4ee68c6..b92737ed5317 100644 --- a/python/tvm/autotvm/tuner/ga_tuner.py +++ b/python/tvm/autotvm/tuner/ga_tuner.py @@ -47,6 +47,7 @@ def __init__(self, task, pop_size, elite_num=3, mutation_prob=0.1): # random initialization self.pop_size = min(self.pop_size, len(self.space)) + self.elite_num = min(self.pop_size, self.elite_num) for _ in range(self.pop_size): tmp_gene = point2knob(np.random.randint(len(self.space)), self.dims) while knob2point(tmp_gene, self.dims) in self.visited: @@ -70,9 +71,9 @@ def update(self, inputs, results): y = inp.task.flop / np.mean(res.costs) self.scores.append(y) else: - self.scores.append(0) + self.scores.append(0.0) - if len(self.scores) >= len(self.genes): + if len(self.scores) >= len(self.genes) and len(self.visited) < len(self.space): genes = self.genes + self.elites scores = np.array(self.scores[:len(self.genes)] + self.elite_scores) @@ -85,8 +86,13 @@ def update(self, inputs, results): # cross over indices = np.arange(len(genes)) - scores /= np.max(scores) - probs = scores / np.sum(scores) + max_score = np.max(scores) + if max_score < 1e-8: + probs = np.empty_like(scores) + probs[:] = 1.0 / len(scores) + else: + scores /= max_score + probs = scores / np.sum(scores) tmp_genes = [] for _ in range(self.pop_size): p1, p2 = np.random.choice(indices, size=2, replace=False, p=probs) From a1829b39655dafc814e67e83119677d464a3d279 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 15 Aug 2018 15:42:24 -0700 Subject: [PATCH 017/529] [NNVM] Add symbol for inception v3 (#1604) --- nnvm/python/nnvm/testing/__init__.py | 1 + nnvm/python/nnvm/testing/inception_v3.py | 255 ++++++++++++++++++ nnvm/python/nnvm/testing/squeezenet.py | 2 +- nnvm/src/compiler/graph_hash.cc | 2 +- .../frontend/mxnet/model_zoo/__init__.py | 9 +- .../frontend/mxnet/model_zoo/inception_v3.py | 170 ++++++++++++ .../tests/python/frontend/mxnet/test_graph.py | 11 +- 7 files changed, 442 insertions(+), 8 deletions(-) create mode 100644 nnvm/python/nnvm/testing/inception_v3.py create mode 100644 nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py diff --git a/nnvm/python/nnvm/testing/__init__.py b/nnvm/python/nnvm/testing/__init__.py index bff828d68280..4a879047ec7e 100644 --- a/nnvm/python/nnvm/testing/__init__.py +++ b/nnvm/python/nnvm/testing/__init__.py @@ -8,6 +8,7 @@ from . import resnet from . import vgg from . import squeezenet +from . import inception_v3 from . import dcgan from . import dqn from . import yolo2_detection diff --git a/nnvm/python/nnvm/testing/inception_v3.py b/nnvm/python/nnvm/testing/inception_v3.py new file mode 100644 index 000000000000..f14daa1ae656 --- /dev/null +++ b/nnvm/python/nnvm/testing/inception_v3.py @@ -0,0 +1,255 @@ +""" +Inception V3, suitable for images with around 299 x 299 + +Reference: +Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." +arXiv preprint arXiv:1512.00567 (2015). + +Adopted from https://github.com/apache/incubator-mxnet/blob/ + master/example/image-classification/symbols/inception-v3.py +""" +# pylint: disable=invalid-name,missing-docstring,unused-argument +from .. import symbol as sym +from .utils import create_workload + +def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''): + conv = sym.conv2d(data=data, channels=num_filter, kernel_size=kernel, + strides=stride, padding=pad, use_bias=False, + name='%s%s_conv2d' % (name, suffix)) + bn = sym.batch_norm(data=conv, name='%s%s_batchnorm' % (name, suffix), epsilon=2e-5) + act = sym.relu(data=bn, name='%s%s_relu' % (name, suffix)) + return act + +def Pooling(data, kernel, stride, pad, pool_type, name): + if pool_type == 'max': + return sym.max_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name) + elif pool_type == 'avg': + return sym.avg_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name, + count_include_pad=True) + else: + raise ValueError("Invalid pooling type: " + pool_type) + +def Inception7A(data, + num_1x1, + num_3x3_red, num_3x3_1, num_3x3_2, + num_5x5_red, num_5x5, + pool, proj, + name): + tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name)) + tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv') + tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), + suffix='_conv_1') + tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv') + tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), + suffix='_conv_1') + tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), + suffix='_conv_2') + pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, + name=('%s_pool_%s_pool' % (pool, name))) + + cproj = Conv(pooling, proj, name=('%s_tower_2' % name), suffix='_conv') + concat = sym.concatenate(*[tower_1x1, tower_5x5, tower_3x3, cproj], + name='ch_concat_%s_chconcat' % name) + return concat + +# First Downsample +def Inception7B(data, + num_3x3, + num_d3x3_red, num_d3x3_1, num_d3x3_2, + pool, + name): + tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), + name=('%s_conv' % name)) + tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv') + tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), + name=('%s_tower' % name), suffix='_conv_1') + tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), + name=('%s_tower' % name), suffix='_conv_2') + pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0, 0), pool_type="max", + name=('max_pool_%s_pool' % name)) + concat = sym.concatenate(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name) + return concat + +def Inception7C(data, + num_1x1, + num_d7_red, num_d7_1, num_d7_2, + num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4, + pool, proj, + name): + tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name)) + tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv') + tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), + name=('%s_tower' % name), suffix='_conv_1') + tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), + name=('%s_tower' % name), suffix='_conv_2') + tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), + name=('%s_tower_1' % name), suffix='_conv_1') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), + name=('%s_tower_1' % name), suffix='_conv_2') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), + name=('%s_tower_1' % name), suffix='_conv_3') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), + name=('%s_tower_1' % name), suffix='_conv_4') + pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, + name=('%s_pool_%s_pool' % (pool, name))) + cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), + name=('%s_tower_2' % name), suffix='_conv') + # concat + concat = sym.concatenate(*[tower_1x1, tower_d7, tower_q7, cproj], + name='ch_concat_%s_chconcat' % name) + return concat + +def Inception7D(data, + num_3x3_red, num_3x3, + num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3, + pool, + name): + tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), + suffix='_conv') + tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), + name=('%s_tower' % name), suffix='_conv_1') + tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), + suffix='_conv') + tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), + name=('%s_tower_1' % name), suffix='_conv_1') + tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), + name=('%s_tower_1' % name), suffix='_conv_2') + tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), + name=('%s_tower_1' % name), suffix='_conv_3') + pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, pad=(0, 0), + name=('%s_pool_%s_pool' % (pool, name))) + # concat + concat = sym.concatenate(*[tower_3x3, tower_d7_3x3, pooling], + name='ch_concat_%s_chconcat' % name) + return concat + +def Inception7E(data, + num_1x1, + num_d3_red, num_d3_1, num_d3_2, + num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2, + pool, proj, + name): + tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name)) + tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv') + tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), + name=('%s_tower' % name), suffix='_mixed_conv') + tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), + name=('%s_tower' % name), suffix='_mixed_conv_1') + tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), + suffix='_conv') + tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), + name=('%s_tower_1' % name), suffix='_conv_1') + tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), + name=('%s_tower_1' % name), suffix='_mixed_conv') + tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), + name=('%s_tower_1' % name), suffix='_mixed_conv_1') + pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, + name=('%s_pool_%s_pool' % (pool, name))) + cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name), + suffix='_conv') + # concat + concat = sym.concatenate( + *[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj], + name='ch_concat_%s_chconcat' % name) + return concat + + +def get_symbol(num_classes=1000, **kwargs): + data = sym.Variable(name="data") + # stage 1 + conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv") + conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1") + conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2") + pool = Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0), + name="pool") + # stage 2 + conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3") + conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4") + pool1 = Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0), + name="pool1") + + # stage 3 + in3a = Inception7A(pool1, 64, + 64, 96, 96, + 48, 64, + "avg", 32, "mixed") + in3b = Inception7A(in3a, 64, + 64, 96, 96, + 48, 64, + "avg", 64, "mixed_1") + in3c = Inception7A(in3b, 64, + 64, 96, 96, + 48, 64, + "avg", 64, "mixed_2") + in3d = Inception7B(in3c, 384, + 64, 96, 96, + "max", "mixed_3") + # stage 4 + in4a = Inception7C(in3d, 192, + 128, 128, 192, + 128, 128, 128, 128, 192, + "avg", 192, "mixed_4") + in4b = Inception7C(in4a, 192, + 160, 160, 192, + 160, 160, 160, 160, 192, + "avg", 192, "mixed_5") + in4c = Inception7C(in4b, 192, + 160, 160, 192, + 160, 160, 160, 160, 192, + "avg", 192, "mixed_6") + in4d = Inception7C(in4c, 192, + 192, 192, 192, + 192, 192, 192, 192, 192, + "avg", 192, "mixed_7") + in4e = Inception7D(in4d, 192, 320, + 192, 192, 192, 192, + "max", "mixed_8") + # stage 5 + in5a = Inception7E(in4e, 320, + 384, 384, 384, + 448, 384, 384, 384, + "avg", 192, "mixed_9") + in5b = Inception7E(in5a, 320, + 384, 384, 384, + 448, 384, 384, 384, + "max", 192, "mixed_10") + # pool + pool = Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", pad=(0, 0), + name="global_pool") + flatten = sym.flatten(data=pool, name="flatten") + fc1 = sym.dense(data=flatten, units=num_classes, name='fc1') + softmax = sym.softmax(data=fc1, name='softmax') + return softmax + +def get_workload(batch_size=1, num_classes=1000, + image_shape=(3, 299, 299), dtype="float32", **kwargs): + """Get benchmark workload for InceptionV3 + + Parameters + ---------- + batch_size : int + The batch size used in the model + + num_classes : int, optional + Number of classes + + image_shape : tuple, optional + The input image shape + + dtype : str, optional + The data type + + kwargs : dict + Extra arguments + + Returns + ------- + net : nnvm.Symbol + The computational graph + + params : dict of str to NDArray + The parameters. + """ + net = get_symbol(num_classes=num_classes, **kwargs) + return create_workload(net, batch_size, image_shape, dtype) diff --git a/nnvm/python/nnvm/testing/squeezenet.py b/nnvm/python/nnvm/testing/squeezenet.py index a445e8cfb7da..eab2cf06fee6 100644 --- a/nnvm/python/nnvm/testing/squeezenet.py +++ b/nnvm/python/nnvm/testing/squeezenet.py @@ -98,7 +98,7 @@ def get_symbol(num_classes, version, **kwargs): def get_workload(batch_size=1, num_classes=1000, version='1.0', image_shape=(3, 224, 224), dtype="float32", **kwargs): - """Get benchmark workload for resnet + """Get benchmark workload for SqueezeNet Parameters ---------- diff --git a/nnvm/src/compiler/graph_hash.cc b/nnvm/src/compiler/graph_hash.cc index d881130f72cc..ccd2e3ce433f 100644 --- a/nnvm/src/compiler/graph_hash.cc +++ b/nnvm/src/compiler/graph_hash.cc @@ -125,7 +125,7 @@ std::string GraphDeepCompare(const Graph& a, const IndexedGraph& idxb = b.indexed_graph(); std::ostringstream err; if (idxa.num_nodes() != idxb.num_nodes()) { - err << "Number of nodes mismatch"; + err << "Number of nodes mismatch (" << idxa.num_nodes() << " v.s " << idxb.num_nodes() << ")"; return err.str(); } if (idxa.num_node_entries() != idxb.num_node_entries()) { diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py b/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py index e3c9acdf23ef..66e743ad9c33 100644 --- a/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py +++ b/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py @@ -1,11 +1,8 @@ """MXNet and NNVM model zoo.""" from __future__ import absolute_import -from . import mlp, resnet, vgg, dqn, dcgan, squeezenet +from . import mlp, resnet, vgg, dqn, dcgan, squeezenet, inception_v3 import nnvm.testing -__all__ = ['mx_mlp', 'nnvm_mlp', 'mx_resnet', 'nnvm_resnet', 'mx_vgg', 'nnvm_vgg', - 'mx_squeezenet', 'nnvm_squeezenet'] - _num_class = 1000 # mlp fc @@ -35,6 +32,10 @@ mx_squeezenet[version] = squeezenet.get_symbol(version=version) nnvm_squeezenet[version] = nnvm.testing.squeezenet.get_workload(1, version=version)[0] +# inception +mx_inception_v3 = inception_v3.get_symbol() +nnvm_inception_v3 = nnvm.testing.inception_v3.get_workload(1)[0] + # dqn mx_dqn = dqn.get_symbol() nnvm_dqn = nnvm.testing.dqn.get_workload(1)[0] diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py b/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py new file mode 100644 index 000000000000..b8585bf05037 --- /dev/null +++ b/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py @@ -0,0 +1,170 @@ +""" +Inception V3, suitable for images with around 299 x 299 + +Reference: +Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." arXiv preprint arXiv:1512.00567 (2015). + +Adopted from https://github.com/apache/incubator-mxnet/blob/ + master/example/image-classification/symbols/inception-v3.py +""" +import mxnet as mx +import numpy as np + +def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''): + conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix)) + bn = mx.sym.BatchNorm(data=conv, eps=2e-5, name='%s%s_batchnorm' % (name, suffix)) + act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' %(name, suffix)) + return act + + +def Inception7A(data, + num_1x1, + num_3x3_red, num_3x3_1, num_3x3_2, + num_5x5_red, num_5x5, + pool, proj, + name): + tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name)) + tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv') + tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), suffix='_conv_1') + tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv') + tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1') + tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_2') + pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) + cproj = Conv(pooling, proj, name=('%s_tower_2' % name), suffix='_conv') + concat = mx.sym.Concat(*[tower_1x1, tower_5x5, tower_3x3, cproj], name='ch_concat_%s_chconcat' % name) + return concat + +# First Downsample +def Inception7B(data, + num_3x3, + num_d3x3_red, num_d3x3_1, num_d3x3_2, + pool, + name): + tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_conv' % name)) + tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv') + tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_tower' % name), suffix='_conv_1') + tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_2') + pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0,0), pool_type="max", name=('max_pool_%s_pool' % name)) + concat = mx.sym.Concat(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name) + return concat + +def Inception7C(data, + num_1x1, + num_d7_red, num_d7_1, num_d7_2, + num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4, + pool, proj, + name): + tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name)) + tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv') + tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower' % name), suffix='_conv_1') + tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower' % name), suffix='_conv_2') + tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_1') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_2') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_3') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_4') + pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) + cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name), suffix='_conv') + # concat + concat = mx.sym.Concat(*[tower_1x1, tower_d7, tower_q7, cproj], name='ch_concat_%s_chconcat' % name) + return concat + +def Inception7D(data, + num_3x3_red, num_3x3, + num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3, + pool, + name): + tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), suffix='_conv') + tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0,0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_1') + tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), suffix='_conv') + tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_1') + tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_2') + tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), name=('%s_tower_1' % name), suffix='_conv_3') + pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) + # concat + concat = mx.sym.Concat(*[tower_3x3, tower_d7_3x3, pooling], name='ch_concat_%s_chconcat' % name) + return concat + +def Inception7E(data, + num_1x1, + num_d3_red, num_d3_1, num_d3_2, + num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2, + pool, proj, + name): + tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name)) + tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv') + tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower' % name), suffix='_mixed_conv') + tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower' % name), suffix='_mixed_conv_1') + tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), suffix='_conv') + tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1') + tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower_1' % name), suffix='_mixed_conv') + tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower_1' % name), suffix='_mixed_conv_1') + pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) + cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name), suffix='_conv') + # concat + concat = mx.sym.Concat(*[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj], name='ch_concat_%s_chconcat' % name) + return concat + +def get_symbol(num_classes=1000, **kwargs): + data = mx.sym.Variable(name="data") + # stage 1 + conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv") + conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1") + conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2") + pool = mx.sym.Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool") + # stage 2 + conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3") + conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4") + pool1 = mx.sym.Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool1") + + # # stage 3 + in3a = Inception7A(pool1, 64, + 64, 96, 96, + 48, 64, + "avg", 32, "mixed") + in3b = Inception7A(in3a, 64, + 64, 96, 96, + 48, 64, + "avg", 64, "mixed_1") + in3c = Inception7A(in3b, 64, + 64, 96, 96, + 48, 64, + "avg", 64, "mixed_2") + in3d = Inception7B(in3c, 384, + 64, 96, 96, + "max", "mixed_3") + # stage 4 + in4a = Inception7C(in3d, 192, + 128, 128, 192, + 128, 128, 128, 128, 192, + "avg", 192, "mixed_4") + in4b = Inception7C(in4a, 192, + 160, 160, 192, + 160, 160, 160, 160, 192, + "avg", 192, "mixed_5") + in4c = Inception7C(in4b, 192, + 160, 160, 192, + 160, 160, 160, 160, 192, + "avg", 192, "mixed_6") + in4d = Inception7C(in4c, 192, + 192, 192, 192, + 192, 192, 192, 192, 192, + "avg", 192, "mixed_7") + in4e = Inception7D(in4d, 192, 320, + 192, 192, 192, 192, + "max", "mixed_8") + # stage 5 + in5a = Inception7E(in4e, 320, + 384, 384, 384, + 448, 384, 384, 384, + "avg", 192, "mixed_9") + in5b = Inception7E(in5a, 320, + 384, 384, 384, + 448, 384, 384, 384, + "max", 192, "mixed_10") + # pool + pool = mx.sym.Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", name="global_pool") + flatten = mx.sym.Flatten(data=pool, name="flatten") + fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=num_classes, name='fc1', flatten=False) + softmax = mx.sym.SoftmaxOutput(data=fc1, name='softmax') + return softmax diff --git a/nnvm/tests/python/frontend/mxnet/test_graph.py b/nnvm/tests/python/frontend/mxnet/test_graph.py index 18e124ad6ffc..e89224cd969e 100644 --- a/nnvm/tests/python/frontend/mxnet/test_graph.py +++ b/nnvm/tests/python/frontend/mxnet/test_graph.py @@ -39,17 +39,23 @@ def test_squeezenet(): nnvm_sym = model_zoo.nnvm_squeezenet[version] compare_graph(from_mx_sym, nnvm_sym) +def test_inception_v3(): + mx_sym = model_zoo.mx_inception_v3 + from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym) + nnvm_sym = model_zoo.nnvm_inception_v3 + compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 3, 299, 299)) + def test_dqn(): mx_sym = model_zoo.mx_dqn from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym) nnvm_sym = model_zoo.nnvm_dqn - compare_graph(from_mx_sym, nnvm_sym) + compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 4, 84, 84)) def test_dcgan(): mx_sym = model_zoo.mx_dcgan from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym) nnvm_sym = model_zoo.nnvm_dcgan - compare_graph(from_mx_sym, nnvm_sym) + compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 100)) def test_multi_outputs(): def compose(F, **kwargs): @@ -70,3 +76,4 @@ def compose(F, **kwargs): test_dqn() test_dcgan() test_squeezenet() + test_inception_v3() From 834d6fe42d98332d7cc5ed1069c8cd674b286da9 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 15 Aug 2018 23:21:05 -0700 Subject: [PATCH 018/529] [TEAM] New reviewer: kevinthesun (#1606) --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 2d571ba668ea..9db50b02b11a 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -26,6 +26,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909) - [Siva](https://github.com/srkreddy1238) - [Alex Weaver](https://github.com/alex-weaver) +- [Yao Wang](https://github.com/kevinthesun) - [Eddie Yan](https://github.com/eqy) - [Joshua Z. Zhang](https://github.com/zhreshold) From 2d7d220d06fd576802e5484264a1215d6f2f6af0 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 16 Aug 2018 14:05:08 -0700 Subject: [PATCH 019/529] [RUNTIME] Enable return NDArray in RPC (#1610) --- include/tvm/runtime/ndarray.h | 1 + src/api/api_base.cc | 8 +++ src/runtime/rpc/rpc_module.cc | 61 +++++++++++++++++++--- src/runtime/rpc/rpc_session.cc | 63 +++++++++++++++++++---- src/runtime/rpc/rpc_session.h | 1 + tests/python/unittest/test_runtime_rpc.py | 44 +++++++++++++++- 6 files changed, 160 insertions(+), 18 deletions(-) diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h index d3ecce8ba9d0..c288ce5f3adb 100644 --- a/include/tvm/runtime/ndarray.h +++ b/include/tvm/runtime/ndarray.h @@ -246,6 +246,7 @@ struct NDArray::Container { private: friend class NDArray; + friend class RPCWrappedFunc; /*! * \brief The shape container, * can be used used for shape data. diff --git a/src/api/api_base.cc b/src/api/api_base.cc index 37970e69e24f..70301993ad3a 100644 --- a/src/api/api_base.cc +++ b/src/api/api_base.cc @@ -37,6 +37,14 @@ TVM_REGISTER_API("_nop") .set_body([](TVMArgs args, TVMRetValue *ret) { }); +// internal fucntion used for debug and testing purposes +TVM_REGISTER_API("_ndarray_use_count") +.set_body([](TVMArgs args, TVMRetValue *ret) { + runtime::NDArray nd = args[0]; + // substract the current one + *ret = (nd.use_count() - 1); + }); + TVM_REGISTER_API("_TVMSetStream") .set_body([](TVMArgs args, TVMRetValue *ret) { TVMSetStream(args[0], args[1], args[2]); diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index 251871bf0cc1..d6c56e1b7cf4 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -12,13 +12,13 @@ namespace tvm { namespace runtime { // Wrapped remote function to packed func. -struct RPCWrappedFunc { +class RPCWrappedFunc { public: RPCWrappedFunc(void* handle, std::shared_ptr sess) : handle_(handle), sess_(sess) { fwrap_ = PackedFunc([sess](TVMArgs args, TVMRetValue* rv) { - WrapRemote(sess, args.values[0].v_handle, args.type_codes[0], rv); + WrapRemote(sess, args, rv); }); } @@ -34,10 +34,47 @@ struct RPCWrappedFunc { } static void WrapRemote(std::shared_ptr sess, - void* handle, - int tcode, + TVMArgs args, TVMRetValue* rv); + // deleter of RPC remote array + static void RemoteNDArrayDeleter(NDArray::Container* ptr) { + RemoteSpace* space = static_cast(ptr->dl_tensor.data); + space->sess->CallRemote(RPCCode::kNDArrayFree, ptr->manager_ctx); + delete space; + delete ptr; + } + // wrap return value as remote NDArray. + static NDArray WrapRemoteNDArray(std::shared_ptr sess, + DLTensor* tensor, + void* nd_handle) { + NDArray::Container* data = new NDArray::Container(); + data->manager_ctx = nd_handle; + data->deleter = RemoteNDArrayDeleter; + RemoteSpace* space = new RemoteSpace(); + space->sess = sess; + space->data = tensor->data; + data->dl_tensor.data = space; + NDArray ret(data); + // RAII now in effect + data->shape_ = std::vector( + tensor->shape, tensor->shape + tensor->ndim); + data->dl_tensor.shape = dmlc::BeginPtr(data->shape_); + data->dl_tensor.ndim = static_cast(data->shape_.size()); + // setup dtype + data->dl_tensor.dtype = tensor->dtype; + // setup ctx, encode as remote session + data->dl_tensor.ctx.device_id = tensor->ctx.device_id; + data->dl_tensor.ctx.device_type = static_cast( + static_cast(tensor->ctx.device_type) + + kRPCSessMask * (sess->table_index() + 1)); + // check strides. + CHECK(tensor->strides == nullptr); + // setup byteoffset + data->dl_tensor.byte_offset = tensor->byte_offset; + return ret; + } + private: PackedFunc fwrap_; void* handle_{nullptr}; @@ -126,20 +163,28 @@ class RPCModuleNode final : public ModuleNode { }; void RPCWrappedFunc::WrapRemote(std::shared_ptr sess, - void* handle, - int tcode, + TVMArgs args, TVMRetValue *rv) { + void* handle = args.values[0].v_handle; + int tcode = args.type_codes[0]; + if (handle == nullptr) return; if (tcode == kFuncHandle) { auto wf = std::make_shared(handle, sess); *rv = PackedFunc([wf](TVMArgs args, TVMRetValue* rv) { return wf->operator()(args, rv); }); - } else { - CHECK_EQ(tcode, kModuleHandle); + } else if (tcode == kModuleHandle) { std::shared_ptr n = std::make_shared(handle, sess); *rv = Module(n); + } else if (tcode == kArrayHandle || tcode == kNDArrayContainer) { + CHECK_EQ(args.size(), 2); + DLTensor* tensor = args[0]; + void* nd_handle = args[1]; + *rv = WrapRemoteNDArray(sess, tensor, nd_handle); + } else { + LOG(FATAL) << "Cannot wrap tcode=" << tcode; } } diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc index 21fff7b29882..6bb01b9bd459 100644 --- a/src/runtime/rpc/rpc_session.cc +++ b/src/runtime/rpc/rpc_session.cc @@ -130,19 +130,22 @@ class RPCSession::EventHandler : public dmlc::Stream { break; } case kReturnReceived: { - CHECK_EQ(arg_buf_->value.size(), 1U); + CHECK_GE(arg_buf_->value.size(), 1U); + TVMArgValue argv = arg_buf_->AsTVMArgs()[0]; if (argv.type_code() == kFuncHandle || - argv.type_code() == kModuleHandle) { + argv.type_code() == kModuleHandle || + argv.type_code() == kArrayHandle) { CHECK(fwrap != nullptr) << "function/module wrapper not available"; fwrap->CallPacked(arg_buf_->AsTVMArgs(), rv); } else { + CHECK_EQ(arg_buf_->value.size(), 1U); *rv = argv; } arg_buf_.reset(); this->SwitchToState(kRecvCode); std::swap(client_mode_, client_mode); - return RPCCode::kReturn; + return RPCCode::kReturn; } case kCopyAckReceived: { std::swap(client_mode_, client_mode); @@ -172,15 +175,22 @@ class RPCSession::EventHandler : public dmlc::Stream { ctx.device_type = static_cast(dev_type % kRPCSessMask); return ctx; } - // send Packed sequence to writer. - void SendPackedSeq(const TVMValue* arg_values, const int* type_codes, int n) { + // Send Packed sequence to writer. + // return_ndarray is a special flag to handle returning of ndarray + // In this case, we return the shape, context and data of the array, + // as well as a customized PackedFunc that handles deletion of + // the array in the remote. + void SendPackedSeq(const TVMValue* arg_values, + const int* type_codes, + int n, + bool return_ndarray = false) { this->Write(n); - // only handles . for (int i = 0; i < n; ++i) { int tcode = type_codes[i]; if (tcode == kNDArrayContainer) tcode = kArrayHandle; this->Write(tcode); } + // Argument packing. for (int i = 0; i < n; ++i) { int tcode = type_codes[i]; @@ -215,9 +225,23 @@ class RPCSession::EventHandler : public dmlc::Stream { case kNDArrayContainer: case kArrayHandle: { DLTensor* arr = static_cast(value.v_handle); - TVMContext ctx = StripSessMask(arr->ctx); - uint64_t data = reinterpret_cast( - static_cast(arr->data)->data); + TVMContext ctx; + uint64_t data; + if (!return_ndarray) { + // in the client mode + // ctx contains the remote table index + // the space is wrapped by an RemoteSpace + // that holds reference to the session. + ctx = StripSessMask(arr->ctx); + data = reinterpret_cast( + static_cast(arr->data)->data); + } else { + // When we return NDArray, we directly return + // the space and the context + // The client will be further wrapping + ctx = arr->ctx; + data = reinterpret_cast(arr->data); + } this->Write(data); this->Write(ctx); this->Write(arr->ndim); @@ -701,6 +725,21 @@ class RPCSession::EventHandler : public dmlc::Stream { << "Only server can send function and module handle back."; rv.MoveToCHost(&ret_value, &ret_tcode); SendPackedSeq(&ret_value, &ret_tcode, 1); + } else if (rv.type_code() == kNDArrayContainer) { + // always send handle in 64 bit. + CHECK(!client_mode_) + << "Only server can send NDArray back"; + // We follow a special protocol to return NDArray to client side + // The first pack value is the NDArray handle as DLTensor + // The second pack value is a customized deleter that deletes the NDArray. + TVMValue ret_value_pack[2]; + int ret_tcode_pack[2]; + rv.MoveToCHost(&ret_value_pack[0], &ret_tcode_pack[0]); + + NDArray::Container* nd = static_cast(ret_value_pack[0].v_handle); + ret_value_pack[1].v_handle = nd; + ret_tcode_pack[1] = kHandle; + SendPackedSeq(ret_value_pack, ret_tcode_pack, 2, true); } else { ret_value = rv.value(); ret_tcode = rv.type_code(); @@ -1090,6 +1129,11 @@ void RPCModuleGetSource(TVMArgs args, TVMRetValue *rv) { *rv = (*static_cast(mhandle))->GetSource(fmt); } +void RPCNDArrayFree(TVMArgs args, TVMRetValue *rv) { + void* handle = args[0]; + static_cast(handle)->DecRef(); +} + void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) { PackedFunc *pf = static_cast(args[0].operator void*()); void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3])); @@ -1138,6 +1182,7 @@ void RPCSession::EventHandler::HandlePackedCall() { case RPCCode::kModuleFree: CallHandler(RPCModuleFree); break; case RPCCode::kModuleGetFunc: CallHandler(RPCModuleGetFunc); break; case RPCCode::kModuleGetSource: CallHandler(RPCModuleGetSource); break; + case RPCCode::kNDArrayFree: CallHandler(RPCNDArrayFree); break; default: LOG(FATAL) << "Unknown event " << static_cast(code_); } CHECK_EQ(state_, kRecvCode); diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h index 68f6763ae6db..4b736de0e041 100644 --- a/src/runtime/rpc/rpc_session.h +++ b/src/runtime/rpc/rpc_session.h @@ -48,6 +48,7 @@ enum class RPCCode : int { kModuleFree, kModuleGetFunc, kModuleGetSource, + kNDArrayFree }; /*! diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py index e7c0cc1bbabd..0de788068b6b 100644 --- a/tests/python/unittest/test_runtime_rpc.py +++ b/tests/python/unittest/test_runtime_rpc.py @@ -175,6 +175,7 @@ def test_rpc_return_func(): @tvm.register_func("rpc.test.remote_func") def addone(x): return lambda y: x+y + server = rpc.Server("localhost", key="x1") client = rpc.connect(server.host, server.port, key="x1") f1 = client.get_function("rpc.test.remote_func") @@ -182,6 +183,46 @@ def addone(x): assert fadd(12) == 22 +def test_rpc_return_ndarray(): + # Use closure to check the ref counter correctness + nd = tvm.nd.array(np.zeros(10).astype("float32")) + @tvm.register_func("rpc.test.remote_return_nd") + def my_module(name): + if name == "get_arr": + return lambda : nd + elif name == "ref_count": + return lambda : tvm._api_internal._ndarray_use_count(nd) + elif name == "get_elem": + return lambda idx: nd.asnumpy()[idx] + elif name == "get_arr_elem": + return lambda arr, idx: arr.asnumpy()[idx] + + # start server + server = rpc.Server("localhost", key="x1") + client = rpc.connect(server.host, server.port, key="x1") + m = client.get_function("rpc.test.remote_return_nd") + get_arr = m("get_arr") + ref_count = m("ref_count") + get_elem = m("get_elem") + get_arr_elem = m("get_arr_elem") + # array test + def run_arr_test(): + arr = get_arr() + assert ref_count() == 2 + arr2 = get_arr() + assert ref_count() == 3 + assert arr.context == client.cpu(0) + arr.copyfrom(np.ones(10).astype(arr.dtype)) + assert arr2.asnumpy()[0] == 1.0 + assert get_elem(0) == 1.0 + assert get_arr_elem(arr2, 0) == 1.0 + + assert ref_count() == 1 + run_arr_test() + # check recycle correctness + assert ref_count() == 1 + + def test_local_func(): @tvm.register_func("rpc.test.remote_func2") def addone(x): @@ -199,9 +240,10 @@ def addone(x): if __name__ == "__main__": logging.basicConfig(level=logging.INFO) + test_rpc_return_ndarray() + test_rpc_return_func() test_bigendian_rpc() test_rpc_remote_module() - test_rpc_return_func() test_rpc_file_exchange() test_rpc_array() test_rpc_simple() From 5435def2bfa2594d704a2f5ad005880fc5c72779 Mon Sep 17 00:00:00 2001 From: Keren Zhou Date: Thu, 16 Aug 2018 16:35:23 -0700 Subject: [PATCH 020/529] [NNVM] Add ONNX upsample converter (#1591) --- nnvm/python/nnvm/frontend/onnx.py | 19 ++++++++ .../python/frontend/onnx/test_forward.py | 47 +++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py index fa26648b293a..f62202a37dff 100644 --- a/nnvm/python/nnvm/frontend/onnx.py +++ b/nnvm/python/nnvm/frontend/onnx.py @@ -406,6 +406,24 @@ def _impl(inputs, attr, params): return _impl +class Upsample(OnnxOpConverter): + """ Operator converter for Upsample (nearest mode). + """ + + @classmethod + def _impl_v7(cls, inputs, attr, params): + scales = attr.get('scales') + assert len(scales) == 4 and scales[0] == 1.0 and scales[1] == 1.0 and scales[2] == scales[3] + mode = attr.get('mode') + if mode == b'nearest': + method = "NEAREST_NEIGHBOR" + elif mode == b'linear': + method = "BILINEAR" + else: + raise ValueError("Invalid ONNX upsample mode: {}".format(mode)) + return _sym.upsampling(inputs[0], scale=int(scales[-1]), method=method, layout='NCHW') + + class Shape(OnnxOpConverter): """ Operator converter for Shape. """ @@ -540,6 +558,7 @@ def _get_convert_map(opset): # 'Crop' # 'Embedding' # 'Upsample' + 'Upsample' : Upsample.get_converter(opset), 'SpatialBN': BatchNorm.get_converter(opset), # defs/generator diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py index 9fb3aed2da10..3f2fbb144289 100644 --- a/nnvm/tests/python/frontend/onnx/test_forward.py +++ b/nnvm/tests/python/frontend/onnx/test_forward.py @@ -1,6 +1,8 @@ import numpy as np import math import nnvm +import topi +import topi.testing import tvm from tvm.contrib import graph_runtime from nnvm.testing.config import ctx_list @@ -380,6 +382,50 @@ def test_lrn(): verify_lrn((5, 5, 5, 5), 3, 'float32') verify_lrn((5, 5, 5, 5), 3, 'float32', alpha=0.0002, beta=0.5, bias=2.0) +def _test_upsample_nearest(): + scale = 2 + in_shape = (1, 1, 3, 3) + out_shape = (1, 1, 3*scale, 3*scale) + y = helper.make_node("Upsample", ['in'], ['out'], mode='nearest', scales=[1.0, 1.0, 2.0, 2.0]) + + in_array = np.random.uniform(size=in_shape).astype(np.float32) + out_array = topi.testing.upsampling_python(in_array, scale, "NCHW") + + graph = helper.make_graph([y], + 'upsample_nearest_test', + inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))], + outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))]) + + model = helper.make_model(graph, producer_name='upsample_nearest_test') + + for target, ctx in ctx_list(): + tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32') + np.testing.assert_allclose(out_array, tvm_out) + +def _test_upsample_bilinear(): + scale = 2 + in_shape = (1, 1, 3, 3) + out_shape = (1, 1, 3*scale, 3*scale) + y = helper.make_node("Upsample", ['in'], ['out'], mode='linear', scales=[1.0, 1.0, 2.0, 2.0]) + + in_array = np.random.uniform(size=in_shape).astype(np.float32) + out_array = topi.testing.bilinear_resize_python(in_array, (3*scale, 3*scale), "NCHW") + + graph = helper.make_graph([y], + 'upsample_bilinear_test', + inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))], + outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))]) + + model = helper.make_model(graph, producer_name='upsample_bilinear_test') + + for target, ctx in ctx_list(): + tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32') + np.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5) + +def test_upsample(): + _test_upsample_nearest() + _test_upsample_bilinear() + if __name__ == '__main__': # verify_super_resolution_example() @@ -398,3 +444,4 @@ def test_lrn(): test_matmul() test_gather() test_lrn() + test_upsample() From 1d7ef11f577fa876d2823e52283f92e973ccbd4a Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 16 Aug 2018 16:37:03 -0700 Subject: [PATCH 021/529] add -mattr=+neon for all arm cpu target (#1612) --- apps/benchmark/README.md | 11 +++++++---- python/tvm/target.py | 16 ++++++++-------- tutorials/nnvm/deploy_model_on_rasp.py | 2 +- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md index e83e47c46eb7..ee22f90dc435 100644 --- a/apps/benchmark/README.md +++ b/apps/benchmark/README.md @@ -63,8 +63,11 @@ python3 -m tvm.exec.rpc_tracker python3 arm_cpu_imagenet_bench.py --device mate10pro --rpc-key mate10pro ``` - If your device has a same SoC of the above device, you can reuse these parameters - (e.g. use `llvm -device=arm_cpu -mode=rk3399 -target=aarch64-linux-gnu` as target). - Otherwise, you need to tune for your own device, please follow this - [tutorial](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html). + If your device has a same or similar SoC of the above devices, you can reuse these parameters. + For example, if your SoC is similar to rasp3b, use + ```bash + python3 arm_cpu_imagenet_bench.py --device rasp3b --rpc-key your_custom_key + ``` + For other devices, to get the best performance, it is recommended that you tune your network by yourself. + Please follow this [tutorial](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html). diff --git a/python/tvm/target.py b/python/tvm/target.py index fed20c3914c6..e2d780f75264 100644 --- a/python/tvm/target.py +++ b/python/tvm/target.py @@ -427,14 +427,14 @@ def arm_cpu(model='unknown', options=None): from . import autotvm trans_table = { - "pixel2": ["-model=snapdragon835", "-target=arm64-linux-android"], - "mate10": ["-model=kirin970", "-target=arm64-linux-android"], - "mate10pro": ["-model=kirin970", "-target=arm64-linux-android"], - "p20": ["-model=kirin970", "-target=arm64-linux-android"], - "p20pro": ["-model=kirin970", "-target=arm64-linux-android"], - "rasp3b": ["-model=bcm2837", "-target=armv7l-linux-gnueabihf"], - "rk3399": ["-model=rk3399", "-target=aarch64-linux-gnu"], - "pynq": ["-model=pynq", "-target=armv7a-linux-eabi"], + "pixel2": ["-model=snapdragon835", "-target=arm64-linux-android -mattr=+neon"], + "mate10": ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"], + "mate10pro": ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"], + "p20": ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"], + "p20pro": ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"], + "rasp3b": ["-model=bcm2837", "-target=armv7l-linux-gnueabihf -mattr=+neon"], + "rk3399": ["-model=rk3399", "-target=aarch64-linux-gnu -mattr=+neon"], + "pynq": ["-model=pynq", "-target=armv7a-linux-eabi -mattr=+neon"], } pre_defined_opt = trans_table.get(model, ["-model=%s" % model]) diff --git a/tutorials/nnvm/deploy_model_on_rasp.py b/tutorials/nnvm/deploy_model_on_rasp.py index c11f202c1251..40dbdaeb00ee 100644 --- a/tutorials/nnvm/deploy_model_on_rasp.py +++ b/tutorials/nnvm/deploy_model_on_rasp.py @@ -154,7 +154,7 @@ def transform_image(image): else: target = tvm.target.arm_cpu('rasp3b') # The above line is a simple form of - # target = tvm.target.create('llvm -devcie=arm_cpu -target=armv7l-linux-gnueabihf') + # target = tvm.target.create('llvm -devcie=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon') with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']): graph, lib, params = nnvm.compiler.build( From 5c5ad860fdafb38a55555d8d0775b0d049adb0da Mon Sep 17 00:00:00 2001 From: Ashok Emani Date: Thu, 16 Aug 2018 19:19:43 -0700 Subject: [PATCH 022/529] fix output_shape in conv2d_nchw (#1613) --- topi/include/topi/nn.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h index 53b899796e37..4a537a646425 100644 --- a/topi/include/topi/nn.h +++ b/topi/include/topi/nn.h @@ -265,7 +265,7 @@ inline tvm::Tensor conv2d_nchw(const tvm::Tensor& I, auto pW = I->shape[3]; tvm::Array output_shape{ I->shape[0], // B - W->shape[1], // O + W->shape[0], // O (I->shape[2] - W->shape[2] + 2 * pad_h) / stride_h + 1, // H (I->shape[3] - W->shape[3] + 2 * pad_w) / stride_w + 1 // W }; From 6bc2b63e757b39d0910aabf960cb242b6252308f Mon Sep 17 00:00:00 2001 From: masahi Date: Fri, 17 Aug 2018 11:37:56 +0900 Subject: [PATCH 023/529] [NNVM] Bug fix Prevent fusing convolution with injective op (#1608) --- nnvm/src/compiler/graph_fuse.cc | 31 +++++++++++++++++- nnvm/tests/python/compiler/test_op_fusion.py | 34 ++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc index 52a8ae44f8ee..f65312be1a29 100644 --- a/nnvm/src/compiler/graph_fuse.cc +++ b/nnvm/src/compiler/graph_fuse.cc @@ -63,12 +63,16 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) { // Check if we can fuse to the master. int chosen_master = -1; bool ewise = inode.source->num_outputs() == 1; + bool mark_as_injective = false; for (const auto& e : inode.inputs) { if (fuse_vec[e.node_id] == FuseRule::kUknown) { TOpPattern ipt = pattern_vec[e.node_id]; if (ipt != kElemWise) ewise = false; - if (ipt <= kInjective) { + if (ipt <= kBroadcast) { + fuse_vec[e.node_id] = FuseRule::kFuseToMaster; + } else if (ipt == kInjective) { fuse_vec[e.node_id] = FuseRule::kFuseToMaster; + mark_as_injective = true; } else if (ipt == kOutEWiseFusable && chosen_master == -1 && shape_vec[idx.entry_id(nid, 0)] == shape_vec[idx.entry_id(e)]) { @@ -87,6 +91,8 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) { master_vec[nid] = chosen_master; if (chosen_master != -1) { pt = kOutEWiseFusable; + } else if (mark_as_injective) { + pt = kInjective; } else { pt = ewise ? kElemWise : kBroadcast; } @@ -135,8 +141,31 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) { if (group_vec[nid] == -1) { group_vec[nid] = nid; } + + // Check if injective op and out_ewise_fusable op (e.g. conv2d) are in the same group. + bool parent_out_ewise = false; + bool parent_injective = false; + for (const auto& e : inode.inputs) { + TOpPattern pt = pattern_vec[e.node_id]; + if (pt == kOutEWiseFusable) { + parent_out_ewise = true; + } else if (pt == kInjective) { + parent_injective = true; + } + } + // Change the master node from out_ewise_fusable op to itself + if (parent_injective && parent_out_ewise) master_vec[nid] = nid; + // Propagate the group id. for (const auto& e : inode.inputs) { + TOpPattern pt = pattern_vec[e.node_id]; + if (parent_out_ewise && parent_injective) { + if (pt == kOutEWiseFusable) { + continue; // Do not fuse out_ewise_fusable op + } else if (pt == kInjective) { + master_vec[e.node_id] = nid; + } + } if (fuse_vec[e.node_id] == FuseRule::kFuseToMaster) { CHECK(group_vec[e.node_id] == -1|| group_vec[e.node_id] == group_vec[nid]); diff --git a/nnvm/tests/python/compiler/test_op_fusion.py b/nnvm/tests/python/compiler/test_op_fusion.py index 8d05ae02c579..5f4da3865a45 100644 --- a/nnvm/tests/python/compiler/test_op_fusion.py +++ b/nnvm/tests/python/compiler/test_op_fusion.py @@ -77,6 +77,39 @@ def test_injective_reduce_injective(): np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5) +def test_injective_conv2d(): + channels = 16 + data = sym.Variable(name="data") + pool = sym.global_avg_pool2d(data=data) + weight = sym.reshape(pool, shape=[1, channels, 1, 1]) + residual = sym.conv2d(data=data, kernel_size=(3,3), channels=channels, padding=(1, 1), + layout="NCHW", kernel_layout="OIHW", use_bias=False, name="conv") + net = weight * data + residual + size = 56 + dtype="float32" + dshape = (1, channels, size, size) + kshape = (channels, channels, 3, 3) + oshape = dshape + shape_dict = {"data": dshape} + + for target, ctx in ctx_list(): + graph, lib, _ = nnvm.compiler.build(net, target, shape_dict) + # data, global_avg_pool, conv weight, conv op, fused elemwise add + assert graph.index.num_nodes == 5 + + data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype)) + kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype)) + m = graph_runtime.create(graph, lib, ctx) + m.run(data=data, conv_weight=kernel) + # get output + out = m.get_output(0, tvm.nd.empty(oshape, dtype)) + residual = topi.testing.conv2d_nchw_python( + data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME') + weight = np.mean(data.asnumpy(), axis=(2, 3)) + c_np = weight[:, :, np.newaxis, np.newaxis] * data.asnumpy() + residual + np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5) + + def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2): with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build(sym, target, shape={"data":data.shape}, params=params) @@ -123,3 +156,4 @@ def get_sym(out_channel): test_ewise_injective() test_conv_ewise_injective() test_fuse_conv2d_elu() + test_injective_conv2d() From e3d094d92f1402359fe9e639c09c89781a5c7475 Mon Sep 17 00:00:00 2001 From: Sergey Mironov Date: Sat, 18 Aug 2018 07:40:52 +0300 Subject: [PATCH 024/529] [NNVM] TF: Add Pack operation (#1570) --- nnvm/include/nnvm/top/tensor.h | 2 +- nnvm/python/nnvm/frontend/tensorflow.py | 9 ++++++ nnvm/src/top/tensor/transform.cc | 19 ++++++------ .../frontend/tensorflow/test_forward.py | 29 ++++++++++++++++++- 4 files changed, 48 insertions(+), 11 deletions(-) diff --git a/nnvm/include/nnvm/top/tensor.h b/nnvm/include/nnvm/top/tensor.h index 22ee9d7118e6..53ed5b3b0a22 100644 --- a/nnvm/include/nnvm/top/tensor.h +++ b/nnvm/include/nnvm/top/tensor.h @@ -16,7 +16,7 @@ namespace top { struct ConcatenateParam : public dmlc::Parameter { int axis; DMLC_DECLARE_PARAMETER(ConcatenateParam) { - DMLC_DECLARE_FIELD(axis).set_lower_bound(0).set_default(1) + DMLC_DECLARE_FIELD(axis).set_default(1) .describe("the axis to be concated."); } }; diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py index d761e34c7c59..092b8fa20219 100644 --- a/nnvm/python/nnvm/frontend/tensorflow.py +++ b/nnvm/python/nnvm/frontend/tensorflow.py @@ -339,6 +339,14 @@ def _impl(inputs, attr, params): extras={'axis': axis.asnumpy()[0]})(inputs, attr) return _impl +def _pack(): + def _impl(inputs, attr, params): + axis = int(attr["axis"]) + inputs_reshaped = [_sym.expand_dims(i, axis=axis, num_newaxis=1) for i in inputs] + return _sym.concatenate(*inputs_reshaped, axis=axis) + + return _impl + def _reshape(): def _impl(inputs, attr, params): try: @@ -673,6 +681,7 @@ def _impl(inputs, attr, params): 'Minimum' : _elemwise('min'), 'Sum' : _sum(), 'Square' : _square(), + 'Pack' : _pack(), 'Relu' : AttrCvt('relu'), 'Reshape' : _reshape(), 'ResizeBilinear' : _resize_bilinear(), diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc index 78255d20f040..52dca5654838 100644 --- a/nnvm/src/top/tensor/transform.cc +++ b/nnvm/src/top/tensor/transform.cc @@ -93,23 +93,24 @@ inline bool ConcatenateInferShape(const NodeAttrs& attrs, TShape dshape; dim_t size = 0; bool has_zero = false; + int axis = param.axis >= 0 ? param.axis : in_shape->at(0).ndim() + param.axis; for (size_t i = 0; i < in_shape->size(); ++i) { TShape tmp = (*in_shape)[i]; if (tmp.ndim()) { - CHECK_LT(static_cast(param.axis), tmp.ndim()) - << "concat dim " << param.axis << " out of range of input shape " << tmp; - has_zero = tmp[param.axis] == 0 || has_zero; - size += tmp[param.axis]; - tmp[param.axis] = 0; + CHECK_LT(static_cast(axis), tmp.ndim()) + << "concat dim " << axis << " out of range of input shape " << tmp; + has_zero = tmp[axis] == 0 || has_zero; + size += tmp[axis]; + tmp[axis] = 0; shape_assign(&dshape, tmp); } } TShape tmp = (*out_shape)[0]; if (tmp.ndim()) { - CHECK_LT(static_cast(param.axis), tmp.ndim()) - << "concat dim " << param.axis << " out of range of input shape " << tmp; - tmp[param.axis] = 0; + CHECK_LT(static_cast(axis), tmp.ndim()) + << "concat dim " << axis << " out of range of input shape " << tmp; + tmp[axis] = 0; shape_assign(&dshape, tmp); } @@ -119,7 +120,7 @@ inline bool ConcatenateInferShape(const NodeAttrs& attrs, NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, i, dshape); } - if (!has_zero) dshape[param.axis] = size; + if (!has_zero) dshape[axis] = size; NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, dshape); return dshape.Size() != 0; } diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py index 64c57c126f8d..6fa020a03444 100644 --- a/nnvm/tests/python/frontend/tensorflow/test_forward.py +++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py @@ -342,7 +342,7 @@ def _test_argx(func, data, **kwargs): compare_tf_with_tvm(data, 'c0:0', 'argx0:0') -def test_argmin_argmax(): +def test_forward_argminmax(): for axis in [None,0,1,2]: data = np.random.uniform(size=(8,4,9)).astype('float32') _test_argx(tf.argmax, data=data, axis=axis) @@ -555,6 +555,31 @@ def test_forward_lstm(): _test_lstm_cell(1, 2, 1, 0.0, 'float32') + + +####################################################################### +# Pack +# --- +def _test_pack(axis, shape, **kwargs): + + a = np.arange(np.prod(shape), dtype=np.float32).reshape(shape) + b = np.arange(np.prod(shape), dtype=np.float32).reshape(shape) + + with tf.Graph().as_default(): + tf_a = array_ops.placeholder(shape=shape, dtype='float32', name='pl_a') + tf_b = array_ops.placeholder(shape=shape, dtype='float32', name='pl_b') + tf_c = tf.stack([tf_a,tf_b], axis=axis, **kwargs) + assert tf_c.op.op_def.name == 'Pack', "tf.stack() is expected to produce 'Pack' operation" + + compare_tf_with_tvm([a,b], ['pl_a:0','pl_b:0'], 'stack:0') + +def test_forward_pack(): + for axis in range(-3,3): + _test_pack(axis, [3,2,1]) + for axis in range(-1,1): + _test_pack(axis, [3]) + _test_pack(0, []) + ####################################################################### # Pad # --- @@ -818,9 +843,11 @@ def test_forward_l2_normalize(): test_forward_reshape() test_forward_squeeze() test_forward_sigmoid() + test_forward_argminmax() if tf.__version__ == '1.4.1': _test_forward_concat_v2() test_forward_multi_input() + test_forward_pack() test_forward_inception_v3() test_forward_inception_v1() test_forward_mobilenet() From d7df07fffc2fbe5d8fc60d23d030749f3bce4094 Mon Sep 17 00:00:00 2001 From: xqdan Date: Sun, 19 Aug 2018 02:18:29 +0800 Subject: [PATCH 025/529] #1592 [PASS] Fix missing mem CHECK in storage_rewrite (#1616) --- src/pass/storage_rewrite.cc | 6 ++ .../unittest/test_pass_storage_rewrite.py | 63 ++++++++++++------- 2 files changed, 48 insertions(+), 21 deletions(-) diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc index 0170499e1491..877216ed7656 100644 --- a/src/pass/storage_rewrite.cc +++ b/src/pass/storage_rewrite.cc @@ -584,6 +584,12 @@ class StoragePlanRewriter : public IRMutator { e->new_alloc = Allocate::make( e->alloc_var, alloc_type, {combo_size}, const_true(), Evaluate::make(0)); + if (e->scope.tag.length() != 0) { + MemoryInfo info = GetMemoryInfo(e->scope.to_string()); + uint64_t total_elem = e->const_nbits / e->elem_type.bits(); + CHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits) + << "Allocation exceed bound of memory tag " << e->scope.to_string(); + } } } } diff --git a/tests/python/unittest/test_pass_storage_rewrite.py b/tests/python/unittest/test_pass_storage_rewrite.py index 2bb02998982f..3c07a1f26aff 100644 --- a/tests/python/unittest/test_pass_storage_rewrite.py +++ b/tests/python/unittest/test_pass_storage_rewrite.py @@ -28,15 +28,30 @@ def verify(n): tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 1 +def register_mem(scope_tb, max_bits): + #Register mem + @tvm.register_func("tvm.info.mem.%s" % scope_tb) + def mem_info_inp_buffer(): + return tvm.make.node("MemoryInfo", + unit_bits= 16, + max_simd_bits=32, + max_num_bits=max_bits, + head_address=None) + def test_alloc_seq(): + scope_tb = "local.L0A" + max_bits = 1024 * 1024 * 1024 + + register_mem(scope_tb, max_bits) + ib = tvm.ir_builder.create() n = tvm.var("n") with ib.for_range(0, n, name="i") as i: with ib.for_range(0, 10, name="j") as j: - A = ib.allocate("float32", 200, name="A", scope="local.L0A") + A = ib.allocate("float32", 200, name="A", scope=scope_tb) A[j] = 1.2 with ib.for_range(0, 10, name="j") as j: - A = ib.allocate("float32", 200, name="B", scope="local.L0A") + A = ib.allocate("float32", 200, name="B", scope=scope_tb) A[j] = 1.3 body = ib.get() @@ -233,16 +248,9 @@ def test_parallel_alloc(): assert(isinstance(body.body.body.body.body, tvm.stmt.Allocate)) -def test_inplace_rule2(): +def test_inplace_rule2(scope_tb = "local_TB2", max_bits = 1024 * 1024 * 1024): #Test Buffer - scope_tb = "local_TB2" - @tvm.register_func("tvm.info.mem.%s" % scope_tb) - def mem_info_inp_buffer(): - return tvm.make.node("MemoryInfo", - unit_bits= 16, - max_simd_bits=32, - max_num_bits=1024*1024*1024, - head_address=None) + register_mem(scope_tb, max_bits) m = 10 A = tvm.placeholder((m,), name='A') C = tvm.placeholder((m,), name='C') @@ -275,16 +283,23 @@ def verify(n): tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 2 +def test_exceed_mem(): + max_bits = 639 + # The critical max_num_bits is between 639 and 640 + loc = -1 + try: + test_inplace_rule2("local_TEM", max_bits) + except Exception as e: + estr = str(e) + loc = estr.find('Allocation exceed bound of memory') + assert loc != -1 + def test_inplace_rule3(): #Test Buffer scope_tb = "local_TB3" - @tvm.register_func("tvm.info.mem.%s" % scope_tb) - def mem_info_inp_buffer(): - return tvm.make.node("MemoryInfo", - unit_bits= 16, - max_simd_bits=32, - max_num_bits=1024*1024*1024, - head_address=None) + max_bits=1024 * 1024 * 1024 + + register_mem(scope_tb, max_bits) m = 10 B0 = tvm.placeholder((m,), name='B0') B1 = tvm.placeholder((m,), name='B1') @@ -388,17 +403,22 @@ def verify(n): assert num_alloc[0] == 1 def test_alloc_seq_type2(): + scope_tb = "local.L0A2" + max_bits=1024 * 1024 * 1024 + + register_mem(scope_tb, max_bits) + ib = tvm.ir_builder.create() n = tvm.var("n") with ib.for_range(0, n, name="i") as i: with ib.for_range(0, 10, name="j") as j: - A = ib.allocate("float32", 200, name="A", scope="local.L0A") + A = ib.allocate("float32", 200, name="A", scope=scope_tb) A[j] = 1.2 with ib.for_range(0, 20, name="j") as j: - B = ib.allocate("int16", 400, name="B", scope="local.L0A") + B = ib.allocate("int16", 400, name="B", scope=scope_tb) B[j] = tvm.const(1, "int16") with ib.for_range(0, 10, name="j") as j: - C = ib.allocate("float32", 200, name="C", scope="local.L0A") + C = ib.allocate("float32", 200, name="C", scope=scope_tb) C[j] = 1.2 body = ib.get() @@ -465,6 +485,7 @@ def test_replace_dataflow(): test_storage_combine() test_storage_share_gpu() test_inplace_rule2() + test_exceed_mem() test_inplace_rule3() test_alloc_seq_type() test_alloc_seq_type2() From 566f18544875ace6e2597693af7f44d4857aa30e Mon Sep 17 00:00:00 2001 From: Siju Date: Sat, 18 Aug 2018 23:50:58 +0530 Subject: [PATCH 026/529] =?UTF-8?q?[FRONTEND][COREML]MultiplyLayerParams?= =?UTF-8?q?=20L2NormalizeLayerParams=20and=20UpsampleLayerParams=20support?= =?UTF-8?q?=20=E2=80=A6=20(#1511)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nnvm/python/nnvm/frontend/coreml.py | 22 ++ .../python/frontend/coreml/test_forward.py | 190 +++++++++++++++++- 2 files changed, 211 insertions(+), 1 deletion(-) diff --git a/nnvm/python/nnvm/frontend/coreml.py b/nnvm/python/nnvm/frontend/coreml.py index e80cfe23f220..3ca76bb0b20e 100644 --- a/nnvm/python/nnvm/frontend/coreml.py +++ b/nnvm/python/nnvm/frontend/coreml.py @@ -217,6 +217,16 @@ def AddLayerParams(op, insyms, symtab): ret = _sym.__add_scalar__(ret, scalar=op.alpha) return ret +def MultiplyLayerParams(op, insyms, symtab): + if not isinstance(insyms, list): + insyms = [insyms] + ret = insyms[0] + for i in range(1, len(insyms)): + ret = _sym.elemwise_mul(ret, insyms[i]) + if op.alpha != 1: + ret = _sym.__mul_scalar__(ret, scalar=op.alpha) + return ret + def ConcatLayerParams(op, insyms, symtab): if not isinstance(insyms, list): insyms = [insyms] @@ -249,6 +259,15 @@ def PermuteLayerParams(op, insym, symtab): axes = tuple(op.axis) return _sym.transpose(insym, axes=axes) +def UpsampleLayerParams(op, insym, symtab): + if op.scalingFactor[0] != op.scalingFactor[1]: + raise NotImplementedError("Upsampling only supported with same \ + height and width scaling factor.") + interpolationMode = 'NEAREST_NEIGHBOR' if op.mode == 0 else 'BILINEAR' + return _sym.upsampling(insym, scale=op.scalingFactor[0], method=interpolationMode) + +def L2NormalizeLayerParams(op, insym, symtab): + return _sym.l2_normalize(insym, eps=op.epsilon, axis=1) _convert_map = { 'NeuralNetworkMeanImage': NeuralNetworkMeanImage, @@ -261,10 +280,13 @@ def PermuteLayerParams(op, insym, symtab): 'SoftmaxLayerParams':SoftmaxLayerParams, 'InnerProductLayerParams':InnerProductLayerParams, 'AddLayerParams':AddLayerParams, + 'MultiplyLayerParams':MultiplyLayerParams, 'FlattenLayerParams':FlattenLayerParams, 'ConcatLayerParams':ConcatLayerParams, 'PaddingLayerParams':PaddingLayerParams, 'PermuteLayerParams':PermuteLayerParams, + 'UpsampleLayerParams':UpsampleLayerParams, + 'L2NormalizeLayerParams':L2NormalizeLayerParams } def coreml_op_to_nnvm(op, inname, outname, symtab): diff --git a/nnvm/tests/python/frontend/coreml/test_forward.py b/nnvm/tests/python/frontend/coreml/test_forward.py index d5c460e56987..27ae28c20ab9 100644 --- a/nnvm/tests/python/frontend/coreml/test_forward.py +++ b/nnvm/tests/python/frontend/coreml/test_forward.py @@ -1,8 +1,12 @@ import numpy as np -import topi +from coremltools.models.neural_network import NeuralNetworkBuilder +from coremltools.models import datatypes + import tvm from tvm.contrib import graph_runtime +import topi +import topi.testing import nnvm.symbol as sym import nnvm.compiler from nnvm.testing.config import ctx_list @@ -40,6 +44,190 @@ def test_resnet50_checkonly(): model_file = model_zoo.get_resnet50() test_model_checkonly(model_file, 'resnet50') +def run_tvm_graph(graph_def, input_data, input_name, output_shape, output_dtype='float32'): + """ Generic function to compile on nnvm and execute on tvm """ + + sym, params = nnvm.frontend.from_coreml(graph_def) + target = 'llvm' + if isinstance(input_data, list): + shape_dict = {} + dtype_dict = {} + for i, e in enumerate(input_name): + shape_dict[e] = input_data[i].shape + dtype_dict[e] = input_data[i].dtype + else: + shape_dict = {input_name: input_data.shape} + dtype_dict = {input_name: input_data.dtype} + + graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, + dtype=dtype_dict, params=params) + + ctx = tvm.cpu(0) + from tvm.contrib import graph_runtime + m = graph_runtime.create(graph, lib, ctx) + # set inputs + if isinstance(input_data, list): + for i, e in enumerate(input_name): + m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype))) + else: + m.set_input(input_name, tvm.nd.array(input_data.astype(input_data.dtype))) + + m.set_input(**params) + # execute + m.run() + # get outputs + if isinstance(output_shape, list) and isinstance(output_dtype, list): + tvm_output_list = [] + for i, s in enumerate(output_shape): + tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i])) + tvm_output_list.append(tvm_output.asnumpy()) + return tvm_output_list + else: + tvm_output = m.get_output(0, tvm.nd.empty((output_shape), output_dtype)) + return tvm_output.asnumpy() + +def verify_AddLayerParams(input_dim, alpha=2): + dtype = 'float32' + + a_np1 = np.random.uniform(size=input_dim).astype(dtype) + a_np2 = np.random.uniform(size=input_dim).astype(dtype) + + b_np = np.add(a_np1, a_np2) + alpha + inputs = [('input1', datatypes.Array(*input_dim)), + ('input2', datatypes.Array(*input_dim))] + output = [('output', datatypes.Array(*b_np.shape))] + builder = NeuralNetworkBuilder(inputs, output) + builder.add_elementwise(name='Add', + alpha=alpha, + input_names=['input1', 'input2'], + output_name='output', + mode='ADD') + model = cm.models.MLModel(builder.spec) + for target, ctx in ctx_list(): + out = run_tvm_graph(model, + [a_np1, a_np2], + ['input1', 'input2'], + b_np.shape, + dtype) + np.testing.assert_allclose(out, b_np, rtol=1e-5) + +def test_forward_AddLayerParams(): + verify_AddLayerParams((1, 2, 2), 0) + verify_AddLayerParams((1, 2, 2), 1) + verify_AddLayerParams((1, 3, 3), 2) + +def verify_MultiplyLayerParams(input_dim, alpha): + dtype = 'float32' + + a_np1 = np.random.uniform(size=input_dim).astype(dtype) + a_np2 = np.random.uniform(size=input_dim).astype(dtype) + + b_np = np.multiply(a_np1, a_np2) * alpha + inputs = [('input1', datatypes.Array(*input_dim)), + ('input2', datatypes.Array(*input_dim))] + output = [('output', datatypes.Array(*b_np.shape))] + builder = NeuralNetworkBuilder(inputs, output) + builder.add_elementwise(name='Mul', + alpha=alpha, + input_names=['input1', 'input2'], + output_name='output', + mode='MULTIPLY') + model = cm.models.MLModel(builder.spec) + for target, ctx in ctx_list(): + out = run_tvm_graph(model, + [a_np1, a_np2], + ['input1', 'input2'], + b_np.shape, + dtype) + np.testing.assert_allclose(out, b_np, rtol=1e-5) + +def test_forward_MultiplyLayerParams(): + verify_MultiplyLayerParams((1, 2, 2), 0) + verify_MultiplyLayerParams((1, 2, 2), 1) + verify_MultiplyLayerParams((1, 3, 3), 2) + +def verify_ConcatLayerParams(input1_dim, input2_dim): + dtype = 'float32' + + a_np1 = np.random.uniform(size=input1_dim).astype(dtype) + a_np2 = np.random.uniform(size=input2_dim).astype(dtype) + + b_np = np.concatenate((a_np1, a_np2), axis=1) + inputs = [('input1', datatypes.Array(*input1_dim)), + ('input2', datatypes.Array(*input2_dim))] + output = [('output', datatypes.Array(*b_np.shape))] + builder = NeuralNetworkBuilder(inputs, output) + builder.add_elementwise(name='Concate', + input_names=['input1', 'input2'], + output_name='output', + mode='CONCAT') + model = cm.models.MLModel(builder.spec) + for target, ctx in ctx_list(): + out = run_tvm_graph(model, + [a_np1, a_np2], + ['input1', 'input2'], + b_np.shape, + dtype) + np.testing.assert_allclose(out, b_np, rtol=1e-5) + +def test_forward_ConcatLayerParams(): + verify_ConcatLayerParams((1, 1, 2, 2), (1, 2, 2, 2)) + verify_ConcatLayerParams((1, 2, 4, 4), (1, 3, 4, 4)) + +def verify_UpsampleLayerParams(input_dim, scale, mode): + dtype = "float32" + + a_np = np.full(input_dim, 1, dtype=dtype) + if mode == 'NN': + b_np = topi.testing.upsampling_python(a_np, scale) + else: + new_h = input_dim[2] * scale + new_w = input_dim[3] * scale + b_np = topi.testing.bilinear_resize_python(a_np, (new_h, new_w), 'NCHW') + + input = [('input', datatypes.Array(*input_dim))] + output = [('output', datatypes.Array(*b_np.shape))] + builder = NeuralNetworkBuilder(input, output) + builder.add_upsample(name='Upsample', + scaling_factor_h=scale, + scaling_factor_w=scale, + mode=mode, + input_name='input', + output_name='output') + + model = cm.models.MLModel(builder.spec) + for target, ctx in ctx_list(): + out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype) + np.testing.assert_allclose(out, b_np, rtol=1e-5) + +def test_forward_UpsampleLayerParams(): + verify_UpsampleLayerParams((1, 16, 32, 32), 2, 'NN') + verify_UpsampleLayerParams((1, 4, 6, 6), 3, 'BILINEAR') + +def verify_l2_normalize(input_dim, eps): + dtype = "float32" + + a_np = np.random.uniform(size=input_dim).astype(dtype) + b_np = topi.testing.l2_normalize_python(a_np, eps, 1) + + input = [('input', datatypes.Array(*input_dim))] + output = [('output', datatypes.Array(*b_np.shape))] + builder = NeuralNetworkBuilder(input, output) + builder.add_l2_normalize(name='L2', epsilon=eps, input_name='input', output_name='output') + + model = cm.models.MLModel(builder.spec) + for target, ctx in ctx_list(): + out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype) + np.testing.assert_allclose(out, b_np, rtol=1e-5) + +def test_forward_l2_normalize(): + verify_l2_normalize((1, 3, 20, 20), 0.001) + if __name__ == '__main__': test_mobilenet_checkonly() test_resnet50_checkonly() + test_forward_AddLayerParams() + test_forward_ConcatLayerParams() + test_forward_MultiplyLayerParams() + test_forward_UpsampleLayerParams() + test_forward_l2_normalize() From 026c2626ce7200a5097205810f2db681d4cc9939 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 19 Aug 2018 19:29:35 -0700 Subject: [PATCH 027/529] fix import (#1621) --- nnvm/python/nnvm/testing/yolo2_detection.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nnvm/python/nnvm/testing/yolo2_detection.py b/nnvm/python/nnvm/testing/yolo2_detection.py index b7744c45cff4..0b229149b8ea 100644 --- a/nnvm/python/nnvm/testing/yolo2_detection.py +++ b/nnvm/python/nnvm/testing/yolo2_detection.py @@ -10,9 +10,6 @@ import math from collections import namedtuple import numpy as np -from PIL import Image -from PIL import ImageDraw -from PIL import ImageFont def _entry_index(batch, w, h, outputs, classes, coords, location, entry): n = int(location/(w*h)) @@ -186,6 +183,10 @@ def _draw_label(im, r, c, label, rgb): _set_pixel(im, i+c, j+r, k, val)#rgb[k] * val) def _get_label(labelstr, rgb): + from PIL import Image + from PIL import ImageDraw + from PIL import ImageFont + text = labelstr colorText = "black" testDraw = ImageDraw.Draw(Image.new('RGB', (1, 1))) From d6d97e8772e4e5b7982688eeffd8b930f5c4fcce Mon Sep 17 00:00:00 2001 From: masahi Date: Mon, 20 Aug 2018 11:30:01 +0900 Subject: [PATCH 028/529] Add missing check when deciding conv op and injective op are in the same group (#1622) --- nnvm/src/compiler/graph_fuse.cc | 1 + nnvm/tests/python/compiler/test_op_fusion.py | 34 ++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc index f65312be1a29..4999d93d1861 100644 --- a/nnvm/src/compiler/graph_fuse.cc +++ b/nnvm/src/compiler/graph_fuse.cc @@ -146,6 +146,7 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) { bool parent_out_ewise = false; bool parent_injective = false; for (const auto& e : inode.inputs) { + if (fuse_vec[e.node_id] != FuseRule::kFuseToMaster) continue; TOpPattern pt = pattern_vec[e.node_id]; if (pt == kOutEWiseFusable) { parent_out_ewise = true; diff --git a/nnvm/tests/python/compiler/test_op_fusion.py b/nnvm/tests/python/compiler/test_op_fusion.py index 5f4da3865a45..0c81ac890d55 100644 --- a/nnvm/tests/python/compiler/test_op_fusion.py +++ b/nnvm/tests/python/compiler/test_op_fusion.py @@ -110,6 +110,39 @@ def test_injective_conv2d(): np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5) +def test_concatenate_conv2d(): + ch = 3 + size = 8 + data = sym.Variable(name="data") + concat = sym.concatenate(data, data, axis=1) + conv = sym.conv2d(data=concat, kernel_size=(1,1), channels=ch*2, use_bias=False, name="conv") + net = sym.elemwise_add(concat, conv) + + dtype="float32" + dshape = (1, ch, size, size) + kshape = (ch*2, ch*2, 1, 1) + oshape = (1, ch*2, size, size) + shape_dict = {"data": dshape} + + for target, ctx in ctx_list(): + graph, lib, _ = nnvm.compiler.build(net, target, shape_dict) + # data, conv weight, conv op, concat + assert graph.index.num_nodes == 4 + + data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype)) + kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype)) + m = graph_runtime.create(graph, lib, ctx) + m.run(data=data, conv_weight=kernel) + # get output + out = m.get_output(0, tvm.nd.empty(oshape, dtype)) + + concat = np.concatenate((data.asnumpy(), data.asnumpy()), axis=1) + conv = topi.testing.conv2d_nchw_python( + concat, kernel.asnumpy(), (1,1), 'SAME') + ref = concat + conv + np.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5) + + def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2): with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build(sym, target, shape={"data":data.shape}, params=params) @@ -157,3 +190,4 @@ def get_sym(out_channel): test_conv_ewise_injective() test_fuse_conv2d_elu() test_injective_conv2d() + test_concatenate_conv2d() From 826de7b833af5e455649dae1fd58015e89e0281d Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 20 Aug 2018 09:20:19 -0700 Subject: [PATCH 029/529] [NODEREF] Introduce named attribute system. (#1618) --- include/tvm/attrs.h | 593 ++++++++++++++++++ include/tvm/runtime/packed_func.h | 8 + python/tvm/make.py | 11 + src/api/api_base.cc | 12 - src/api/api_test.cc | 46 ++ src/api/dsl_api.cc | 37 +- src/lang/attrs.cc | 45 ++ src/lang/reflection.cc | 42 +- tests/cpp/attrs_test.cc | 76 +++ tests/python/unittest/test_lang_reflection.py | 26 + 10 files changed, 861 insertions(+), 35 deletions(-) create mode 100644 include/tvm/attrs.h create mode 100644 src/api/api_test.cc create mode 100644 src/lang/attrs.cc create mode 100644 tests/cpp/attrs_test.cc diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h new file mode 100644 index 000000000000..aed6b1ff722f --- /dev/null +++ b/include/tvm/attrs.h @@ -0,0 +1,593 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file tvm/attrs.h + * \brief TVM attribute module + * + * This module enables declaration of named attributes + * which support default value setup and bound checking. + * + * \code + * struct MyAttrs : public tvm::AttrsNode { + * float learning_rate; + * int num_hidden; + * std::string name; + * // declare attribute fields in header file + * TVM_DECLARE_ATTRS(MyAttrs, "attrs.MyAttrs") { + * TVM_ATTR_FIELD(num_hidden).set_lower_bound(1); + * TVM_ATTR_FIELD(learning_rate).set_default(0.01f); + * TVM_ATTR_FIELD(name).set_default("hello"); + * } + * }; + * // register it in cc file + * TVM_REGISTER_NODE_TYPE(MyAttrs); + * \endcode + * + * \sa AttrsNode, TVM_DECLARE_ATTRS, TVM_ATTR_FIELD + */ +#ifndef TVM_ATTRS_H_ +#define TVM_ATTRS_H_ + +#include +#include +#include +#include +#include "./ir.h" +#include "./base.h" +#include "./packed_func_ext.h" + +namespace tvm { +/*! + * \brief Declare an attribute function. + * \param ClassName The name of the class. + * \param TypeKey The type key to be used by the TVM node system. + */ +#define TVM_DECLARE_ATTRS(ClassName, TypeKey) \ + static constexpr const char* _type_key = TypeKey; \ + TVM_DECLARE_NODE_TYPE_INFO(ClassName, ::tvm::BaseAttrsNode); \ + template \ + void __VisitAttrs__(FVisit& __fvisit__) // NOLINT(*) + + +/*! + * \brief Declare an attribute field. + * \param FieldName The field name. + */ +#define TVM_ATTR_FIELD(FieldName) \ + __fvisit__(#FieldName, &FieldName) + + +/*! \brief Error thrown during attribute checking. */ +struct AttrError : public dmlc::Error { + /*! + * \brief constructor + * \param msg error message + */ + explicit AttrError(const std::string &msg) + : dmlc::Error(msg) {} +}; + +/*! + * \brief Information about attribute fields in string representations. + */ +struct AttrFieldInfo { + /*! \brief name of the field */ + std::string name; + /*! \brief type docstring information in str. */ + std::string type_info; + /*! \brief detailed description of the type */ + std::string description; +}; + +/*! + * \brief Base class of all attribute class + * \note Do not subclass AttrBaseNode directly, + * subclass AttrsNode instead. + * \sa AttrsNode + */ +class BaseAttrsNode : public Node { + public: + using TVMArgs = runtime::TVMArgs; + using TVMRetValue = runtime::TVMRetValue; + /*! + * \brief Initialize the attributes by sequence of arguments + * \param args The postional arguments in the form + * [key0, value0, key1, value1, ..., key_n, value_n] + */ + template + inline void InitBySeq(Args&& ...args); + /*! + * \brief Print readible docstring to ostream, add newline. + * \param os the stream to print the docstring to. + */ + inline void PrintDocString(std::ostream &os) const; // NOLINT(*) + /*! + * \brief Get the field information about the + * \note This function throws when the required a field is not present. + */ + TVM_DLL virtual std::vector ListFieldInfo() const = 0; + /*! + * \brief Initialize the attributes by arguments. + * \param kwargs The key value pairs for initialization. + * [key0, value0, key1, value1, ..., key_n, value_n] + * \param allow_unknown Whether allow additional unknown fields. + * \note This function throws when the required a field is not present. + */ + TVM_DLL virtual void InitByPackedArgs(const TVMArgs& kwargs, bool allow_unknown = false) = 0; + + static constexpr const char* _type_key = "Attrs"; + TVM_DECLARE_BASE_NODE_INFO(BaseAttrsNode, Node); +}; + +/*! \brief Base attribute container for all attributes */ +class Attrs : public NodeRef { + public: + // normal constructor + Attrs() {} + // construct from shared ptr. + explicit Attrs(std::shared_ptr n) : NodeRef(n) {} + + /*! \return The attribute node */ + const BaseAttrsNode* operator->() const { + return ptr(); + } + /*! \brief specify container node */ + using ContainerType = BaseAttrsNode; + + private: + /*! \return the internal attribute node */ + const BaseAttrsNode* ptr() const { + return static_cast(node_.get()); + } +}; + +/*! + * \brief Specialized attribute type that is backed by a map. + * The DictAttrsNode implements the Attrs behavior, + * its fields are directly accessible via object.field_name + * like other normal nodes. + */ +class DictAttrsNode : public BaseAttrsNode { + public: + /*! \brief internal attrs map */ + Map dict; + /*! + * \brief Consruct a Attrs backed by DictAttrsNode. + * \param dict The attributes. + * \return The dict attributes. + */ + TVM_DLL static Attrs make(Map dict); + // implementations + void VisitAttrs(AttrVisitor* v) final; + void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final; + std::vector ListFieldInfo() const final; + // type info + static constexpr const char* _type_key = "DictAttrs"; + TVM_DECLARE_NODE_TYPE_INFO(DictAttrsNode, BaseAttrsNode); +}; + +// Namespace containing detail implementations +namespace detail { +using runtime::TVMArgValue; + +// helper entry that does nothing in set_default/bound/describe calls. +struct AttrNopEntry { + using TSelf = AttrNopEntry; + + TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) { + return *this; + } + template + TSelf& set_default(DMLC_ATTRIBUTE_UNUSED T value) { + return *this; + } + template + TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED T begin) { + return *this; + } + template + TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED T end) { + return *this; + } +}; + +// Wrapper for normal visitor. +class AttrNormalVisitor { + public: + explicit AttrNormalVisitor(AttrVisitor* visitor) + : visitor_(visitor) { + } + template + AttrNopEntry operator()(const char* key, T* value) { + visitor_->Visit(key, value); + return AttrNopEntry(); + } + + private: + AttrVisitor* visitor_; +}; + +// helper entry that does initialization, set default. +template +struct AttrInitEntry { + // The attributes + using TSelf = AttrInitEntry; + // The type key + const char* type_key_; + // field name + const char* key_; + // internal value. + T* value_; + // whether the value is missing. + bool value_missing_{true}; + // If the value is still missing in destruction time throw an error. + ~AttrInitEntry() DMLC_THROW_EXCEPTION { + if (value_missing_) { + std::ostringstream os; + os << type_key_ << ": Cannot find required field \'" << key_ + << "\' during initialization"; + throw AttrError(os.str()); + } + } + // override fields. + // This function sets the lower bound of the attribute + TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED const T& begin) { + if (this->value_missing_) return *this; + const T& val = *value_; + if (begin > val) { + std::ostringstream os; + os << type_key_ << "." << key_ << ": " + << "value " << val + << " is smaller than the lower bound " << begin; + throw AttrError(os.str()); + } + return *this; + } + // This function sets the upper bound of the attribute + TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED const T& end) { + if (this->value_missing_) return *this; + const T& val = *value_; + if (val > end) { + std::ostringstream os; + os << type_key_ << "." << key_ << ": " + << "value " << val + << " is bigger than the upper bound " << end; + throw AttrError(os.str()); + } + return *this; + } + // set default when + TSelf& set_default(DMLC_ATTRIBUTE_UNUSED const T& value) { + if (!value_missing_) return *this; + *value_ = value; + value_missing_ = false; + return *this; + } + TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) { + return *this; + } +}; + +// Template function to allow smart conversion +// from Expr types into the constants. +template +inline void SetValue(T* ptr, const TVMArgValue& val) { + *ptr = val.operator T(); +} +template +inline void SetIntValue(T* ptr, const TVMArgValue& val) { + if (val.type_code() == kDLInt) { + *ptr = static_cast(val.value().v_int64); + } else { + Expr expr = val; + CHECK(expr.defined()); + if (const ir::IntImm* op = expr.as()) { + *ptr = static_cast(op->value); + } else if (const ir::UIntImm* op = expr.as()) { + *ptr = static_cast(op->value); + } else { + LOG(FATAL) << "Expect int value, but get " << expr->type_key(); + } + } +} +template<> +inline void SetValue(std::string* ptr, const TVMArgValue& val) { + if (val.type_code() == kStr) { + *ptr = val.operator std::string(); + } else { + Expr expr = val; + const ir::StringImm* op = expr.as(); + CHECK(op != nullptr); + *ptr = op->value; + } +} +template<> +inline void SetValue(double* ptr, const TVMArgValue& val) { + if (val.type_code() == kDLFloat || val.type_code() == kDLInt) { + *ptr = val.operator double(); + } else { + Expr expr = val; + CHECK(expr.defined()); + if (const ir::IntImm* op = expr.as()) { + *ptr = static_cast(op->value); + } else if (const ir::IntImm* op = expr.as()) { + *ptr = static_cast(op->value); + } else if (const ir::UIntImm* op = expr.as()) { + *ptr = static_cast(op->value); + } else { + LOG(FATAL) << "Expect float value, but get " << expr->type_key(); + } + } +} +template<> +inline void SetValue(int* ptr, const TVMArgValue& val) { + SetIntValue(ptr, val); +} +template<> +inline void SetValue(int64_t* ptr, const TVMArgValue& val) { + SetIntValue(ptr, val); +} +template<> +inline void SetValue(uint64_t* ptr, const TVMArgValue& val) { + SetIntValue(ptr, val); +} +template<> +inline void SetValue(bool* ptr, const TVMArgValue& val) { + SetIntValue(ptr, val); +} + +// Visitor for value initialization +template +class AttrInitVisitor { + public: + // Counter of number of matched attributes during visit. + // This is used to decide if there is additional unmatched attributes. + size_t hit_count_{0}; + // constructor + AttrInitVisitor(const char* type_key, FFind ffind) + : type_key_(type_key), ffind_(ffind) { + } + + template + AttrInitEntry operator()(const char* key, T* value) { + TVMArgValue val; + AttrInitEntry opt; + opt.type_key_ = type_key_; + opt.key_ = key; + opt.value_ = value; + if (ffind_(key, &val)) { + SetValue(value, val); + opt.value_missing_ = false; + ++hit_count_; + } else { + opt.value_missing_ = true; + } + return opt; + } + + private: + // the type key + const char* type_key_; + FFind ffind_; +}; + +template +inline AttrInitVisitor CreateInitVisitor( + const char* type_key, + FFind ffind) { + return AttrInitVisitor(type_key, ffind); +} + +/*! + * \brief Helper struct to get the type name known to tvm. + * \tparam T the type we are interested in. + */ +template +struct TypeName { + static constexpr const char* value = T::ContainerType::_type_key; +}; + +template<> +struct TypeName { + static constexpr const char* value = "int"; +}; + +template<> +struct TypeName { + static constexpr const char* value = "int64"; +}; + +template<> +struct TypeName { + static constexpr const char* value = "uint64_t"; +}; + +template<> +struct TypeName { + static constexpr const char* value = "Type"; +}; + +template<> +struct TypeName { + static constexpr const char* value = "str"; +}; + +template<> +struct TypeName { + static constexpr const char* value = "bool"; +}; + +template<> +struct TypeName { + static constexpr const char* value = "handle"; +}; + +template<> +struct TypeName { + static constexpr const char* value = "double"; +}; + +class AttrDocEntry { + public: + using TSelf = AttrDocEntry; + + explicit AttrDocEntry(AttrFieldInfo* info) + : info_(info) { + } + TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) { + info_->description = str; + return *this; + } + template + TSelf& set_default(DMLC_ATTRIBUTE_UNUSED T value) { + std::ostringstream os; + os << info_->type_info << ", default=" << value; + info_->type_info = os.str(); + return *this; + } + template + TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED T begin) { + return *this; + } + template + TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED T end) { + return *this; + } + + private: + AttrFieldInfo* info_; +}; + +class AttrDocVisitor { + public: + template + AttrDocEntry operator()(const char* key, T* v) { + AttrFieldInfo info; + info.name = key; + info.type_info = TypeName::value; + fields_.emplace_back(std::move(info)); + return AttrDocEntry(&(fields_.back())); + } + + std::vector fields_; +}; + +class AttrExistVisitor { + public: + std::string key_; + bool exist_{false}; + + template + AttrNopEntry operator()(const char* key, T* v) { + if (exist_) return AttrNopEntry(); + if (key == key_) exist_ = true; + return AttrNopEntry(); + } +}; +} // namespace detail + +/*! + * \brief The base class of the all the + * Use "curiously recurring template pattern". + * + * \tparam DerivedType The final attribute type. + */ +template +class AttrsNode : public BaseAttrsNode { + public: + void VisitAttrs(AttrVisitor* v) final { + detail::AttrNormalVisitor vis(v); + self()->__VisitAttrs__(vis); + } + + void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final { + CHECK_EQ(args.size() % 2, 0); + const int kLinearSearchBound = 16; + int hit_count = 0; + // applies two stratgies to lookup + if (args.size() < kLinearSearchBound) { + // linear search. + auto ffind = [&args](const char* key, runtime::TVMArgValue* val) { + for (int i = 0; i < args.size(); i += 2) { + CHECK_EQ(args.type_codes[i], kStr); + if (!std::strcmp(key, args.values[i].v_str)) { + *val = args[i + 1]; + return true; + } + } + return false; + }; + auto vis = detail::CreateInitVisitor(DerivedType::_type_key, ffind); + self()->__VisitAttrs__(vis); + hit_count = vis.hit_count_; + } else { + // construct a map then do lookup. + std::unordered_map kwargs; + for (int i = 0; i < args.size(); i += 2) { + CHECK_EQ(args.type_codes[i], kStr); + kwargs[args[i].operator std::string()] = args[i + 1]; + } + auto ffind = [&kwargs](const char *key, runtime::TVMArgValue* val) { + auto it = kwargs.find(key); + if (it != kwargs.end()) { + *val = it->second; + return true; + } + return false; + }; + auto vis = detail::CreateInitVisitor(DerivedType::_type_key, ffind); + self()->__VisitAttrs__(vis); + hit_count = vis.hit_count_; + } + // error handling, slow path + if (hit_count * 2 != args.size() && !allow_unknown) { + for (int i = 0; i < args.size(); i += 2) { + detail::AttrExistVisitor visitor; + visitor.key_ = args[i].operator std::string(); + self()->__VisitAttrs__(visitor); + if (!visitor.exist_) { + std::ostringstream os; + os << DerivedType::_type_key + << ": does not have field \'" << visitor.key_ + << "\', Possible fields:\n"; + os << "----------------\n"; + this->PrintDocString(os); + throw AttrError(os.str()); + } + } + } + } + + std::vector ListFieldInfo() const final { + detail::AttrDocVisitor visitor; + self()->__VisitAttrs__(visitor); + return visitor.fields_; + } + + private: + DerivedType* self() const { + return const_cast( + static_cast(this)); + } +}; + + +template +inline void BaseAttrsNode::InitBySeq(Args&& ...args) { + runtime::PackedFunc pf([this](const TVMArgs& args, TVMRetValue *rv) { + this->InitByPackedArgs(args); + }); + pf(std::forward(args)...); +} + +inline void BaseAttrsNode::PrintDocString(std::ostream &os) const { // NOLINT(*) + std::vector entry = this->ListFieldInfo(); + for (AttrFieldInfo info : entry) { + os << info.name << " : " << info.type_info << '\n'; + if (info.description.length() != 0) { + os << " " << info.description << '\n'; + } + } +} + +} // namespace tvm +#endif // TVM_ATTRS_H_ diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index 6d8df4a5e3d6..63e8ca7cd16b 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -223,6 +223,12 @@ class ExtTypeVTable { class TVMPODValue_ { public: operator double() const { + // Allow automatic conversion from int to float + // This avoids errors when user pass in int from + // the frontend while the API expects a float. + if (type_code_ == kDLInt) { + return static_cast(value_.v_int64); + } TVM_CHECK_TYPE_CODE(type_code_, kDLFloat); return value_.v_float64; } @@ -310,6 +316,8 @@ class TVMPODValue_ { */ class TVMArgValue : public TVMPODValue_ { public: + /*! \brief default constructor */ + TVMArgValue() {} /*! * \brief constructor * \param value of the function diff --git a/python/tvm/make.py b/python/tvm/make.py index 49f698f4f663..19949509778b 100644 --- a/python/tvm/make.py +++ b/python/tvm/make.py @@ -71,6 +71,17 @@ def node(type_key, **kwargs): **kwargs : dict The fields of the node. + Returns + ------- + node : Node + The corresponding DSL Node + + Note + ---- + If the created node is instance of AttrsNode, then + the creator function will also run bound checks and + default value setup as supported by Attrs. + Example ------- The following code constructs a IntImm object diff --git a/src/api/api_base.cc b/src/api/api_base.cc index 70301993ad3a..3583f42a00c9 100644 --- a/src/api/api_base.cc +++ b/src/api/api_base.cc @@ -33,18 +33,6 @@ TVM_REGISTER_API("_load_json") *ret = LoadJSON(args[0]); }); -TVM_REGISTER_API("_nop") -.set_body([](TVMArgs args, TVMRetValue *ret) { - }); - -// internal fucntion used for debug and testing purposes -TVM_REGISTER_API("_ndarray_use_count") -.set_body([](TVMArgs args, TVMRetValue *ret) { - runtime::NDArray nd = args[0]; - // substract the current one - *ret = (nd.use_count() - 1); - }); - TVM_REGISTER_API("_TVMSetStream") .set_body([](TVMArgs args, TVMRetValue *ret) { TVMSetStream(args[0], args[1], args[2]); diff --git a/src/api/api_test.cc b/src/api/api_test.cc new file mode 100644 index 000000000000..1744267fdcd7 --- /dev/null +++ b/src/api/api_test.cc @@ -0,0 +1,46 @@ + /*! + * Copyright (c) 2018 by Contributors + * Code mainly used for test purposes. + * \file api_test.cc + */ +#include +#include +#include +#include + +namespace tvm { +// Attrs used to python API +struct TestAttrs : public AttrsNode { + int axis; + std::string name; + Array padding; + + TVM_DECLARE_ATTRS(TestAttrs, "attrs.TestAttrs") { + TVM_ATTR_FIELD(axis) + .set_default(10) + .set_lower_bound(1) + .set_upper_bound(10) + .describe("axis field"); + TVM_ATTR_FIELD(name) + .describe("name"); + TVM_ATTR_FIELD(padding) + .describe("padding of input") + .set_default(Array({0, 0})); + } +}; + +TVM_REGISTER_NODE_TYPE(TestAttrs); + +TVM_REGISTER_API("_nop") +.set_body([](TVMArgs args, TVMRetValue *ret) { + }); + +// internal fucntion used for debug and testing purposes +TVM_REGISTER_API("_ndarray_use_count") +.set_body([](TVMArgs args, TVMRetValue *ret) { + runtime::NDArray nd = args[0]; + // substract the current one + *ret = (nd.use_count() - 1); + }); + +} // namespace tvm diff --git a/src/api/dsl_api.cc b/src/api/dsl_api.cc index 80d7c3163e10..9157e62fda8a 100644 --- a/src/api/dsl_api.cc +++ b/src/api/dsl_api.cc @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -124,22 +125,35 @@ class DSLAPIImpl : public DSLAPI { (*static_cast(handle))->type_index()); } void NodeGetAttr(NodeHandle handle, - const char* key, - TVMValue* ret_val, - int* ret_type_code, - int* ret_success) const final { + const char* key, + TVMValue* ret_val, + int* ret_type_code, + int* ret_success) const final { TVMRetValue rv; APIAttrGetter getter; + TVMAPINode* tnode = static_cast(handle); getter.skey = key; getter.ret = &rv; - TVMAPINode* tnode = static_cast(handle); if (getter.skey == "type_key") { ret_val->v_str = (*tnode)->type_key(); *ret_type_code = kStr; *ret_success = 1; - } else { + return; + } else if (!(*tnode)->is_type()) { (*tnode)->VisitAttrs(&getter); *ret_success = getter.found_ref_object || rv.type_code() != kNull; + } else { + // specially handle dict attr + DictAttrsNode* dnode = static_cast(tnode->get()); + auto it = dnode->dict.find(key); + if (it != dnode->dict.end()) { + *ret_success = 1; + rv = (*it).second; + } else { + *ret_success = 0; + } + } + if (*ret_success) { if (rv.type_code() == kStr || rv.type_code() == kTVMType) { TVMAPIThreadLocalEntry *e = TVMAPIThreadLocalStore::Get(); @@ -159,7 +173,16 @@ class DSLAPIImpl : public DSLAPI { TVMAPINode* tnode = static_cast(handle); APIAttrDir dir; dir.names = &(ret->ret_vec_str); - (*tnode)->VisitAttrs(&dir); + + if (!(*tnode)->is_type()) { + (*tnode)->VisitAttrs(&dir); + } else { + // specially handle dict attr + DictAttrsNode* dnode = static_cast(tnode->get()); + for (const auto& kv : dnode->dict) { + ret->ret_vec_str.push_back(kv.first); + } + } ret->ret_vec_charp.clear(); for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) { ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str()); diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc new file mode 100644 index 000000000000..49a91983e79d --- /dev/null +++ b/src/lang/attrs.cc @@ -0,0 +1,45 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file attrs.cc + */ +#include + +namespace tvm { + +void DictAttrsNode::VisitAttrs(AttrVisitor* v) { + v->Visit("__dict__", &dict); +} + +void DictAttrsNode::InitByPackedArgs( + const runtime::TVMArgs& args, bool allow_unknown) { + for (int i = 0; i < args.size(); i += 2) { + std::string key = args[i]; + runtime::TVMArgValue val = args[i + 1]; + if (val.type_code() == kNodeHandle) { + dict.Set(key, val.operator NodeRef()); + } else if (val.type_code() == kStr) { + dict.Set(key, Expr(val.operator std::string())); + } else { + dict.Set(key, val.operator Expr()); + } + } +} + +std::vector DictAttrsNode::ListFieldInfo() const { + return {}; +} + +Attrs DictAttrsNode::make(Map dict) { + std::shared_ptr n = std::make_shared(); + n->dict = std::move(dict); + return Attrs(n); +} + +TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) +.set_dispatch([](const DictAttrsNode *op, IRPrinter *p) { + p->stream << op->dict; +}); + +TVM_REGISTER_NODE_TYPE(DictAttrsNode); + +} // namespace tvm diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc index 7c4e862f0abb..9fb9143aa7f4 100644 --- a/src/lang/reflection.cc +++ b/src/lang/reflection.cc @@ -5,6 +5,7 @@ */ #include #include +#include #include #include #include @@ -467,22 +468,15 @@ class NodeAttrSetter : public AttrVisitor { } }; -// API function to make node. -// args format: -// type_key, key1, value1, ..., key_n, value_n -void MakeNode(runtime::TVMArgs args, runtime::TVMRetValue* rv) { + +void InitNodeByPackedArgs(Node* n, const TVMArgs& args) { NodeAttrSetter setter; - setter.type_key = args[0].operator std::string(); - CHECK_EQ(args.size() % 2, 1); - for (int i = 1; i < args.size(); i += 2) { - setter.attrs.emplace( - args[i].operator std::string(), - runtime::TVMArgValue(args.values[i + 1], args.type_codes[i + 1])); - } - auto* f = dmlc::Registry::Find(setter.type_key); - CHECK(f != nullptr) - << "Node type \'" << setter.type_key << "\' is not registered in TVM"; - std::shared_ptr n = f->body(); + setter.type_key = n->type_key(); + CHECK_EQ(args.size() % 2, 0); + for (int i = 0; i < args.size(); i += 2) { + setter.attrs.emplace(args[i].operator std::string(), + args[i + 1]); + } n->VisitAttrs(&setter); if (setter.attrs.size() != 0) { std::ostringstream os; @@ -492,10 +486,26 @@ void MakeNode(runtime::TVMArgs args, runtime::TVMRetValue* rv) { } LOG(FATAL) << os.str(); } +} + +// API function to make node. +// args format: +// key1, value1, ..., key_n, value_n +void MakeNode(const TVMArgs& args, TVMRetValue* rv) { + std::string type_key = args[0]; + auto* f = dmlc::Registry::Find(type_key); + CHECK(f != nullptr) + << "Node type \'" << type_key << "\' is not registered in TVM"; + TVMArgs kwargs(args.values + 1, args.type_codes + 1, args.size() - 1); + std::shared_ptr n = f->body(); + if (n->derived_from()) { + static_cast(n.get())->InitByPackedArgs(kwargs); + } else { + InitNodeByPackedArgs(n.get(), kwargs); + } *rv = NodeRef(n); } TVM_REGISTER_GLOBAL("make._Node") .set_body(MakeNode); - } // namespace tvm diff --git a/tests/cpp/attrs_test.cc b/tests/cpp/attrs_test.cc new file mode 100644 index 000000000000..138e0b242e02 --- /dev/null +++ b/tests/cpp/attrs_test.cc @@ -0,0 +1,76 @@ +#include +#include +#include +#include + +namespace tvm { +namespace test { +// test example usage docs +struct TestAttrs : public AttrsNode { + int axis; + std::string name; + Expr expr; + double learning_rate; + + TVM_DECLARE_ATTRS(TestAttrs, "attrs.cpptest.TestAttrs") { + TVM_ATTR_FIELD(axis) + .set_default(10) + .set_lower_bound(1) + .set_upper_bound(10) + .describe("axis field"); + TVM_ATTR_FIELD(name) + .describe("name of the field"); + TVM_ATTR_FIELD(expr) + .describe("expression field") + .set_default(make_const(Int(32), 1)); + TVM_ATTR_FIELD(learning_rate) + .describe("learning_rate") + .set_default(0.1); + } +}; +} +} + +TEST(Attrs, Basic) { + using namespace tvm; + using namespace tvm::test; + std::shared_ptr n = std::make_shared(); + try { + n->InitBySeq("axis", 10); + LOG(FATAL) << "bad"; + } catch (const tvm::AttrError& e) { + } + try { + n->InitBySeq("axis", 12, "name", "111"); + LOG(FATAL) << "bad"; + } catch (const tvm::AttrError& e) { + } + + try { + n->InitBySeq("axisx", 12, "name", "111"); + LOG(FATAL) << "bad"; + } catch (const tvm::AttrError& e) { + std::string what = e.what(); + CHECK(what.find("expr : Expr, default=1") != std::string::npos); + CHECK(what.find("axisx") != std::string::npos); + } + n->InitBySeq("learning_rate", Expr(1), "expr", 128, "name", "xx"); + CHECK_EQ(n->learning_rate, 1.0); + + n->InitBySeq("name", "xxx", "expr", 128); + CHECK_EQ(n->name, "xxx"); + CHECK_EQ(n->axis, 10); + CHECK_EQ(n->expr.as()->value, 128); + // Check docstring + std::ostringstream os; + n->PrintDocString(os); + LOG(INFO) << "docstring\n"<< os.str(); + CHECK(os.str().find("expr : Expr, default=1") != std::string::npos); +} + + +int main(int argc, char ** argv) { + testing::InitGoogleTest(&argc, argv); + testing::FLAGS_gtest_death_test_style = "threadsafe"; + return RUN_ALL_TESTS(); +} diff --git a/tests/python/unittest/test_lang_reflection.py b/tests/python/unittest/test_lang_reflection.py index fefb8771a812..2ba67b8d9c86 100644 --- a/tests/python/unittest/test_lang_reflection.py +++ b/tests/python/unittest/test_lang_reflection.py @@ -36,6 +36,31 @@ def test_make_node(): assert AA.op == A.op assert AA.value_index == A.value_index + +def test_make_attrs(): + try: + x = tvm.make.node("attrs.TestAttrs", unknown_key=1, name="xx") + assert False + except tvm.TVMError as e: + assert str(e).find("unknown_key") != -1 + + try: + x = tvm.make.node("attrs.TestAttrs", axis=100, name="xx") + assert False + except tvm.TVMError as e: + assert str(e).find("upper bound") != -1 + + x = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3,4)) + assert x.name == "xx" + assert x.padding[0].value == 3 + assert x.padding[1].value == 4 + assert x.axis == 10 + + dattr = tvm.make.node("DictAttrs", x=1, y=10, name="xyz", padding=(0,0)) + assert dattr.x.value == 1 + + + def test_make_sum(): A = tvm.placeholder((2, 10), name='A') k = tvm.reduce_axis((0,10), "k") @@ -46,6 +71,7 @@ def test_make_sum(): assert BB.op.body[0].combiner is not None if __name__ == "__main__": + test_make_attrs() test_make_node() test_make_smap() test_const_saveload_json() From b58698d1fd392af8c2cd412ba9ec850f4a575c52 Mon Sep 17 00:00:00 2001 From: Pariksheet Pinjari Date: Mon, 20 Aug 2018 22:43:35 +0530 Subject: [PATCH 030/529] [NNVM][DARKNET]Yolo and Upsample frontend support (#1501) * Yolo and Upsample frontend support * Lint fix * Mac support added * Code clean and trigger CI --- nnvm/python/nnvm/frontend/darknet.py | 32 +++++++- nnvm/python/nnvm/testing/darknet.py | 25 +++++- nnvm/python/nnvm/top/vision.py | 15 ++++ nnvm/src/top/vision/yolo/yolo.cc | 33 ++++++++ .../python/frontend/darknet/test_forward.py | 30 +++++++- tutorials/nnvm/from_darknet.py | 77 +++++++++---------- 6 files changed, 165 insertions(+), 47 deletions(-) create mode 100644 nnvm/src/top/vision/yolo/yolo.cc diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py index 3aa36b7e7ef9..7fb3e34750c8 100644 --- a/nnvm/python/nnvm/frontend/darknet.py +++ b/nnvm/python/nnvm/frontend/darknet.py @@ -32,8 +32,12 @@ class LAYERTYPE(object): NETWORK = 20 XNOR = 21 REGION = 22 - REORG = 23 - BLANK = 24 + YOLO = 23 + REORG = 24 + UPSAMPLE = 25 + LOGXENT = 26 + L2NORM = 27 + BLANK = 28 class ACTIVATION(object): """Darknet ACTIVATION Class constant.""" @@ -257,6 +261,12 @@ def _darknet_reshape(inputs, attrs): new_attrs['shape'] = _darknet_required_attr(attrs, 'shape') return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None +def _darknet_upsampling(inputs, attrs): + """Process the upsampling operation.""" + op_name, new_attrs = 'upsampling', {} + new_attrs['scale'] = attrs.get('scale', 1) + return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None + def _darknet_softmax_output(inputs, attrs): """Process the softmax operation.""" temperature = attrs.get('temperature', 1) @@ -298,6 +308,15 @@ def _darknet_region(inputs, attrs): new_attrs['softmax'] = attrs.get('softmax', 0) return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None +def _darknet_yolo(inputs, attrs): + """Process the yolo operation.""" + op_name, new_attrs = 'yolov3_yolo', {} + if 'n' in attrs: + new_attrs['n'] = attrs.get('n', 1) + if 'classes' in attrs: + new_attrs['classes'] = attrs.get('classes', 1) + return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None + def _darknet_activations(inputs, attrs): """Process the activation function.""" act = _darknet_required_attr(attrs, 'activation') @@ -350,6 +369,8 @@ def _darknet_op_not_support(inputs, attrs): LAYERTYPE.REORG : _darknet_reorg, LAYERTYPE.REGION : _darknet_region, LAYERTYPE.SHORTCUT : _darknet_shortcut, + LAYERTYPE.UPSAMPLE : _darknet_upsampling, + LAYERTYPE.YOLO : _darknet_yolo, LAYERTYPE.DETECTION : _darknet_op_not_support, LAYERTYPE.CROP : _darknet_op_not_support, LAYERTYPE.COST : _darknet_op_not_support, @@ -575,6 +596,13 @@ def _get_darknet_attrs(self, layer, layer_num): attr.update({'coords' : layer.coords}) attr.update({'background' : layer.background}) attr.update({'softmax' : layer.softmax}) + + elif LAYERTYPE.YOLO == layer.type: + attr.update({'n' : layer.n}) + attr.update({'classes' : layer.classes}) + + elif LAYERTYPE.UPSAMPLE == layer.type: + attr.update({'scale' : layer.stride}) else: err = "Darknet layer type {} is not supported in nnvm.".format(layer.type) raise NotImplementedError(err) diff --git a/nnvm/python/nnvm/testing/darknet.py b/nnvm/python/nnvm/testing/darknet.py index e3d110e9605e..9a346e01b50b 100644 --- a/nnvm/python/nnvm/testing/darknet.py +++ b/nnvm/python/nnvm/testing/darknet.py @@ -115,8 +115,12 @@ class LAYERTYPE(object): NETWORK = 20 XNOR = 21 REGION = 22 - REORG = 23 - BLANK = 24 + YOLO = 23 + REORG = 24 + UPSAMPLE = 25 + LOGXENT = 26 + L2NORM = 27 + BLANK = 28 class ACTIVATION(object): """Darknet ACTIVATION Class constant.""" @@ -182,12 +186,16 @@ class ACTIVATION(object): NETWORK, XNOR, REGION, + YOLO, REORG, + UPSAMPLE, + LOGXENT, + L2NORM, BLANK } LAYERTYPE; typedef enum{ - SSE, MASKED, LONE, SEG, SMOOTH + SSE, MASKED, L1, SEG, SMOOTH, WGAN } COSTTYPE; @@ -241,18 +249,20 @@ class ACTIVATION(object): float shift; float ratio; float learning_rate_scale; + float clip; int softmax; int classes; int coords; int background; int rescore; int objectness; - int does_cost; int joint; int noadjust; int reorg; int log; int tanh; + int *mask; + int total; float alpha; float beta; @@ -265,13 +275,17 @@ class ACTIVATION(object): float class_scale; int bias_match; int random; + float ignore_thresh; + float truth_thresh; float thresh; + float focus; int classfix; int absolute; int onlyforward; int stopbackward; int dontload; + int dontsave; int dontloadscales; float temperature; @@ -309,6 +323,7 @@ class ACTIVATION(object): float * delta; float * output; + float * loss; float * squared; float * norms; @@ -462,6 +477,7 @@ class ACTIVATION(object): int train; int index; float *cost; + float clip; } network; @@ -491,6 +507,7 @@ class ACTIVATION(object): layer make_region_layer(int batch, int w, int h, int n, int classes, int coords); layer make_softmax_layer(int batch, int inputs, int groups); layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam); +layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes); layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, ACTIVATION activation, int batch_normalize); layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam); layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam); diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py index f2e12c0f367a..e59b2bdfe6d9 100644 --- a/nnvm/python/nnvm/top/vision.py +++ b/nnvm/python/nnvm/top/vision.py @@ -38,6 +38,21 @@ def schedule_region(attrs, outs, target): reg.register_pattern("yolo_region", OpPattern.OPAQUE) +@reg.register_compute("yolov3_yolo") +def compute_yolo(attrs, inputs, _): + """Compute definition of yolo""" + n = attrs.get_int("n") + classes = attrs.get_int("classes") + return topi.vision.yolo.yolo(inputs[0], n, classes) + +@reg.register_schedule("yolov3_yolo") +def schedule_yolo(attrs, outs, target): + """Schedule definition of yolo""" + with tvm.target.create(target): + return topi.generic.schedule_injective(outs) + +reg.register_pattern("yolov3_yolo", OpPattern.OPAQUE) + # multibox_prior @reg.register_schedule("multibox_prior") def schedule_multibox_prior(_, outs, target): diff --git a/nnvm/src/top/vision/yolo/yolo.cc b/nnvm/src/top/vision/yolo/yolo.cc new file mode 100644 index 000000000000..4800f4371f9d --- /dev/null +++ b/nnvm/src/top/vision/yolo/yolo.cc @@ -0,0 +1,33 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file yolo.cc + * \brief Property def of yolo operators. + */ +#include +#include +#include +#include +#include "../../elemwise_op_common.h" + +namespace nnvm { +namespace top { + +NNVM_REGISTER_OP(yolov3_yolo) +.describe(R"code(Yolo layer +)code" NNVM_ADD_FILELINE) +.set_num_inputs(1) +.set_num_outputs(1) +.set_support_level(5) +.add_argument("data", "Tensor", "Input data") +.set_attr("FInferShape", ElemwiseShape<1, 1>) +.set_attr( + "FInplaceOption", + [](const NodeAttrs &attrs) { + return std::vector>{{0, 0}, {1, 0}}; + }) +.set_attr("FGradient", [](const NodePtr &n, + const std::vector &ograds) { + return std::vector{ograds[0], ograds[0]}; +}); +} // namespace top +} // namespace nnvm diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py index 5fc71a86211e..3d7d06b48483 100644 --- a/nnvm/tests/python/frontend/darknet/test_forward.py +++ b/nnvm/tests/python/frontend/darknet/test_forward.py @@ -44,7 +44,7 @@ def _download(url, path, overwrite=False, sizecompare=False): except: urllib.urlretrieve(url, path) -DARKNET_LIB = 'libdarknet.so' +DARKNET_LIB = 'libdarknet2.0.so' DARKNETLIB_URL = 'https://github.com/siju-samuel/darknet/blob/master/lib/' \ + DARKNET_LIB + '?raw=true' _download(DARKNETLIB_URL, DARKNET_LIB) @@ -239,6 +239,8 @@ def test_forward_shortcut(): layer_2 = LIB.make_convolutional_layer(1, 111, 111, 32, 32, 1, 1, 1, 0, 1, 0, 0, 0, 0) layer_3 = LIB.make_shortcut_layer(1, 0, 111, 111, 32, 111, 111, 32) layer_3.activation = 1 + layer_3.alpha = 1 + layer_3.beta = 1 net.layers[0] = layer_1 net.layers[1] = layer_2 net.layers[2] = layer_3 @@ -272,6 +274,30 @@ def test_forward_region(): test_forward(net) LIB.free_network(net) +def test_forward_yolo_op(): + '''test yolo layer''' + net = LIB.make_network(2) + layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 14, 1, 3, 2, 0, 1, 0, 0, 0, 0) + a = [] + layer_2 = LIB.make_yolo_layer(1, 111, 111, 2, 0, a, 2) + net.layers[0] = layer_1 + net.layers[1] = layer_2 + net.w = net.h = 224 + LIB.resize_network(net, 224, 224) + test_forward(net) + LIB.free_network(net) + +def test_forward_upsample(): + '''test upsample layer''' + net = LIB.make_network(1) + layer = LIB.make_upsample_layer(1, 19, 19, 3, 3) + layer.scale = 1 + net.layers[0] = layer + net.w = net.h = 19 + LIB.resize_network(net, 19, 19) + test_forward(net) + LIB.free_network(net) + def test_forward_elu(): '''test elu activation layer''' net = LIB.make_network(1) @@ -428,6 +454,8 @@ def test_forward_activation_logistic(): test_forward_rnn() test_forward_reorg() test_forward_region() + test_forward_yolo_op() + test_forward_upsample() test_forward_elu() test_forward_rnn() test_forward_crnn() diff --git a/tutorials/nnvm/from_darknet.py b/tutorials/nnvm/from_darknet.py index 883026f2af98..c6b70cf59413 100644 --- a/tutorials/nnvm/from_darknet.py +++ b/tutorials/nnvm/from_darknet.py @@ -22,54 +22,48 @@ import numpy as np import tvm import os +import sys from ctypes import * from tvm.contrib.download import download from nnvm.testing.darknet import __darknetffi__ -###################################################################### -# Set the parameters here. -# Supported models alexnet, resnet50, resnet152, extraction, yolo -# -model_name = 'yolo' -test_image = 'dog.jpg' -target = 'llvm' -ctx = tvm.cpu(0) +#Model name +MODEL_NAME = 'yolo' ###################################################################### -# Prepare cfg and weights file -# ---------------------------- -# Pretrained model available https://pjreddie.com/darknet/imagenet/ -# Download cfg and weights file first time. +# Download required files +# ----------------------- +# Download cfg and weights file if first time. +CFG_NAME = MODEL_NAME + '.cfg' +WEIGHTS_NAME = MODEL_NAME + '.weights' +REPO_URL = 'https://github.com/siju-samuel/darknet/blob/master/' +CFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true' +WEIGHTS_URL = REPO_URL + 'weights/' + WEIGHTS_NAME + '?raw=true' + +download(CFG_URL, CFG_NAME) +download(WEIGHTS_URL, WEIGHTS_NAME) -cfg_name = model_name + '.cfg' -weights_name = model_name + '.weights' -cfg_url = 'https://github.com/siju-samuel/darknet/blob/master/cfg/' + \ - cfg_name + '?raw=true' -weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true' - -download(cfg_url, cfg_name) -download(weights_url, weights_name) - -###################################################################### # Download and Load darknet library -# --------------------------------- - -darknet_lib = 'libdarknet.so' -darknetlib_url = 'https://github.com/siju-samuel/darknet/blob/master/lib/' + \ - darknet_lib + '?raw=true' -download(darknetlib_url, darknet_lib) - -#if the file doesnt exist, then exit normally. -if os.path.isfile('./' + darknet_lib) is False: - exit(0) - -darknet_lib = __darknetffi__.dlopen('./' + darknet_lib) -cfg = "./" + str(cfg_name) -weights = "./" + str(weights_name) -net = darknet_lib.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0) +if sys.platform in ['linux', 'linux2']: + DARKNET_LIB = 'libdarknet2.0.so' + DARKNET_URL = REPO_URL + 'lib/' + DARKNET_LIB + '?raw=true' +elif sys.platform == 'darwin': + DARKNET_LIB = 'libdarknet_mac2.0.so' + DARKNET_URL = REPO_URL + 'lib_osx/' + DARKNET_LIB + '?raw=true' +else: + err = "Darknet lib is not supported on {} platform".format(sys.platform) + raise NotImplementedError(err) + +download(DARKNET_URL, DARKNET_LIB) + +DARKNET_LIB = __darknetffi__.dlopen('./' + DARKNET_LIB) +cfg = "./" + str(CFG_NAME) +weights = "./" + str(WEIGHTS_NAME) +net = DARKNET_LIB.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0) dtype = 'float32' batch_size = 1 + print("Converting darknet to nnvm symbols...") sym, params = nnvm.frontend.darknet.from_darknet(net, dtype) @@ -77,7 +71,9 @@ # Compile the model on NNVM # ------------------------- # compile the model -data = np.empty([batch_size, net.c ,net.h, net.w], dtype); +target = 'llvm' +ctx = tvm.cpu(0) +data = np.empty([batch_size, net.c, net.h, net.w], dtype) shape = {'data': data.shape} print("Compiling the model...") with nnvm.compiler.build_config(opt_level=2): @@ -103,6 +99,7 @@ def save_lib(): ###################################################################### # Load a test image # -------------------------------------------------------------------- +test_image = 'dog.jpg' print("Loading the test image...") img_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + \ test_image +'?raw=true' @@ -134,7 +131,7 @@ def save_lib(): hier_thresh = 0.5 img = nnvm.testing.darknet.load_image_color(test_image) _, im_h, im_w = img.shape -probs= [] +probs = [] boxes = [] region_layer = net.layers[net.n - 1] boxes, probs = nnvm.testing.yolo2_detection.get_region_boxes(region_layer, im_w, im_h, net.w, net.h, @@ -157,5 +154,5 @@ def save_lib(): nnvm.testing.yolo2_detection.draw_detections(img, region_layer.w*region_layer.h*region_layer.n, thresh, boxes, probs, names, region_layer.classes) -plt.imshow(img.transpose(1,2,0)) +plt.imshow(img.transpose(1, 2, 0)) plt.show() From 447d7c1ece6120f161dabf8ff0265ca539497ece Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Mon, 20 Aug 2018 13:27:31 -0700 Subject: [PATCH 031/529] Improve x86 Inception (#1506) * Improve x86 pooling and concat * Fix * Fix test concatenate correct layout * Add conditional vectorize * Fix lint * Modify schedule for global pooling * Fix * Fix warning * Fix alter layout test * Remove vectorization for pooling when using 4D layout * Remove vectorization for 4D concat * Fix concatenate layout * Fix concatenate schedule * Fix concat * Fix lint * Fix concat * Simplify pooling logic * Update docstring * Fix test topi pooling * Small changes --- nnvm/python/nnvm/top/nn.py | 10 ++-- nnvm/python/nnvm/top/transform.py | 8 +++- nnvm/src/top/tensor/transform.cc | 24 ++++++++-- .../python/unittest/test_correct_layout.py | 21 +++++++-- topi/include/topi/nn/pooling.h | 12 ++--- topi/python/topi/cuda/pooling.py | 7 ++- topi/python/topi/generic/injective.py | 17 +++++++ topi/python/topi/generic/nn.py | 5 +- topi/python/topi/opengl/pooling.py | 7 ++- topi/python/topi/x86/injective.py | 46 +++++++++++++++++++ topi/python/topi/x86/pooling.py | 44 +++++++++++++++--- topi/tests/python/test_topi_pooling.py | 6 ++- 12 files changed, 174 insertions(+), 33 deletions(-) diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py index f9a2c2813a04..b452738123c3 100644 --- a/nnvm/python/nnvm/top/nn.py +++ b/nnvm/python/nnvm/top/nn.py @@ -280,20 +280,22 @@ def schedule_conv2d_transpose(attrs, outs, target): # max_pool2d @reg.register_schedule("max_pool2d") -def schedule_max_pool2d(_, outs, target): +def schedule_max_pool2d(attrs, outs, target): """Schedule definition of max_pool2d""" + layout = attrs["layout"] with tvm.target.create(target): - return topi.generic.schedule_pool(outs) + return topi.generic.schedule_pool(outs, layout) reg.register_pattern("max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) # avg_pool2d @reg.register_schedule("avg_pool2d") -def schedule_avg_pool2d(_, outs, target): +def schedule_avg_pool2d(attrs, outs, target): """Schedule definition of avg_pool2d""" + layout = attrs["layout"] with tvm.target.create(target): - return topi.generic.schedule_pool(outs) + return topi.generic.schedule_pool(outs, layout) reg.register_pattern("avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) diff --git a/nnvm/python/nnvm/top/transform.py b/nnvm/python/nnvm/top/transform.py index facb345c1abe..594007239d4a 100644 --- a/nnvm/python/nnvm/top/transform.py +++ b/nnvm/python/nnvm/top/transform.py @@ -2,6 +2,7 @@ """Tensor transformation ops""" from __future__ import absolute_import +import tvm import topi from .tensor import _fschedule_broadcast, _fschedule_injective from . import registry as reg @@ -58,8 +59,13 @@ def compute_reshape_like(attrs, inputs, out_info): reg.register_schedule("squeeze", _fschedule_injective) # concatenate +@reg.register_schedule("concatenate") +def schedule_concatenate(_, outs, target): + """Schedule definition of concatenate""" + with tvm.target.create(target): + return topi.generic.schedule_concatenate(outs) + reg.register_pattern("concatenate", OpPattern.INJECTIVE) -reg.register_schedule("concatenate", _fschedule_injective) # split reg.register_pattern("split", OpPattern.INJECTIVE) diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc index 52dca5654838..b1485438ca50 100644 --- a/nnvm/src/top/tensor/transform.cc +++ b/nnvm/src/top/tensor/transform.cc @@ -129,15 +129,31 @@ inline bool ConcatenateCorrectLayout(const NodeAttrs& attrs, std::vector *ilayouts, const std::vector *last_ilayouts, std::vector *olayouts) { + const ConcatenateParam& param = nnvm::get(attrs.parsed); CHECK_EQ(ilayouts->size(), last_ilayouts->size()); CHECK_EQ(olayouts->size(), 1U); - for (size_t i = 0; i < ilayouts->size(); ++i) { - const Layout& input = last_ilayouts->at(i).defined() ? - last_ilayouts->at(i) : ilayouts->at(i); - NNVM_ASSIGN_LAYOUT(*ilayouts, i, input); + Layout layout; + if (!ilayouts->at(0).defined()) { + layout = last_ilayouts->at(0); + } else if (param.axis >= static_cast(ilayouts->at(0).ndim())) { + CHECK(last_ilayouts->at(0).defined()) + << "Current input layout " << ilayouts->at(0) + << " is invalid but last input layout is not " + "defined for the first input."; + layout = last_ilayouts->at(0); + } else if (last_ilayouts->at(0).defined() + && ilayouts->at(0)[param.axis] + != last_ilayouts->at(0)[param.axis]) { + layout = last_ilayouts->at(0); + } else { + layout = ilayouts->at(0); } + for (size_t i = 0; i < ilayouts->size(); ++i) { + NNVM_ASSIGN_LAYOUT(*ilayouts, i, layout); + } + NNVM_ASSIGN_LAYOUT(*olayouts, 0, layout); return true; } diff --git a/nnvm/tests/python/unittest/test_correct_layout.py b/nnvm/tests/python/unittest/test_correct_layout.py index 6176586284a7..8961498a579e 100644 --- a/nnvm/tests/python/unittest/test_correct_layout.py +++ b/nnvm/tests/python/unittest/test_correct_layout.py @@ -77,14 +77,25 @@ def test_concatenate(): g, ldict = correct_layout(z, {"x": "HW", "y": "HW"}) assert(ldict["x"][0] == "HW") assert(ldict["y"][0] == "HW") - assert(ldict["concat"][0] == "__undef__") + assert(ldict["concat"][0] == "HW") # second pass will insert layout transform _, ldict = correct_layout(g, {"x": "HW16w", "y": "HW16w"}) assert(ldict["x"][0] == "HW16w") assert(ldict["y"][0] == "HW16w") - assert(ldict["x_HW"][0] == "HW") - assert(ldict["y_HW"][0] == "HW") - assert(ldict["concat"][0] == "__undef__") + assert(ldict["concat"][0] == "HW16w") + + x1 = sym.Variable("x", shape=(10, 20, 60)) + x2 = sym.Variable("y", shape=(10, 20, 40)) + z = sym.concatenate(x1, x2, axis=2, name="concat") + g, ldict = correct_layout(z, {"x": "H20wW", "y": "H20wW"}) + assert(ldict["x"][0] == "H20wW") + assert(ldict["y"][0] == "H20wW") + assert(ldict["concat"][0] == "H20wW") + # second pass will insert layout transform + _, ldict = correct_layout(g, {"x": "HW", "y": "HW"}) + assert(ldict["x_H20wW"][0] == "H20wW") + assert(ldict["x_H20wW"][0] == "H20wW") + assert(ldict["concat"][0] == "H20wW") def test_expand_dims(): @@ -349,4 +360,4 @@ def test_reduce(): test_transpose() test_broadcast_to() test_broadcast_binary() - test_reduce() \ No newline at end of file + test_reduce() diff --git a/topi/include/topi/nn/pooling.h b/topi/include/topi/nn/pooling.h index 26d61d42991d..ca318adfe6cb 100644 --- a/topi/include/topi/nn/pooling.h +++ b/topi/include/topi/nn/pooling.h @@ -112,18 +112,18 @@ inline Tensor pool_impl(const Tensor& x, }, "tensor", "pool_max"); } else if (pool_type == kAvgPool) { auto temp = do_pad ? pad(x, pad_before, pad_after, 0, "pad_temp") : x; - auto tsum = tvm::compute(out_shape, [&](const Array& output) { + auto tavg = [&](const Array& output, Expr divide_factor) { Array indices; for (const Var& var : output) indices.push_back(var); indices.Set(height_axis, output[height_axis] * stride_height + dheight); indices.Set(width_axis, output[width_axis] * stride_width + dwidth); - return tvm::sum(temp(indices), { dheight, dwidth }); - }, "tensor", "pool_avg"); + return tvm::sum(temp(indices) / divide_factor, { dheight, dwidth }); + }; return tvm::compute(out_shape, [&](const Array& output) { if (count_include_pad) { - return tsum(output) / (kernel_height * kernel_width); + return tavg(output, kernel_height * kernel_width); } else { Expr h_start = output[height_axis] * stride_height - pad_top; Expr w_start = output[width_axis] * stride_width - pad_left; @@ -133,9 +133,9 @@ inline Tensor pool_impl(const Tensor& x, w_start = ir::Max::make(w_start, make_const(Int(32), 0)); Expr divide_factor = ir::Max::make((h_end - h_start) * (w_end - w_start), make_const(Int(32), 1)); - return tsum(output) / divide_factor; + return tavg(output, divide_factor); } - }, "tensor", kElementWise); + }, "tensor", "pool_avg"); } else { LOG(ERROR) << "Unrecognized pool_type: " << pool_type; return x; diff --git a/topi/python/topi/cuda/pooling.py b/topi/python/topi/cuda/pooling.py index 637f664fbd36..6b36e9a8743f 100644 --- a/topi/python/topi/cuda/pooling.py +++ b/topi/python/topi/cuda/pooling.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, unused-variable +# pylint: disable=invalid-name, unused-variable, unused-argument """Schedule for pooling operators""" import tvm from .. import tag @@ -70,7 +70,7 @@ def traverse(OP): @generic.schedule_pool.register(["cuda", "gpu"]) -def schedule_pool(outs): +def schedule_pool(outs, layout): """Schedule for pool. Parameters @@ -79,6 +79,9 @@ def schedule_pool(outs): The computation graph description of pool in the format of an array of tensors. + layout: str + Data layout. + Returns ------- s: Schedule diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py index 0a9e394661af..975e4c11ea41 100644 --- a/topi/python/topi/generic/injective.py +++ b/topi/python/topi/generic/injective.py @@ -29,5 +29,22 @@ def schedule_injective(outs): s[x].fuse(s[x].op.axis) return s +@tvm.target.generic_func +def schedule_concatenate(outs): + """Schedule for concatenate op. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of reduce in the format + of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + return schedule_injective(outs) + schedule_elemwise = schedule_injective schedule_broadcast = schedule_injective diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 1e01adb899b7..874decc792ec 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -282,7 +282,7 @@ def schedule_dense(outs): @tvm.target.override_native_generic_func("schedule_pool") -def schedule_pool(outs): +def schedule_pool(outs, layout): """Schedule for pool Parameters @@ -291,6 +291,9 @@ def schedule_pool(outs): The computation graph description of pool in the format of an array of tensors. + layout: str + Data layout. + Returns ------- sch: Schedule diff --git a/topi/python/topi/opengl/pooling.py b/topi/python/topi/opengl/pooling.py index 8195ea91d8a6..d6dbf0eac5c2 100644 --- a/topi/python/topi/opengl/pooling.py +++ b/topi/python/topi/opengl/pooling.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, unused-variable +# pylint: disable=invalid-name, unused-variable, unused-argument """Schedule for pooling operators""" import tvm from .. import tag @@ -54,7 +54,7 @@ def traverse(OP): @generic.schedule_pool.register(["opengl"]) -def schedule_pool(outs): +def schedule_pool(outs, layout): """Schedule for pool. Parameters @@ -63,6 +63,9 @@ def schedule_pool(outs): The computation graph description of pool in the format of an array of tensors. + layout: str + Data layout. + Returns ------- s: Schedule diff --git a/topi/python/topi/x86/injective.py b/topi/python/topi/x86/injective.py index b43ebb98b82f..ac552903ad7f 100644 --- a/topi/python/topi/x86/injective.py +++ b/topi/python/topi/x86/injective.py @@ -33,5 +33,51 @@ def schedule_injective(outs): s[x].parallel(s[x].op.axis[0]) return s +@generic.schedule_concatenate.register(["cpu"]) +def schedule_concatenate(outs): + """X86 schedule for concatenate op. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of injective in the format + of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + def vectorize(sch, tensor, vectorize_limit): + """Internal vectorization function for concatenate.""" + inner_axis = s[tensor].op.axis[len(s[tensor].op.axis) - 1] + inner_length = tensor.shape[len(tensor.shape) - 1].value + if inner_length <= vectorize_limit: + sch[tensor].vectorize(inner_axis) + else: + split_factor = 1 + for i in range(vectorize_limit, 1, -1): + if inner_length % i == 0: + split_factor = i + break + if split_factor > 1: + _, inner_i = sch[tensor].split(inner_axis, split_factor) + sch[tensor].vectorize(inner_i) + + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + x = outs[0] + s = tvm.create_schedule([x.op for x in outs]) + tvm.schedule.AutoInlineInjective(s) + if len(s[x].op.axis) >= 5: + fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1], s[x].op.axis[2]) + vectorize(s, x, 64) + s[x].parallel(fused) + elif len(s[x].op.axis) >= 3: + fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1]) + s[x].parallel(fused) + else: + s[x].parallel(s[x].op.axis[0]) + return s + schedule_elemwise = schedule_injective schedule_broadcast = schedule_injective diff --git a/topi/python/topi/x86/pooling.py b/topi/python/topi/x86/pooling.py index 998edf7a0e16..5fce5f32afb6 100644 --- a/topi/python/topi/x86/pooling.py +++ b/topi/python/topi/x86/pooling.py @@ -4,19 +4,47 @@ from .. import generic from .. import tag -def _parallel_sch(sch): +def _parallel_sch(sch, oshape, do_vectorize=False): + def vectorize(fused_axis, num_parallel_axis, vectorize_limit=64): + """Internal vectorization utility function.""" + reorder_axis = [fused_axis] + for i in range(num_parallel_axis, len(sch.op.axis) - 1): + reorder_axis.append(sch.op.axis[i]) + kw, kh = sch.op.reduce_axis + fuse_k = sch.fuse(kw, kh) + c = sch.op.axis[len(sch.op.axis) - 1] + reorder_axis += [fuse_k, c] + sch.reorder(*reorder_axis) + inner_length = oshape[len(oshape) - 1].value + if inner_length <= vectorize_limit: + sch.vectorize(c) + else: + split_factor = 1 + for i in range(vectorize_limit, 1, -1): + if inner_length % i == 0: + split_factor = i + break + if split_factor > 1: + _, c_i = sch.split(c, split_factor) + sch.vectorize(c_i) + if len(sch.op.axis) >= 5: fused = sch.fuse(sch.op.axis[0], sch.op.axis[1], sch.op.axis[2]) - sch.parallel(fused) + if do_vectorize: + vectorize(fused, 3) + elif len(sch.op.axis) >= 3: fused = sch.fuse(sch.op.axis[0], sch.op.axis[1]) - sch.parallel(fused) + if do_vectorize: + vectorize(fused, 2) else: sch.parallel(sch.op.axis[0]) + return + sch.parallel(fused) @generic.schedule_pool.register(["cpu"]) -def schedule_pool(outs): +def schedule_pool(outs, layout): """Schedule for pool Parameters @@ -25,6 +53,9 @@ def schedule_pool(outs): The computation graph description of pool in the format of an array of tensors. + layout: str + Data layout. + Returns ------- sch: Schedule @@ -37,7 +68,8 @@ def schedule_pool(outs): def _schedule(PaddedInput, Pool): if isinstance(PaddedInput.op, tvm.tensor.ComputeOp): s[PaddedInput].compute_inline() - _parallel_sch(s[Pool]) + do_vectorize = layout[-1] not in "HWhw" + _parallel_sch(s[Pool], outs[0].shape, do_vectorize) def traverse(OP): """Internal travserse function""" @@ -93,7 +125,7 @@ def traverse(OP): # schedule pool elif OP.tag.startswith('global_pool'): Pool = OP.output(0) - _parallel_sch(s[Pool]) + _parallel_sch(s[Pool], outs[0].shape) else: raise RuntimeError("Unsupported operator: %s" % OP.tag) diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py index c9f790146b4a..b87795743c4c 100644 --- a/topi/tests/python/test_topi_pooling.py +++ b/topi/tests/python/test_topi_pooling.py @@ -10,9 +10,11 @@ def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_ kw = kh sw = sh pt, pl, pb, pr = padding + layout = "NCHW" A = tvm.placeholder((n, ic, ih, iw), name='A') B = topi.nn.pool(A, kernel=[kh, kw], stride=[sh, sw], padding=padding, - pool_type=pool_type, ceil_mode=ceil_mode, count_include_pad=count_include_pad) + pool_type=pool_type, ceil_mode=ceil_mode, + layout="NCHW", count_include_pad=count_include_pad) B = topi.nn.relu(B) dtype = A.dtype @@ -54,7 +56,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_pool(B) + s = topi.generic.schedule_pool(B, layout) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) From 75bc72422b923c9f3da981c67b07b8d54eca5a93 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 20 Aug 2018 16:28:28 -0700 Subject: [PATCH 032/529] [VERSION] Update to 0.5.dev (#1623) * [VERSION] Update to 0.5.dev * Update the docs to include all intrins --- NEWS.md | 63 +++++++++++++++++++++++++++ conda/nnvm/meta.yaml | 2 +- conda/topi/meta.yaml | 2 +- conda/tvm-libs/meta.yaml | 2 +- conda/tvm/meta.yaml | 2 +- docs/api/python/intrin.rst | 6 +++ include/tvm/runtime/c_runtime_api.h | 2 +- python/tvm/_ffi/libinfo.py | 6 ++- python/update_version.py | 66 +++++++++++++++++++++++++++++ web/tvm_runtime.js | 2 +- 10 files changed, 145 insertions(+), 8 deletions(-) create mode 100644 python/update_version.py diff --git a/NEWS.md b/NEWS.md index 567aabf3fcbd..2c2f616cb2f0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,69 @@ Refer to the Roadmap issue for complete list on on-going version features. If you check in something that is not reflected in Roadmap issue, please reply to that issue so it can get added. +## 0.4 + +This release features several major improvements. The high-level graph optimizer is now part of TVM repo. Some of the highlights are: Initial support of AutoTVM for automated optimization; customized accelerator backend VTA. + +- Tensor operator primitives + - Introduce attrs field to operator primitives(e.g. compute) to store additional metadata, the attrs can be used as hint for scheduling +- Enable embedding of asm micro-kernels +- Hybrid python programming model + - python AST based IR builder interface + - support GPU programs +- AutoTVM, Automated tuning, and scheduling + - basic autotvm infra + - GPU IR verifier + - basic autotuning tutorial + - topi integration +- ARM support + - winograd support + - initial support of ARM autotuning records +- TOPI Vision + - Generic GPU sort support(useful for vision) + - SSD operator support +- TOPI numpy consistency + - Rename all binary operators for numpy consistecy: broadcast_add-> add, broadcast_sub -> substract, broadcast_mul -> multiply, broadcast_div->divide + - New operators: slice, LRN, equal, not_equal, less, greater + - tutorials on topi +- Initial low-bit operator support support + - Optimized popcount generation on ARM + - general bit-serial convolution and GEMM + - optimized low bit kernels + - parallel optimization +- New topi backend optimization for intel graphics +- Adapt AVX schedules for SSE target +- VTA: customized accelerator backend + - custom hardware backend example + - tutorials on how to use customized accelerator +- Initial experimental support for HLS backend +- Bugfix in SPIRV code generator for vulkan +- libdevice support, enable NVPTX backend +- Introduce NDArrayContainer for managed NDarray +- RPC and Device API + - Support communication between big/small endian machines. + - RPC and device API protocol upgrade (this is a non-backward compatible change) to support big-small endian communication. This is a non-backward compatible change, need to use the latest version of TVM runtime with the RPC + - graduate rpc from contrib, tvm.contrib.rpc->tvm.rpc + -Support tracker in Android RPC, add fault tolerance for AutoTVM +- BIG.LITTLE aware threadpool +- tvm4j graph runtime that runs end to end workload in java +- DLPack support + - Support from_dlpack and to_dlpack + - Enables bridges to pytorch +- Enable link of stackvm in runtime +- Tensorflow graphdef frontend +- Keras frontend + - improved to support reuse layers, add activations +- ONNX + - gather, LRN +- CoreML frontend + - Support C-RNN and activation functions +- Fix grads for sum and expand_like +- Enhanced operator fusion for multiple elemwise branches +- Separate nnvm fusion and compilation pass +- Unified build system to cmake, customizable cmake path for vulkan, rocm, cuda + + ## 0.3 This release features numerous improvements in TOPI and backends. We make the first step toward object detection support in TOPI, featuring operators necessary for YOLO and SSDs. The topi now supports numpy-style API and operator overloading. RPC is significantly improved to support resource allocation and using a pool of devices. We are adding two new backends: WebGL for running GPUs on the browser, and Vulkan for running on next-generation graphics API. diff --git a/conda/nnvm/meta.yaml b/conda/nnvm/meta.yaml index a8b47d0de118..9c045c177ff6 100644 --- a/conda/nnvm/meta.yaml +++ b/conda/nnvm/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "0.4.dev" %} +{% set version = "0.5.dev" %} package: name: nnvm diff --git a/conda/topi/meta.yaml b/conda/topi/meta.yaml index af2fb4fd4228..4002f577863b 100644 --- a/conda/topi/meta.yaml +++ b/conda/topi/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "0.4.dev" %} +{% set version = "0.5.dev" %} package: name: topi diff --git a/conda/tvm-libs/meta.yaml b/conda/tvm-libs/meta.yaml index dbdfd4a7701f..d6902c45a693 100644 --- a/conda/tvm-libs/meta.yaml +++ b/conda/tvm-libs/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "0.4.dev" %} +{% set version = "0.5.dev" %} package: name: tvm-libs diff --git a/conda/tvm/meta.yaml b/conda/tvm/meta.yaml index 478e095322eb..fe53b7dd49d9 100644 --- a/conda/tvm/meta.yaml +++ b/conda/tvm/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "0.4.dev" %} +{% set version = "0.5.dev" %} package: name: tvm diff --git a/docs/api/python/intrin.rst b/docs/api/python/intrin.rst index 3942c57f1a04..59f695196ce8 100644 --- a/docs/api/python/intrin.rst +++ b/docs/api/python/intrin.rst @@ -6,7 +6,10 @@ tvm.intrin tvm.call_packed tvm.call_pure_intrin + tvm.call_intrin tvm.call_pure_extern + tvm.call_extern + tvm.call_llvm_intrin tvm.register_intrin_rule tvm.exp tvm.log @@ -18,7 +21,10 @@ tvm.intrin .. autofunction:: tvm.call_packed .. autofunction:: tvm.call_pure_intrin +.. autofunction:: tvm.call_intrin .. autofunction:: tvm.call_pure_extern +.. autofunction:: tvm.call_extern +.. autofunction:: tvm.call_llvm_intrin .. autofunction:: tvm.register_intrin_rule .. autofunction:: tvm.exp .. autofunction:: tvm.log diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index 32d574340052..52499fb9186f 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -43,7 +43,7 @@ #endif // TVM version -#define TVM_VERSION "0.4.0" +#define TVM_VERSION "0.5.dev" // TVM Runtime is DLPack compatible. diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py index 390849f8536d..f911829d38b1 100644 --- a/python/tvm/_ffi/libinfo.py +++ b/python/tvm/_ffi/libinfo.py @@ -100,5 +100,7 @@ def find_lib_path(name=None, search_path=None, optional=False): # current version -# We use the version of the incoming release for code that is under development -__version__ = "0.4.0" +# We use the version of the incoming release for code +# that is under development. +# The following line is set by tvm/python/update_version.py +__version__ = "0.5.dev" diff --git a/python/update_version.py b/python/update_version.py new file mode 100644 index 000000000000..9e958f109479 --- /dev/null +++ b/python/update_version.py @@ -0,0 +1,66 @@ +""" +This is the global script that set the version information of TVM. +This script runs and update all the locations that related to versions + +List of affected files: +- tvm-root/python/tvm/_ffi/libinfo.py +- tvm-root/include/tvm/runtime/c_runtime_api.h +- tvm-root/web/tvm_runtime.js +- tvm-root/conda/tvm/meta.yaml +- tvm-root/conda/topi/meta.yaml +- tvm-root/conda/nnvm/meta.yaml +- tvm-root/conda/tvm-libs/meta.yaml +""" +import os +import re +# current version +# We use the version of the incoming release for code +# that is under development +__version__ = "0.5.dev" + +# Implementations +def update(file_name, pattern, repl): + update = [] + hit_counter = 0 + need_update = False + for l in open(file_name): + result = re.findall(pattern, l) + if result: + assert len(result) == 1 + hit_counter += 1 + if result[0] != repl: + l = re.sub(pattern, repl, l) + need_update = True + print("%s: %s->%s" % (file_name, result[0], repl)) + else: + print("%s: version is already %s" % (file_name, repl)) + + update.append(l) + if hit_counter != 1: + raise RuntimeError("Cannot find version in %s" % file_name) + + if need_update: + with open(file_name, "w") as output_file: + for l in update: + output_file.write(l) + + +def main(): + curr_dir = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) + proj_root = os.path.abspath(os.path.join(curr_dir, "..")) + # python path + update(os.path.join(proj_root, "python", "tvm", "_ffi", "libinfo.py"), + r"(?<=__version__ = \")[.0-9a-z]+", __version__) + # C++ header + update(os.path.join(proj_root, "include", "tvm", "runtime", "c_runtime_api.h"), + "(?<=TVM_VERSION \")[.0-9a-z]+", __version__) + # conda + for path in ["tvm", "topi", "nnvm", "tvm-libs"]: + update(os.path.join(proj_root, "conda", path, "meta.yaml"), + "(?<=version = \")[.0-9a-z]+", __version__) + # web + update(os.path.join(proj_root, "web", "tvm_runtime.js"), + "(?<=@version )[.0-9a-z]+", __version__) + +if __name__ == "__main__": + main() diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js index 786745d3ce88..2eab15093b72 100644 --- a/web/tvm_runtime.js +++ b/web/tvm_runtime.js @@ -2,7 +2,7 @@ * TVM Javascript web runtime library. * * @projectname tvm - * @version 0.1 + * @version 0.5.dev */ /* eslint no-unused-vars: "off" */ /* eslint no-unexpected-multiline: "off" */ From c9c7d186d9f51966857ac307a97df6a3cd1b7976 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Tue, 21 Aug 2018 12:40:23 -0500 Subject: [PATCH 033/529] Add int8 gemm recipe (#1614) --- topi/recipe/gemm/gemm_int8.py | 185 ++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 topi/recipe/gemm/gemm_int8.py diff --git a/topi/recipe/gemm/gemm_int8.py b/topi/recipe/gemm/gemm_int8.py new file mode 100644 index 000000000000..61ef97d0a2bf --- /dev/null +++ b/topi/recipe/gemm/gemm_int8.py @@ -0,0 +1,185 @@ +"Example code to perform int8 GEMM" +import logging +import sys +import numpy as np +import tvm +from tvm import autotvm + +DO_TUNING = True +PRETUNED_INDEX = 75333 + +def intrin_dot(): + n = 4 # dp4a requires operands packed by 4 + x = tvm.placeholder((n,), name='x', dtype='int8') + y = tvm.placeholder((n,), name='y', dtype='int8') + k = tvm.reduce_axis((0, n), name='k') + + z = tvm.compute( + (1,), lambda _: tvm.sum( + x[k].astype('int32') * y[k].astype('int32'), axis=k)) + + def intrin_func(ins, outs): + xx, yy = ins + zz = outs[0] + ib = tvm.ir_builder.create() + + dp4a = zz.vstore(0, tvm.call_pure_extern('int32', '__dp4a', + xx.vload(0, dtype='int8x4'), + yy.vload(0, dtype='int8x4'), + zz.vload(0))) + ib.emit(dp4a) + + body = ib.get() + return body, zz.vstore(0, 0), body + + with tvm.build_config(data_alignment=4, offset_factor=1) as cfg: + binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name, + data_alignment=cfg.data_alignment, + offset_factor=cfg.offset_factor, + scope='local') for t in [x, y, z]} + return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds) + + +dot = intrin_dot() + + +@autotvm.template +def gemm_int8(n, m, l): + A = tvm.placeholder((n, l), name='A', dtype='int8') + B = tvm.placeholder((m, l), name='B', dtype='int8') + + k = tvm.reduce_axis((0, l), name='k') + C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k].astype('int32') * B[j, k].astype( + 'int32'), axis=k), name='C') + + cfg = autotvm.get_config() + s = tvm.create_schedule(C.op) + y, x = C.op.axis + + AA = s.cache_read(A, 'shared', [C]) + BB = s.cache_read(B, 'shared', [C]) + AL = s.cache_read(AA, 'local', [C]) + BL = s.cache_read(BB, 'local', [C]) + CC = s.cache_write(C, 'local') + + k = CC.op.reduce_axis[0] + + cfg.define_split('tile_k', cfg.axis(k), num_outputs=3, + filter=lambda entity: entity.size[2] == 4 and \ + entity.size[0] * 2 >= entity.size[1]) + + ko, kt, ki = cfg['tile_k'].apply(s, CC, k) + + s[CC].tensorize(ki, dot) + + block_x = tvm.thread_axis('blockIdx.x') + block_y = tvm.thread_axis('blockIdx.y') + thread_x = tvm.thread_axis('threadIdx.x') + thread_y = tvm.thread_axis('threadIdx.y') + + def block_size_filter(entity): + return entity.size[0] * 2 >= entity.size[1] * 2 and \ + entity.size[1] <= 16 and entity.size[3] <= 4 + cfg.define_split('tile_y', cfg.axis(y), num_outputs=4, filter=block_size_filter) + cfg.define_split('tile_x', cfg.axis(x), num_outputs=4, filter=block_size_filter) + by, tyz, ty, yi = cfg['tile_y'].apply(s, C, y) + bx, txz, tx, xi = cfg['tile_x'].apply(s, C, x) + + s[C].bind(by, block_y) + s[C].bind(bx, block_x) + s[C].bind(tyz, tvm.thread_axis('vthread')) + s[C].bind(txz, tvm.thread_axis('vthread')) + s[C].bind(ty, thread_y) + s[C].bind(tx, thread_x) + s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi) + + s[CC].compute_at(s[C], tx) + + yo, xo = CC.op.axis + s[CC].reorder(ko, kt, yo, xo, ki) + s[CC].unroll(kt) + + for stage in [AL, BL]: + s[stage].compute_at(s[CC], kt) + _, xi = s[stage].split(stage.op.axis[1], factor=4) + s[stage].vectorize(xi) + s[stage].double_buffer() + + cfg.define_knob('storage_align', [16, 48]) + for stage in [AA, BB]: + s[stage].storage_align(s[stage].op.axis[0], + cfg['storage_align'].val, 0) + s[stage].compute_at(s[CC], ko) + + fused = s[stage].fuse(*s[stage].op.axis) + ty, tx = s[stage].split(fused, nparts=cfg['tile_y'].size[2]) + tx, xi = s[stage].split(tx, nparts=cfg['tile_x'].size[2]) + _, xi = s[stage].split(xi, factor=16) + + s[stage].bind(ty, thread_y) + s[stage].bind(tx, thread_x) + s[stage].vectorize(xi) + + cfg.define_knob('auto_unroll_max_step', [512, 1500]) + s[C].pragma(by, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) + s[C].pragma(by, 'unroll_explicit', False) + + cfg.add_flop(n*m*l*2) + return s, [A, B, C] + + +if __name__ == '__main__': + N = 2048 + n = m = l = N + + logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) + task = autotvm.task.create(gemm_int8, args=(n, m, l), target='cuda') + print(task.config_space) + + measure_option = autotvm.measure_option( + measure_func='local', number=10, n_parallel=8, timeout=20) + log_name = 'gemm_int8.log' + if DO_TUNING: + tuner = autotvm.tuner.XGBTuner(task) + tuner.tune(n_trial=1000, measure_option=measure_option, + callbacks=[autotvm.callback.log_to_file(log_name)]) + + dispatch_context = autotvm.apply_history_best(log_name) + best_config = dispatch_context.query(task.target, task.workload) + print('\nBest config:') + print(best_config) + else: + config = task.config_space.get(PRETUNED_INDEX) + dispatch_context = autotvm.task.ApplyConfig(config) + print("Using pretuned config:") + print(config) + + with dispatch_context: + with tvm.target.create('cuda'): + s, arg_bufs = gemm_int8(n, m, l) + f = tvm.build(s, arg_bufs, 'cuda', name='gemm_int8') + + ctx = tvm.context('cuda', 0) + + a_np = np.random.randint(size=(n, l), low=-128, high=127, dtype='int8') + b_np = np.random.randint(size=(m, l), low=-128, high=127, dtype='int8') + + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(np.zeros((n, m), dtype='int32'), ctx) + f(a, b, c) + + np.testing.assert_allclose( + c.asnumpy(), + np.dot( + a_np.astype('int32'), + b_np.T.astype('int32')), + rtol=1e-5) + + num_ops = 2 * l * m * n + num_runs = 1000 + timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs) + t = timer_f(a, b, c).mean + GOPS = num_ops / (t * 1e3) / 1e6 + print("average time cost of %d runs = %g ms, %g GOPS." % + (num_runs, t * 1e3, GOPS)) From d879066af12d0ac09a875913675060413483b792 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 21 Aug 2018 16:35:59 -0700 Subject: [PATCH 034/529] [RUNTIME] Add TypedPackedFunc (#1626) --- include/tvm/runtime/packed_func.h | 251 ++++++++++++++++++++++++++++++ tests/cpp/packed_func_test.cc | 23 +++ 2 files changed, 274 insertions(+) diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index 63e8ca7cd16b..758d03b5b18b 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -118,6 +118,163 @@ class PackedFunc { FType body_; }; +/*! + * \brief Please refer to \ref TypedPackedFuncAnchor "TypedPackedFunc" + */ +template +class TypedPackedFunc; + +/*! + * \anchor TypedPackedFuncAnchor + * \brief A PackedFunc wrapper to provide typed function signature. + * It is backed by a PackedFunc internally. + * + * TypedPackedFunc enables compile time type checking. + * TypedPackedFunc works with the runtime system: + * - It can be passed as an argument of PackedFunc. + * - It can be assigned to TVMRetValue. + * - It can be directly converted to a type-erased PackedFunc. + * + * Developers should prefer TypedPackedFunc over PackedFunc in C++ code + * as it enables compile time checking. + * We can construct a TypedPackedFunc from a lambda function + * with the same signature. + * + * \code + * // user defined lambda function. + * auto addone = [](int x)->int { + * return x + 1; + * }; + * // We can directly convert + * // lambda function to TypedPackedFunc + * TypedPackedFunc ftyped(addone); + * // invoke the function. + * int y = ftyped(1); + * // Can be directly converted to PackedFunc + * PackedFunc packed = ftype; + * \endcode + * \tparam R The return value of the function. + * \tparam Args The argument signature of the function. + */ +template +class TypedPackedFunc { + public: + /*! \brief short hand for this function type */ + using TSelf = TypedPackedFunc; + /*! \brief default constructor */ + TypedPackedFunc() {} + /*! + * \brief construct by wrap a PackedFunc + * + * Example usage: + * \code + * PackedFunc packed([](TVMArgs args, TVMRetValue *rv) { + * int x = args[0]; + * *rv = x + 1; + * }); + * // construct from packed function + * TypedPackedFunc ftyped(packed); + * // call the typed version. + * CHECK_EQ(ftyped(1), 2); + * \endcode + * + * \param packed The packed function + */ + explicit TypedPackedFunc(PackedFunc packed) + : packed_(packed) { + } + /*! + * \brief construct from a lambda function with the same signature. + * + * Example usage: + * \code + * auto typed_lambda = [](int x)->int { return x + 1; } + * // construct from packed function + * TypedPackedFunc ftyped(typed_lambda); + * // call the typed version. + * CHECK_EQ(ftyped(1), 2); + * \endcode + * + * \param typed_lambda typed lambda function. + * \tparam FLambda the type of the lambda function. + */ + template + >::value>::type> + explicit TypedPackedFunc(const FLambda& typed_lambda) { + this->AssignTypedLambda(typed_lambda); + } + /*! + * \brief copy assignment operator from typed lambda + * + * Example usage: + * \code + * // construct from packed function + * TypedPackedFunc ftyped; + * ftyped = [](int x) { return x + 1; } + * // call the typed version. + * CHECK_EQ(ftyped(1), 2); + * \endcode + * + * \param typed_lambda typed lambda function. + * \tparam FLambda the type of the lambda function. + * \returns reference to self. + */ + template + >::value>::type> + TSelf& operator=(FLambda typed_lambda) { // NOLINT(*) + this->AssignTypedLambda(typed_lambda); + return *this; + } + /*! + * \brief copy assignment operator from PackedFunc. + * \param packed The packed function. + * \returns reference to self. + */ + TSelf& operator=(PackedFunc packed) { + packed_ = packed; + return *this; + } + /*! + * \brief Invoke the operator. + * \param args The arguments + * \returns The return value. + */ + inline R operator()(Args ...args) const; + /*! + * \brief convert to PackedFunc + * \return the internal PackedFunc + */ + operator PackedFunc() const { + return packed(); + } + /*! + * \return reference the internal PackedFunc + */ + const PackedFunc& packed() const { + return packed_; + } + + private: + friend class TVMRetValue; + /*! \brief The internal packed function */ + PackedFunc packed_; + /*! + * \brief Assign the packed field using a typed lambda function. + * + * \param flambda The lambda function. + * \tparam FLambda The lambda function type. + * \note We capture the lambda when possible for maximum efficiency. + */ + template + inline void AssignTypedLambda(FLambda flambda); +}; + /*! \brief Arguments into TVM functions. */ class TVMArgs { public: @@ -361,6 +518,10 @@ class TVMArgValue : public TVMPODValue_ { TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle); return *ptr(); } + template + operator TypedPackedFunc() const { + return TypedPackedFunc(operator PackedFunc()); + } operator Module() const { TVM_CHECK_TYPE_CODE(type_code_, kModuleHandle); return *ptr(); @@ -446,6 +607,10 @@ class TVMRetValue : public TVMPODValue_ { TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle); return *ptr(); } + template + operator TypedPackedFunc() const { + return TypedPackedFunc(operator PackedFunc()); + } operator Module() const { TVM_CHECK_TYPE_CODE(type_code_, kModuleHandle); return *ptr(); @@ -512,6 +677,10 @@ class TVMRetValue : public TVMPODValue_ { this->SwitchToClass(kFuncHandle, f); return *this; } + template + TVMRetValue& operator=(const TypedPackedFunc& f) { + return operator=(f.packed()); + } TVMRetValue& operator=(Module m) { this->SwitchToClass(kModuleHandle, m); return *this; @@ -847,6 +1016,10 @@ class TVMArgsSetter { values_[i].v_handle = const_cast(&value); type_codes_[i] = kFuncHandle; } + template + void operator()(size_t i, const TypedPackedFunc& value) const { // NOLINT(*) + operator()(i, value.packed()); + } void operator()(size_t i, const Module& value) const { // NOLINT(*) values_[i].v_handle = const_cast(&value); type_codes_[i] = kModuleHandle; @@ -894,6 +1067,84 @@ inline TVMRetValue PackedFunc::operator()(Args&& ...args) const { return rv; } +namespace detail { +template +struct unpack_call_dispatcher { + template + static void run(const F& f, + const TVMArgs& args_pack, + TVMRetValue* rv, + Args&&... unpacked_args) { + unpack_call_dispatcher + ::run(f, args_pack, rv, + std::forward(unpacked_args)..., + args_pack[index]); + } +}; + +template +struct unpack_call_dispatcher { + template + static void run(const F& f, + const TVMArgs& args_pack, + TVMRetValue* rv, + Args&&... unpacked_args) { + *rv = R(f(std::forward(unpacked_args)...)); + } +}; + +template +struct unpack_call_dispatcher { + template + static void run(const F& f, + const TVMArgs& args_pack, + TVMRetValue* rv, + Args&&... unpacked_args) { + f(std::forward(unpacked_args)...); + } +}; + +template +inline void unpack_call(const F& f, const TVMArgs& args, TVMRetValue* rv) { + unpack_call_dispatcher::run(f, args, rv); +} + +template +inline R call_packed(const PackedFunc& pf, Args&& ...args) { + return R(pf(std::forward(args)...)); +} + +template +struct typed_packed_call_dispatcher { + template + static inline R run(const PackedFunc& pf, Args&& ...args) { + return pf(std::forward(args)...); + } +}; + +template<> +struct typed_packed_call_dispatcher { + template + static inline void run(const PackedFunc& pf, Args&& ...args) { + pf(std::forward(args)...); + } +}; +} // namespace detail + +template +template +inline void TypedPackedFunc::AssignTypedLambda(FType flambda) { + packed_ = PackedFunc([flambda](const TVMArgs& args, TVMRetValue* rv) { + detail::unpack_call(flambda, args, rv); + }); +} + +template +inline R TypedPackedFunc::operator()(Args... args) const { + return detail::typed_packed_call_dispatcher + ::run(packed_, std::forward(args)...); +} + // extension and node type handling namespace detail { template diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc index 9b2f1df73731..abe26fabe9ea 100644 --- a/tests/cpp/packed_func_test.cc +++ b/tests/cpp/packed_func_test.cc @@ -135,6 +135,29 @@ TEST(PackedFunc, Type) { CHECK(get_type2("float32x2").operator Type() == Float(32, 2)); } +TEST(TypedPackedFunc, HighOrder) { + using namespace tvm; + using namespace tvm::runtime; + using Int1Func = TypedPackedFunc; + using Int2Func = TypedPackedFunc; + using BindFunc = TypedPackedFunc; + BindFunc ftyped; + ftyped = [](Int2Func f1, int value) -> Int1Func { + auto binded = [f1, value](int x) { + return f1(value, x); + }; + Int1Func x(binded); + return x; + }; + auto add = [](int x, int y) { return x + y; }; + CHECK_EQ(ftyped(Int2Func(add), 1)(2), 3); + PackedFunc f = ftyped(Int2Func(add), 1); + CHECK_EQ(f(3).operator int(), 4); + // call the type erased version. + Int1Func f1 = ftyped.packed()(Int2Func(add), 1); + CHECK_EQ(f1(3), 4); +} + // new namespoace namespace test { // register int vector as extension type From 7e520f2983ad5e9b1480c1a6dc905d713e79170d Mon Sep 17 00:00:00 2001 From: eqy Date: Tue, 21 Aug 2018 18:35:03 -0700 Subject: [PATCH 035/529] check in (#1629) --- .../java/ml/dmlc/tvm/tvmrpc/MainActivity.java | 24 ++++--------------- .../app/src/main/res/layout/content_main.xml | 11 --------- .../app/src/main/res/values/strings.xml | 3 +-- 3 files changed, 5 insertions(+), 33 deletions(-) diff --git a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java index d80008bbe258..2ea4e4cb7528 100644 --- a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java +++ b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java @@ -39,11 +39,9 @@ public class MainActivity extends AppCompatActivity { - private boolean skipRelaunch = true; // wait time before automatic restart of RPC Activity public static final int HANDLER_RESTART_DELAY = 5000; - private void showDialog(String title, String msg) { AlertDialog.Builder builder = new AlertDialog.Builder(this); builder.setTitle(title); @@ -91,7 +89,7 @@ private void setupRelaunch() { final Runnable rPCStarter = new Runnable() { public void run() { if (switchPersistent.isChecked()) { - System.err.println("relaunching RPC activity in 5s..."); + System.err.println("relaunching RPC activity..."); Intent intent = ((MainActivity) context).updateRPCPrefs(); startActivity(intent); } @@ -116,6 +114,7 @@ public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) { if (isChecked) { System.err.println("automatic RPC restart enabled..."); updateRPCPrefs(); + setupRelaunch(); } else { System.err.println("automatic RPC restart disabled..."); updateRPCPrefs(); @@ -123,29 +122,14 @@ public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) { } }); - Button startRPC = findViewById(R.id.button_start_rpc); - startRPC.setOnClickListener(new View.OnClickListener() { - public void onClick(View v) { - Intent intent = ((MainActivity) context).updateRPCPrefs(); - startActivity(intent); - } - }); - enableInputView(true); } @Override protected void onResume() { System.err.println("MainActivity onResume..."); - System.err.println("skipRelaunch: " + skipRelaunch); - // if this is the first time onResume is called, do nothing, otherwise we - // may double launch - if (!skipRelaunch) { - enableInputView(true); - setupRelaunch(); - } else { - skipRelaunch = false; - } + enableInputView(true); + setupRelaunch(); super.onResume(); } diff --git a/apps/android_rpc/app/src/main/res/layout/content_main.xml b/apps/android_rpc/app/src/main/res/layout/content_main.xml index 82be44d98451..69c1f76030df 100644 --- a/apps/android_rpc/app/src/main/res/layout/content_main.xml +++ b/apps/android_rpc/app/src/main/res/layout/content_main.xml @@ -78,15 +78,4 @@ android:textOn="@string/switch_on" /> - -