From 1041a8c7381e4ea65c10b181fcf7d6c9de1c5f99 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Thu, 17 Nov 2022 15:25:05 -0800 Subject: [PATCH 1/7] [TOPI] Add handwritten matvec for dynamic cases Add a handwritten matrix-vector multiplication implementation for dynamic cases on cpu. This avoids crashing when a dynamic shape is present. --- python/tvm/relay/op/strategy/x86.py | 16 ++++++++ python/tvm/topi/x86/dense.py | 41 +++++++++++++++++++++ tests/python/topi/python/test_topi_dense.py | 3 ++ 3 files changed, 60 insertions(+) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 10d7fbb3a926..f5592ba20565 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -507,10 +507,26 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target): return strategy +def is_any(shape): + return any([isinstance(x, tir.Any) or isinstance(x, tir.SizeVar) for x in shape]) + + @dense_strategy.register("cpu") def dense_strategy_cpu(attrs, inputs, out_type, target): """dense x86 strategy""" + strategy = _op.OpStrategy() + # For dynamic shapes we use a hand written kernel. Right now it only + # supports matrix-vector multiplication. + if is_any(inputs[0].shape) or is_any(inputs[1].shape): + strategy.add_implementation( + wrap_compute_dense(topi.x86.dense_dynamic), + wrap_topi_schedule(topi.x86.schedule_dense_dynamic), + name="dense_dynamic.x86", + plevel=20, + ) + return strategy + same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype dtype = inputs[0].dtype u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32" diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py index 8ddb8d7a5c9a..ed992ffc7e20 100644 --- a/python/tvm/topi/x86/dense.py +++ b/python/tvm/topi/x86/dense.py @@ -480,3 +480,44 @@ def matmul_dnnl( def schedule_matmul_dnnl(_, outs): """Create schedule for matmul_dnnl.""" return generic.schedule_extern(outs) + + +def dense_dynamic(A, B, bias, dtype): + """Compute for dense with dynamic shape""" + + # Right now we only support matrix-vector multiplication with lhs as the + # vector. We don't need to do much optimization here because the access + # pattern and parallelization are straight forward. + def gen_ir(a, b, bias, c): + ib = tvm.tir.ir_builder.create() + A = ib.buffer_ptr(a) + B = ib.buffer_ptr(b) + C = ib.buffer_ptr(c) + with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j: + if bias is None: + C[0, j] = 0.0 + else: + C[0, j] = bias[j] + with ib.for_range(0, b.shape[1], name="k") as k: + C[0, j] += A[0, k] * B[j, k] + return ib.get() + + out_shape = (A.shape[0], B.shape[0]) + out_buf = tvm.tir.decl_buffer(out_shape, dtype, "out_buf") + out = te.extern( + [out_shape], + [A, B, bias], + lambda ins, outs: gen_ir(*ins, *outs), + dtype=dtype, + out_buffers=[out_buf], + name="dense_dynamic_cpu", + tag="dense_dynamic_cpu", + ) + return out + + +def schedule_dense_dynamic(outs): + """Create schedule for dense_dynamic.""" + s = te.create_schedule([o.op for o in outs]) + return s + return generic.schedule_extern(outs) diff --git a/tests/python/topi/python/test_topi_dense.py b/tests/python/topi/python/test_topi_dense.py index 7e65e2449fd7..8f6523366878 100644 --- a/tests/python/topi/python/test_topi_dense.py +++ b/tests/python/topi/python/test_topi_dense.py @@ -45,6 +45,7 @@ "cpu": [ (topi.x86.dense_nopack, topi.x86.schedule_dense_nopack), (topi.x86.dense_pack, topi.x86.schedule_dense_pack), + (topi.x86.dense_dynamic, topi.x86.schedule_dense_dynamic), ], "gpu": [ (topi.gpu.dense_small_batch, topi.gpu.schedule_dense_small_batch), @@ -136,6 +137,8 @@ def test_dense( implementations = tvm.topi.testing.dispatch(target, _dense_implementations) for fcompute, fschedule in implementations: + if fcompute == topi.x86.dense_dynamic and (batch_size != 1 or in_dtype != "float32"): + continue with tvm.target.Target(target): D = fcompute(A, B, C if use_bias else None, out_dtype) D = topi.nn.relu(D) From 6dae5395ad20adbd8edf562db67b77d690eacc81 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Thu, 17 Nov 2022 15:48:26 -0800 Subject: [PATCH 2/7] rename is_any to has_dynamic_shape, check for vector lhs --- python/tvm/relay/op/strategy/x86.py | 4 ++-- python/tvm/topi/x86/dense.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index f5592ba20565..53915f42c84d 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -507,7 +507,7 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target): return strategy -def is_any(shape): +def is_dynamic_shape(shape): return any([isinstance(x, tir.Any) or isinstance(x, tir.SizeVar) for x in shape]) @@ -518,7 +518,7 @@ def dense_strategy_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() # For dynamic shapes we use a hand written kernel. Right now it only # supports matrix-vector multiplication. - if is_any(inputs[0].shape) or is_any(inputs[1].shape): + if is_dynamic_shape(inputs[0].shape) or is_dynamic_shape(inputs[1].shape): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_dynamic), wrap_topi_schedule(topi.x86.schedule_dense_dynamic), diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py index ed992ffc7e20..8bc6d1e9bdb3 100644 --- a/python/tvm/topi/x86/dense.py +++ b/python/tvm/topi/x86/dense.py @@ -485,6 +485,8 @@ def schedule_matmul_dnnl(_, outs): def dense_dynamic(A, B, bias, dtype): """Compute for dense with dynamic shape""" + assert A[0].shape == 1, "Only dynamic matrix vector multiplication with vector LHS is supported" + # Right now we only support matrix-vector multiplication with lhs as the # vector. We don't need to do much optimization here because the access # pattern and parallelization are straight forward. From f13312a78ae61002f37be594032f2948f9aac7a9 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Thu, 17 Nov 2022 15:57:46 -0800 Subject: [PATCH 3/7] whoops --- python/tvm/topi/x86/dense.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py index 8bc6d1e9bdb3..e4b13c4ed941 100644 --- a/python/tvm/topi/x86/dense.py +++ b/python/tvm/topi/x86/dense.py @@ -520,6 +520,4 @@ def gen_ir(a, b, bias, c): def schedule_dense_dynamic(outs): """Create schedule for dense_dynamic.""" - s = te.create_schedule([o.op for o in outs]) - return s return generic.schedule_extern(outs) From 52697e7cc990e02564fed611192b2a9ec8b78e39 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Thu, 17 Nov 2022 15:58:13 -0800 Subject: [PATCH 4/7] lint --- python/tvm/relay/op/strategy/x86.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 53915f42c84d..14902697d09a 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -508,7 +508,7 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target): def is_dynamic_shape(shape): - return any([isinstance(x, tir.Any) or isinstance(x, tir.SizeVar) for x in shape]) + return any([isinstance(x, (tir.Any, tir.SizeVar)) for x in shape]) @dense_strategy.register("cpu") From 60429c34001f5bb0d57bdf21ed319521c85b00fb Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Fri, 18 Nov 2022 09:55:44 -0800 Subject: [PATCH 5/7] handle bias correctly, only dispatch to handwritten kernel for matvec --- python/tvm/relay/op/strategy/x86.py | 5 ++- python/tvm/topi/x86/dense.py | 49 ++++++++++++++++++++--------- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 14902697d09a..976ebd111221 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -516,9 +516,8 @@ def dense_strategy_cpu(attrs, inputs, out_type, target): """dense x86 strategy""" strategy = _op.OpStrategy() - # For dynamic shapes we use a hand written kernel. Right now it only - # supports matrix-vector multiplication. - if is_dynamic_shape(inputs[0].shape) or is_dynamic_shape(inputs[1].shape): + # For dynamic matrix-vector multiply we use a hand written kernel. + if inputs[0].shape[0] == 1 and is_dynamic_shape(inputs[0].shape) or is_dynamic_shape(inputs[1].shape): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_dynamic), wrap_topi_schedule(topi.x86.schedule_dense_dynamic), diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py index e4b13c4ed941..65a803781a57 100644 --- a/python/tvm/topi/x86/dense.py +++ b/python/tvm/topi/x86/dense.py @@ -485,36 +485,55 @@ def schedule_matmul_dnnl(_, outs): def dense_dynamic(A, B, bias, dtype): """Compute for dense with dynamic shape""" - assert A[0].shape == 1, "Only dynamic matrix vector multiplication with vector LHS is supported" + assert A.shape[0] == 1, "Only dynamic matrix vector multiplication with vector LHS is supported" # Right now we only support matrix-vector multiplication with lhs as the # vector. We don't need to do much optimization here because the access # pattern and parallelization are straight forward. - def gen_ir(a, b, bias, c): + def gen_ir(a, b, c): ib = tvm.tir.ir_builder.create() A = ib.buffer_ptr(a) B = ib.buffer_ptr(b) C = ib.buffer_ptr(c) with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j: - if bias is None: - C[0, j] = 0.0 - else: - C[0, j] = bias[j] + C[0, j] = 0.0 + with ib.for_range(0, b.shape[1], name="k") as k: + C[0, j] += A[0, k] * B[j, k] + return ib.get() + + def gen_ir_bias(a, b, bias, c): + ib = tvm.tir.ir_builder.create() + A = ib.buffer_ptr(a) + B = ib.buffer_ptr(b) + C = ib.buffer_ptr(c) + with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j: + C[0, j] = bias[j] with ib.for_range(0, b.shape[1], name="k") as k: C[0, j] += A[0, k] * B[j, k] return ib.get() out_shape = (A.shape[0], B.shape[0]) out_buf = tvm.tir.decl_buffer(out_shape, dtype, "out_buf") - out = te.extern( - [out_shape], - [A, B, bias], - lambda ins, outs: gen_ir(*ins, *outs), - dtype=dtype, - out_buffers=[out_buf], - name="dense_dynamic_cpu", - tag="dense_dynamic_cpu", - ) + if bias is None: + out = te.extern( + [out_shape], + [A, B], + lambda ins, outs: gen_ir(*ins, *outs), + dtype=dtype, + out_buffers=[out_buf], + name="dense_dynamic_cpu", + tag="dense_dynamic_cpu", + ) + else: + out = te.extern( + [out_shape], + [A, B, bias], + lambda ins, outs: gen_ir_bias(*ins, *outs), + dtype=dtype, + out_buffers=[out_buf], + name="dense_dynamic_cpu", + tag="dense_dynamic_cpu", + ) return out From b02713e77be272efb138abe9a2dd9f793e6ea410 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Fri, 18 Nov 2022 10:08:14 -0800 Subject: [PATCH 6/7] formatting --- python/tvm/relay/op/strategy/x86.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 976ebd111221..d6fe2a9ce0b4 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -517,7 +517,11 @@ def dense_strategy_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() # For dynamic matrix-vector multiply we use a hand written kernel. - if inputs[0].shape[0] == 1 and is_dynamic_shape(inputs[0].shape) or is_dynamic_shape(inputs[1].shape): + if ( + inputs[0].shape[0] == 1 + and is_dynamic_shape(inputs[0].shape) + or is_dynamic_shape(inputs[1].shape) + ): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_dynamic), wrap_topi_schedule(topi.x86.schedule_dense_dynamic), From e69073757daa0198272c0d8e5524e76a69f69284 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Mon, 21 Nov 2022 10:10:37 -0800 Subject: [PATCH 7/7] fix conditional for dispatch to implementation --- python/tvm/relay/op/strategy/x86.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index d6fe2a9ce0b4..897f7c4e588f 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -518,9 +518,9 @@ def dense_strategy_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() # For dynamic matrix-vector multiply we use a hand written kernel. if ( - inputs[0].shape[0] == 1 - and is_dynamic_shape(inputs[0].shape) - or is_dynamic_shape(inputs[1].shape) + isinstance(inputs[0].shape[0], (int, tir.IntImm)) + and inputs[0].shape[0] == 1 + and (is_dynamic_shape(inputs[0].shape) or is_dynamic_shape(inputs[1].shape)) ): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_dynamic),