diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 10d7fbb3a926..897f7c4e588f 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -507,10 +507,29 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target): return strategy +def is_dynamic_shape(shape): + return any([isinstance(x, (tir.Any, tir.SizeVar)) for x in shape]) + + @dense_strategy.register("cpu") def dense_strategy_cpu(attrs, inputs, out_type, target): """dense x86 strategy""" + strategy = _op.OpStrategy() + # For dynamic matrix-vector multiply we use a hand written kernel. + if ( + isinstance(inputs[0].shape[0], (int, tir.IntImm)) + and inputs[0].shape[0] == 1 + and (is_dynamic_shape(inputs[0].shape) or is_dynamic_shape(inputs[1].shape)) + ): + strategy.add_implementation( + wrap_compute_dense(topi.x86.dense_dynamic), + wrap_topi_schedule(topi.x86.schedule_dense_dynamic), + name="dense_dynamic.x86", + plevel=20, + ) + return strategy + same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype dtype = inputs[0].dtype u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32" diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py index 8ddb8d7a5c9a..65a803781a57 100644 --- a/python/tvm/topi/x86/dense.py +++ b/python/tvm/topi/x86/dense.py @@ -480,3 +480,63 @@ def matmul_dnnl( def schedule_matmul_dnnl(_, outs): """Create schedule for matmul_dnnl.""" return generic.schedule_extern(outs) + + +def dense_dynamic(A, B, bias, dtype): + """Compute for dense with dynamic shape""" + + assert A.shape[0] == 1, "Only dynamic matrix vector multiplication with vector LHS is supported" + + # Right now we only support matrix-vector multiplication with lhs as the + # vector. We don't need to do much optimization here because the access + # pattern and parallelization are straight forward. + def gen_ir(a, b, c): + ib = tvm.tir.ir_builder.create() + A = ib.buffer_ptr(a) + B = ib.buffer_ptr(b) + C = ib.buffer_ptr(c) + with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j: + C[0, j] = 0.0 + with ib.for_range(0, b.shape[1], name="k") as k: + C[0, j] += A[0, k] * B[j, k] + return ib.get() + + def gen_ir_bias(a, b, bias, c): + ib = tvm.tir.ir_builder.create() + A = ib.buffer_ptr(a) + B = ib.buffer_ptr(b) + C = ib.buffer_ptr(c) + with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j: + C[0, j] = bias[j] + with ib.for_range(0, b.shape[1], name="k") as k: + C[0, j] += A[0, k] * B[j, k] + return ib.get() + + out_shape = (A.shape[0], B.shape[0]) + out_buf = tvm.tir.decl_buffer(out_shape, dtype, "out_buf") + if bias is None: + out = te.extern( + [out_shape], + [A, B], + lambda ins, outs: gen_ir(*ins, *outs), + dtype=dtype, + out_buffers=[out_buf], + name="dense_dynamic_cpu", + tag="dense_dynamic_cpu", + ) + else: + out = te.extern( + [out_shape], + [A, B, bias], + lambda ins, outs: gen_ir_bias(*ins, *outs), + dtype=dtype, + out_buffers=[out_buf], + name="dense_dynamic_cpu", + tag="dense_dynamic_cpu", + ) + return out + + +def schedule_dense_dynamic(outs): + """Create schedule for dense_dynamic.""" + return generic.schedule_extern(outs) diff --git a/tests/python/topi/python/test_topi_dense.py b/tests/python/topi/python/test_topi_dense.py index 7e65e2449fd7..8f6523366878 100644 --- a/tests/python/topi/python/test_topi_dense.py +++ b/tests/python/topi/python/test_topi_dense.py @@ -45,6 +45,7 @@ "cpu": [ (topi.x86.dense_nopack, topi.x86.schedule_dense_nopack), (topi.x86.dense_pack, topi.x86.schedule_dense_pack), + (topi.x86.dense_dynamic, topi.x86.schedule_dense_dynamic), ], "gpu": [ (topi.gpu.dense_small_batch, topi.gpu.schedule_dense_small_batch), @@ -136,6 +137,8 @@ def test_dense( implementations = tvm.topi.testing.dispatch(target, _dense_implementations) for fcompute, fschedule in implementations: + if fcompute == topi.x86.dense_dynamic and (batch_size != 1 or in_dtype != "float32"): + continue with tvm.target.Target(target): D = fcompute(A, B, C if use_bias else None, out_dtype) D = topi.nn.relu(D)