apache · junrushao · Nov 21, 2022 · Nov 17, 2022 · Nov 17, 2022 · Nov 17, 2022
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
@@ -507,10 +507,29 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
     return strategy
 
 
+def is_dynamic_shape(shape):
+    return any([isinstance(x, (tir.Any, tir.SizeVar)) for x in shape])
+
+
 @dense_strategy.register("cpu")
 def dense_strategy_cpu(attrs, inputs, out_type, target):
     """dense x86 strategy"""
+
     strategy = _op.OpStrategy()
+    # For dynamic matrix-vector multiply we use a hand written kernel.
+    if (
+        isinstance(inputs[0].shape[0], (int, tir.IntImm))
+        and inputs[0].shape[0] == 1
+        and (is_dynamic_shape(inputs[0].shape) or is_dynamic_shape(inputs[1].shape))
+    ):
+        strategy.add_implementation(
+            wrap_compute_dense(topi.x86.dense_dynamic),
+            wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
+            name="dense_dynamic.x86",
+            plevel=20,
+        )
+        return strategy
+
     same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype
     dtype = inputs[0].dtype
     u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32"

diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
@@ -480,3 +480,63 @@ def matmul_dnnl(
 def schedule_matmul_dnnl(_, outs):
     """Create schedule for matmul_dnnl."""
     return generic.schedule_extern(outs)
+
+
+def dense_dynamic(A, B, bias, dtype):
+    """Compute for dense with dynamic shape"""
+
+    assert A.shape[0] == 1, "Only dynamic matrix vector multiplication with vector LHS is supported"
+
+    # Right now we only support matrix-vector multiplication with lhs as the
+    # vector. We don't need to do much optimization here because the access
+    # pattern and parallelization are straight forward.
+    def gen_ir(a, b, c):
+        ib = tvm.tir.ir_builder.create()
+        A = ib.buffer_ptr(a)
+        B = ib.buffer_ptr(b)
+        C = ib.buffer_ptr(c)
+        with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j:
+            C[0, j] = 0.0
+            with ib.for_range(0, b.shape[1], name="k") as k:
+                C[0, j] += A[0, k] * B[j, k]
+        return ib.get()
+
+    def gen_ir_bias(a, b, bias, c):
+        ib = tvm.tir.ir_builder.create()
+        A = ib.buffer_ptr(a)
+        B = ib.buffer_ptr(b)
+        C = ib.buffer_ptr(c)
+        with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j:
+            C[0, j] = bias[j]
+            with ib.for_range(0, b.shape[1], name="k") as k:
+                C[0, j] += A[0, k] * B[j, k]
+        return ib.get()
+
+    out_shape = (A.shape[0], B.shape[0])
+    out_buf = tvm.tir.decl_buffer(out_shape, dtype, "out_buf")
+    if bias is None:
+        out = te.extern(
+            [out_shape],
+            [A, B],
+            lambda ins, outs: gen_ir(*ins, *outs),
+            dtype=dtype,
+            out_buffers=[out_buf],
+            name="dense_dynamic_cpu",
+            tag="dense_dynamic_cpu",
+        )
+    else:
+        out = te.extern(
+            [out_shape],
+            [A, B, bias],
+            lambda ins, outs: gen_ir_bias(*ins, *outs),
+            dtype=dtype,
+            out_buffers=[out_buf],
+            name="dense_dynamic_cpu",
+            tag="dense_dynamic_cpu",
+        )
+    return out
+
+
+def schedule_dense_dynamic(outs):
+    """Create schedule for dense_dynamic."""
+    return generic.schedule_extern(outs)
diff --git a/tests/python/topi/python/test_topi_dense.py b/tests/python/topi/python/test_topi_dense.py
@@ -45,6 +45,7 @@
     "cpu": [
         (topi.x86.dense_nopack, topi.x86.schedule_dense_nopack),
         (topi.x86.dense_pack, topi.x86.schedule_dense_pack),
+        (topi.x86.dense_dynamic, topi.x86.schedule_dense_dynamic),
     ],
     "gpu": [
         (topi.gpu.dense_small_batch, topi.gpu.schedule_dense_small_batch),
@@ -136,6 +137,8 @@ def test_dense(
         implementations = tvm.topi.testing.dispatch(target, _dense_implementations)
 
     for fcompute, fschedule in implementations:
+        if fcompute == topi.x86.dense_dynamic and (batch_size != 1 or in_dtype != "float32"):
+            continue
         with tvm.target.Target(target):
             D = fcompute(A, B, C if use_bias else None, out_dtype)
             D = topi.nn.relu(D)