From 1041a8c7381e4ea65c10b181fcf7d6c9de1c5f99 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 17 Nov 2022 15:25:05 -0800
Subject: [PATCH 1/7] [TOPI] Add handwritten matvec for dynamic cases

Add a handwritten matrix-vector multiplication implementation for
dynamic cases on cpu. This avoids crashing when a dynamic shape is
present.
---
 python/tvm/relay/op/strategy/x86.py         | 16 ++++++++
 python/tvm/topi/x86/dense.py                | 41 +++++++++++++++++++++
 tests/python/topi/python/test_topi_dense.py |  3 ++
 3 files changed, 60 insertions(+)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 10d7fbb3a926..f5592ba20565 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -507,10 +507,26 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
     return strategy
 
 
+def is_any(shape):
+    return any([isinstance(x, tir.Any) or isinstance(x, tir.SizeVar) for x in shape])
+
+
 @dense_strategy.register("cpu")
 def dense_strategy_cpu(attrs, inputs, out_type, target):
     """dense x86 strategy"""
+
     strategy = _op.OpStrategy()
+    # For dynamic shapes we use a hand written kernel. Right now it only
+    # supports matrix-vector multiplication.
+    if is_any(inputs[0].shape) or is_any(inputs[1].shape):
+        strategy.add_implementation(
+            wrap_compute_dense(topi.x86.dense_dynamic),
+            wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
+            name="dense_dynamic.x86",
+            plevel=20,
+        )
+        return strategy
+
     same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype
     dtype = inputs[0].dtype
     u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32"
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index 8ddb8d7a5c9a..ed992ffc7e20 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -480,3 +480,44 @@ def matmul_dnnl(
 def schedule_matmul_dnnl(_, outs):
     """Create schedule for matmul_dnnl."""
     return generic.schedule_extern(outs)
+
+
+def dense_dynamic(A, B, bias, dtype):
+    """Compute for dense with dynamic shape"""
+
+    # Right now we only support matrix-vector multiplication with lhs as the
+    # vector. We don't need to do much optimization here because the access
+    # pattern and parallelization are straight forward.
+    def gen_ir(a, b, bias, c):
+        ib = tvm.tir.ir_builder.create()
+        A = ib.buffer_ptr(a)
+        B = ib.buffer_ptr(b)
+        C = ib.buffer_ptr(c)
+        with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j:
+            if bias is None:
+                C[0, j] = 0.0
+            else:
+                C[0, j] = bias[j]
+            with ib.for_range(0, b.shape[1], name="k") as k:
+                C[0, j] += A[0, k] * B[j, k]
+        return ib.get()
+
+    out_shape = (A.shape[0], B.shape[0])
+    out_buf = tvm.tir.decl_buffer(out_shape, dtype, "out_buf")
+    out = te.extern(
+        [out_shape],
+        [A, B, bias],
+        lambda ins, outs: gen_ir(*ins, *outs),
+        dtype=dtype,
+        out_buffers=[out_buf],
+        name="dense_dynamic_cpu",
+        tag="dense_dynamic_cpu",
+    )
+    return out
+
+
+def schedule_dense_dynamic(outs):
+    """Create schedule for dense_dynamic."""
+    s = te.create_schedule([o.op for o in outs])
+    return s
+    return generic.schedule_extern(outs)
diff --git a/tests/python/topi/python/test_topi_dense.py b/tests/python/topi/python/test_topi_dense.py
index 7e65e2449fd7..8f6523366878 100644
--- a/tests/python/topi/python/test_topi_dense.py
+++ b/tests/python/topi/python/test_topi_dense.py
@@ -45,6 +45,7 @@
     "cpu": [
         (topi.x86.dense_nopack, topi.x86.schedule_dense_nopack),
         (topi.x86.dense_pack, topi.x86.schedule_dense_pack),
+        (topi.x86.dense_dynamic, topi.x86.schedule_dense_dynamic),
     ],
     "gpu": [
         (topi.gpu.dense_small_batch, topi.gpu.schedule_dense_small_batch),
@@ -136,6 +137,8 @@ def test_dense(
         implementations = tvm.topi.testing.dispatch(target, _dense_implementations)
 
     for fcompute, fschedule in implementations:
+        if fcompute == topi.x86.dense_dynamic and (batch_size != 1 or in_dtype != "float32"):
+            continue
         with tvm.target.Target(target):
             D = fcompute(A, B, C if use_bias else None, out_dtype)
             D = topi.nn.relu(D)

From 6dae5395ad20adbd8edf562db67b77d690eacc81 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 17 Nov 2022 15:48:26 -0800
Subject: [PATCH 2/7] rename is_any to has_dynamic_shape, check for vector lhs

---
 python/tvm/relay/op/strategy/x86.py | 4 ++--
 python/tvm/topi/x86/dense.py        | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index f5592ba20565..53915f42c84d 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -507,7 +507,7 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
     return strategy
 
 
-def is_any(shape):
+def is_dynamic_shape(shape):
     return any([isinstance(x, tir.Any) or isinstance(x, tir.SizeVar) for x in shape])
 
 
@@ -518,7 +518,7 @@ def dense_strategy_cpu(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
     # For dynamic shapes we use a hand written kernel. Right now it only
     # supports matrix-vector multiplication.
-    if is_any(inputs[0].shape) or is_any(inputs[1].shape):
+    if is_dynamic_shape(inputs[0].shape) or is_dynamic_shape(inputs[1].shape):
         strategy.add_implementation(
             wrap_compute_dense(topi.x86.dense_dynamic),
             wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index ed992ffc7e20..8bc6d1e9bdb3 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -485,6 +485,8 @@ def schedule_matmul_dnnl(_, outs):
 def dense_dynamic(A, B, bias, dtype):
     """Compute for dense with dynamic shape"""
 
+    assert A[0].shape == 1, "Only dynamic matrix vector multiplication with vector LHS is supported"
+
     # Right now we only support matrix-vector multiplication with lhs as the
     # vector. We don't need to do much optimization here because the access
     # pattern and parallelization are straight forward.

From f13312a78ae61002f37be594032f2948f9aac7a9 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 17 Nov 2022 15:57:46 -0800
Subject: [PATCH 3/7] whoops

---
 python/tvm/topi/x86/dense.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index 8bc6d1e9bdb3..e4b13c4ed941 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -520,6 +520,4 @@ def gen_ir(a, b, bias, c):
 
 def schedule_dense_dynamic(outs):
     """Create schedule for dense_dynamic."""
-    s = te.create_schedule([o.op for o in outs])
-    return s
     return generic.schedule_extern(outs)

From 52697e7cc990e02564fed611192b2a9ec8b78e39 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 17 Nov 2022 15:58:13 -0800
Subject: [PATCH 4/7] lint

---
 python/tvm/relay/op/strategy/x86.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 53915f42c84d..14902697d09a 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -508,7 +508,7 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
 
 
 def is_dynamic_shape(shape):
-    return any([isinstance(x, tir.Any) or isinstance(x, tir.SizeVar) for x in shape])
+    return any([isinstance(x, (tir.Any, tir.SizeVar)) for x in shape])
 
 
 @dense_strategy.register("cpu")

From 60429c34001f5bb0d57bdf21ed319521c85b00fb Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 18 Nov 2022 09:55:44 -0800
Subject: [PATCH 5/7] handle bias correctly, only dispatch to handwritten
 kernel for matvec

---
 python/tvm/relay/op/strategy/x86.py |  5 ++-
 python/tvm/topi/x86/dense.py        | 49 ++++++++++++++++++++---------
 2 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 14902697d09a..976ebd111221 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -516,9 +516,8 @@ def dense_strategy_cpu(attrs, inputs, out_type, target):
     """dense x86 strategy"""
 
     strategy = _op.OpStrategy()
-    # For dynamic shapes we use a hand written kernel. Right now it only
-    # supports matrix-vector multiplication.
-    if is_dynamic_shape(inputs[0].shape) or is_dynamic_shape(inputs[1].shape):
+    # For dynamic matrix-vector multiply we use a hand written kernel.
+    if inputs[0].shape[0] == 1 and is_dynamic_shape(inputs[0].shape) or is_dynamic_shape(inputs[1].shape):
         strategy.add_implementation(
             wrap_compute_dense(topi.x86.dense_dynamic),
             wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index e4b13c4ed941..65a803781a57 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -485,36 +485,55 @@ def schedule_matmul_dnnl(_, outs):
 def dense_dynamic(A, B, bias, dtype):
     """Compute for dense with dynamic shape"""
 
-    assert A[0].shape == 1, "Only dynamic matrix vector multiplication with vector LHS is supported"
+    assert A.shape[0] == 1, "Only dynamic matrix vector multiplication with vector LHS is supported"
 
     # Right now we only support matrix-vector multiplication with lhs as the
     # vector. We don't need to do much optimization here because the access
     # pattern and parallelization are straight forward.
-    def gen_ir(a, b, bias, c):
+    def gen_ir(a, b, c):
         ib = tvm.tir.ir_builder.create()
         A = ib.buffer_ptr(a)
         B = ib.buffer_ptr(b)
         C = ib.buffer_ptr(c)
         with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j:
-            if bias is None:
-                C[0, j] = 0.0
-            else:
-                C[0, j] = bias[j]
+            C[0, j] = 0.0
+            with ib.for_range(0, b.shape[1], name="k") as k:
+                C[0, j] += A[0, k] * B[j, k]
+        return ib.get()
+
+    def gen_ir_bias(a, b, bias, c):
+        ib = tvm.tir.ir_builder.create()
+        A = ib.buffer_ptr(a)
+        B = ib.buffer_ptr(b)
+        C = ib.buffer_ptr(c)
+        with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j:
+            C[0, j] = bias[j]
             with ib.for_range(0, b.shape[1], name="k") as k:
                 C[0, j] += A[0, k] * B[j, k]
         return ib.get()
 
     out_shape = (A.shape[0], B.shape[0])
     out_buf = tvm.tir.decl_buffer(out_shape, dtype, "out_buf")
-    out = te.extern(
-        [out_shape],
-        [A, B, bias],
-        lambda ins, outs: gen_ir(*ins, *outs),
-        dtype=dtype,
-        out_buffers=[out_buf],
-        name="dense_dynamic_cpu",
-        tag="dense_dynamic_cpu",
-    )
+    if bias is None:
+        out = te.extern(
+            [out_shape],
+            [A, B],
+            lambda ins, outs: gen_ir(*ins, *outs),
+            dtype=dtype,
+            out_buffers=[out_buf],
+            name="dense_dynamic_cpu",
+            tag="dense_dynamic_cpu",
+        )
+    else:
+        out = te.extern(
+            [out_shape],
+            [A, B, bias],
+            lambda ins, outs: gen_ir_bias(*ins, *outs),
+            dtype=dtype,
+            out_buffers=[out_buf],
+            name="dense_dynamic_cpu",
+            tag="dense_dynamic_cpu",
+        )
     return out
 
 

From b02713e77be272efb138abe9a2dd9f793e6ea410 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 18 Nov 2022 10:08:14 -0800
Subject: [PATCH 6/7] formatting

---
 python/tvm/relay/op/strategy/x86.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 976ebd111221..d6fe2a9ce0b4 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -517,7 +517,11 @@ def dense_strategy_cpu(attrs, inputs, out_type, target):
 
     strategy = _op.OpStrategy()
     # For dynamic matrix-vector multiply we use a hand written kernel.
-    if inputs[0].shape[0] == 1 and is_dynamic_shape(inputs[0].shape) or is_dynamic_shape(inputs[1].shape):
+    if (
+        inputs[0].shape[0] == 1
+        and is_dynamic_shape(inputs[0].shape)
+        or is_dynamic_shape(inputs[1].shape)
+    ):
         strategy.add_implementation(
             wrap_compute_dense(topi.x86.dense_dynamic),
             wrap_topi_schedule(topi.x86.schedule_dense_dynamic),

From e69073757daa0198272c0d8e5524e76a69f69284 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Mon, 21 Nov 2022 10:10:37 -0800
Subject: [PATCH 7/7] fix conditional for dispatch to implementation

---
 python/tvm/relay/op/strategy/x86.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index d6fe2a9ce0b4..897f7c4e588f 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -518,9 +518,9 @@ def dense_strategy_cpu(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
     # For dynamic matrix-vector multiply we use a hand written kernel.
     if (
-        inputs[0].shape[0] == 1
-        and is_dynamic_shape(inputs[0].shape)
-        or is_dynamic_shape(inputs[1].shape)
+        isinstance(inputs[0].shape[0], (int, tir.IntImm))
+        and inputs[0].shape[0] == 1
+        and (is_dynamic_shape(inputs[0].shape) or is_dynamic_shape(inputs[1].shape))
     ):
         strategy.add_implementation(
             wrap_compute_dense(topi.x86.dense_dynamic),