Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions python/tvm/relay/op/strategy/x86.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,10 +507,29 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
return strategy


def is_dynamic_shape(shape):
return any([isinstance(x, (tir.Any, tir.SizeVar)) for x in shape])


@dense_strategy.register("cpu")
def dense_strategy_cpu(attrs, inputs, out_type, target):
"""dense x86 strategy"""

strategy = _op.OpStrategy()
# For dynamic matrix-vector multiply we use a hand written kernel.
if (
isinstance(inputs[0].shape[0], (int, tir.IntImm))
and inputs[0].shape[0] == 1
and (is_dynamic_shape(inputs[0].shape) or is_dynamic_shape(inputs[1].shape))
):
strategy.add_implementation(
wrap_compute_dense(topi.x86.dense_dynamic),
wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
name="dense_dynamic.x86",
plevel=20,
)
return strategy

same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype
dtype = inputs[0].dtype
u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32"
Expand Down
60 changes: 60 additions & 0 deletions python/tvm/topi/x86/dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,3 +480,63 @@ def matmul_dnnl(
def schedule_matmul_dnnl(_, outs):
"""Create schedule for matmul_dnnl."""
return generic.schedule_extern(outs)


def dense_dynamic(A, B, bias, dtype):
"""Compute for dense with dynamic shape"""

assert A.shape[0] == 1, "Only dynamic matrix vector multiplication with vector LHS is supported"

# Right now we only support matrix-vector multiplication with lhs as the
# vector. We don't need to do much optimization here because the access
# pattern and parallelization are straight forward.
def gen_ir(a, b, c):
ib = tvm.tir.ir_builder.create()
A = ib.buffer_ptr(a)
B = ib.buffer_ptr(b)
C = ib.buffer_ptr(c)
with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j:
C[0, j] = 0.0
with ib.for_range(0, b.shape[1], name="k") as k:
C[0, j] += A[0, k] * B[j, k]
return ib.get()

def gen_ir_bias(a, b, bias, c):
ib = tvm.tir.ir_builder.create()
A = ib.buffer_ptr(a)
B = ib.buffer_ptr(b)
C = ib.buffer_ptr(c)
with ib.for_range(0, b.shape[0], name="j", kind="parallel") as j:
C[0, j] = bias[j]
with ib.for_range(0, b.shape[1], name="k") as k:
C[0, j] += A[0, k] * B[j, k]
return ib.get()

out_shape = (A.shape[0], B.shape[0])
out_buf = tvm.tir.decl_buffer(out_shape, dtype, "out_buf")
if bias is None:
out = te.extern(
[out_shape],
[A, B],
lambda ins, outs: gen_ir(*ins, *outs),
dtype=dtype,
out_buffers=[out_buf],
name="dense_dynamic_cpu",
tag="dense_dynamic_cpu",
)
else:
out = te.extern(
[out_shape],
[A, B, bias],
lambda ins, outs: gen_ir_bias(*ins, *outs),
dtype=dtype,
out_buffers=[out_buf],
name="dense_dynamic_cpu",
tag="dense_dynamic_cpu",
)
return out


def schedule_dense_dynamic(outs):
"""Create schedule for dense_dynamic."""
return generic.schedule_extern(outs)
3 changes: 3 additions & 0 deletions tests/python/topi/python/test_topi_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
"cpu": [
(topi.x86.dense_nopack, topi.x86.schedule_dense_nopack),
(topi.x86.dense_pack, topi.x86.schedule_dense_pack),
(topi.x86.dense_dynamic, topi.x86.schedule_dense_dynamic),
],
"gpu": [
(topi.gpu.dense_small_batch, topi.gpu.schedule_dense_small_batch),
Expand Down Expand Up @@ -136,6 +137,8 @@ def test_dense(
implementations = tvm.topi.testing.dispatch(target, _dense_implementations)

for fcompute, fschedule in implementations:
if fcompute == topi.x86.dense_dynamic and (batch_size != 1 or in_dtype != "float32"):
continue
with tvm.target.Target(target):
D = fcompute(A, B, C if use_bias else None, out_dtype)
D = topi.nn.relu(D)
Expand Down