Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions python/tvm/relax/backend/contrib/cutlass.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,33 @@ def residual_block_patterns():
return patterns


def _check_stacked_attention(context: PatternCheckContext) -> bool:
"""Check if the given stacked attention workload can be offloaded to CUTLASS."""
if _has_leaking_intermediate_variables(context):
return False
if not context.annotated_expr["stacked_qkv"].struct_info.ndim == 3:
return False
if "split" in context.annotated_expr:
split_op = context.annotated_expr["split"]
if not split_op.attrs.axis == 2:
return False
else:
last_end = 0
for name in ["query", "key", "value"]:
assert f"strided_slice_{name}" in context.annotated_expr
strided_slice_op = context.annotated_expr[f"strided_slice_{name}"]
if list(strided_slice_op.attrs.axes) != [2]:
return False
if list(strided_slice_op.attrs.begin) != [last_end]:
return False
if not len(strided_slice_op.attrs.end) == 1:
return False
last_end = strided_slice_op.attrs.end[0]
if list(strided_slice_op.attrs.strides) != [1]:
return False
return True


def attention_patterns():
"""
Returns a list of all attention patterns in cutlass BYOC backend.
Expand All @@ -248,18 +275,22 @@ def attention_patterns():
(
"cutlass.stacked_attention",
*make_stacked_attention_pattern(start_op="split"),
_check_stacked_attention,
),
(
"cutlass.stacked_attention",
*make_stacked_attention_pattern(start_op="split", with_bias=True),
_check_stacked_attention,
),
(
"cutlass.stacked_attention",
*make_stacked_attention_pattern(start_op="strided_slice"),
_check_stacked_attention,
),
(
"cutlass.stacked_attention",
*make_stacked_attention_pattern(start_op="strided_slice", with_bias=True),
_check_stacked_attention,
),
]

Expand Down
10 changes: 6 additions & 4 deletions python/tvm/relax/backend/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,15 +220,16 @@ def make_stacked_attention_pattern(start_op: str, with_bias: bool = False):
check function and codegen.
"""
stacked_qkv = wildcard()
ops = {}
if start_op == "split":
qkv_tuple = is_op("relax.split")(stacked_qkv)
ops["split"] = qkv_tuple = is_op("relax.split")(stacked_qkv)
query_raw = is_tuple_get_item(qkv_tuple, 0)
key_raw = is_tuple_get_item(qkv_tuple, 1)
value_raw = is_tuple_get_item(qkv_tuple, 2)
elif start_op == "strided_slice":
query_raw = is_op("relax.strided_slice")(stacked_qkv)
key_raw = is_op("relax.strided_slice")(stacked_qkv)
value_raw = is_op("relax.strided_slice")(stacked_qkv)
ops["strided_slice_query"] = query_raw = is_op("relax.strided_slice")(stacked_qkv)
ops["strided_slice_key"] = key_raw = is_op("relax.strided_slice")(stacked_qkv)
ops["strided_slice_value"] = value_raw = is_op("relax.strided_slice")(stacked_qkv)
else:
raise NotImplementedError()
query_reshape_list = wildcard()
Expand All @@ -242,6 +243,7 @@ def make_stacked_attention_pattern(start_op: str, with_bias: bool = False):
"query_reshape_list": query_reshape_list,
"key_reshape_list": key_reshape_list,
"value_reshape_list": value_reshape_list,
**ops,
}
if with_bias:
bias = wildcard()
Expand Down
2 changes: 1 addition & 1 deletion src/relax/transform/fuse_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1045,7 +1045,7 @@ class PatternBasedPartitioner : ExprVisitor {

Map<Var, Expr> matched_bindings;
for (const auto& [pat, match] : matched_result) {
if (pat->IsInstance<CallPatternNode>()) {
if (pat->IsInstance<CallPatternNode>() || pat->IsInstance<TupleGetItemPatternNode>()) {
matched_bindings.Set(value_to_bound_var_[match], match);
}
}
Expand Down