From f19795d2f723167bb6e7c6f0abc9c126ee6b9868 Mon Sep 17 00:00:00 2001 From: YJ Shi Date: Fri, 18 Nov 2022 17:45:35 -0800 Subject: [PATCH 1/2] patch to avoid seg fault with test --- .../space_generator/space_generator.cc | 3 +- .../test_meta_schedule_space_cpu_winograd.py | 103 ++++++++++++++++++ 2 files changed, 105 insertions(+), 1 deletion(-) diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc index bd124511b83c..32f90149683e 100644 --- a/src/meta_schedule/space_generator/space_generator.cc +++ b/src/meta_schedule/space_generator/space_generator.cc @@ -25,7 +25,8 @@ String GetRuleKindFromTarget(const Target& target) { if (target->kind->name == "llvm") { static const PackedFunc* f_check_vnni = runtime::Registry::Get("tvm.topi.x86.utils.target_has_vnni"); - ICHECK(*f_check_vnni != nullptr) << "The `target_has_vnni` func is not in tvm registry."; + ICHECK(f_check_vnni != nullptr && *f_check_vnni != nullptr) + << "The `target_has_vnni` func is not in tvm registry."; if (target->GetAttr("mcpu") && (*f_check_vnni)(target->GetAttr("mcpu").value())) { return "vnni"; diff --git a/tests/python/unittest/test_meta_schedule_space_cpu_winograd.py b/tests/python/unittest/test_meta_schedule_space_cpu_winograd.py index 78b75d592ed4..906240f1986a 100644 --- a/tests/python/unittest/test_meta_schedule_space_cpu_winograd.py +++ b/tests/python/unittest/test_meta_schedule_space_cpu_winograd.py @@ -15,7 +15,9 @@ # specific language governing permissions and limitations # under the License. """Tests for MetaSchedule search space on CPU""" +import tvm from tvm import meta_schedule as ms +from tvm.meta_schedule import TuneContext from tvm.meta_schedule.testing.space_generation import ( check_sketches, generate_design_space, @@ -164,5 +166,106 @@ def cpu_nhwc_0(X: T.Buffer[(1, 14, 14, 128), "float32"], W: T.Buffer[(6, 6, 128, ) +def test_cpu_target_has_vnni(): + # fmt: off + @tvm.script.ir_module + class Module: + @T.prim_func + def main(p0: T.Buffer[(32, 64, 56, 56), "float16"], p1: T.Buffer[(6, 6, 64, 64), "float16"], p2: T.Buffer[(1, 64, 1, 1), "float16"], T_relu: T.Buffer[(32, 64, 56, 56), "float16"]): + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_buffers": [1]}) + # body + # with T.block("root") + data_pad = T.alloc_buffer([32, 64, 58, 58], dtype="float16") + input_tile = T.alloc_buffer([64, 6272, 6, 6], dtype="float16") + B = T.alloc_buffer([6, 6], dtype="float16") + data_pack = T.alloc_buffer([6, 6, 64, 6272], dtype="float16") + bgemm = T.alloc_buffer([6, 6, 64, 6272], dtype="float16") + A = T.alloc_buffer([6, 4], dtype="float16") + inverse = T.alloc_buffer([64, 6272, 4, 4], dtype="float16") + conv2d_winograd = T.alloc_buffer([32, 64, 56, 56], dtype="float16") + T_add = T.alloc_buffer([32, 64, 56, 56], dtype="float16") + for i0, i1, i2, i3 in T.grid(32, 64, 58, 58): + with T.block("data_pad"): + i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) + T.reads(p0[i0_1, i1_1, i2_1 - 1, i3_1 - 1]) + T.writes(data_pad[i0_1, i1_1, i2_1, i3_1]) + data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i2_1 and i2_1 < 57 and 1 <= i3_1 and i3_1 < 57, p0[i0_1, i1_1, i2_1 - 1, i3_1 - 1], T.float16(0), dtype="float16") + for i0, i1, i2, i3 in T.grid(64, 6272, 6, 6): + with T.block("input_tile"): + ci, p, eps, nu = T.axis.remap("SSSS", [i0, i1, i2, i3]) + T.reads(data_pad[p // 196, ci, p % 196 // 14 * 4 + eps, p % 14 * 4 + nu]) + T.writes(input_tile[ci, p, eps, nu]) + T.block_attr({"schedule_rule":"None"}) + input_tile[ci, p, eps, nu] = data_pad[p // 196, ci, p % 196 // 14 * 4 + eps, p % 14 * 4 + nu] + for i0, i1 in T.grid(6, 6): + with T.block("B"): + i, j = T.axis.remap("SS", [i0, i1]) + T.reads() + T.writes(B[i, j]) + T.block_attr({"schedule_rule":"None"}) + B[i, j] = T.Select(i % 6 == 5 and j % 6 == 5, T.float16(1), T.Select(i % 6 == 5 and j % 6 == 4, T.float16(0), T.Select(i % 6 == 5 and j % 6 == 3, T.float16(0), T.Select(i % 6 == 5 and j % 6 == 2, T.float16(0), T.Select(i % 6 == 5 and j % 6 == 1, T.float16(0), T.Select(i % 6 == 5 and j % 6 == 0, T.float16(0), T.Select(i % 6 == 4 and j % 6 == 5, T.float16(1.5), T.Select(i % 6 == 4 and j % 6 == 4, T.float16(1), T.Select(i % 6 == 4 and j % 6 == 3, T.float16(1), T.Select(i % 6 == 4 and j % 6 == 2, T.float16(1), T.Select(i % 6 == 4 and j % 6 == 1, T.float16(1), T.Select(i % 6 == 4 and j % 6 == 0, T.float16(1), T.Select(i % 6 == 3 and j % 6 == 5, T.float16(-2), T.Select(i % 6 == 3 and j % 6 == 4, T.float16(-0.5), T.Select(i % 6 == 3 and j % 6 == 3, T.float16(2), T.Select(i % 6 == 3 and j % 6 == 2, T.float16(2.5), T.Select(i % 6 == 3 and j % 6 == 1, T.float16(0.5), T.Select(i % 6 == 3 and j % 6 == 0, T.float16(1.5), T.Select(i % 6 == 2 and j % 6 == 5, T.float16(-1.5), T.Select(i % 6 == 2 and j % 6 == 4, T.float16(-1), T.Select(i % 6 == 2 and j % 6 == 3, T.float16(-1), T.Select(i % 6 == 2 and j % 6 == 2, T.float16(0.5), T.Select(i % 6 == 2 and j % 6 == 1, T.float16(-2.5), T.Select(i % 6 == 2 and j % 6 == 0, T.float16(-2), T.Select(i % 6 == 1 and j % 6 == 5, T.float16(1), T.Select(i % 6 == 1 and j % 6 == 4, T.float16(0.5), T.Select(i % 6 == 1 and j % 6 == 3, T.float16(-2), T.Select(i % 6 == 1 and j % 6 == 2, T.float16(-1), T.Select(i % 6 == 1 and j % 6 == 1, T.float16(1), T.Select(i % 6 == 1 and j % 6 == 0, T.float16(-1.5), T.Select(i % 6 == 0 and j % 6 == 5, T.float16(0), T.Select(i % 6 == 0 and j % 6 == 4, T.float16(0), T.Select(i % 6 == 0 and j % 6 == 3, T.float16(0), T.Select(i % 6 == 0 and j % 6 == 2, T.float16(0), T.Select(i % 6 == 0 and j % 6 == 1, T.float16(0), T.Select(i % 6 == 0 and j % 6 == 0, T.float16(1), T.float16(0))))))))))))))))))))))))))))))))))))) + for i0, i1, i2, i3, i4, i5 in T.grid(6, 6, 64, 6272, 6, 6): + with T.block("data_pack"): + eps, nu, ci, p, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) + T.reads(input_tile[ci, p, r_a, r_b], B[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(eps, nu) : T.max(eps, nu) + 1]) + T.writes(data_pack[eps, nu, ci, p]) + T.block_attr({"schedule_rule":"conv2d_nchw_winograd_data_pack"}) + with T.init(): + data_pack[eps, nu, ci, p] = T.float16(0) + data_pack[eps, nu, ci, p] = data_pack[eps, nu, ci, p] + input_tile[ci, p, r_a, r_b] * B[r_a, eps] * B[r_b, nu] + for i0, i1, i2, i3, i4 in T.grid(6, 6, 64, 6272, 64): + with T.block("bgemm"): + eps, nu, co, p, ci = T.axis.remap("SSSSR", [i0, i1, i2, i3, i4]) + T.reads(data_pack[eps, nu, ci, p], p1[eps, nu, ci, co]) + T.writes(bgemm[eps, nu, co, p]) + with T.init(): + bgemm[eps, nu, co, p] = T.float16(0) + bgemm[eps, nu, co, p] = bgemm[eps, nu, co, p] + data_pack[eps, nu, ci, p] * p1[eps, nu, ci, co] + for i0, i1 in T.grid(6, 4): + with T.block("A"): + i, j = T.axis.remap("SS", [i0, i1]) + T.reads() + T.writes(A[i, j]) + T.block_attr({"schedule_rule":"None"}) + A[i, j] = T.Select(i % 6 == 5 and j % 4 == 3, T.float16(1), T.Select(i % 6 == 5 and j % 4 == 2, T.float16(0), T.Select(i % 6 == 5 and j % 4 == 1, T.float16(0), T.Select(i % 6 == 5 and j % 4 == 0, T.float16(0), T.Select(i % 6 == 4 and j % 4 == 3, T.float16(-8), T.Select(i % 6 == 4 and j % 4 == 2, T.float16(4), T.Select(i % 6 == 4 and j % 4 == 1, T.float16(-2), T.Select(i % 6 == 4 and j % 4 == 0, T.float16(1), T.Select(i % 6 == 3 and j % 4 == 3, T.float16(0.125), T.Select(i % 6 == 3 and j % 4 == 2, T.float16(0.25), T.Select(i % 6 == 3 and j % 4 == 1, T.float16(0.5), T.Select(i % 6 == 3 and j % 4 == 0, T.float16(1), T.Select(i % 6 == 2 and j % 4 == 3, T.float16(1), T.Select(i % 6 == 2 and j % 4 == 2, T.float16(1), T.Select(i % 6 == 2 and j % 4 == 1, T.float16(1), T.Select(i % 6 == 2 and j % 4 == 0, T.float16(1), T.Select(i % 6 == 1 and j % 4 == 3, T.float16(-1), T.Select(i % 6 == 1 and j % 4 == 2, T.float16(1), T.Select(i % 6 == 1 and j % 4 == 1, T.float16(-1), T.Select(i % 6 == 1 and j % 4 == 0, T.float16(1), T.Select(i % 6 == 0 and j % 4 == 3, T.float16(0), T.Select(i % 6 == 0 and j % 4 == 2, T.float16(0), T.Select(i % 6 == 0 and j % 4 == 1, T.float16(0), T.Select(i % 6 == 0 and j % 4 == 0, T.float16(1), T.float16(0))))))))))))))))))))))))) + for i0, i1, i2, i3, i4, i5 in T.grid(64, 6272, 4, 4, 6, 6): + with T.block("inverse"): + co, p, vh, vw, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) + T.reads(bgemm[r_a, r_b, co, p], A[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(vh, vw) : T.max(vh, vw) + 1]) + T.writes(inverse[co, p, vh, vw]) + T.block_attr({"schedule_rule":"conv2d_nchw_winograd_inverse"}) + with T.init(): + inverse[co, p, vh, vw] = T.float16(0) + inverse[co, p, vh, vw] = inverse[co, p, vh, vw] + bgemm[r_a, r_b, co, p] * A[r_a, vh] * A[r_b, vw] + for i0, i1, i2, i3 in T.grid(32, 64, 56, 56): + with T.block("conv2d_winograd"): + n, co, h, w = T.axis.remap("SSSS", [i0, i1, i2, i3]) + T.reads(inverse[co, n * 196 + h // 4 * 14 + w // 4, h % 4, w % 4]) + T.writes(conv2d_winograd[n, co, h, w]) + conv2d_winograd[n, co, h, w] = inverse[co, n * 196 + h // 4 * 14 + w // 4, h % 4, w % 4] + for i0, i1, i2, i3 in T.grid(32, 64, 56, 56): + with T.block("T_add"): + ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) + T.reads(conv2d_winograd[ax0, ax1, ax2, ax3], p2[0, ax1, 0, 0]) + T.writes(T_add[ax0, ax1, ax2, ax3]) + T_add[ax0, ax1, ax2, ax3] = conv2d_winograd[ax0, ax1, ax2, ax3] + p2[0, ax1, 0, 0] + for i0, i1, i2, i3 in T.grid(32, 64, 56, 56): + with T.block("T_relu"): + ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) + T.reads(T_add[ax0, ax1, ax2, ax3]) + T.writes(T_relu[ax0, ax1, ax2, ax3]) + T_relu[ax0, ax1, ax2, ax3] = T.max(T_add[ax0, ax1, ax2, ax3], T.float16(0)) + + # fmt: on + target = Target("llvm --num-cores 8") + ctx = TuneContext( + mod=Module, + target=target, + space_generator="post-order-apply", + ).clone() + + if __name__ == "__main__": test_cpu_nhwc() + test_cpu_target_has_vnni() From 81ce3e36026d39d29c30c2f3ffc584caa0a9ec4c Mon Sep 17 00:00:00 2001 From: YJ Shi Date: Mon, 21 Nov 2022 13:54:01 -0800 Subject: [PATCH 2/2] address comments --- .../space_generator/space_generator.cc | 3 +- .../test_meta_schedule_space_cpu_winograd.py | 104 ------------------ 2 files changed, 1 insertion(+), 106 deletions(-) diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc index 32f90149683e..cb89b3b817af 100644 --- a/src/meta_schedule/space_generator/space_generator.cc +++ b/src/meta_schedule/space_generator/space_generator.cc @@ -25,8 +25,7 @@ String GetRuleKindFromTarget(const Target& target) { if (target->kind->name == "llvm") { static const PackedFunc* f_check_vnni = runtime::Registry::Get("tvm.topi.x86.utils.target_has_vnni"); - ICHECK(f_check_vnni != nullptr && *f_check_vnni != nullptr) - << "The `target_has_vnni` func is not in tvm registry."; + ICHECK(f_check_vnni != nullptr) << "The `target_has_vnni` func is not in tvm registry."; if (target->GetAttr("mcpu") && (*f_check_vnni)(target->GetAttr("mcpu").value())) { return "vnni"; diff --git a/tests/python/unittest/test_meta_schedule_space_cpu_winograd.py b/tests/python/unittest/test_meta_schedule_space_cpu_winograd.py index 906240f1986a..135304286b4b 100644 --- a/tests/python/unittest/test_meta_schedule_space_cpu_winograd.py +++ b/tests/python/unittest/test_meta_schedule_space_cpu_winograd.py @@ -15,13 +15,10 @@ # specific language governing permissions and limitations # under the License. """Tests for MetaSchedule search space on CPU""" -import tvm from tvm import meta_schedule as ms -from tvm.meta_schedule import TuneContext from tvm.meta_schedule.testing.space_generation import ( check_sketches, generate_design_space, - print_sketches, ) from tvm.meta_schedule.testing.te_workload import create_te_workload from tvm.script import tir as T @@ -166,106 +163,5 @@ def cpu_nhwc_0(X: T.Buffer[(1, 14, 14, 128), "float32"], W: T.Buffer[(6, 6, 128, ) -def test_cpu_target_has_vnni(): - # fmt: off - @tvm.script.ir_module - class Module: - @T.prim_func - def main(p0: T.Buffer[(32, 64, 56, 56), "float16"], p1: T.Buffer[(6, 6, 64, 64), "float16"], p2: T.Buffer[(1, 64, 1, 1), "float16"], T_relu: T.Buffer[(32, 64, 56, 56), "float16"]): - # function attr dict - T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_buffers": [1]}) - # body - # with T.block("root") - data_pad = T.alloc_buffer([32, 64, 58, 58], dtype="float16") - input_tile = T.alloc_buffer([64, 6272, 6, 6], dtype="float16") - B = T.alloc_buffer([6, 6], dtype="float16") - data_pack = T.alloc_buffer([6, 6, 64, 6272], dtype="float16") - bgemm = T.alloc_buffer([6, 6, 64, 6272], dtype="float16") - A = T.alloc_buffer([6, 4], dtype="float16") - inverse = T.alloc_buffer([64, 6272, 4, 4], dtype="float16") - conv2d_winograd = T.alloc_buffer([32, 64, 56, 56], dtype="float16") - T_add = T.alloc_buffer([32, 64, 56, 56], dtype="float16") - for i0, i1, i2, i3 in T.grid(32, 64, 58, 58): - with T.block("data_pad"): - i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) - T.reads(p0[i0_1, i1_1, i2_1 - 1, i3_1 - 1]) - T.writes(data_pad[i0_1, i1_1, i2_1, i3_1]) - data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i2_1 and i2_1 < 57 and 1 <= i3_1 and i3_1 < 57, p0[i0_1, i1_1, i2_1 - 1, i3_1 - 1], T.float16(0), dtype="float16") - for i0, i1, i2, i3 in T.grid(64, 6272, 6, 6): - with T.block("input_tile"): - ci, p, eps, nu = T.axis.remap("SSSS", [i0, i1, i2, i3]) - T.reads(data_pad[p // 196, ci, p % 196 // 14 * 4 + eps, p % 14 * 4 + nu]) - T.writes(input_tile[ci, p, eps, nu]) - T.block_attr({"schedule_rule":"None"}) - input_tile[ci, p, eps, nu] = data_pad[p // 196, ci, p % 196 // 14 * 4 + eps, p % 14 * 4 + nu] - for i0, i1 in T.grid(6, 6): - with T.block("B"): - i, j = T.axis.remap("SS", [i0, i1]) - T.reads() - T.writes(B[i, j]) - T.block_attr({"schedule_rule":"None"}) - B[i, j] = T.Select(i % 6 == 5 and j % 6 == 5, T.float16(1), T.Select(i % 6 == 5 and j % 6 == 4, T.float16(0), T.Select(i % 6 == 5 and j % 6 == 3, T.float16(0), T.Select(i % 6 == 5 and j % 6 == 2, T.float16(0), T.Select(i % 6 == 5 and j % 6 == 1, T.float16(0), T.Select(i % 6 == 5 and j % 6 == 0, T.float16(0), T.Select(i % 6 == 4 and j % 6 == 5, T.float16(1.5), T.Select(i % 6 == 4 and j % 6 == 4, T.float16(1), T.Select(i % 6 == 4 and j % 6 == 3, T.float16(1), T.Select(i % 6 == 4 and j % 6 == 2, T.float16(1), T.Select(i % 6 == 4 and j % 6 == 1, T.float16(1), T.Select(i % 6 == 4 and j % 6 == 0, T.float16(1), T.Select(i % 6 == 3 and j % 6 == 5, T.float16(-2), T.Select(i % 6 == 3 and j % 6 == 4, T.float16(-0.5), T.Select(i % 6 == 3 and j % 6 == 3, T.float16(2), T.Select(i % 6 == 3 and j % 6 == 2, T.float16(2.5), T.Select(i % 6 == 3 and j % 6 == 1, T.float16(0.5), T.Select(i % 6 == 3 and j % 6 == 0, T.float16(1.5), T.Select(i % 6 == 2 and j % 6 == 5, T.float16(-1.5), T.Select(i % 6 == 2 and j % 6 == 4, T.float16(-1), T.Select(i % 6 == 2 and j % 6 == 3, T.float16(-1), T.Select(i % 6 == 2 and j % 6 == 2, T.float16(0.5), T.Select(i % 6 == 2 and j % 6 == 1, T.float16(-2.5), T.Select(i % 6 == 2 and j % 6 == 0, T.float16(-2), T.Select(i % 6 == 1 and j % 6 == 5, T.float16(1), T.Select(i % 6 == 1 and j % 6 == 4, T.float16(0.5), T.Select(i % 6 == 1 and j % 6 == 3, T.float16(-2), T.Select(i % 6 == 1 and j % 6 == 2, T.float16(-1), T.Select(i % 6 == 1 and j % 6 == 1, T.float16(1), T.Select(i % 6 == 1 and j % 6 == 0, T.float16(-1.5), T.Select(i % 6 == 0 and j % 6 == 5, T.float16(0), T.Select(i % 6 == 0 and j % 6 == 4, T.float16(0), T.Select(i % 6 == 0 and j % 6 == 3, T.float16(0), T.Select(i % 6 == 0 and j % 6 == 2, T.float16(0), T.Select(i % 6 == 0 and j % 6 == 1, T.float16(0), T.Select(i % 6 == 0 and j % 6 == 0, T.float16(1), T.float16(0))))))))))))))))))))))))))))))))))))) - for i0, i1, i2, i3, i4, i5 in T.grid(6, 6, 64, 6272, 6, 6): - with T.block("data_pack"): - eps, nu, ci, p, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) - T.reads(input_tile[ci, p, r_a, r_b], B[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(eps, nu) : T.max(eps, nu) + 1]) - T.writes(data_pack[eps, nu, ci, p]) - T.block_attr({"schedule_rule":"conv2d_nchw_winograd_data_pack"}) - with T.init(): - data_pack[eps, nu, ci, p] = T.float16(0) - data_pack[eps, nu, ci, p] = data_pack[eps, nu, ci, p] + input_tile[ci, p, r_a, r_b] * B[r_a, eps] * B[r_b, nu] - for i0, i1, i2, i3, i4 in T.grid(6, 6, 64, 6272, 64): - with T.block("bgemm"): - eps, nu, co, p, ci = T.axis.remap("SSSSR", [i0, i1, i2, i3, i4]) - T.reads(data_pack[eps, nu, ci, p], p1[eps, nu, ci, co]) - T.writes(bgemm[eps, nu, co, p]) - with T.init(): - bgemm[eps, nu, co, p] = T.float16(0) - bgemm[eps, nu, co, p] = bgemm[eps, nu, co, p] + data_pack[eps, nu, ci, p] * p1[eps, nu, ci, co] - for i0, i1 in T.grid(6, 4): - with T.block("A"): - i, j = T.axis.remap("SS", [i0, i1]) - T.reads() - T.writes(A[i, j]) - T.block_attr({"schedule_rule":"None"}) - A[i, j] = T.Select(i % 6 == 5 and j % 4 == 3, T.float16(1), T.Select(i % 6 == 5 and j % 4 == 2, T.float16(0), T.Select(i % 6 == 5 and j % 4 == 1, T.float16(0), T.Select(i % 6 == 5 and j % 4 == 0, T.float16(0), T.Select(i % 6 == 4 and j % 4 == 3, T.float16(-8), T.Select(i % 6 == 4 and j % 4 == 2, T.float16(4), T.Select(i % 6 == 4 and j % 4 == 1, T.float16(-2), T.Select(i % 6 == 4 and j % 4 == 0, T.float16(1), T.Select(i % 6 == 3 and j % 4 == 3, T.float16(0.125), T.Select(i % 6 == 3 and j % 4 == 2, T.float16(0.25), T.Select(i % 6 == 3 and j % 4 == 1, T.float16(0.5), T.Select(i % 6 == 3 and j % 4 == 0, T.float16(1), T.Select(i % 6 == 2 and j % 4 == 3, T.float16(1), T.Select(i % 6 == 2 and j % 4 == 2, T.float16(1), T.Select(i % 6 == 2 and j % 4 == 1, T.float16(1), T.Select(i % 6 == 2 and j % 4 == 0, T.float16(1), T.Select(i % 6 == 1 and j % 4 == 3, T.float16(-1), T.Select(i % 6 == 1 and j % 4 == 2, T.float16(1), T.Select(i % 6 == 1 and j % 4 == 1, T.float16(-1), T.Select(i % 6 == 1 and j % 4 == 0, T.float16(1), T.Select(i % 6 == 0 and j % 4 == 3, T.float16(0), T.Select(i % 6 == 0 and j % 4 == 2, T.float16(0), T.Select(i % 6 == 0 and j % 4 == 1, T.float16(0), T.Select(i % 6 == 0 and j % 4 == 0, T.float16(1), T.float16(0))))))))))))))))))))))))) - for i0, i1, i2, i3, i4, i5 in T.grid(64, 6272, 4, 4, 6, 6): - with T.block("inverse"): - co, p, vh, vw, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) - T.reads(bgemm[r_a, r_b, co, p], A[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(vh, vw) : T.max(vh, vw) + 1]) - T.writes(inverse[co, p, vh, vw]) - T.block_attr({"schedule_rule":"conv2d_nchw_winograd_inverse"}) - with T.init(): - inverse[co, p, vh, vw] = T.float16(0) - inverse[co, p, vh, vw] = inverse[co, p, vh, vw] + bgemm[r_a, r_b, co, p] * A[r_a, vh] * A[r_b, vw] - for i0, i1, i2, i3 in T.grid(32, 64, 56, 56): - with T.block("conv2d_winograd"): - n, co, h, w = T.axis.remap("SSSS", [i0, i1, i2, i3]) - T.reads(inverse[co, n * 196 + h // 4 * 14 + w // 4, h % 4, w % 4]) - T.writes(conv2d_winograd[n, co, h, w]) - conv2d_winograd[n, co, h, w] = inverse[co, n * 196 + h // 4 * 14 + w // 4, h % 4, w % 4] - for i0, i1, i2, i3 in T.grid(32, 64, 56, 56): - with T.block("T_add"): - ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) - T.reads(conv2d_winograd[ax0, ax1, ax2, ax3], p2[0, ax1, 0, 0]) - T.writes(T_add[ax0, ax1, ax2, ax3]) - T_add[ax0, ax1, ax2, ax3] = conv2d_winograd[ax0, ax1, ax2, ax3] + p2[0, ax1, 0, 0] - for i0, i1, i2, i3 in T.grid(32, 64, 56, 56): - with T.block("T_relu"): - ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) - T.reads(T_add[ax0, ax1, ax2, ax3]) - T.writes(T_relu[ax0, ax1, ax2, ax3]) - T_relu[ax0, ax1, ax2, ax3] = T.max(T_add[ax0, ax1, ax2, ax3], T.float16(0)) - - # fmt: on - target = Target("llvm --num-cores 8") - ctx = TuneContext( - mod=Module, - target=target, - space_generator="post-order-apply", - ).clone() - - if __name__ == "__main__": test_cpu_nhwc() - test_cpu_target_has_vnni()