From 474840d885fc78298884081d4e47125ae871a1a7 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 7 Sep 2021 16:04:29 -0700
Subject: [PATCH 1/2] Add hexagon contrib tests for blocked conv2d and
 maxpool2d

---
 tests/python/contrib/test_hexagon/__init__.py |  18 +
 tests/python/contrib/test_hexagon/conftest.py |  46 ++
 .../contrib/test_hexagon/infrastructure.py    |  57 +++
 .../test_hexagon/test_conv2d_blocked.py       | 474 ++++++++++++++++++
 .../test_hexagon/test_maxpool2d_blocked.py    | 151 ++++++
 5 files changed, 746 insertions(+)
 create mode 100644 tests/python/contrib/test_hexagon/__init__.py
 create mode 100644 tests/python/contrib/test_hexagon/conftest.py
 create mode 100644 tests/python/contrib/test_hexagon/infrastructure.py
 create mode 100644 tests/python/contrib/test_hexagon/test_conv2d_blocked.py
 create mode 100644 tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py

diff --git a/tests/python/contrib/test_hexagon/__init__.py b/tests/python/contrib/test_hexagon/__init__.py
new file mode 100644
index 000000000000..58dc4cc1e03d
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Testing infrastructure for Hexagon """
diff --git a/tests/python/contrib/test_hexagon/conftest.py b/tests/python/contrib/test_hexagon/conftest.py
new file mode 100644
index 000000000000..8213c33123d8
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/conftest.py
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Hexagon testing fixtures used to deduce testing argument
+    values from testing parameters """
+
+import tvm
+from .infrastructure import ceildiv
+
+
+@tvm.testing.fixture
+def shape_nhwc(batch, in_channel, in_size):
+    return (batch, in_size, in_size, in_channel)
+
+
+@tvm.testing.fixture
+def shape_oihw(out_channel, in_channel, kernel):
+    return (out_channel, in_channel, kernel, kernel)
+
+
+@tvm.testing.fixture
+def shape_oihw8i32o4i(out_channel, in_channel, kernel):
+    out_factor, in_factor, in_second_factor = 32, 32, 4
+    return (
+        int(ceildiv(out_channel, out_factor)),
+        int(ceildiv(in_channel, in_factor)),
+        kernel,
+        kernel,
+        in_factor // in_second_factor,
+        out_factor,
+        in_second_factor,
+    )
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
new file mode 100644
index 000000000000..9c7ba5b1b6a3
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Hexagon testing infrastructure """
+
+import tvm
+import numpy
+
+
+def ceildiv(o, d):
+    return tvm.tir.floordiv(o + d - 1, d)
+
+
+def get_packed_layout(logical_shape_nhwc, block_shape, packed_C=True):
+    shape = [logical_shape_nhwc[0]]
+    off_h, off_w, off_c = block_shape
+    shape.append(ceildiv(logical_shape_nhwc[1], off_h))
+    shape.append(ceildiv(logical_shape_nhwc[2], off_w))
+    if packed_C:
+        shape.append(ceildiv(logical_shape_nhwc[3], off_c))
+        shape.extend(block_shape)
+    else:
+        shape.extend([off_h, off_w, logical_shape_nhwc[-1]])
+    return shape
+
+
+def build_and_run(inputs, func, target, target_host, *args, **kwargs):
+    s, placeholders, binds = func(*args, **kwargs)
+
+    func = tvm.build(s, placeholders, target=target, target_host=target_host, binds=binds)
+    dev = tvm.device(target)
+    tensors = []
+    for tensor in inputs:
+        tensors.append(tvm.nd.array(tensor, dev))
+    tensors.append(
+        tvm.nd.array(
+            numpy.zeros([i.value for i in placeholders[-1].shape], dtype=placeholders[-1].dtype),
+            dev,
+        )
+    )
+    func(*tensors)
+
+    return tensors[-1].asnumpy()
diff --git a/tests/python/contrib/test_hexagon/test_conv2d_blocked.py b/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
new file mode 100644
index 000000000000..a5a766b5b5df
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
@@ -0,0 +1,474 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+import tvm
+from tvm import te
+from tvm import topi
+from tvm.topi import testing
+from .infrastructure import ceildiv
+from .infrastructure import get_packed_layout
+from .infrastructure import build_and_run
+
+import numpy as np
+import pytest
+
+
+def conv2d_logical(
+    shape_nhwc,
+    shape_kcrs,
+    kernel_size,
+    stride,
+    padding,
+    dtype,
+    storage_scope="global",
+):
+    """
+    Conv2d TE wherein both input activation and filter tensors
+    are defined with their logical NHWC/OIHW shapes, respectively.
+    The packed physical layout for the activation and filter are:
+      Activation: nhwc8h8w32c
+      Filter: oihw8i32o4i
+    """
+    assert kernel_size == tuple(shape_kcrs[2:])
+
+    block_shape = 8, 8, 32
+    block_H, block_W, block_C = block_shape
+    shape = get_packed_layout(shape_nhwc, block_shape)
+    logical_output_shape = (
+        shape_nhwc[0],
+        (shape_nhwc[1] - kernel_size[0] + padding[0] + padding[1]) // stride[0] + 1,
+        (shape_nhwc[2] - kernel_size[1] + padding[2] + padding[3]) // stride[0] + 1,
+        shape_nhwc[3],
+    )
+    output_shape = get_packed_layout(logical_output_shape, block_shape)
+
+    N, H, W, C = shape_nhwc
+    X = te.placeholder(shape_nhwc, dtype=dtype)
+    # Combination of padding required by conv2d operator and padding to evenly divisible
+    # number of blocks. Note that this padding should be inlined in the schedule so
+    # as to avoid input copying.
+    pad_h = (block_H - ((H + padding[1]) % block_H)) % block_H
+    pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W
+    X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0)
+    # Calculate packed layout
+    X_packed = te.compute(
+        shape,
+        lambda n, ho, wo, co, hi, wi, ci: X_pad[
+            n, ho * block_H + hi, wo * block_W + wi, co * block_C + ci
+        ],
+    )
+
+    # Filter shape using KCRS (OIHW) notation
+    K, C, R, S = shape_kcrs
+    filter_Ki, filter_Ci, filter_Cii = 32, 32, 4
+    shape_filter = [
+        ceildiv(K, filter_Ki),
+        ceildiv(C, filter_Ci),
+        R,
+        S,
+        filter_Ci // filter_Cii,
+        filter_Ki,
+        filter_Cii,
+    ]
+    filt = te.placeholder(shape_kcrs, dtype=dtype)
+    # Channel padding to multiples of 32
+    pad_c = (filter_Ci - (C % filter_Ci)) % filter_Ci
+    pad_k = (filter_Ki - (K % filter_Ki)) % filter_Ki
+    filt_pad = topi.nn.pad(
+        filt, [0, 0, 0, 0], [pad_k, pad_c, R, S], pad_value=0, name="padded_filter"
+    )
+    filt_packed = te.compute(
+        shape_filter,
+        lambda ko, co, r, s, cio, ki, cii: filt_pad[
+            ko * filter_Ki + ki, co * filter_Ci + cio * filter_Cii + cii, r, s
+        ],
+        name="packed_filter",
+    )
+
+    rh = te.reduce_axis((0, kernel_size[0]), name="rh")
+    rw = te.reduce_axis((0, kernel_size[1]), name="rw")
+    rc = te.reduce_axis((0, C), name="rc")
+
+    def compute(n, ho, wo, ko, hi, wi, ki):
+        # Construct blockized strided conv2d height index
+        h = ho * block_H + hi
+        h_contig = h * stride[0] + rh
+        h_block_id = h_contig // block_H
+        h_block_offset = h_contig % block_H
+
+        # Construct blockized strided conv2d width index
+        w = wo * block_W + wi
+        w_contig = w * stride[1] + rw
+        w_block_id = w_contig // block_W
+        w_block_offset = w_contig % block_W
+
+        # Construct blockized conv2d channel index
+        c_block_id = rc // block_C
+        c_block_offset = rc % block_C
+
+        # Construct flat filter input channel indices
+        rco = rc // filter_Ci
+        rcio = (rc % filter_Ci) // filter_Cii
+        rcii = rc % filter_Cii
+
+        return te.sum(
+            X_packed[
+                n,
+                h_block_id,
+                w_block_id,
+                c_block_id,
+                h_block_offset,
+                w_block_offset,
+                c_block_offset,
+            ]
+            * filt_packed[ko, rco, rh, rw, rcio, ki, rcii],
+            axis=[rh, rw, rc],
+        )
+
+    Y = te.compute(output_shape, compute)
+    s = te.create_schedule(Y.op)
+
+    # Ensure the padding and array packing is performed inline
+    s[X_pad].compute_inline()
+    s[X_packed].compute_inline()
+
+    s[filt_pad].compute_inline()
+    s[filt_packed].compute_inline()
+
+    binds = {}
+    if storage_scope and storage_scope != "global":
+        with tvm.transform.PassContext():
+            Xb = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope)
+            Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope)
+            binds = {X: Xb, Y: Yb}
+
+    return (s, [X, filt, Y], binds)
+
+
+def conv2d_packed_filter(
+    shape_nhwc,
+    shape_filter,
+    kernel_size,
+    stride,
+    padding,
+    dtype,
+    storage_scope="global",
+):
+    """
+    Conv2d TE wherein the input activation is defined by its
+    logical NHWC shape, but the filter is provided in the
+    packed layout oihw8i32o4i. The physical packed layout used
+    for the activation is: nhwc8h8w32c
+    """
+    assert kernel_size == tuple(shape_filter[2:4])
+
+    block_shape = 8, 8, 32
+    block_H, block_W, block_C = block_shape
+    shape = get_packed_layout(shape_nhwc, block_shape)
+    logical_output_shape = (
+        shape_nhwc[0],
+        (shape_nhwc[1] - kernel_size[0] + padding[0] + padding[1]) // stride[0] + 1,
+        (shape_nhwc[2] - kernel_size[1] + padding[2] + padding[3]) // stride[0] + 1,
+        shape_nhwc[3],
+    )
+    output_shape = get_packed_layout(logical_output_shape, block_shape)
+
+    N, H, W, C = shape_nhwc
+    X = te.placeholder(shape_nhwc, dtype=dtype)
+    # Combination of padding required by conv2d operator and padding to evenly divisible
+    # number of blocks. Note that this padding should be inlined in the schedule so
+    # as to avoid input copying.
+    pad_h = (block_H - ((H + padding[1]) % block_H)) % block_H
+    pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W
+
+    X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0)
+    # Calculate packed layout
+    packed_shape = get_packed_layout(X_pad.shape, block_shape)
+
+    X_packed = te.compute(
+        packed_shape,
+        lambda n, ho, wo, co, hi, wi, ci: X_pad[
+            n, ho * block_H + hi, wo * block_W + wi, co * block_C + ci
+        ],
+    )
+
+    # Filter shape using KCRS (OIHW) notation
+    filter_Ki, filter_Ci, filter_Cii = 32, 32, 4
+    assert shape_filter[-1] == filter_Cii
+    assert shape_filter[-2] == filter_Ki
+    assert shape_filter[-3] == filter_Ci // filter_Cii
+
+    filt_packed = te.placeholder(shape_filter, dtype=dtype)
+
+    rh = te.reduce_axis((0, kernel_size[0]), name="rh")
+    rw = te.reduce_axis((0, kernel_size[1]), name="rw")
+    rc = te.reduce_axis((0, C), name="rc")
+
+    def compute(n, ho, wo, ko, hi, wi, ki):
+        # Construct blockized strided conv2d height index
+        h = ho * block_H + hi
+        h_contig = h * stride[0] + rh
+        h_block_id = h_contig // block_H
+        h_block_offset = h_contig % block_H
+
+        # Construct blockized strided conv2d width index
+        w = wo * block_W + wi
+        w_contig = w * stride[1] + rw
+        w_block_id = w_contig // block_W
+        w_block_offset = w_contig % block_W
+
+        # Construct blockized conv2d channel index
+        c_block_id = rc // block_C
+        c_block_offset = rc % block_C
+
+        # Construct flat filter input channel indices
+        rco = rc // filter_Ci
+        rcio = (rc % filter_Ci) // filter_Cii
+        rcii = rc % filter_Cii
+
+        return te.sum(
+            X_packed[
+                n,
+                h_block_id,
+                w_block_id,
+                c_block_id,
+                h_block_offset,
+                w_block_offset,
+                c_block_offset,
+            ]
+            * filt_packed[ko, rco, rh, rw, rcio, ki, rcii],
+            axis=[rh, rw, rc],
+        )
+
+    Y = te.compute(output_shape, compute)
+    s = te.create_schedule(Y.op)
+
+    # Ensure the padding and array packing is performed inline
+    s[X_pad].compute_inline()
+    s[X_packed].compute_inline()
+
+    # Perform scheduling
+    n, hid, wid, cid, hoff, woff, coff = s[Y].op.axis
+    slice = s[Y].fuse(wid, cid)
+    Xl = s.cache_read(X_packed, storage_scope, [Y])
+    Yl = s.cache_write(Y, storage_scope)
+
+    s[Yl].compute_at(s[Y], hid)
+    n, hid, slice, hoff, woff, coff = s[Yl].op.axis
+    s[Xl].compute_at(s[Yl], slice)
+
+    binds = {}
+    if storage_scope and storage_scope != "global":
+        with tvm.transform.PassContext():
+            Xb = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope)
+            Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope)
+            binds = {X: Xb, Y: Yb}
+
+    return (s, [X, filt_packed, Y], binds)
+
+
+def conv2d_packed_filter_nhwhwc(
+    shape_nhwc,
+    shape_filter,
+    kernel_size,
+    stride,
+    padding,
+    dtype,
+    storage_scope="global",
+):
+    """
+    Conv2d TE wherein the input activation is defined by its
+    logical NHWC shape, but the filter is provided in the
+    packed layout oihw8i32o4i. The physical packed layout used
+    for the activation is: nhw8h8wc
+
+    """
+    assert kernel_size == tuple(shape_filter[2:4])
+
+    block_shape = 8, 8, 32
+    block_H, block_W, _ = block_shape
+    shape = get_packed_layout(shape_nhwc, block_shape, packed_C=False)
+    logical_output_shape = (
+        shape_nhwc[0],
+        (shape_nhwc[1] - kernel_size[0] + padding[0] + padding[1]) // stride[0] + 1,
+        (shape_nhwc[2] - kernel_size[1] + padding[2] + padding[3]) // stride[0] + 1,
+        shape_nhwc[3],
+    )
+    output_shape = get_packed_layout(logical_output_shape, block_shape, packed_C=False)
+
+    N, H, W, C = shape_nhwc
+    X = te.placeholder(shape_nhwc, dtype=dtype)
+    # Combination of padding required by conv2d operator and padding to evenly divisible
+    # number of blocks. Note that this padding should be inlined in the schedule so
+    # as to avoid input copying.
+    pad_h = (block_H - ((H + padding[1]) % block_H)) % block_H
+    pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W
+    X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0)
+    # Calculate packed layout
+    packed_shape = get_packed_layout(X_pad.shape, block_shape, packed_C=False)
+    X_packed = te.compute(
+        packed_shape, lambda n, ho, wo, hi, wi, c: X_pad[n, ho * block_H + hi, wo * block_W + wi, c]
+    )
+
+    # Filter shape using KCRS (OIHW) notation
+    filter_Ki, filter_Ci, filter_Cii = 32, 32, 4
+    assert shape_filter[-1] == filter_Cii
+    assert shape_filter[-2] == filter_Ki
+    assert shape_filter[-3] == filter_Ci // filter_Cii
+
+    filt_packed = te.placeholder(shape_filter, dtype=dtype)
+
+    rh = te.reduce_axis((0, kernel_size[0]), name="rh")
+    rw = te.reduce_axis((0, kernel_size[1]), name="rw")
+    rc = te.reduce_axis((0, C), name="rc")
+
+    def compute(n, ho, wo, hi, wi, k):
+        # Construct blockized strided conv2d height index
+        h = ho * block_H + hi
+        h_contig = h * stride[0] + rh
+        h_block_id = h_contig // block_H
+        h_block_offset = h_contig % block_H
+
+        # Construct blockized strided conv2d width index
+        w = wo * block_W + wi
+        w_contig = w * stride[1] + rw
+        w_block_id = w_contig // block_W
+        w_block_offset = w_contig % block_W
+
+        # Construct flat filter input channel indices
+        rco = rc // filter_Ci
+        rcio = (rc % filter_Ci) // filter_Cii
+        rcii = rc % filter_Cii
+
+        # Construct split filter output channel index
+        ko = k // filter_Ki
+        ki = k % filter_Ki
+
+        return te.sum(
+            X_packed[n, h_block_id, w_block_id, h_block_offset, w_block_offset, rc]
+            * filt_packed[ko, rco, rh, rw, rcio, ki, rcii],
+            axis=[rh, rw, rc],
+        )
+
+    Y = te.compute(output_shape, compute)
+    s = te.create_schedule(Y.op)
+
+    # Ensure the padding and array packing is performed inline
+    s[X_pad].compute_inline()
+    s[X_packed].compute_inline()
+
+    n, ho, wo, hi, wi, k = s[Y].op.axis
+    rh, rw, rc = s[Y].op.reduce_axis
+
+    rco, rci = s[Y].split(rc, factor=32)
+    s[Y].reorder(n, rco, wo, ho, k, hi, wi)
+    Xl = s.cache_read(X_packed, storage_scope, [Y])
+    s[Xl].compute_at(s[Y], rco)
+
+    ko, ki = s[Y].split(k, factor=32)
+    s[Y].reorder(n, rco, wo, ho, ko, hi, wi, ki)
+    Fl = s.cache_read(filt_packed, storage_scope, [Y])
+    s[Fl].compute_at(s[Y], ko)
+
+    binds = {}
+    if storage_scope and storage_scope != "global":
+        with tvm.transform.PassContext():
+            Xb = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope)
+            Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope)
+            binds = {X: Xb, Y: Yb}
+
+    return (s, [X, filt_packed, Y], binds)
+
+
+class BaseConv2d:
+    batch = tvm.testing.parameter(1)
+    in_size = tvm.testing.parameter(8, 56)
+    in_channel = tvm.testing.parameter(64)
+    out_channel = tvm.testing.parameter(64)
+    kernel = tvm.testing.parameter(3)
+    stride = tvm.testing.parameter(1)
+    pad = tvm.testing.parameter(1)
+    dtype = tvm.testing.parameter("float32")
+
+
+class TestConv2dLogical(BaseConv2d):
+    @tvm.testing.parametrize_targets("llvm")
+    def test_conv2d(self, shape_nhwc, shape_oihw, kernel, stride, pad, dtype, target):
+        inputs = [
+            np.random.uniform(0, 255, size=shape_nhwc).astype(dtype),
+            np.random.uniform(0, 255, size=shape_oihw).astype(dtype),
+        ]
+        np_filter = inputs[1].transpose(2, 3, 1, 0)
+        ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad)
+        output = build_and_run(
+            inputs,
+            conv2d_logical,
+            target,
+            target,
+            shape_nhwc=shape_nhwc,
+            shape_kcrs=shape_oihw,
+            kernel_size=(kernel, kernel),
+            stride=(stride, stride),
+            padding=(pad, pad, pad, pad),
+            dtype=dtype,
+        )
+        return output, ref_output
+
+
+class TestConv2dPackedFilter(BaseConv2d):
+    conv2d_impl = tvm.testing.parameter(conv2d_packed_filter, conv2d_packed_filter_nhwhwc)
+
+    @tvm.testing.parametrize_targets("llvm")
+    def test_conv2d(
+        self,
+        conv2d_impl,
+        shape_nhwc,
+        shape_oihw,
+        shape_oihw8i32o4i,
+        kernel,
+        stride,
+        pad,
+        dtype,
+        target,
+    ):
+        inputs = [
+            np.random.uniform(0, 255, size=shape_nhwc).astype(dtype),
+            np.random.uniform(0, 255, size=shape_oihw8i32o4i).astype(dtype),
+        ]
+        np_filter = (
+            inputs[1].transpose(0, 5, 1, 4, 6, 2, 3).reshape(shape_oihw).transpose(2, 3, 1, 0)
+        )
+        ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad)
+        output = build_and_run(
+            inputs,
+            conv2d_impl,
+            target,
+            target,
+            shape_nhwc=shape_nhwc,
+            shape_filter=shape_oihw8i32o4i,
+            kernel_size=(kernel, kernel),
+            stride=(stride, stride),
+            padding=(pad, pad, pad, pad),
+            dtype=dtype,
+        )
+        return output, ref_output
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py b/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py
new file mode 100644
index 000000000000..75df6504611e
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py
@@ -0,0 +1,151 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+import tvm
+from tvm import te
+from tvm import topi
+from tvm.topi import testing
+from .infrastructure import ceildiv
+from .infrastructure import get_packed_layout
+from .infrastructure import build_and_run
+
+import numpy as np
+import pytest
+
+# Blocked layout: NHWC8h8w32c :: [N, H//8, W//8, C//32, 8h, 8w, 32c]
+def maxpool2d_logical(
+    shape_nhwc,
+    window_shape,
+    stride,
+    padding,
+    dtype,
+    storage_scope="global",
+):
+    """
+    Maxpool2d TE wherein the input activation is defined by its
+    logical NHWC shape. The packed physical layout for the
+    activation is nhwc8h8w32c.
+    """
+
+    block_shape = 8, 8, 32
+    block_H, block_W, block_C = block_shape
+    shape = get_packed_layout(shape_nhwc, block_shape)
+    logical_output_shape = (
+        shape_nhwc[0],
+        (shape_nhwc[1] - window_shape[0] + padding[0] + padding[1]) // stride[0] + 1,
+        (shape_nhwc[2] - window_shape[1] + padding[2] + padding[3]) // stride[0] + 1,
+        shape_nhwc[3],
+    )
+    output_shape = get_packed_layout(logical_output_shape, block_shape)
+
+    N, H, W, C = shape_nhwc
+    X = te.placeholder(shape_nhwc, dtype=dtype)
+
+    # Combination of padding required by maxpool operator and padding to evenly divisible
+    # number of blocks. Note that this padding should be inlined in the schedule so
+    # as to avoid input copying.
+    pad_h = (block_H - ((H + padding[1]) % block_H)) % block_H
+    pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W
+    X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0)
+
+    # Calculate packed layout
+    X_packed = te.compute(
+        shape,
+        lambda n, ho, wo, co, hi, wi, ci: X_pad[
+            n, ho * block_H + hi, wo * block_W + wi, co * block_C + ci
+        ],
+    )
+
+    rh = te.reduce_axis((0, window_shape[0]), name="rh")
+    rw = te.reduce_axis((0, window_shape[1]), name="rw")
+
+    def compute(n, ho, wo, co, hi, wi, ci):
+        # Construct blockized strided maxpool height indices
+        h = ho * block_H + hi
+        h_contig = h * stride[0] + rh
+        h_block_id = h_contig // block_H
+        h_block_offset = h_contig % block_H
+
+        # Construct blockized strided maxpool width indices
+        w = wo * block_W + wi
+        w_contig = w * stride[1] + rw
+        w_block_id = w_contig // block_W
+        w_block_offset = w_contig % block_W
+
+        return te.max(
+            X_packed[n, h_block_id, w_block_id, co, h_block_offset, w_block_offset, ci],
+            axis=[rh, rw],
+        )
+
+    Y = te.compute(output_shape, compute)
+    s = te.create_schedule(Y.op)
+
+    # Ensure the padding and array packing is performed inline
+    s[X_pad].compute_inline()
+    s[X_packed].compute_inline()
+
+    binds = {}
+    if storage_scope and storage_scope != "global":
+        with tvm.transform.PassContext():
+            Xb = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope)
+            Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope)
+            binds = {X: Xb, Y: Yb}
+
+    return (s, [X, Y], binds)
+
+
+class BaseMaxPooling:
+    batch = tvm.testing.parameter(1)
+    in_size = tvm.testing.parameter(8, 112)
+    in_channel = tvm.testing.parameter(64)
+    window_size = tvm.testing.parameter(3)
+    stride = tvm.testing.parameter(2)
+    pad = tvm.testing.parameter(1)
+    dtype = tvm.testing.parameter("float32")
+
+
+class TestMaxPooling(BaseMaxPooling):
+    @tvm.testing.parametrize_targets("llvm")
+    def test_maxpool(self, shape_nhwc, window_size, stride, pad, dtype, target):
+        inputs = [np.random.uniform(0, 255, size=shape_nhwc).astype(dtype)]
+        ref_output = testing.poolnd_python(
+            inputs[0],
+            (window_size, window_size),
+            strides=(stride, stride),
+            dilation=(1, 1),
+            padding_before=(pad, pad),
+            padding_after=(pad, pad),
+            pool_type="max",
+        )
+        output = build_and_run(
+            inputs,
+            maxpool2d_logical,
+            target,
+            target,
+            shape_nhwc,
+            window_shape=(window_size, window_size),
+            stride=(stride, stride),
+            padding=(pad, pad, pad, pad),
+            dtype=dtype,
+        )
+        return output, ref_output
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))

From ae5b095c4a887a8c3b7bfe2e3a2611f1e0c44453 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Thu, 9 Sep 2021 16:36:33 -0700
Subject: [PATCH 2/2] Restructure based on review comments

---
 tests/python/contrib/test_hexagon/conftest.py |  13 +-
 .../contrib/test_hexagon/infrastructure.py    |  47 ++++++--
 .../test_hexagon/test_conv2d_blocked.py       | 111 +++++++++---------
 .../test_hexagon/test_maxpool2d_blocked.py    |  16 ++-
 4 files changed, 106 insertions(+), 81 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/conftest.py b/tests/python/contrib/test_hexagon/conftest.py
index 8213c33123d8..0329328de3df 100644
--- a/tests/python/contrib/test_hexagon/conftest.py
+++ b/tests/python/contrib/test_hexagon/conftest.py
@@ -19,7 +19,7 @@
     values from testing parameters """
 
 import tvm
-from .infrastructure import ceildiv
+from .infrastructure import get_packed_filter_layout
 
 
 @tvm.testing.fixture
@@ -34,13 +34,4 @@ def shape_oihw(out_channel, in_channel, kernel):
 
 @tvm.testing.fixture
 def shape_oihw8i32o4i(out_channel, in_channel, kernel):
-    out_factor, in_factor, in_second_factor = 32, 32, 4
-    return (
-        int(ceildiv(out_channel, out_factor)),
-        int(ceildiv(in_channel, in_factor)),
-        kernel,
-        kernel,
-        in_factor // in_second_factor,
-        out_factor,
-        in_second_factor,
-    )
+    return get_packed_filter_layout(out_channel, in_channel, kernel, kernel)
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 9c7ba5b1b6a3..193a8630c3d2 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -25,23 +25,37 @@ def ceildiv(o, d):
     return tvm.tir.floordiv(o + d - 1, d)
 
 
-def get_packed_layout(logical_shape_nhwc, block_shape, packed_C=True):
-    shape = [logical_shape_nhwc[0]]
+def get_packed_activation_layout(shape_nhwc, block_shape, packed_C=True):
+    assert len(shape_nhwc) == 4
+    shape = [shape_nhwc[0]]
     off_h, off_w, off_c = block_shape
-    shape.append(ceildiv(logical_shape_nhwc[1], off_h))
-    shape.append(ceildiv(logical_shape_nhwc[2], off_w))
+    shape.append(ceildiv(shape_nhwc[1], off_h))
+    shape.append(ceildiv(shape_nhwc[2], off_w))
     if packed_C:
-        shape.append(ceildiv(logical_shape_nhwc[3], off_c))
+        shape.append(ceildiv(shape_nhwc[3], off_c))
         shape.extend(block_shape)
     else:
-        shape.extend([off_h, off_w, logical_shape_nhwc[-1]])
+        shape.extend([off_h, off_w, shape_nhwc[3]])
     return shape
 
 
+def get_packed_filter_layout(out_channel, in_channel, kernel_h, kernel_w):
+    out_factor, in_first_factor, in_second_factor = 32, 32, 4
+    return (
+        int(ceildiv(out_channel, out_factor)),
+        int(ceildiv(in_channel, in_first_factor)),
+        kernel_h,
+        kernel_w,
+        in_first_factor // in_second_factor,
+        out_factor,
+        in_second_factor,
+    )
+
+
 def build_and_run(inputs, func, target, target_host, *args, **kwargs):
-    s, placeholders, binds = func(*args, **kwargs)
+    schedule, placeholders, binds = func(*args, **kwargs)
 
-    func = tvm.build(s, placeholders, target=target, target_host=target_host, binds=binds)
+    func = tvm.build(schedule, placeholders, target=target, target_host=target_host, binds=binds)
     dev = tvm.device(target)
     tensors = []
     for tensor in inputs:
@@ -55,3 +69,20 @@ def build_and_run(inputs, func, target, target_host, *args, **kwargs):
     func(*tensors)
 
     return tensors[-1].asnumpy()
+
+
+def get_block_shape():
+    return 8, 8, 32
+
+
+def get_conv2d_nhwc_shape(shape_nhwc, kernel_size, strides, padding, dilation, out_channels):
+    assert len(shape_nhwc) == 4
+    kernel = []
+    kernel.append((kernel_size[0] - 1) * dilation[0] + 1)
+    kernel.append((kernel_size[1] - 1) * dilation[1] + 1)
+    return (
+        shape_nhwc[0],
+        (shape_nhwc[1] - kernel[0] + padding[0] + padding[1]) // strides[0] + 1,
+        (shape_nhwc[2] - kernel[1] + padding[2] + padding[3]) // strides[1] + 1,
+        out_channels,
+    )
diff --git a/tests/python/contrib/test_hexagon/test_conv2d_blocked.py b/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
index a5a766b5b5df..e0b7fb20ab8e 100644
--- a/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
+++ b/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
@@ -21,9 +21,14 @@
 from tvm import te
 from tvm import topi
 from tvm.topi import testing
-from .infrastructure import ceildiv
-from .infrastructure import get_packed_layout
-from .infrastructure import build_and_run
+from .infrastructure import (
+    ceildiv,
+    build_and_run,
+    get_block_shape,
+    get_conv2d_nhwc_shape,
+    get_packed_filter_layout,
+    get_packed_activation_layout,
+)
 
 import numpy as np
 import pytest
@@ -31,7 +36,7 @@
 
 def conv2d_logical(
     shape_nhwc,
-    shape_kcrs,
+    shape_oihw,
     kernel_size,
     stride,
     padding,
@@ -45,18 +50,15 @@ def conv2d_logical(
       Activation: nhwc8h8w32c
       Filter: oihw8i32o4i
     """
-    assert kernel_size == tuple(shape_kcrs[2:])
+    assert kernel_size == tuple(shape_oihw[2:])
 
-    block_shape = 8, 8, 32
+    block_shape = get_block_shape()
     block_H, block_W, block_C = block_shape
-    shape = get_packed_layout(shape_nhwc, block_shape)
-    logical_output_shape = (
-        shape_nhwc[0],
-        (shape_nhwc[1] - kernel_size[0] + padding[0] + padding[1]) // stride[0] + 1,
-        (shape_nhwc[2] - kernel_size[1] + padding[2] + padding[3]) // stride[0] + 1,
-        shape_nhwc[3],
+    shape = get_packed_activation_layout(shape_nhwc, block_shape)
+    logical_output_shape = get_conv2d_nhwc_shape(
+        shape_nhwc, kernel_size, stride, padding, [1, 1], shape_oihw[0]
     )
-    output_shape = get_packed_layout(logical_output_shape, block_shape)
+    output_shape = get_packed_activation_layout(logical_output_shape, block_shape)
 
     N, H, W, C = shape_nhwc
     X = te.placeholder(shape_nhwc, dtype=dtype)
@@ -75,18 +77,10 @@ def conv2d_logical(
     )
 
     # Filter shape using KCRS (OIHW) notation
-    K, C, R, S = shape_kcrs
+    K, C, R, S = shape_oihw
     filter_Ki, filter_Ci, filter_Cii = 32, 32, 4
-    shape_filter = [
-        ceildiv(K, filter_Ki),
-        ceildiv(C, filter_Ci),
-        R,
-        S,
-        filter_Ci // filter_Cii,
-        filter_Ki,
-        filter_Cii,
-    ]
-    filt = te.placeholder(shape_kcrs, dtype=dtype)
+    shape_filter = get_packed_filter_layout(K, C, R, S)
+    filt = te.placeholder(shape_oihw, dtype=dtype)
     # Channel padding to multiples of 32
     pad_c = (filter_Ci - (C % filter_Ci)) % filter_Ci
     pad_k = (filter_Ki - (K % filter_Ki)) % filter_Ki
@@ -163,7 +157,7 @@ def compute(n, ho, wo, ko, hi, wi, ki):
 
 def conv2d_packed_filter(
     shape_nhwc,
-    shape_filter,
+    shape_oihw8i32o4i,
     kernel_size,
     stride,
     padding,
@@ -176,18 +170,21 @@ def conv2d_packed_filter(
     packed layout oihw8i32o4i. The physical packed layout used
     for the activation is: nhwc8h8w32c
     """
-    assert kernel_size == tuple(shape_filter[2:4])
+    assert kernel_size == tuple(shape_oihw8i32o4i[2:4])
 
-    block_shape = 8, 8, 32
+    block_shape = get_block_shape()
     block_H, block_W, block_C = block_shape
-    shape = get_packed_layout(shape_nhwc, block_shape)
-    logical_output_shape = (
-        shape_nhwc[0],
-        (shape_nhwc[1] - kernel_size[0] + padding[0] + padding[1]) // stride[0] + 1,
-        (shape_nhwc[2] - kernel_size[1] + padding[2] + padding[3]) // stride[0] + 1,
-        shape_nhwc[3],
+    shape = get_packed_activation_layout(shape_nhwc, block_shape)
+    logical_output_shape = get_conv2d_nhwc_shape(
+        shape_nhwc,
+        kernel_size,
+        stride,
+        padding,
+        [1, 1],
+        shape_oihw8i32o4i[0] * shape_oihw8i32o4i[5],
     )
-    output_shape = get_packed_layout(logical_output_shape, block_shape)
+
+    output_shape = get_packed_activation_layout(logical_output_shape, block_shape)
 
     N, H, W, C = shape_nhwc
     X = te.placeholder(shape_nhwc, dtype=dtype)
@@ -199,7 +196,7 @@ def conv2d_packed_filter(
 
     X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0)
     # Calculate packed layout
-    packed_shape = get_packed_layout(X_pad.shape, block_shape)
+    packed_shape = get_packed_activation_layout(X_pad.shape, block_shape)
 
     X_packed = te.compute(
         packed_shape,
@@ -210,11 +207,11 @@ def conv2d_packed_filter(
 
     # Filter shape using KCRS (OIHW) notation
     filter_Ki, filter_Ci, filter_Cii = 32, 32, 4
-    assert shape_filter[-1] == filter_Cii
-    assert shape_filter[-2] == filter_Ki
-    assert shape_filter[-3] == filter_Ci // filter_Cii
+    assert shape_oihw8i32o4i[-1] == filter_Cii
+    assert shape_oihw8i32o4i[-2] == filter_Ki
+    assert shape_oihw8i32o4i[-3] == filter_Ci // filter_Cii
 
-    filt_packed = te.placeholder(shape_filter, dtype=dtype)
+    filt_packed = te.placeholder(shape_oihw8i32o4i, dtype=dtype)
 
     rh = te.reduce_axis((0, kernel_size[0]), name="rh")
     rw = te.reduce_axis((0, kernel_size[1]), name="rw")
@@ -285,7 +282,7 @@ def compute(n, ho, wo, ko, hi, wi, ki):
 
 def conv2d_packed_filter_nhwhwc(
     shape_nhwc,
-    shape_filter,
+    shape_oihw8i32o4i,
     kernel_size,
     stride,
     padding,
@@ -299,18 +296,20 @@ def conv2d_packed_filter_nhwhwc(
     for the activation is: nhw8h8wc
 
     """
-    assert kernel_size == tuple(shape_filter[2:4])
+    assert kernel_size == tuple(shape_oihw8i32o4i[2:4])
 
-    block_shape = 8, 8, 32
+    block_shape = get_block_shape()
     block_H, block_W, _ = block_shape
-    shape = get_packed_layout(shape_nhwc, block_shape, packed_C=False)
-    logical_output_shape = (
-        shape_nhwc[0],
-        (shape_nhwc[1] - kernel_size[0] + padding[0] + padding[1]) // stride[0] + 1,
-        (shape_nhwc[2] - kernel_size[1] + padding[2] + padding[3]) // stride[0] + 1,
-        shape_nhwc[3],
+    shape = get_packed_activation_layout(shape_nhwc, block_shape, packed_C=False)
+    logical_output_shape = get_conv2d_nhwc_shape(
+        shape_nhwc,
+        kernel_size,
+        stride,
+        padding,
+        [1, 1],
+        shape_oihw8i32o4i[0] * shape_oihw8i32o4i[5],
     )
-    output_shape = get_packed_layout(logical_output_shape, block_shape, packed_C=False)
+    output_shape = get_packed_activation_layout(logical_output_shape, block_shape, packed_C=False)
 
     N, H, W, C = shape_nhwc
     X = te.placeholder(shape_nhwc, dtype=dtype)
@@ -321,18 +320,18 @@ def conv2d_packed_filter_nhwhwc(
     pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W
     X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0)
     # Calculate packed layout
-    packed_shape = get_packed_layout(X_pad.shape, block_shape, packed_C=False)
+    packed_shape = get_packed_activation_layout(X_pad.shape, block_shape, packed_C=False)
     X_packed = te.compute(
         packed_shape, lambda n, ho, wo, hi, wi, c: X_pad[n, ho * block_H + hi, wo * block_W + wi, c]
     )
 
     # Filter shape using KCRS (OIHW) notation
     filter_Ki, filter_Ci, filter_Cii = 32, 32, 4
-    assert shape_filter[-1] == filter_Cii
-    assert shape_filter[-2] == filter_Ki
-    assert shape_filter[-3] == filter_Ci // filter_Cii
+    assert shape_oihw8i32o4i[-1] == filter_Cii
+    assert shape_oihw8i32o4i[-2] == filter_Ki
+    assert shape_oihw8i32o4i[-3] == filter_Ci // filter_Cii
 
-    filt_packed = te.placeholder(shape_filter, dtype=dtype)
+    filt_packed = te.placeholder(shape_oihw8i32o4i, dtype=dtype)
 
     rh = te.reduce_axis((0, kernel_size[0]), name="rh")
     rw = te.reduce_axis((0, kernel_size[1]), name="rw")
@@ -422,7 +421,7 @@ def test_conv2d(self, shape_nhwc, shape_oihw, kernel, stride, pad, dtype, target
             target,
             target,
             shape_nhwc=shape_nhwc,
-            shape_kcrs=shape_oihw,
+            shape_oihw=shape_oihw,
             kernel_size=(kernel, kernel),
             stride=(stride, stride),
             padding=(pad, pad, pad, pad),
@@ -461,7 +460,7 @@ def test_conv2d(
             target,
             target,
             shape_nhwc=shape_nhwc,
-            shape_filter=shape_oihw8i32o4i,
+            shape_oihw8i32o4i=shape_oihw8i32o4i,
             kernel_size=(kernel, kernel),
             stride=(stride, stride),
             padding=(pad, pad, pad, pad),
diff --git a/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py b/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py
index 75df6504611e..67af8d87f708 100644
--- a/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py
+++ b/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py
@@ -21,9 +21,13 @@
 from tvm import te
 from tvm import topi
 from tvm.topi import testing
-from .infrastructure import ceildiv
-from .infrastructure import get_packed_layout
-from .infrastructure import build_and_run
+from .infrastructure import (
+    ceildiv,
+    build_and_run,
+    get_block_shape,
+    get_packed_filter_layout,
+    get_packed_activation_layout,
+)
 
 import numpy as np
 import pytest
@@ -43,16 +47,16 @@ def maxpool2d_logical(
     activation is nhwc8h8w32c.
     """
 
-    block_shape = 8, 8, 32
+    block_shape = get_block_shape()
     block_H, block_W, block_C = block_shape
-    shape = get_packed_layout(shape_nhwc, block_shape)
+    shape = get_packed_activation_layout(shape_nhwc, block_shape)
     logical_output_shape = (
         shape_nhwc[0],
         (shape_nhwc[1] - window_shape[0] + padding[0] + padding[1]) // stride[0] + 1,
         (shape_nhwc[2] - window_shape[1] + padding[2] + padding[3]) // stride[0] + 1,
         shape_nhwc[3],
     )
-    output_shape = get_packed_layout(logical_output_shape, block_shape)
+    output_shape = get_packed_activation_layout(logical_output_shape, block_shape)
 
     N, H, W, C = shape_nhwc
     X = te.placeholder(shape_nhwc, dtype=dtype)