From 49fe19e597fa14bf221e5c15952139192efc8b30 Mon Sep 17 00:00:00 2001
From: Anirudh Sundar <quic_sanirudh@quicinc.com>
Date: Tue, 18 Oct 2022 14:16:01 +0530
Subject: [PATCH 1/2] [Hexagon] Add HVX quant conv2d implementation

This patch adds a new HVX intrinsic implementation to perform quantized
convolution.

It assumes that the qnn.conv2d relay op is not
canonicalized and all the quantization parameters (scales and zero
points) are passed into the intrinsic implementation.

It also uses the fixed point computation function defined in hexagon
topi utils to compute a fixed point (combined) scale which is used to
perform the final requantization before returning the quantized output.
---
 cmake/modules/Hexagon.cmake                   |   9 +
 .../tvm => src}/runtime/hexagon/ops/conv2d.h  | 138 +++++++-
 src/runtime/hexagon/ops/conv2d_fp16_hvx.cc    |  57 ++--
 src/runtime/hexagon/ops/conv2d_quant_hvx.cc   | 319 ++++++++++++++++++
 src/runtime/hexagon/ops/conv_utils.cc         | 170 ++++------
 .../hexagon/hexagon_conv_utils_test.h         | 102 ++++++
 .../hexagon/hexagon_fp16_utils_tests.cc       |  96 ++----
 .../hexagon/hexagon_quant_utils_tests.cc      | 224 ++++++++++++
 .../contrib/test_hexagon/infrastructure.py    |   2 +-
 .../topi/test_conv2d_quant_intrin.py          | 261 ++++++++++++++
 10 files changed, 1172 insertions(+), 206 deletions(-)
 rename {include/tvm => src}/runtime/hexagon/ops/conv2d.h (55%)
 create mode 100644 src/runtime/hexagon/ops/conv2d_quant_hvx.cc
 create mode 100644 tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h
 create mode 100644 tests/cpp-runtime/hexagon/hexagon_quant_utils_tests.cc
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 31cece8a19e0..887211893558 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -178,6 +178,15 @@ if(BUILD_FOR_HEXAGON)
     "${TVMRT_SOURCE_DIR}/hexagon/ops/*.cc"
   )
 
+  include_directories(
+    "${TVMRT_SOURCE_DIR}/hexagon/ops"
+  )
+
+  set_source_files_properties(
+    "${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_quant_hvx.cc"
+    PROPERTIES COMPILE_FLAGS "-mhvx"
+  )
+
   set_source_files_properties(
     "${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_fp16_hvx.cc"
     PROPERTIES COMPILE_FLAGS "-mhvx"
diff --git a/include/tvm/runtime/hexagon/ops/conv2d.h b/src/runtime/hexagon/ops/conv2d.h
similarity index 55%
rename from include/tvm/runtime/hexagon/ops/conv2d.h
rename to src/runtime/hexagon/ops/conv2d.h
index d759149727e8..3501441f0a8c 100644
--- a/include/tvm/runtime/hexagon/ops/conv2d.h
+++ b/src/runtime/hexagon/ops/conv2d.h
@@ -20,6 +20,7 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 
+#include <algorithm>
 #include <cassert>
 
 #ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
@@ -28,6 +29,7 @@
 namespace tvm {
 namespace runtime {
 namespace hexagon {
+namespace conv_utils {
 static constexpr auto hexagon_device = DLDevice{static_cast<DLDeviceType>(kDLHexagon), 0};
 
 // Standalone DLTensor: the standalone-ness means that this object owns the shape
@@ -75,7 +77,7 @@ inline void* to_ptr(uintptr_t v) { return reinterpret_cast<void*>(v); }
 
 inline uintptr_t to_uint(void* ptr) { return reinterpret_cast<uintptr_t>(ptr); }
 
-constexpr int xyc_to_sm_16b(int y, int x, int c) {
+inline constexpr int yxc_to_sm_16b(int y, int x, int c) {
   // Map y,x,c coordinates within a block to the offset (in 16-bit elements)
   // from the beginning of the block in spatial-major layout.
   // 10-bit spatial mask: yyyxcccccx
@@ -83,7 +85,23 @@ constexpr int xyc_to_sm_16b(int y, int x, int c) {
   return y << 7 | (x & 2) << 5 | c << 1 | (x & 1);
 }
 
-constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
+inline constexpr int yxc_to_sm_8b(int y, int x, int c) {
+  // Map y,x,c coordinates within a block to the offset (in 8-bit elements)
+  // from the beginning of the block in spatial-major layout.
+  // 10-bit spatial mask: yyyxxxccccc
+  return y << 8 | x << 5 | c;
+}
+
+inline constexpr int hwio_to_sm_8b(int width, int y, int x, int i, int o) {
+  // Map y,x,i,o coordinates within a chunk (assuming the origin at the
+  // top-left spatial corner) to the offset (in 8-bit elements) from the
+  // beginning of the chunk in spatial-major layout.
+  // Spatial mask: p..piiioooooii, where p..p are position bits.
+  int p = y * width + (width - 1 - x);
+  return p << 10 | (i & 0x1c) << 5 | o << 2 | (i & 3);
+}
+
+inline constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
   // Map y,x,i,o coordinates within a chunk (assuming the origin at the
   // top-left spatial corner) to the offset (in 16-bit elements) from the
   // beginning of the chunk in spatial-major layout.
@@ -123,6 +141,10 @@ inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
  * The input is mapped into the below mentioned layout (notation similar to index map used for
  * transform layout):
  *
+ * For uint8_t type
+ * lambda n, h, w, c: n, h//8, w//8, c//32, AXIS_SEPARATOR, h%8, w%8, c%32
+ *
+ * For uint16_t type
  * lambda n, h, w, c: n, h//8, w//4, c//32, AXIS_SEPARATOR, h%8, (w%4)//2, c%32, w%2
  *
  * where AXIS_SEPARATOR represents split up in the physical layout
@@ -133,7 +155,48 @@ inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
  * @param width
  * @param depth
  */
-void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int depth);
+template <typename T, int block_height, int block_width, int block_depth>
+void blockize_hwc(void* out, void* inp_flat, int height, int width, int depth) {
+  int (*index_func)(int, int, int);
+  if constexpr (std::is_same_v<T, uint8_t>)
+    index_func = yxc_to_sm_8b;
+  else if constexpr (std::is_same_v<T, uint16_t>)
+    index_func = yxc_to_sm_16b;
+  else
+    LOG_ERROR << "blockize_hwc is only supported for uint8_t and uint16_t types";
+
+  auto inp_data = static_cast<T*>(inp_flat);
+  auto out_data = static_cast<uintptr_t*>(out);
+  const int stride_x = depth;
+  const int stride_y = stride_x * width;
+
+  for (int cy = 0; cy < height; cy += block_height) {
+    for (int cx = 0; cx < width; cx += block_width) {
+      for (int cc = 0; cc < depth; cc += block_depth) {
+        auto block = reinterpret_cast<T*>(*out_data++);
+        int max_y = std::min(block_height, height - cy);
+        int max_x = std::min(block_width, width - cx);
+        int max_c = std::min(block_depth, depth - cc);
+        for (int y = 0; y < max_y; ++y) {
+          for (int x = 0; x < max_x; ++x) {
+            for (int c = 0; c < max_c; ++c) {
+              block[index_func(y, x, c)] =
+                  inp_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)];
+            }
+            for (int c = max_c; c < block_depth; ++c) block[index_func(y, x, c)] = 0;
+          }
+          for (int x = max_x; x < block_width; ++x) {
+            for (int c = 0; c < block_depth; ++c) block[index_func(y, x, c)] = 0;
+          }
+        }
+
+        for (int y = max_y; y < block_height; ++y)
+          for (int x = 0; x < block_width; ++x)
+            for (int c = 0; c < block_depth; ++c) block[index_func(y, x, c)] = 0;
+      }  // cc
+    }    // cx
+  }      // cy
+}
 
 /**
  * @brief Convert back from non-contguous layout to a flat layout
@@ -144,7 +207,42 @@ void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int dept
  * @param width
  * @param depth
  */
-void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int depth);
+template <typename T, int block_height, int block_width, int block_depth>
+void deblockize_hwc(void* out_flat, void* inp, int height, int width, int depth) {
+  int (*index_func)(int, int, int);
+  if constexpr (std::is_same_v<T, uint8_t>)
+    index_func = yxc_to_sm_8b;
+  else if constexpr (std::is_same_v<T, uint16_t>)
+    index_func = yxc_to_sm_16b;
+  else
+    LOG_ERROR << "deblockize_hwc is only supported for uint8_t and uint16_t types";
+
+  uintptr_t* inp_data = static_cast<uintptr_t*>(inp);
+  T* out_data = static_cast<T*>(out_flat);
+  const int stride_x = depth;
+  const int stride_y = stride_x * width;
+
+  for (int cy = 0; cy < height; cy += block_height) {
+    for (int cx = 0; cx < width; cx += block_width) {
+      for (int cc = 0; cc < depth; cc += block_depth) {
+        auto block = reinterpret_cast<T*>(*inp_data);
+        int max_y = std::min(block_height, height - cy);
+        int max_x = std::min(block_width, width - cx);
+        int max_c = std::min(block_depth, depth - cc);
+        for (int y = 0; y < max_y; ++y) {
+          for (int x = 0; x < max_x; ++x) {
+            for (int c = 0; c < max_c; ++c) {
+              out_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)] =
+                  block[index_func(y, x, c)];
+            }
+          }
+        }
+
+        inp_data++;
+      }
+    }
+  }
+}
 
 /**
  * @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below:
@@ -175,15 +273,42 @@ void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int de
  */
 void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height,
                        int width, int idepth, int odepth);
+void chunkify_hwio_8b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height, int width,
+                      int idepth, int odepth);
 
+template <typename T, int block_height, int block_width, int block_depth>
 SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* nhwc_flat,
-                          bool copy_data);
+                          bool copy_data) {
+  tvm::runtime::String vtcm_scope = "global.vtcm";
 
-int calculate_num_weight_chunks(int64_t* shape_hwio);
+  // Allocate blocks for activations. We will use the block pointers
+  // directly from the allocated area.
+  int n = nhwc_flat->shape[0];
+  int h = round_up(nhwc_flat->shape[1], block_height);
+  int w = round_up(nhwc_flat->shape[2], block_width);
+  int c = round_up(nhwc_flat->shape[3], block_depth);
+  int64_t shape_2d[2] = {(n * h * w * c) / (block_height * block_width * block_depth),
+                         block_height * block_width * block_depth};
+  void* nhwc_vtcm =
+      device_api->AllocDataSpace(hexagon_device, 2, shape_2d, nhwc_flat->dtype, vtcm_scope);
+  if (copy_data) {
+    blockize_hwc<T, block_height, block_width, block_depth>(
+        nhwc_vtcm, nhwc_flat->data, nhwc_flat->shape[1], nhwc_flat->shape[2], nhwc_flat->shape[3]);
+  }
+
+  return SDLTensor<4>(nhwc_vtcm, nhwc_flat->dtype, nhwc_vtcm,
+                      {n, h / block_height, w / block_width, c / block_depth});
+}
+
+int calculate_num_weight_chunks(int64_t* shape_hwio, int chunk_height, int chunk_width,
+                                int chunk_in_channel, int chunk_out_channel);
 
 SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
                           int num_chunks, void** ptr_table);
 
+SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
+                             int num_chunks, void** ptr_table, int wgt_zp = 0);
+
 template <size_t N>
 void release(tvm::runtime::DeviceAPI* device_api, const SDLTensor<N>& tensor) {
   if (auto* data_space = tensor.GetDataSpace()) {
@@ -191,6 +316,7 @@ void release(tvm::runtime::DeviceAPI* device_api, const SDLTensor<N>& tensor) {
   }
 }
 
+}  // namespace conv_utils
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
index a478fbab352d..53ea0868ad2a 100644
--- a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
+++ b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
@@ -27,7 +27,7 @@
 #include <cassert>
 #include <cinttypes>
 
-#include "tvm/runtime/hexagon/ops/conv2d.h"
+#include "conv2d.h"
 
 // Current limitations:
 // - N in NHWC must be 1
@@ -68,7 +68,7 @@ namespace hexagon {
  */
 static inline uint16_t* getElementPtr(int block_out_y, int block_out_x, int block_out_c, int yi,
                                       int xio, int ci, int xii, const DLTensor& tensor) {
-  auto block_ptr = nhwc_at(tensor, 0, block_out_y, block_out_x, block_out_c);
+  auto block_ptr = conv_utils::nhwc_at(tensor, 0, block_out_y, block_out_x, block_out_c);
   auto block_offset = yi * 128 + xio * 64 + ci * 2 + xii;
   auto first_element_ptr = reinterpret_cast<uint16_t*>(block_ptr);
   return first_element_ptr + block_offset;
@@ -279,10 +279,10 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
         }
         int fx = (fw < wgt_chunk_thin_width) ? fw : ((fw - wgt_chunk_thin_width) % 4);
         int fy = fh % 8;
-        for (int c = 0; c < round_up(filt_idepth, 2); c += 2) {
+        for (int c = 0; c < conv_utils::round_up(filt_idepth, 2); c += 2) {
           int out_act_cc = c / 32;
           int ci = c % 32;
-          auto wgt_chunk = hwio_at(cr_filt, fch, fcw, out_act_cc, out_c);
+          auto wgt_chunk = conv_utils::hwio_at(cr_filt, fch, fcw, out_act_cc, out_c);
 
           // Find weight chunk offset ptr
           int max_x = (fcw == 0) ? wgt_chunk_thin_width : 4;
@@ -306,7 +306,7 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
                                                true_wo, ci, true_wi, cr_act);
           HVX_Vector act_vec = getInputVector(act_element_ptr);
 
-          auto wgt_chunk_offset = hwio_to_sm_16b(max_x, fy, fx, ci, 0);
+          auto wgt_chunk_offset = conv_utils::hwio_to_sm_16b(max_x, fy, fx, ci, 0);
           auto base_chunk_ptr = reinterpret_cast<uint16_t*>(wgt_chunk);
           auto chunk_ptr = base_chunk_ptr + wgt_chunk_offset;
 
@@ -404,7 +404,7 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
 
 int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val,
                        int out_code, void* res_handle) {
-  namespace hexagonrt = tvm::runtime::hexagon;
+  namespace conv_utils = tvm::runtime::hexagon::conv_utils;
   ICHECK_EQ(num_args, 7) << "Unexpected number of arguments";
   ICHECK_EQ(type_codes[0], kTVMDLTensorHandle)
       << "First argument is expected to be the input tensor";  // Input activations
@@ -440,50 +440,55 @@ int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue*
            << wgt_flat->shape[2] << "x" << wgt_flat->shape[3] << ", pad_top=" << pad_top
            << ", pad_left=" << pad_left;
 
-  auto* device_api = tvm::runtime::DeviceAPI::Get(hexagonrt::hexagon_device, false);
+  auto* device_api = tvm::runtime::DeviceAPI::Get(conv_utils::hexagon_device, false);
   ICHECK(device_api != nullptr);
   tvm::runtime::String vtcm_scope = "global.vtcm";
 
-  auto act_vtcm = hexagonrt::prepare_nhwc(device_api, act_flat, /*copy_data=*/true);
+  auto act_vtcm =
+      conv_utils::prepare_nhwc<uint16_t, 8, 4, 32>(device_api, act_flat, /*copy_data=*/true);
 
   ICHECK_NE(wgt_flat->shape[0], 0) << "Weights height should not be zero";
   ICHECK_NE(wgt_flat->shape[1], 0) << "Weights width should not be zero";
   ICHECK_NE(wgt_flat->shape[2], 0) << "Weights input channels should not be zero";
   ICHECK_NE(wgt_flat->shape[3], 0) << "Weights output channels should not be zero";
-  int num_wgt_chunks = hexagonrt::calculate_num_weight_chunks(wgt_flat->shape);
+  int num_wgt_chunks = conv_utils::calculate_num_weight_chunks(
+      wgt_flat->shape, /* chunk_height */ 8, /* chunk_width */ 4, /* chunk_in_channel */ 32,
+      /* chunk_out_channel */ 32);
+
   LOG_INFO << "num_wgt_chunks: " << num_wgt_chunks;
   auto wgt_ptr_table =
       reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
-  auto wgt_vtcm = hexagonrt::prepare_hwio(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table);
+  auto wgt_vtcm = conv_utils::prepare_hwio(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table);
 
-  auto out_vtcm = hexagonrt::prepare_nhwc(device_api, out_flat, /*copy_data=*/false);
+  auto out_vtcm =
+      conv_utils::prepare_nhwc<uint16_t, 8, 4, 32>(device_api, out_flat, /*copy_data=*/false);
 
   // Prepare zero_block
   int64_t block_nbytes = 2048;
-  void* zero_block = device_api->AllocDataSpace(hexagonrt::hexagon_device, 1, &block_nbytes,
+  void* zero_block = device_api->AllocDataSpace(conv_utils::hexagon_device, 1, &block_nbytes,
                                                 tvm::runtime::DataType::UInt(8), vtcm_scope);
   memset(zero_block, 0, 2048);
 
   // FIXME: Setting bias to zero_block: this works for up to 256 output channels.
   auto bias_flat =
-      hexagonrt::SDLTensor<1>(zero_block, wgt_flat->dtype, zero_block, &wgt_flat->shape[3]);
-  auto act_shape = hexagonrt::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape);
-  auto filt_shape = hexagonrt::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape);
-  auto pad_shape = hexagonrt::SDLTensor<2>(nullptr, act_flat->dtype, nullptr, {pad_top, pad_left});
-  auto out_shape = hexagonrt::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape);
+      conv_utils::SDLTensor<1>(zero_block, wgt_flat->dtype, zero_block, &wgt_flat->shape[3]);
+  auto act_shape = conv_utils::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape);
+  auto filt_shape = conv_utils::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape);
+  auto pad_shape = conv_utils::SDLTensor<2>(nullptr, act_flat->dtype, nullptr, {pad_top, pad_left});
+  auto out_shape = conv_utils::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape);
   bool relu = false;
 
-  hexagonrt::conv_layer_fp16_hvx(out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape, bias_flat,
-                                 filt_shape, pad_shape, relu, stride_h, stride_w,
-                                 hexagonrt::to_uint(zero_block));
+  tvm::runtime::hexagon::conv_layer_fp16_hvx(out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape,
+                                             bias_flat, filt_shape, pad_shape, relu, stride_h,
+                                             stride_w, conv_utils::to_uint(zero_block));
 
-  hexagonrt::deblockize_hwc_16b(out_flat->data, out_vtcm.data, out_flat->shape[1],
-                                out_flat->shape[2], out_flat->shape[3]);
+  conv_utils::deblockize_hwc<uint16_t, 8, 4, 32>(out_flat->data, out_vtcm.data, out_flat->shape[1],
+                                                 out_flat->shape[2], out_flat->shape[3]);
 
-  device_api->FreeDataSpace(hexagonrt::hexagon_device, zero_block);
-  hexagonrt::release(device_api, out_vtcm);
-  hexagonrt::release(device_api, wgt_vtcm);
-  hexagonrt::release(device_api, act_vtcm);
+  device_api->FreeDataSpace(conv_utils::hexagon_device, zero_block);
+  conv_utils::release(device_api, out_vtcm);
+  conv_utils::release(device_api, wgt_vtcm);
+  conv_utils::release(device_api, act_vtcm);
 
   return 0;
 }
diff --git a/src/runtime/hexagon/ops/conv2d_quant_hvx.cc b/src/runtime/hexagon/ops/conv2d_quant_hvx.cc
new file mode 100644
index 000000000000..682eebb137c0
--- /dev/null
+++ b/src/runtime/hexagon/ops/conv2d_quant_hvx.cc
@@ -0,0 +1,319 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <hexagon_types.h>
+#include <hvx_hexagon_protos.h>
+#include <inttypes.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include "conv2d.h"
+
+extern "C" int conv2d_packed_quant(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val,
+                                   int out_code, void* res_handle);
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+inline uint8_t* getElementPtr_int8(int block_out_y, int block_out_x, int block_out_c, int yi,
+                                   int xi, int ci, const DLTensor& block) {
+  auto block_ptr =
+      tvm::runtime::hexagon::conv_utils::nhwc_at(block, 0, block_out_y, block_out_x, block_out_c);
+  const int width_stride = 32;
+  const int height_stride = width_stride * 8;
+  auto block_offset = yi * height_stride + xi * width_stride + ci;
+  auto first_element_ptr = reinterpret_cast<uint8_t*>(block_ptr);
+  return first_element_ptr + block_offset;
+}
+
+inline int8_t* getWgtPtr_int8(int out_i, int out_o, int h, int w, int i, int o,
+                              const DLTensor& wgt_vtcm, int width) {
+  auto data = static_cast<intptr_t*>(wgt_vtcm.data);
+  auto chunk = data[out_i * wgt_vtcm.shape[3] + out_o];
+  auto base_chunk_ptr = reinterpret_cast<int8_t*>(chunk);
+  auto wgt_chunk_offset = tvm::runtime::hexagon::conv_utils::hwio_to_sm_8b(width, h, w, i, o);
+  return base_chunk_ptr + wgt_chunk_offset;
+}
+
+int32_t saturate_uint8(int32_t val) { return std::max(std::min(val, 255), 0); }
+
+int32_t saturate_int8(int32_t val) { return std::max(std::min(val, 127), -128); }
+
+/**
+ * @brief Compute the quantized convolution along with requantize with output quantization params to
+ * get uint8 outputs
+ *
+ * The quantized convolution is represented by the below equation
+ * out_scale(out_q - out_zp) = Σr,s,c(act_scale(act_q[n,h+r,w+s,c] - act_zp) *
+ *                             wgt_scale(wgt_q[r,s,c,o] - wgt_zp))
+ * => out_q = Σr,s,c((act_q[n,h+r,w+s,c] - act_zp) * (wgt_q[r,s,c,o] - wgt_zp))
+ *            * (act_scale*wgt_scale/out_scale) + out_zp
+ * out_q = Σr,s,c((act_q[n,h+r,w+s,c] - act_zp) * (wgt_zp_q[r,s,c,o])) *
+ * (act_scale*wgt_scale/out_scale) + out_zp, where wgt_zp_q = (wgt_q[r,s,c,o] - wgt_zp)
+ *
+ *  Assumptions/Limitations:
+ *  - Strided convolution is not yet supported so the stride variables are unused
+ *
+ * @param cr_out blockized output tensor with zeros already filled in
+ * @param cr_act blockized activations
+ * @param cr_filt Chunkified weights as returned from output of prepare_hwio
+ * @param out_shape Original output shape of the tensor before blockization
+ * @param act_shape Original input shape
+ * @param filt_shape Original filter shape
+ * @param act_scale Quantization scale for activation
+ * @param act_zp Activations zero point
+ * @param wgt_scale Quantization scale for weights
+ * @param wgt_zp Weights zero point
+ * @param out_scale Quantization scale for output
+ * @param out_zp Output zero point
+ * @param fixed_final_scale Fixed point value of final_scale= (act_scale*wgt_scale/out_scale)
+ * @param scale_factor Scale factor for the fixed_final_scale
+ */
+void conv_layer_int8_hvx_whole(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
+                               const DLTensor& cr_filt, const DLTensor& out_shape,
+                               const DLTensor& act_shape, const DLTensor& filt_shape,
+                               float act_scale, int act_zp, float wgt_scale, int wgt_zp,
+                               float out_scale, int out_zp, int fixed_final_scale,
+                               int scale_factor) {
+  namespace conv_utils = tvm::runtime::hexagon::conv_utils;
+  int filt_height = filt_shape.shape[0];
+  int filt_width = filt_shape.shape[1];
+  int filt_idepth = filt_shape.shape[2];
+
+  int a_depth = cr_act.shape[3];
+
+  int o_height = cr_out.shape[1];
+  int o_width = cr_out.shape[2];
+  int o_depth = cr_out.shape[3];
+
+  int out_height = out_shape.shape[1];
+  int out_width = out_shape.shape[2];
+
+  uint8_t act_zp_u8 = static_cast<uint8_t>(act_zp);
+  int8_t wgt_zp_i8 = static_cast<int8_t>(wgt_zp);
+
+  HVX_Vector act_zp_vec = Q6_Vb_vsplat_R(act_zp_u8);
+  HVX_Vector wgt_zp_vec = Q6_Vb_vsplat_R(wgt_zp_i8);
+  HVX_VectorPair wgt_zp_vec_pair = Q6_Wh_vsxt_Vb(wgt_zp_vec);
+
+  ICHECK_EQ(a_depth, cr_filt.shape[2]) << "input depth should match weights input channels";
+  ICHECK_EQ(o_depth, cr_filt.shape[3]) << "output depth should match the weights output channel";
+
+  uint32_t scale_u = static_cast<uint32_t>(fixed_final_scale);
+  HVX_Vector scale_vec = Q6_V_vsplat_R(scale_u);
+  uint32_t new_scale_factor = static_cast<uint32_t>(scale_factor - 16);
+  HVX_Vector out_zp_vec = Q6_V_vsplat_R(out_zp);
+
+  auto computeOutVec = [&cr_act, &cr_filt, &act_zp_vec, &wgt_zp_vec_pair, &out_zp_vec, &scale_vec,
+                        new_scale_factor, filt_height, filt_width,
+                        filt_idepth](int out_h, int out_w, int out_c, int h, int w) -> HVX_Vector {
+    HVX_Vector out_vec = Q6_V_vzero();
+    for (int fh = 0; fh < filt_height; ++fh) {
+      for (int fw = 0; fw < filt_width; ++fw) {
+        for (int c = 0; c < conv_utils::round_up(filt_idepth, 4); c += 4) {
+          int act_h = out_h * 8 + h + fh;
+          int act_ho = act_h / 8;
+          int act_hi = act_h % 8;
+
+          int act_w = out_w * 8 + w + fw;
+          int act_wo = act_w / 8;
+          int act_wi = act_w % 8;
+
+          int act_co = c / 32;
+          int act_ci = c % 32;
+
+          uint8_t* act_ptr =
+              getElementPtr_int8(act_ho, act_wo, act_co, act_hi, act_wi, act_ci, cr_act);
+
+          uint32_t four_act_elems = *reinterpret_cast<uint32_t*>(act_ptr);
+          HVX_Vector act_vec = Q6_V_vsplat_R(four_act_elems);
+          int8_t* wgt_ptr = getWgtPtr_int8(act_co, out_c, fh, fw, act_ci, 0, cr_filt, filt_width);
+
+          HVX_Vector* wgt_vec_ptr = reinterpret_cast<HVX_Vector*>(wgt_ptr);
+          HVX_Vector wgt_vec = *wgt_vec_ptr;
+
+          HVX_VectorPair act_vec_zp_diff = Q6_Wh_vsub_VubVub(act_vec, act_zp_vec);
+          HVX_VectorPair wgt_i16_vec_nodiff = Q6_Wh_vsxt_Vb(wgt_vec);
+          HVX_VectorPair wgt_i16_vec = Q6_Wh_vsub_WhWh_sat(wgt_i16_vec_nodiff, wgt_zp_vec_pair);
+
+          out_vec = Q6_Vw_vdmpyacc_VwVhVh_sat(out_vec, Q6_V_lo_W(act_vec_zp_diff),
+                                              Q6_V_lo_W(wgt_i16_vec));
+          out_vec = Q6_Vw_vdmpyacc_VwVhVh_sat(out_vec, Q6_V_hi_W(act_vec_zp_diff),
+                                              Q6_V_hi_W(wgt_i16_vec));
+        }
+      }
+    }
+    HVX_Vector mul_vec = Q6_Vw_vmpye_VwVuh(out_vec, scale_vec);
+    HVX_Vector scaled_vec = Q6_Vw_vasr_VwR(mul_vec, new_scale_factor);
+    HVX_Vector sum_vec = Q6_Vw_vadd_VwVw(scaled_vec, out_zp_vec);
+    return sum_vec;
+  };
+
+  auto saturateAndStore = [&cr_out, &computeOutVec](int out_h, int out_w, int out_c, int h, int w) {
+    uint8_t* out_ptr = getElementPtr_int8(out_h, out_w, out_c, h, w, 0, cr_out);
+    HVX_Vector* out_vec_ptr = reinterpret_cast<HVX_Vector*>(out_ptr);
+    HVX_Vector out_vec1, out_vec2, out_vec3, out_vec4, out_vec;
+    out_vec1 = computeOutVec(out_h, out_w, out_c, h, w);
+    out_vec2 = computeOutVec(out_h, out_w, out_c, h, w + 1);
+    out_vec3 = computeOutVec(out_h, out_w, out_c, h, w + 2);
+    out_vec4 = computeOutVec(out_h, out_w, out_c, h, w + 3);
+
+    HVX_Vector half_vec1 = Q6_Vh_vpack_VwVw_sat(out_vec2, out_vec1);
+    HVX_Vector half_vec2 = Q6_Vh_vpack_VwVw_sat(out_vec4, out_vec3);
+    out_vec = Q6_Vub_vpack_VhVh_sat(half_vec2, half_vec1);
+    *out_vec_ptr = out_vec;
+  };
+
+  for (int out_c = 0; out_c < o_depth; ++out_c) {
+    for (int out_h = 0; out_h < o_height; ++out_h) {
+      int max_y = std::min(8, out_height - out_h * 8);
+      for (int out_w = 0; out_w < o_width; ++out_w) {
+        int max_x = std::min(8, out_width - out_w * 8);
+        for (int h = 0; h < max_y; ++h) {
+          if (max_x == 8) {
+            for (int w = 0; w < max_x; w += 4) {
+              saturateAndStore(out_h, out_w, out_c, h, w);
+            }
+          } else {
+            int w = 0;
+            if (max_x >= 4) {
+              saturateAndStore(out_h, out_w, out_c, h, w);
+              w = 4;
+            }
+            uint8_t* out_ptr = getElementPtr_int8(out_h, out_w, out_c, h, w, 0, cr_out);
+            HVX_Vector* out_vec_ptr = reinterpret_cast<HVX_Vector*>(out_ptr);
+            HVX_Vector out_vec1, out_vec2, out_vec3, out_vec;
+            if (max_x % 4 == 1) {
+              out_vec1 = computeOutVec(out_h, out_w, out_c, h, w);
+              HVX_Vector half_vec = Q6_Vh_vpack_VwVw_sat(Q6_V_vzero(), out_vec1);
+              out_vec = Q6_Vub_vpack_VhVh_sat(Q6_V_vzero(), half_vec);
+              *out_vec_ptr = out_vec;
+            } else if (max_x % 4 == 2) {
+              out_vec1 = computeOutVec(out_h, out_w, out_c, h, w);
+              out_vec2 = computeOutVec(out_h, out_w, out_c, h, w + 1);
+              HVX_Vector half_vec = Q6_Vh_vpack_VwVw_sat(out_vec2, out_vec1);
+              out_vec = Q6_Vub_vpack_VhVh_sat(Q6_V_vzero(), half_vec);
+              *out_vec_ptr = out_vec;
+            } else if (max_x % 4 == 3) {
+              out_vec1 = computeOutVec(out_h, out_w, out_c, h, w);
+              out_vec2 = computeOutVec(out_h, out_w, out_c, h, w + 1);
+              out_vec3 = computeOutVec(out_h, out_w, out_c, h, w + 2);
+              HVX_Vector half_vec1 = Q6_Vh_vpack_VwVw_sat(out_vec2, out_vec1);
+              HVX_Vector half_vec2 = Q6_Vh_vpack_VwVw_sat(Q6_V_vzero(), out_vec3);
+              out_vec = Q6_Vub_vpack_VhVh_sat(half_vec2, half_vec1);
+              *out_vec_ptr = out_vec;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
+
+int conv2d_packed_quant(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val,
+                        int out_code, void* res_handle) {
+  namespace conv_utils = tvm::runtime::hexagon::conv_utils;
+  ICHECK_EQ(num_args, 13) << "Unexpected number of arguments";
+  ICHECK_EQ(type_codes[0], kTVMDLTensorHandle)
+      << "First argument is expected to be the input tensor";  // Input activations
+  ICHECK_EQ(type_codes[1], kTVMDLTensorHandle)
+      << "Second argument is expected to be the weights tensor";  // Weights
+  ICHECK_EQ(type_codes[2], kDLFloat) << "Third argument is expected to be the activation scale";
+  ICHECK_EQ(type_codes[3], kDLInt) << "Fourth argument is expected to be the activation zero point";
+  ICHECK_EQ(type_codes[4], kDLFloat) << "Fifth argument is expected to be the weight scale";
+  ICHECK_EQ(type_codes[5], kDLInt) << "Sixth argument is expected to be the weight zero point";
+  ICHECK_EQ(type_codes[6], kDLFloat) << "Seventh argument is expected to be the output scale";
+  ICHECK_EQ(type_codes[7], kDLInt) << "Eigth argument is expected to be the output zero point";
+  ICHECK_EQ(type_codes[8], kDLInt) << "Nineth argument is expected to be the stride_h";  // stride_h
+  ICHECK_EQ(type_codes[9], kDLInt) << "Tenth argument is expected to be the stride_w";   // stride_w
+  ICHECK_EQ(type_codes[10], kDLInt) << "Eleventh argument is expected to be fixed final scale";
+  ICHECK_EQ(type_codes[11], kDLInt) << "Twelfth argument is expected to be scale factor";
+  ICHECK_EQ(type_codes[12], kTVMDLTensorHandle)
+      << "Thirteenth argument is expected to be the output tensor";  // output
+
+  auto* act_flat = static_cast<DLTensor*>(args[0].v_handle);
+  auto* wgt_flat = static_cast<DLTensor*>(args[1].v_handle);
+  auto* out_flat = static_cast<DLTensor*>(args[12].v_handle);
+
+  // Temporary assertion until multiple batches are supported
+  ICHECK_EQ(act_flat->shape[0], 1) << "Input batch size more than 1 is not supported yet";
+
+  // Temporary assertion until multiple batches are supported
+  ICHECK_EQ(out_flat->shape[0], 1) << "Output batch size more than 1 is not supported yet";
+
+  float act_scale = args[2].v_float64;
+  int act_zp = args[3].v_int64;
+  LOG_INFO << "act_scale: " << act_scale << ", act_zp: " << act_zp;
+
+  float wgt_scale = args[4].v_float64;
+  int wgt_zp = args[5].v_int64;
+  LOG_INFO << "wgt_scale: " << wgt_scale << ", wgt_zp: " << wgt_zp;
+
+  float out_scale = args[6].v_float64;
+  int out_zp = args[7].v_int64;
+  LOG_INFO << "out_scale: " << out_scale << ", out_zp: " << out_zp;
+
+  int stride_h = args[8].v_int64;
+  int stride_w = args[9].v_int64;
+  LOG_INFO << "stride_h: " << stride_h << ", stride_w: " << stride_w;
+
+  int fixed_final_scale = args[10].v_int64;
+  int scale_factor = args[11].v_int64;
+  LOG_INFO << "fixed_final_scale: " << fixed_final_scale << ", scale_factor: " << scale_factor;
+
+  auto* device_api = tvm::runtime::DeviceAPI::Get(conv_utils::hexagon_device, false);
+  ICHECK(device_api != nullptr);
+  tvm::runtime::String vtcm_scope = "global.vtcm";
+
+  auto act_vtcm =
+      conv_utils::prepare_nhwc<uint8_t, 8, 8, 32>(device_api, act_flat, /*copy_data=*/true);
+
+  int num_wgt_chunks = conv_utils::calculate_num_weight_chunks(
+      wgt_flat->shape, /* chunk_height */ wgt_flat->shape[0],
+      /* chunk_width */ wgt_flat->shape[1], /* chunk_in_channel */ 32, /* chunk_out_channel */ 32);
+  auto wgt_ptr_table =
+      reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
+
+  auto wgt_vtcm =
+      conv_utils::prepare_hwio_8b(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table, wgt_zp);
+
+  auto out_vtcm =
+      conv_utils::prepare_nhwc<uint8_t, 8, 8, 32>(device_api, out_flat, /*copy_data=*/false);
+
+  auto act_shape = conv_utils::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape);
+  auto filt_shape = conv_utils::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape);
+  auto out_shape = conv_utils::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape);
+
+  tvm::runtime::hexagon::conv_layer_int8_hvx_whole(
+      out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape, filt_shape, act_scale, act_zp, wgt_scale,
+      wgt_zp, out_scale, out_zp, fixed_final_scale, scale_factor);
+
+  conv_utils::deblockize_hwc<uint8_t, 8, 8, 32>(out_flat->data, out_vtcm.data, out_flat->shape[1],
+                                                out_flat->shape[2], out_flat->shape[3]);
+
+  conv_utils::release(device_api, out_vtcm);
+  conv_utils::release(device_api, wgt_vtcm);
+  conv_utils::release(device_api, act_vtcm);
+
+  return 0;
+}
diff --git a/src/runtime/hexagon/ops/conv_utils.cc b/src/runtime/hexagon/ops/conv_utils.cc
index b10f7cc315b2..a40e23e463d4 100644
--- a/src/runtime/hexagon/ops/conv_utils.cc
+++ b/src/runtime/hexagon/ops/conv_utils.cc
@@ -17,96 +17,69 @@
  * under the License.
  */
 
-#include "tvm/runtime/hexagon/ops/conv2d.h"
+#include <type_traits>
+
+#include "conv2d.h"
 
 namespace tvm {
 namespace runtime {
 namespace hexagon {
+namespace conv_utils {
 
 /**
- * @brief Function to "blockize" the flat input data
- * The term "blockize" is used to mention that the data is stored in non-contiguous blocks
+ * @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below:
  *
- * The input is mapped into the below mentioned layout (notation similar to index map used for
- * transform layout):
+ * Weights are packed into the below mentioned layout (notation similar to index map):
+ * Since weights cannot be exactly represented into a index map notation, the
+ * base split up is mentioned below with a few deviations
  *
- * lambda n, h, w, c: n, h//8, w//4, c//32, AXIS_SEPARATOR, h%8, (w%4)//2, c%32, w%2
+ * lambda h, w, i, o: o//32, i//32, h, w, (i%32)//4, o%32, i%4
  *
- * where AXIS_SEPARATOR represents split up in the physical layout
+ * The deviations are:
+ *  - w is actually stored in the right to left order, as in 3,2,1,0 instead of 0,1,2,3
  *
- * @param out Pre-allocated output memory pointer
- * @param inp_flat Flat input data pointer
- * @param height
- * @param width
- * @param depth
- */
-void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int depth) {
-  auto inp_data = static_cast<uint16_t*>(inp_flat);
-  auto out_data = static_cast<uintptr_t*>(out);
-  const int stride_x = depth;
-  const int stride_y = stride_x * width;
-
-  for (int cy = 0; cy < height; cy += 8) {
-    for (int cx = 0; cx < width; cx += 4) {
-      for (int cc = 0; cc < depth; cc += 32) {
-        auto block = reinterpret_cast<uint16_t*>(*out_data++);
-        int max_y = std::min(8, height - cy);
-        int max_x = std::min(4, width - cx);
-        int max_c = std::min(32, depth - cc);
-        for (int y = 0; y < max_y; ++y) {
-          for (int x = 0; x < max_x; ++x) {
-            for (int c = 0; c < max_c; ++c) {
-              block[xyc_to_sm_16b(y, x, c)] =
-                  inp_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)];
-            }
-            for (int c = max_c; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0;
-          }
-          for (int x = max_x; x < 4; ++x) {
-            for (int c = 0; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0;
-          }
-        }
-
-        for (int y = max_y; y < 8; ++y)
-          for (int x = 0; x < 4; ++x)
-            for (int c = 0; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0;
-      }  // cc
-    }    // cx
-  }      // cy
-}
-
-/**
- * @brief Convert back from non-contguous layout to a flat layout
- *
- * @param out_flat Pre-allocated output memory pointer
- * @param inp Blockized input data pointer
+ * @param out_ptr Base pointer table to be filled with the list of pointers to the first addresses
+ * of the "chunked" weights
+ * @param out_ptr_size The number of chunks
+ * @param out Pointer to pre-allocated output memory
+ * @param inp Pointer to flat input data
  * @param height
  * @param width
- * @param depth
+ * @param idepth
+ * @param odepth
  */
-void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int depth) {
-  uintptr_t* inp_data = static_cast<uintptr_t*>(inp);
-  uint16_t* out_data = static_cast<uint16_t*>(out_flat);
-  const int stride_x = depth;
+void chunkify_hwio_8b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height, int width,
+                      int idepth, int odepth, int wgt_zp) {
+  auto inp_data = static_cast<int8_t*>(inp);
+  auto out_data = static_cast<int8_t*>(out);
+  const int stride_i = odepth;
+  const int stride_x = stride_i * idepth;
   const int stride_y = stride_x * width;
 
-  for (int cy = 0; cy < height; cy += 8) {
-    for (int cx = 0; cx < width; cx += 4) {
-      for (int cc = 0; cc < depth; cc += 32) {
-        auto block = reinterpret_cast<uint16_t*>(*inp_data);
-        int max_y = std::min(8, height - cy);
-        int max_x = std::min(4, width - cx);
-        int max_c = std::min(32, depth - cc);
-        for (int y = 0; y < max_y; ++y) {
-          for (int x = 0; x < max_x; ++x) {
-            for (int c = 0; c < max_c; ++c) {
-              out_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)] =
-                  block[xyc_to_sm_16b(y, x, c)];
+  for (int ci = 0; ci < idepth; ci += 32) {
+    for (int co = 0; co < odepth; co += 32) {
+      int max_i = std::min(32, idepth - ci);
+      int max_o = std::min(32, odepth - co);
+
+      auto chunk = out_data;
+      for (int y = 0; y < height; ++y) {
+        for (int x = width - 1; x >= 0; --x) {
+          for (int i = 0; i < max_i; ++i) {
+            for (int o = 0; o < max_o; ++o) {
+              chunk[hwio_to_sm_8b(width, y, x, i, o)] =
+                  inp_data[y * stride_y + x * stride_x + (ci + i) * stride_i + (co + o)];
             }
+            for (int o = max_o; o < 32; ++o) chunk[hwio_to_sm_8b(width, y, x, i, o)] = wgt_zp;
           }
+          for (int i = max_i; i < 32; ++i)
+            for (int o = 0; o < 32; ++o) chunk[hwio_to_sm_8b(width, y, x, i, o)] = wgt_zp;
         }
-
-        inp_data++;
       }
+
+      *out_ptr++ = chunk;
+      out_data += height * width * 32 * 32;
+      out_ptr_size--;
+      assert(out_ptr_size >= 0);
     }
   }
 }
@@ -184,25 +157,27 @@ void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, i
   }
 }
 
-SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* nhwc_flat,
-                          bool copy_data) {
+std::tuple<int, int, int, int> getHWIO(const DLTensor* hwio_flat) {
+  int h = hwio_flat->shape[0];
+  int w = hwio_flat->shape[1];
+  int i = round_up(hwio_flat->shape[2], 32);
+  int o = round_up(hwio_flat->shape[3], 32);
+  return std::make_tuple(h, w, i, o);
+}
+
+SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
+                             int num_chunks, void** ptr_table, int wgt_zp) {
   tvm::runtime::String vtcm_scope = "global.vtcm";
 
-  // Allocate blocks for activations. We will use the block pointers
-  // directly from the allocated area.
-  int n = nhwc_flat->shape[0];
-  int h = round_up(nhwc_flat->shape[1], 8);
-  int w = round_up(nhwc_flat->shape[2], 4);
-  int c = round_up(nhwc_flat->shape[3], 32);
-  int64_t shape_2d[2] = {(n * h * w * c) / (8 * 4 * 32), 8 * 4 * 32};
-  void* nhwc_vtcm =
-      device_api->AllocDataSpace(hexagon_device, 2, shape_2d, nhwc_flat->dtype, vtcm_scope);
-  if (copy_data) {
-    blockize_hwc_16b(nhwc_vtcm, nhwc_flat->data, nhwc_flat->shape[1], nhwc_flat->shape[2],
-                     nhwc_flat->shape[3]);
-  }
+  auto [h, w, i, o] = getHWIO(hwio_flat);
+  int64_t shape_1d[] = {h * w * i * o};
+  void* hwio_vtcm =
+      device_api->AllocDataSpace(hexagon_device, 1, shape_1d, hwio_flat->dtype, vtcm_scope);
 
-  return SDLTensor<4>(nhwc_vtcm, nhwc_flat->dtype, nhwc_vtcm, {n, h / 8, w / 4, c / 32});
+  chunkify_hwio_8b(ptr_table, num_chunks, hwio_vtcm, hwio_flat->data, hwio_flat->shape[0],
+                   hwio_flat->shape[1], hwio_flat->shape[2], hwio_flat->shape[3], wgt_zp);
+
+  return SDLTensor<4>(ptr_table, hwio_flat->dtype, hwio_vtcm, {1, 1, i / 32, o / 32});
 }
 
 SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
@@ -214,10 +189,7 @@ SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* h
   // height- or width-wise, so filter chunks may have different sizes.
   // A filter chunk is a block of size HxWx32x32, where H, W are at most
   // height and width of a block respectively.
-  int h = hwio_flat->shape[0];
-  int w = hwio_flat->shape[1];
-  int i = round_up(hwio_flat->shape[2], 32);
-  int o = round_up(hwio_flat->shape[3], 32);
+  auto [h, w, i, o] = getHWIO(hwio_flat);
   int64_t shape_1d[] = {h * w * i * o};
   void* hwio_vtcm =
       device_api->AllocDataSpace(hexagon_device, 1, shape_1d, hwio_flat->dtype, vtcm_scope);
@@ -229,15 +201,19 @@ SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* h
                       {round_up(h, 8) / 8, round_up(w, 4) / 4, i / 32, o / 32});
 }
 
-int calculate_num_weight_chunks(int64_t* shape_hwio) {
-  int h = round_up(shape_hwio[0], 8);
-  int w = round_up(shape_hwio[1], 4);
-  int i = round_up(shape_hwio[2], 32);
-  int o = round_up(shape_hwio[3], 32);
+int calculate_num_weight_chunks(int64_t* shape_hwio, int chunk_height, int chunk_width,
+                                int chunk_in_channel, int chunk_out_channel) {
+  // Define slower roundup that doesn't assume multiplier 'p' to be power of 2
+  auto roundup = [](int v, int p) { return (v + p - 1) - ((v + p - 1) % p); };
+  int h = roundup(shape_hwio[0], chunk_height);
+  int w = roundup(shape_hwio[1], chunk_width);
+  int i = roundup(shape_hwio[2], chunk_in_channel);
+  int o = roundup(shape_hwio[3], chunk_out_channel);
 
-  return (h * w * i * o) / (8 * 4 * 32 * 32);
+  return (h * w * i * o) / (chunk_height * chunk_width * chunk_in_channel * chunk_out_channel);
 }
 
+}  // namespace conv_utils
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h b/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h
new file mode 100644
index 000000000000..07e15966863e
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_TESTS_CPPRUNTIME_HEXAGON_HEXAGON_CONV_UTILS_H
+#define TVM_TESTS_CPPRUNTIME_HEXAGON_HEXAGON_CONV_UTILS_H
+
+#include <dlpack/dlpack.h>
+#include <gtest/gtest.h>
+
+#include <limits>
+
+#include "conv2d.h"
+
+using namespace tvm::runtime::hexagon::conv_utils;
+
+template <typename T>
+class HexagonUtilsTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    vtcm_scope = "global.vtcm";
+    device_api = tvm::runtime::DeviceAPI::Get(hexagon_device, false);
+    float16.code = kDLFloat;
+    float16.bits = 16;
+    float16.lanes = 1;
+
+    uint8.code = kDLUInt;
+    uint8.bits = 8;
+    uint8.lanes = 1;
+
+    int8.code = kDLInt;
+    int8.bits = 8;
+    int8.lanes = 1;
+  }
+
+  void setupTensor(std::tuple<int64_t, int64_t, int64_t, int64_t> shape, DLDataType dtype) {
+    auto [s1, s2, s3, s4] = shape;
+    tensor_shape[0] = s1;
+    tensor_shape[1] = s2;
+    tensor_shape[2] = s3;
+    tensor_shape[3] = s4;
+    int64_t shape_1d[1] = {s1 * s2 * s3 * s4};
+
+    flat_mem = device_api->AllocDataSpace(hexagon_device, 1, shape_1d, dtype, vtcm_scope);
+    flat_mem_data = static_cast<T*>(flat_mem);
+    fill_vals(flat_mem_data, shape_1d[0]);
+
+    flat_tensor.data = flat_mem;
+    flat_tensor.device = hexagon_device;
+    flat_tensor.ndim = 4;
+    flat_tensor.dtype = dtype;
+    flat_tensor.shape = tensor_shape;
+    flat_tensor.strides = nullptr;
+    flat_tensor.byte_offset = 0;
+  }
+
+  void TearDownTensor() {
+    if (flat_tensor.data) device_api->FreeDataSpace(hexagon_device, flat_mem);
+  }
+
+  static void fill_vals(T* arr, int size) {
+    // Testing with uint16 instead of float16 as generating random float16 is not easy within c++
+    auto max = std::numeric_limits<T>::max();
+    srand(std::time(0));
+    for (int i = 0; i < size; ++i) {
+      arr[i] = static_cast<T>(std::rand() % max);
+    }
+  }
+
+  static int flattened_idx(int nn, int hh, int ww, int cc, int64_t* shape) {
+    int h = shape[1];
+    int w = shape[2];
+    int c = shape[3];
+    return cc + c * (ww + w * (hh + h * (nn)));
+  }
+
+  DLTensor flat_tensor;
+  void* flat_mem;
+  T* flat_mem_data;
+  tvm::runtime::DeviceAPI* device_api;
+  tvm::runtime::String vtcm_scope;
+  DLDataType float16;
+  DLDataType int8, uint8;
+  int64_t tensor_shape[4];
+};
+
+#endif
diff --git a/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc b/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc
index 3b922fa6c2a8..5f2ef490d020 100644
--- a/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc
@@ -26,73 +26,14 @@
 #include <string>
 #include <tuple>
 
-#include "tvm/runtime/hexagon/ops/conv2d.h"
+#include "conv2d.h"
+#include "hexagon_conv_utils_test.h"
 
 using namespace tvm::runtime::hexagon;
 
-class HexagonUtilsTest : public ::testing::Test {
- public:
-  void SetUp() override {
-    vtcm_scope = "global.vtcm";
-    device_api = tvm::runtime::DeviceAPI::Get(hexagon_device, false);
-    float16.code = kDLFloat;
-    float16.bits = 16;
-    float16.lanes = 1;
-  }
-
-  void setupTensor(std::tuple<int64_t, int64_t, int64_t, int64_t> shape) {
-    auto [s1, s2, s3, s4] = shape;
-    tensor_shape[0] = s1;
-    tensor_shape[1] = s2;
-    tensor_shape[2] = s3;
-    tensor_shape[3] = s4;
-    int64_t shape_1d[1] = {s1 * s2 * s3 * s4};
-
-    flat_mem = device_api->AllocDataSpace(hexagon_device, 1, shape_1d, float16, vtcm_scope);
-    flat_mem_data = static_cast<uint16_t*>(flat_mem);
-    fill_vals(flat_mem_data, shape_1d[0]);
-
-    flat_tensor.data = flat_mem;
-    flat_tensor.device = hexagon_device;
-    flat_tensor.ndim = 4;
-    flat_tensor.dtype = float16;
-    flat_tensor.shape = tensor_shape;
-    flat_tensor.strides = nullptr;
-    flat_tensor.byte_offset = 0;
-  }
-
-  void TearDownTensor() {
-    if (flat_tensor.data) device_api->FreeDataSpace(hexagon_device, flat_mem);
-  }
-
-  static void fill_vals(uint16_t* arr, int size) {
-    // Testing with uint16 instead of float16 as generating random float16 is not easy within c++
-    uint16_t max = UINT16_MAX;
-    srand(std::time(0));
-    for (int i = 0; i < size; ++i) {
-      arr[i] = static_cast<uint16_t>(std::rand() % max);
-    }
-  }
-
-  static int flattened_idx(int nn, int hh, int ww, int cc, int64_t* shape) {
-    int h = shape[1];
-    int w = shape[2];
-    int c = shape[3];
-    return cc + c * (ww + w * (hh + h * (nn)));
-  }
-
-  DLTensor flat_tensor;
-  void* flat_mem;
-  uint16_t* flat_mem_data;
-  tvm::runtime::DeviceAPI* device_api;
-  tvm::runtime::String vtcm_scope;
-  DLDataType float16;
-  int64_t tensor_shape[4];
-};
-
 // Parameterized test fixture with 4 params representing n, h, w, c
 class HexagonUtilsActivationsBlockizeTest
-    : public HexagonUtilsTest,
+    : public HexagonUtilsTest<uint16_t>,
       public ::testing::WithParamInterface<std::tuple<
           std::tuple<int64_t, int64_t, int64_t, int64_t>, std::tuple<int, int, int, int>>> {};
 
@@ -122,11 +63,12 @@ INSTANTIATE_TEST_SUITE_P(
 TEST_F(HexagonUtilsActivationsBlockizeTest, prepare_nhwc) {
   auto shape = std::make_tuple(1, 14, 7, 60);
   auto [n, h, w, c] = shape;
-  setupTensor(shape);
+  setupTensor(shape, float16);
 
   // // copy_data is set to false here as there's a separate test for blockize when copy_data
   // becomes true
-  auto blocked_tensor = prepare_nhwc(device_api, &flat_tensor, /*copy_data=*/false);
+  auto blocked_tensor =
+      prepare_nhwc<uint16_t, 8, 4, 32>(device_api, &flat_tensor, /*copy_data=*/false);
 
   EXPECT_EQ(blocked_tensor.shape[0], n);
   EXPECT_EQ(blocked_tensor.shape[1], round_up(h, 8) / 8);
@@ -139,7 +81,7 @@ TEST_F(HexagonUtilsActivationsBlockizeTest, prepare_nhwc) {
 
 TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) {
   auto shape_tuple = std::get<0>(GetParam());
-  setupTensor(shape_tuple);
+  setupTensor(shape_tuple, float16);
   auto [n, h, w, c] = shape_tuple;
   int64_t shape[] = {n, h, w, c};
 
@@ -150,7 +92,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) {
 
   void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, float16, vtcm_scope);
   int64_t blocked_shape[] = {n, h_rounded / 8, w_rounded / 4, c_rounded / 32};
-  blockize_hwc_16b(blocked_mem, flat_mem, h, w, c);
+  blockize_hwc<uint16_t, 8, 4, 32>(blocked_mem, flat_mem, h, w, c);
 
   std::function<int(int, int, int, int, int64_t*)> flatten =
       HexagonUtilsActivationsBlockizeTest::flattened_idx;
@@ -159,7 +101,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) {
     auto* blocks = static_cast<uintptr_t*>(blocked_mem);
     int blockIdx = flatten(nn, hh / 8, ww / 4, cc / 32, blocked_shape);
     uint16_t* block = reinterpret_cast<uint16_t*>(blocks[blockIdx]);
-    return block[xyc_to_sm_16b(hh % 8, ww % 4, cc % 32)];
+    return block[yxc_to_sm_16b(hh % 8, ww % 4, cc % 32)];
   };
 
   auto [nn, hh, ww, cc] = std::get<1>(GetParam());
@@ -172,7 +114,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) {
 
 TEST_P(HexagonUtilsActivationsBlockizeTest, deblockize_hwc_16b) {
   auto shape_tuple = std::get<0>(GetParam());
-  setupTensor(shape_tuple);
+  setupTensor(shape_tuple, float16);
   auto [n, h, w, c] = shape_tuple;
   int64_t shape[] = {n, h, w, c};
   int64_t shape_1d[1] = {n * h * w * c};
@@ -183,11 +125,11 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, deblockize_hwc_16b) {
   int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 4 * 32), 8 * 4 * 32};
 
   void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, float16, vtcm_scope);
-  blockize_hwc_16b(blocked_mem, flat_mem, h, w, c);
+  blockize_hwc<uint16_t, 8, 4, 32>(blocked_mem, flat_mem, h, w, c);
 
   void* deblocked_flat_mem =
       device_api->AllocDataSpace(hexagon_device, 1, shape_1d, float16, vtcm_scope);
-  deblockize_hwc_16b(deblocked_flat_mem, blocked_mem, h, w, c);
+  deblockize_hwc<uint16_t, 8, 4, 32>(deblocked_flat_mem, blocked_mem, h, w, c);
   auto* deblocked_flat_mem_data = static_cast<uint16_t*>(deblocked_flat_mem);
 
   auto [nn, hh, ww, cc] = std::get<1>(GetParam());
@@ -201,7 +143,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, deblockize_hwc_16b) {
 }
 
 class HexagonUtilsWeightsChunkifyTest
-    : public HexagonUtilsTest,
+    : public HexagonUtilsTest<uint16_t>,
       public ::testing::WithParamInterface<std::tuple<
           std::tuple<int64_t, int64_t, int64_t, int64_t>, std::tuple<int, int, int, int>>> {};
 
@@ -231,7 +173,9 @@ INSTANTIATE_TEST_SUITE_P(
 
 TEST_F(HexagonUtilsWeightsChunkifyTest, calculate_num_weight_chunks) {
   int64_t shape[] = {3, 3, 40, 40};
-  int num_wgt_chunks = calculate_num_weight_chunks(shape);
+  int num_wgt_chunks =
+      calculate_num_weight_chunks(shape, /* chunk_height */ 8, /* chunk_width */ 4,
+                                  /* chunk_in_channel */ 32, /* chunk_out_channel */ 32);
   EXPECT_EQ(num_wgt_chunks, 4);
 }
 
@@ -239,11 +183,11 @@ TEST_F(HexagonUtilsWeightsChunkifyTest, prepare_hwio) {
   int64_t shape[] = {3, 3, 40, 40};
   auto [h, w, i, o] = shape;
   auto shape_tuple = std::make_tuple(h, w, i, o);
-  setupTensor(shape_tuple);
+  setupTensor(shape_tuple, float16);
 
   // copy_data is set to false here as there's a separate test for blockize when copy_data becomes
   // true
-  auto num_wgt_chunks = calculate_num_weight_chunks(shape);
+  auto num_wgt_chunks = calculate_num_weight_chunks(shape, 8, 4, 32, 32);
   auto wgt_ptr_table =
       reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
   auto chunked_tensor = prepare_hwio(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table);
@@ -260,10 +204,10 @@ TEST_F(HexagonUtilsWeightsChunkifyTest, prepare_hwio) {
 TEST_P(HexagonUtilsWeightsChunkifyTest, chunkify_hwio_16b) {
   auto [shape_tuple, indices] = GetParam();
   auto [h, w, i, o] = shape_tuple;
-  setupTensor(shape_tuple);
+  setupTensor(shape_tuple, float16);
   int64_t shape[] = {h, w, i, o};
 
-  auto num_wgt_chunks = calculate_num_weight_chunks(shape);
+  auto num_wgt_chunks = calculate_num_weight_chunks(shape, 8, 4, 32, 32);
   auto wgt_ptr_table =
       reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
   auto chunked_tensor = prepare_hwio(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table);
diff --git a/tests/cpp-runtime/hexagon/hexagon_quant_utils_tests.cc b/tests/cpp-runtime/hexagon/hexagon_quant_utils_tests.cc
new file mode 100644
index 000000000000..449c69736050
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/hexagon_quant_utils_tests.cc
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <dlpack/dlpack.h>
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <ctime>
+#include <functional>
+#include <string>
+#include <tuple>
+
+#include "conv2d.h"
+#include "hexagon_conv_utils_test.h"
+
+using namespace tvm::runtime::hexagon::conv_utils;
+
+// Parameterized test fixture with 4 params representing n, h, w, c
+class HexagonUtilsQuantActivationsBlockizeTest
+    : public HexagonUtilsTest<uint8_t>,
+      public ::testing::WithParamInterface<std::tuple<
+          std::tuple<int64_t, int64_t, int64_t, int64_t>, std::tuple<int, int, int, int>>> {};
+
+// TODO (quic-sanirudh): See if we can test with random generated indices
+INSTANTIATE_TEST_SUITE_P(
+    BlockizeDeblockizeTestFixtures, HexagonUtilsQuantActivationsBlockizeTest,
+    ::testing::Combine(::testing::Values(std::make_tuple(1, 14, 7, 60)),
+                       ::testing::Values(std::make_tuple(0, 0, 0, 0),   // first element
+                                         std::make_tuple(0, 7, 3, 31),  // last element
+                                         // Remaining are random element tests
+                                         std::make_tuple(0, 13, 6, 59),
+                                         std::make_tuple(0, 0, 0, 32), std::make_tuple(0, 0, 4, 32),
+                                         std::make_tuple(0, 2, 3, 4), std::make_tuple(0, 5, 6, 7),
+                                         std::make_tuple(0, 10, 4, 12))),
+    [](const ::testing::TestParamInfo<HexagonUtilsQuantActivationsBlockizeTest::ParamType>& info) {
+      // Can use info.param here to generate the test suffix
+      auto indices = std::get<1>(info.param);
+      int h = std::get<1>(indices);
+      int w = std::get<2>(indices);
+      int c = std::get<3>(indices);
+      // Generate test name as "hwc0x0x0" if the indices of hwc are 0,0,0
+      std::string name =
+          "hwc" + std::to_string(h) + "x" + std::to_string(w) + "x" + std::to_string(c);
+      return name;
+    });
+
+TEST_F(HexagonUtilsQuantActivationsBlockizeTest, prepare_nhwc) {
+  auto shape = std::make_tuple(1, 14, 7, 60);
+  auto [n, h, w, c] = shape;
+  setupTensor(shape, uint8);
+
+  // // copy_data is set to false here as there's a separate test for blockize when copy_data
+  // becomes true
+  auto blocked_tensor =
+      prepare_nhwc<uint8_t, 8, 8, 32>(device_api, &flat_tensor, /*copy_data=*/false);
+
+  EXPECT_EQ(blocked_tensor.shape[0], n);
+  EXPECT_EQ(blocked_tensor.shape[1], round_up(h, 8) / 8);
+  EXPECT_EQ(blocked_tensor.shape[2], round_up(w, 8) / 8);
+  EXPECT_EQ(blocked_tensor.shape[3], round_up(c, 32) / 32);
+
+  TearDownTensor();
+  release(device_api, blocked_tensor);
+}
+
+TEST_P(HexagonUtilsQuantActivationsBlockizeTest, blockize_hwc_8b) {
+  auto shape_tuple = std::get<0>(GetParam());
+  setupTensor(shape_tuple, uint8);
+  auto [n, h, w, c] = shape_tuple;
+  int64_t shape[] = {n, h, w, c};
+
+  int h_rounded = round_up(h, 8);
+  int w_rounded = round_up(w, 8);
+  int c_rounded = round_up(c, 32);
+  int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 8 * 32), 8 * 8 * 32};
+
+  void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, uint8, vtcm_scope);
+  int64_t blocked_shape[] = {n, h_rounded / 8, w_rounded / 8, c_rounded / 32};
+  blockize_hwc<uint8_t, 8, 8, 32>(blocked_mem, flat_mem, h, w, c);
+
+  std::function<int(int, int, int, int, int64_t*)> flatten =
+      HexagonUtilsQuantActivationsBlockizeTest::flattened_idx;
+
+  auto getBlockedElem = [&blocked_shape, blocked_mem, flatten](int nn, int hh, int ww, int cc) {
+    auto* blocks = static_cast<uintptr_t*>(blocked_mem);
+    int blockIdx = flatten(nn, hh / 8, ww / 8, cc / 32, blocked_shape);
+    uint8_t* block = reinterpret_cast<uint8_t*>(blocks[blockIdx]);
+    return block[yxc_to_sm_8b(hh % 8, ww % 8, cc % 32)];
+  };
+
+  auto [nn, hh, ww, cc] = std::get<1>(GetParam());
+
+  EXPECT_EQ(flat_mem_data[flattened_idx(nn, hh, ww, cc, shape)], getBlockedElem(nn, hh, ww, cc));
+
+  TearDownTensor();
+  device_api->FreeDataSpace(hexagon_device, blocked_mem);
+}
+
+TEST_P(HexagonUtilsQuantActivationsBlockizeTest, deblockize_hwc_8b) {
+  auto shape_tuple = std::get<0>(GetParam());
+  setupTensor(shape_tuple, uint8);
+  auto [n, h, w, c] = shape_tuple;
+  int64_t shape[] = {n, h, w, c};
+  int64_t shape_1d[1] = {n * h * w * c};
+
+  int h_rounded = round_up(h, 8);
+  int w_rounded = round_up(w, 8);
+  int c_rounded = round_up(c, 32);
+  int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 8 * 32), 8 * 8 * 32};
+
+  void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, uint8, vtcm_scope);
+  blockize_hwc<uint8_t, 8, 8, 32>(blocked_mem, flat_mem, h, w, c);
+
+  void* deblocked_flat_mem =
+      device_api->AllocDataSpace(hexagon_device, 1, shape_1d, uint8, vtcm_scope);
+  deblockize_hwc<uint8_t, 8, 8, 32>(deblocked_flat_mem, blocked_mem, h, w, c);
+  auto* deblocked_flat_mem_data = static_cast<uint8_t*>(deblocked_flat_mem);
+
+  auto [nn, hh, ww, cc] = std::get<1>(GetParam());
+
+  auto idx = flattened_idx(nn, hh, ww, cc, shape);
+  EXPECT_EQ(flat_mem_data[idx], deblocked_flat_mem_data[idx]);
+
+  TearDownTensor();
+  device_api->FreeDataSpace(hexagon_device, blocked_mem);
+  device_api->FreeDataSpace(hexagon_device, deblocked_flat_mem);
+}
+
+class HexagonUtilsQuantWeightsChunkifyTest
+    : public HexagonUtilsTest<int8_t>,
+      public ::testing::WithParamInterface<std::tuple<
+          std::tuple<int64_t, int64_t, int64_t, int64_t>, std::tuple<int, int, int, int>>> {};
+
+INSTANTIATE_TEST_SUITE_P(
+    ChunkifyDechunkifyTests, HexagonUtilsQuantWeightsChunkifyTest,
+    ::testing::Combine(::testing::Values(std::make_tuple(3, 3, 40, 40)),
+                       ::testing::Values(std::make_tuple(0, 0, 0, 0),    // first element
+                                         std::make_tuple(2, 2, 39, 39),  // Last element
+                                         // Remaining are random element tests
+                                         std::make_tuple(1, 1, 28, 33),
+                                         std::make_tuple(1, 2, 8, 38),
+                                         std::make_tuple(1, 0, 12, 15),
+                                         std::make_tuple(2, 1, 9, 22), std::make_tuple(0, 2, 6, 7),
+                                         std::make_tuple(1, 2, 3, 4))),
+    [](const ::testing::TestParamInfo<HexagonUtilsQuantWeightsChunkifyTest::ParamType>& info) {
+      // Can use info.param here to generate the test suffix
+      auto indices = std::get<1>(info.param);
+      int h = std::get<0>(indices);
+      int w = std::get<1>(indices);
+      int i = std::get<2>(indices);
+      int o = std::get<3>(indices);
+      // Generate test name as "hwc0x0x0" if the indices of hwc are 0,0,0
+      std::string name = "hwio" + std::to_string(h) + std::to_string(w) + "x" + std::to_string(i) +
+                         "x" + std::to_string(o);
+      return name;
+    });
+
+TEST_F(HexagonUtilsQuantWeightsChunkifyTest, calculate_num_weight_chunks) {
+  int64_t shape[] = {3, 3, 40, 40};
+  int num_wgt_chunks = calculate_num_weight_chunks(shape, shape[0], shape[1], 32, 32);
+  EXPECT_EQ(num_wgt_chunks, 4);
+}
+
+TEST_F(HexagonUtilsQuantWeightsChunkifyTest, prepare_hwio) {
+  int64_t shape[] = {3, 3, 40, 40};
+  auto [h, w, i, o] = shape;
+  auto shape_tuple = std::make_tuple(h, w, i, o);
+  setupTensor(shape_tuple, int8);
+
+  // copy_data is set to false here as there's a separate test for blockize when copy_data becomes
+  // true
+  auto num_wgt_chunks = calculate_num_weight_chunks(shape, shape[0], shape[1], 32, 32);
+  auto wgt_ptr_table =
+      reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
+  auto chunked_tensor = prepare_hwio_8b(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table);
+
+  EXPECT_EQ(chunked_tensor.shape[0], 1);
+  EXPECT_EQ(chunked_tensor.shape[1], 1);
+  EXPECT_EQ(chunked_tensor.shape[2], round_up(i, 32) / 32);
+  EXPECT_EQ(chunked_tensor.shape[3], round_up(o, 32) / 32);
+
+  release(device_api, chunked_tensor);
+  TearDownTensor();
+}
+
+TEST_P(HexagonUtilsQuantWeightsChunkifyTest, chunkify_hwio_8b) {
+  auto [shape_tuple, indices] = GetParam();
+  auto [h, w, i, o] = shape_tuple;
+  setupTensor(shape_tuple, int8);
+  int64_t shape[] = {h, w, i, o};
+
+  auto num_wgt_chunks = calculate_num_weight_chunks(shape, shape[0], shape[1], 32, 32);
+  auto wgt_ptr_table =
+      reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
+  auto chunked_tensor = prepare_hwio_8b(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table);
+
+  auto getChunkedElem = [width = w, chunked_tensor](int hh, int ww, int ii, int oo) {
+    auto data = static_cast<intptr_t*>(chunked_tensor.data);
+    auto chunk = data[ii / 32 * chunked_tensor.shape[3] + oo / 32];
+    auto chunk_int8 = reinterpret_cast<int8_t*>(chunk);
+    return chunk_int8[hwio_to_sm_8b(width, hh, ww, ii % 32, oo % 32)];
+  };
+
+  auto [hh, ww, ii, oo] = indices;
+
+  EXPECT_EQ(flat_mem_data[flattened_idx(hh, ww, ii, oo, shape)], getChunkedElem(hh, ww, ii, oo));
+  release(device_api, chunked_tensor);
+}
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index c03701f83ccc..5b13513c0fb3 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -320,7 +320,7 @@ def quantize_np(arr_np: numpy.ndarray, dtype: str):
 
     scale = (fmax - fmin) / (qmax - qmin)
     zero_point = numpy.rint((fmax * qmin - fmin * qmax) / (fmax - fmin)).astype("int32")
-    quant_np = (arr_np / scale + zero_point).astype(dtype)
+    quant_np = numpy.clip(((arr_np / scale).round() + zero_point), qmin, qmax).astype(dtype)
     return quant_np, scale, zero_point
 
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py
new file mode 100644
index 000000000000..c26e6142ba5c
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py
@@ -0,0 +1,261 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test quantized conv2d HVX intrinsic implementation"""
+
+import numpy as np
+
+import tvm
+import tvm.contrib.hexagon
+from tvm.topi.hexagon.utils import get_fixed_point_value
+from tvm.topi.testing import conv2d_nhwc_python
+
+from ..infrastructure import get_hexagon_target, quantize_np
+
+
+def build_conv2d(target):
+    """Build and return the conv2d IRModule that calls the intrinsic implementation"""
+    act_n, act_h, act_w, act_c = (
+        tvm.te.var("an"),
+        tvm.te.var("ah"),
+        tvm.te.var("aw"),
+        tvm.te.var("ac"),
+    )
+    filt_h, filt_w, filt_o = tvm.te.var("filt_h"), tvm.te.var("filt_w"), tvm.te.var("filt_o")
+    act_scale, act_zp = tvm.te.var("act_scale", dtype="float32"), tvm.te.var("act_zp")
+    wgt_scale, wgt_zp = tvm.te.var("wgt_scale", dtype="float32"), tvm.te.var("wgt_zp")
+    out_scale, out_zp = tvm.te.var("out_scale", dtype="float32"), tvm.te.var("out_zp")
+    fixed_final_scale, scale_factor = tvm.te.var("fixed_final_scale", dtype="int32"), tvm.te.var(
+        "scale_factor"
+    )
+    stride_h, stride_w = tvm.te.var("stride_h"), tvm.te.var("stride_w")
+
+    act_flat = tvm.te.placeholder(
+        shape=(act_n, act_h, act_w, act_c), dtype="uint8", name="act_flat"
+    )
+    wgt_flat = tvm.te.placeholder(
+        shape=(filt_h, filt_w, act_c, filt_o), dtype="int8", name="wgt_flat"
+    )
+
+    out_flat = tvm.te.extern(
+        shape=(act_n, (act_h - filt_h) // stride_h + 1, (act_w - filt_w) // stride_w + 1, filt_o),
+        inputs=[act_flat, wgt_flat],
+        fcompute=lambda ins, outs: tvm.tir.call_cpacked(
+            "conv2d_packed_quant",  # Function from TVM runtime
+            ins[0],
+            ins[1],
+            act_scale,
+            act_zp,
+            wgt_scale,
+            wgt_zp,
+            out_scale,
+            out_zp,
+            stride_h,
+            stride_w,
+            fixed_final_scale,
+            scale_factor,
+            outs[0],
+            tvm.runtime.const(0),  # resource_handle (unused)
+        ),
+        dtype="uint8",
+    )
+
+    s = tvm.te.create_schedule(out_flat.op)
+
+    func_name = "conv2d_quant_hvx"
+    module = tvm.build(
+        s,
+        [
+            act_flat,
+            wgt_flat,
+            act_scale,
+            act_zp,
+            wgt_scale,
+            wgt_zp,
+            out_scale,
+            out_zp,
+            stride_h,
+            stride_w,
+            fixed_final_scale,
+            scale_factor,
+            out_flat,
+        ],
+        target=target,
+        name=func_name,
+    )
+
+    return module
+
+
+def gen_config(params):
+    """Utility function to generate useful ids for shape_parameters"""
+
+    dims = lambda vals: "x".join(map(str, vals))
+
+    config = {}
+    for param in params:
+        act_shape, wgt_shape, inp_stride = param
+        name = f"nhwc{dims(act_shape)}-hwio{dims(wgt_shape)}-stride{dims(inp_stride)}"
+        config[name] = param
+
+    return config
+
+
+class TestQuantConv2dIntrin:
+    """Test Quantized Conv2d Intrin class"""
+
+    shape_parameters = [
+        [
+            (1, 5, 5, 33),
+            (3, 3, 33, 33),
+            (1, 1),
+        ],
+        [
+            (1, 9, 8, 64),
+            (3, 3, 64, 64),
+            (1, 1),
+        ],
+        [
+            (1, 11, 16, 64),
+            (3, 3, 64, 32),
+            (1, 1),
+        ],
+        [
+            (1, 24, 8, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 4, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 5, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 6, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 7, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 8, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 9, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 10, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 11, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 4, 5),
+            (3, 3, 5, 3),
+            (1, 1),
+        ],
+    ]
+
+    config = gen_config(shape_parameters)
+    act_shape, wgt_shape, inp_stride = tvm.testing.parameters(*config.values(), ids=config.keys())
+    inp_offset = tvm.testing.parameter((0, 0), ids=["offset0x0"])
+
+    @tvm.testing.requires_hexagon
+    def test_conv2d_quant(self, act_shape, wgt_shape, inp_stride, hexagon_session):
+        """Test quantized conv2d intrinsic implementation"""
+        assert act_shape[3] == wgt_shape[2]
+
+        # Currently, input offset does not affect the output shape
+        def get_out_shape(ash, wsh, inp_stride):
+            assert ash[3] == wsh[2]
+            osh = (
+                ash[0],
+                (ash[1] - wsh[0]) // inp_stride[0] + 1,
+                (ash[2] - wsh[1]) // inp_stride[1] + 1,
+                wsh[3],
+            )
+            assert tvm.tir.all([x > 0 for x in osh])
+            return osh
+
+        act_f = np.random.uniform(-1.5, 1.0, size=act_shape).astype("float32")
+        wgt_f = np.random.uniform(-1.5, 1.0, size=wgt_shape).astype("float32")
+
+        # Quanize activations using onnxruntime
+        act_q, act_scale, act_zp = quantize_np(act_f, dtype="uint8")
+        act_q = act_q.reshape(act_f.shape)
+
+        # Quanize weights using onnxruntime
+        wgt_q, wgt_scale, wgt_zp = quantize_np(wgt_f, dtype="int8")
+        wgt_q = wgt_q.reshape(wgt_f.shape)
+
+        # Generate reference output
+        ref_out = conv2d_nhwc_python(act_f, wgt_f, stride=inp_stride, padding="VALID")
+
+        ref_out_q, out_scale, out_zp = quantize_np(ref_out, dtype="uint8")
+        ref_out_q = ref_out_q.reshape(ref_out.shape)
+
+        final_scale = act_scale * wgt_scale / out_scale
+        fixed_final_scale, scale_factor = get_fixed_point_value(final_scale)
+
+        module = build_conv2d(get_hexagon_target("v69"))
+        mod = hexagon_session.load_module(module)
+
+        output_shape = get_out_shape(act_shape, wgt_shape, inp_stride)
+
+        output = tvm.nd.array(
+            np.zeros(output_shape, dtype="uint8"),
+            device=hexagon_session.device,
+        )
+        mod(
+            tvm.nd.array(act_q, device=hexagon_session.device),
+            tvm.nd.array(wgt_q, device=hexagon_session.device),
+            act_scale,
+            act_zp,
+            wgt_scale,
+            wgt_zp,
+            out_scale,
+            out_zp,
+            inp_stride[0],  # stride_height
+            inp_stride[1],  # stride_width
+            fixed_final_scale,
+            scale_factor,
+            output,
+        )
+
+        out_q = output.numpy()
+
+        tvm.testing.assert_allclose(out_q, ref_out_q, rtol=0, atol=2)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 71a89934c4df0fa9f14fcffe8a9ea7d4c73d4a48 Mon Sep 17 00:00:00 2001
From: Anirudh Sundar <quic_sanirudh@quicinc.com>
Date: Tue, 29 Nov 2022 22:26:01 +0530
Subject: [PATCH 2/2] Remove inline keywords and add debug asserts

---
 src/runtime/hexagon/ops/conv2d.h | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/runtime/hexagon/ops/conv2d.h b/src/runtime/hexagon/ops/conv2d.h
index 3501441f0a8c..76c6cccff73d 100644
--- a/src/runtime/hexagon/ops/conv2d.h
+++ b/src/runtime/hexagon/ops/conv2d.h
@@ -77,42 +77,49 @@ inline void* to_ptr(uintptr_t v) { return reinterpret_cast<void*>(v); }
 
 inline uintptr_t to_uint(void* ptr) { return reinterpret_cast<uintptr_t>(ptr); }
 
-inline constexpr int yxc_to_sm_16b(int y, int x, int c) {
+constexpr int yxc_to_sm_16b(int y, int x, int c) {
   // Map y,x,c coordinates within a block to the offset (in 16-bit elements)
   // from the beginning of the block in spatial-major layout.
   // 10-bit spatial mask: yyyxcccccx
   assert(y >= 0 && x >= 0 && c >= 0);
+  assert(y < 8 && x < 4 && c < 32);
   return y << 7 | (x & 2) << 5 | c << 1 | (x & 1);
 }
 
-inline constexpr int yxc_to_sm_8b(int y, int x, int c) {
+constexpr int yxc_to_sm_8b(int y, int x, int c) {
   // Map y,x,c coordinates within a block to the offset (in 8-bit elements)
   // from the beginning of the block in spatial-major layout.
   // 10-bit spatial mask: yyyxxxccccc
+  assert(y >= 0 && x >= 0 && c >= 0);
+  assert(y < 8 && x < 8 && c < 32);
   return y << 8 | x << 5 | c;
 }
 
-inline constexpr int hwio_to_sm_8b(int width, int y, int x, int i, int o) {
+constexpr int hwio_to_sm_8b(int width, int y, int x, int i, int o) {
   // Map y,x,i,o coordinates within a chunk (assuming the origin at the
   // top-left spatial corner) to the offset (in 8-bit elements) from the
   // beginning of the chunk in spatial-major layout.
   // Spatial mask: p..piiioooooii, where p..p are position bits.
+  assert(width >= 1);
+  assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  assert(i < 32 && o < 32);
   int p = y * width + (width - 1 - x);
   return p << 10 | (i & 0x1c) << 5 | o << 2 | (i & 3);
 }
 
-inline constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
+constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
   // Map y,x,i,o coordinates within a chunk (assuming the origin at the
   // top-left spatial corner) to the offset (in 16-bit elements) from the
   // beginning of the chunk in spatial-major layout.
   // Spatial mask: p..piiiioooooi, where p..p are position bits.
   assert(width >= 1);
   assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  assert(i < 32 && o < 32);
   int p = y * width + (width - 1 - x);
   return p << 10 | (i & 0x1e) << 5 | o << 1 | (i & 1);
 }
 
-inline constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
+constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
 
 // Returns the block address at the given index
 // Assumptions