diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 31cece8a19e0..887211893558 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -178,6 +178,15 @@ if(BUILD_FOR_HEXAGON)
     "${TVMRT_SOURCE_DIR}/hexagon/ops/*.cc"
   )
 
+  include_directories(
+    "${TVMRT_SOURCE_DIR}/hexagon/ops"
+  )
+
+  set_source_files_properties(
+    "${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_quant_hvx.cc"
+    PROPERTIES COMPILE_FLAGS "-mhvx"
+  )
+
   set_source_files_properties(
     "${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_fp16_hvx.cc"
     PROPERTIES COMPILE_FLAGS "-mhvx"
diff --git a/include/tvm/runtime/hexagon/ops/conv2d.h b/src/runtime/hexagon/ops/conv2d.h
similarity index 54%
rename from include/tvm/runtime/hexagon/ops/conv2d.h
rename to src/runtime/hexagon/ops/conv2d.h
index d759149727e8..76c6cccff73d 100644
--- a/include/tvm/runtime/hexagon/ops/conv2d.h
+++ b/src/runtime/hexagon/ops/conv2d.h
@@ -20,6 +20,7 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 
+#include <algorithm>
 #include <cassert>
 
 #ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
@@ -28,6 +29,7 @@
 namespace tvm {
 namespace runtime {
 namespace hexagon {
+namespace conv_utils {
 static constexpr auto hexagon_device = DLDevice{static_cast<DLDeviceType>(kDLHexagon), 0};
 
 // Standalone DLTensor: the standalone-ness means that this object owns the shape
@@ -75,14 +77,36 @@ inline void* to_ptr(uintptr_t v) { return reinterpret_cast<void*>(v); }
 
 inline uintptr_t to_uint(void* ptr) { return reinterpret_cast<uintptr_t>(ptr); }
 
-constexpr int xyc_to_sm_16b(int y, int x, int c) {
+constexpr int yxc_to_sm_16b(int y, int x, int c) {
   // Map y,x,c coordinates within a block to the offset (in 16-bit elements)
   // from the beginning of the block in spatial-major layout.
   // 10-bit spatial mask: yyyxcccccx
   assert(y >= 0 && x >= 0 && c >= 0);
+  assert(y < 8 && x < 4 && c < 32);
   return y << 7 | (x & 2) << 5 | c << 1 | (x & 1);
 }
 
+constexpr int yxc_to_sm_8b(int y, int x, int c) {
+  // Map y,x,c coordinates within a block to the offset (in 8-bit elements)
+  // from the beginning of the block in spatial-major layout.
+  // 10-bit spatial mask: yyyxxxccccc
+  assert(y >= 0 && x >= 0 && c >= 0);
+  assert(y < 8 && x < 8 && c < 32);
+  return y << 8 | x << 5 | c;
+}
+
+constexpr int hwio_to_sm_8b(int width, int y, int x, int i, int o) {
+  // Map y,x,i,o coordinates within a chunk (assuming the origin at the
+  // top-left spatial corner) to the offset (in 8-bit elements) from the
+  // beginning of the chunk in spatial-major layout.
+  // Spatial mask: p..piiioooooii, where p..p are position bits.
+  assert(width >= 1);
+  assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  assert(i < 32 && o < 32);
+  int p = y * width + (width - 1 - x);
+  return p << 10 | (i & 0x1c) << 5 | o << 2 | (i & 3);
+}
+
 constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
   // Map y,x,i,o coordinates within a chunk (assuming the origin at the
   // top-left spatial corner) to the offset (in 16-bit elements) from the
@@ -90,11 +114,12 @@ constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
   // Spatial mask: p..piiiioooooi, where p..p are position bits.
   assert(width >= 1);
   assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  assert(i < 32 && o < 32);
   int p = y * width + (width - 1 - x);
   return p << 10 | (i & 0x1e) << 5 | o << 1 | (i & 1);
 }
 
-inline constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
+constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
 
 // Returns the block address at the given index
 // Assumptions
@@ -123,6 +148,10 @@ inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
  * The input is mapped into the below mentioned layout (notation similar to index map used for
  * transform layout):
  *
+ * For uint8_t type
+ * lambda n, h, w, c: n, h//8, w//8, c//32, AXIS_SEPARATOR, h%8, w%8, c%32
+ *
+ * For uint16_t type
  * lambda n, h, w, c: n, h//8, w//4, c//32, AXIS_SEPARATOR, h%8, (w%4)//2, c%32, w%2
  *
  * where AXIS_SEPARATOR represents split up in the physical layout
@@ -133,7 +162,48 @@ inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
  * @param width
  * @param depth
  */
-void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int depth);
+template <typename T, int block_height, int block_width, int block_depth>
+void blockize_hwc(void* out, void* inp_flat, int height, int width, int depth) {
+  int (*index_func)(int, int, int);
+  if constexpr (std::is_same_v<T, uint8_t>)
+    index_func = yxc_to_sm_8b;
+  else if constexpr (std::is_same_v<T, uint16_t>)
+    index_func = yxc_to_sm_16b;
+  else
+    LOG_ERROR << "blockize_hwc is only supported for uint8_t and uint16_t types";
+
+  auto inp_data = static_cast<T*>(inp_flat);
+  auto out_data = static_cast<uintptr_t*>(out);
+  const int stride_x = depth;
+  const int stride_y = stride_x * width;
+
+  for (int cy = 0; cy < height; cy += block_height) {
+    for (int cx = 0; cx < width; cx += block_width) {
+      for (int cc = 0; cc < depth; cc += block_depth) {
+        auto block = reinterpret_cast<T*>(*out_data++);
+        int max_y = std::min(block_height, height - cy);
+        int max_x = std::min(block_width, width - cx);
+        int max_c = std::min(block_depth, depth - cc);
+        for (int y = 0; y < max_y; ++y) {
+          for (int x = 0; x < max_x; ++x) {
+            for (int c = 0; c < max_c; ++c) {
+              block[index_func(y, x, c)] =
+                  inp_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)];
+            }
+            for (int c = max_c; c < block_depth; ++c) block[index_func(y, x, c)] = 0;
+          }
+          for (int x = max_x; x < block_width; ++x) {
+            for (int c = 0; c < block_depth; ++c) block[index_func(y, x, c)] = 0;
+          }
+        }
+
+        for (int y = max_y; y < block_height; ++y)
+          for (int x = 0; x < block_width; ++x)
+            for (int c = 0; c < block_depth; ++c) block[index_func(y, x, c)] = 0;
+      }  // cc
+    }    // cx
+  }      // cy
+}
 
 /**
  * @brief Convert back from non-contguous layout to a flat layout
@@ -144,7 +214,42 @@ void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int dept
  * @param width
  * @param depth
  */
-void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int depth);
+template <typename T, int block_height, int block_width, int block_depth>
+void deblockize_hwc(void* out_flat, void* inp, int height, int width, int depth) {
+  int (*index_func)(int, int, int);
+  if constexpr (std::is_same_v<T, uint8_t>)
+    index_func = yxc_to_sm_8b;
+  else if constexpr (std::is_same_v<T, uint16_t>)
+    index_func = yxc_to_sm_16b;
+  else
+    LOG_ERROR << "deblockize_hwc is only supported for uint8_t and uint16_t types";
+
+  uintptr_t* inp_data = static_cast<uintptr_t*>(inp);
+  T* out_data = static_cast<T*>(out_flat);
+  const int stride_x = depth;
+  const int stride_y = stride_x * width;
+
+  for (int cy = 0; cy < height; cy += block_height) {
+    for (int cx = 0; cx < width; cx += block_width) {
+      for (int cc = 0; cc < depth; cc += block_depth) {
+        auto block = reinterpret_cast<T*>(*inp_data);
+        int max_y = std::min(block_height, height - cy);
+        int max_x = std::min(block_width, width - cx);
+        int max_c = std::min(block_depth, depth - cc);
+        for (int y = 0; y < max_y; ++y) {
+          for (int x = 0; x < max_x; ++x) {
+            for (int c = 0; c < max_c; ++c) {
+              out_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)] =
+                  block[index_func(y, x, c)];
+            }
+          }
+        }
+
+        inp_data++;
+      }
+    }
+  }
+}
 
 /**
  * @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below:
@@ -175,15 +280,42 @@ void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int de
  */
 void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height,
                        int width, int idepth, int odepth);
+void chunkify_hwio_8b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height, int width,
+                      int idepth, int odepth);
 
+template <typename T, int block_height, int block_width, int block_depth>
 SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* nhwc_flat,
-                          bool copy_data);
+                          bool copy_data) {
+  tvm::runtime::String vtcm_scope = "global.vtcm";
+
+  // Allocate blocks for activations. We will use the block pointers
+  // directly from the allocated area.
+  int n = nhwc_flat->shape[0];
+  int h = round_up(nhwc_flat->shape[1], block_height);
+  int w = round_up(nhwc_flat->shape[2], block_width);
+  int c = round_up(nhwc_flat->shape[3], block_depth);
+  int64_t shape_2d[2] = {(n * h * w * c) / (block_height * block_width * block_depth),
+                         block_height * block_width * block_depth};
+  void* nhwc_vtcm =
+      device_api->AllocDataSpace(hexagon_device, 2, shape_2d, nhwc_flat->dtype, vtcm_scope);
+  if (copy_data) {
+    blockize_hwc<T, block_height, block_width, block_depth>(
+        nhwc_vtcm, nhwc_flat->data, nhwc_flat->shape[1], nhwc_flat->shape[2], nhwc_flat->shape[3]);
+  }
 
-int calculate_num_weight_chunks(int64_t* shape_hwio);
+  return SDLTensor<4>(nhwc_vtcm, nhwc_flat->dtype, nhwc_vtcm,
+                      {n, h / block_height, w / block_width, c / block_depth});
+}
+
+int calculate_num_weight_chunks(int64_t* shape_hwio, int chunk_height, int chunk_width,
+                                int chunk_in_channel, int chunk_out_channel);
 
 SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
                           int num_chunks, void** ptr_table);
 
+SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
+                             int num_chunks, void** ptr_table, int wgt_zp = 0);
+
 template <size_t N>
 void release(tvm::runtime::DeviceAPI* device_api, const SDLTensor<N>& tensor) {
   if (auto* data_space = tensor.GetDataSpace()) {
@@ -191,6 +323,7 @@ void release(tvm::runtime::DeviceAPI* device_api, const SDLTensor<N>& tensor) {
   }
 }
 
+}  // namespace conv_utils
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
index a478fbab352d..53ea0868ad2a 100644
--- a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
+++ b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
@@ -27,7 +27,7 @@
 #include <cassert>
 #include <cinttypes>
 
-#include "tvm/runtime/hexagon/ops/conv2d.h"
+#include "conv2d.h"
 
 // Current limitations:
 // - N in NHWC must be 1
@@ -68,7 +68,7 @@ namespace hexagon {
  */
 static inline uint16_t* getElementPtr(int block_out_y, int block_out_x, int block_out_c, int yi,
                                       int xio, int ci, int xii, const DLTensor& tensor) {
-  auto block_ptr = nhwc_at(tensor, 0, block_out_y, block_out_x, block_out_c);
+  auto block_ptr = conv_utils::nhwc_at(tensor, 0, block_out_y, block_out_x, block_out_c);
   auto block_offset = yi * 128 + xio * 64 + ci * 2 + xii;
   auto first_element_ptr = reinterpret_cast<uint16_t*>(block_ptr);
   return first_element_ptr + block_offset;
@@ -279,10 +279,10 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
         }
         int fx = (fw < wgt_chunk_thin_width) ? fw : ((fw - wgt_chunk_thin_width) % 4);
         int fy = fh % 8;
-        for (int c = 0; c < round_up(filt_idepth, 2); c += 2) {
+        for (int c = 0; c < conv_utils::round_up(filt_idepth, 2); c += 2) {
           int out_act_cc = c / 32;
           int ci = c % 32;
-          auto wgt_chunk = hwio_at(cr_filt, fch, fcw, out_act_cc, out_c);
+          auto wgt_chunk = conv_utils::hwio_at(cr_filt, fch, fcw, out_act_cc, out_c);
 
           // Find weight chunk offset ptr
           int max_x = (fcw == 0) ? wgt_chunk_thin_width : 4;
@@ -306,7 +306,7 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
                                                true_wo, ci, true_wi, cr_act);
           HVX_Vector act_vec = getInputVector(act_element_ptr);
 
-          auto wgt_chunk_offset = hwio_to_sm_16b(max_x, fy, fx, ci, 0);
+          auto wgt_chunk_offset = conv_utils::hwio_to_sm_16b(max_x, fy, fx, ci, 0);
           auto base_chunk_ptr = reinterpret_cast<uint16_t*>(wgt_chunk);
           auto chunk_ptr = base_chunk_ptr + wgt_chunk_offset;
 
@@ -404,7 +404,7 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
 
 int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val,
                        int out_code, void* res_handle) {
-  namespace hexagonrt = tvm::runtime::hexagon;
+  namespace conv_utils = tvm::runtime::hexagon::conv_utils;
   ICHECK_EQ(num_args, 7) << "Unexpected number of arguments";
   ICHECK_EQ(type_codes[0], kTVMDLTensorHandle)
       << "First argument is expected to be the input tensor";  // Input activations
@@ -440,50 +440,55 @@ int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue*
            << wgt_flat->shape[2] << "x" << wgt_flat->shape[3] << ", pad_top=" << pad_top
            << ", pad_left=" << pad_left;
 
-  auto* device_api = tvm::runtime::DeviceAPI::Get(hexagonrt::hexagon_device, false);
+  auto* device_api = tvm::runtime::DeviceAPI::Get(conv_utils::hexagon_device, false);
   ICHECK(device_api != nullptr);
   tvm::runtime::String vtcm_scope = "global.vtcm";
 
-  auto act_vtcm = hexagonrt::prepare_nhwc(device_api, act_flat, /*copy_data=*/true);
+  auto act_vtcm =
+      conv_utils::prepare_nhwc<uint16_t, 8, 4, 32>(device_api, act_flat, /*copy_data=*/true);
 
   ICHECK_NE(wgt_flat->shape[0], 0) << "Weights height should not be zero";
   ICHECK_NE(wgt_flat->shape[1], 0) << "Weights width should not be zero";
   ICHECK_NE(wgt_flat->shape[2], 0) << "Weights input channels should not be zero";
   ICHECK_NE(wgt_flat->shape[3], 0) << "Weights output channels should not be zero";
-  int num_wgt_chunks = hexagonrt::calculate_num_weight_chunks(wgt_flat->shape);
+  int num_wgt_chunks = conv_utils::calculate_num_weight_chunks(
+      wgt_flat->shape, /* chunk_height */ 8, /* chunk_width */ 4, /* chunk_in_channel */ 32,
+      /* chunk_out_channel */ 32);
+
   LOG_INFO << "num_wgt_chunks: " << num_wgt_chunks;
   auto wgt_ptr_table =
       reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
-  auto wgt_vtcm = hexagonrt::prepare_hwio(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table);
+  auto wgt_vtcm = conv_utils::prepare_hwio(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table);
 
-  auto out_vtcm = hexagonrt::prepare_nhwc(device_api, out_flat, /*copy_data=*/false);
+  auto out_vtcm =
+      conv_utils::prepare_nhwc<uint16_t, 8, 4, 32>(device_api, out_flat, /*copy_data=*/false);
 
   // Prepare zero_block
   int64_t block_nbytes = 2048;
-  void* zero_block = device_api->AllocDataSpace(hexagonrt::hexagon_device, 1, &block_nbytes,
+  void* zero_block = device_api->AllocDataSpace(conv_utils::hexagon_device, 1, &block_nbytes,
                                                 tvm::runtime::DataType::UInt(8), vtcm_scope);
   memset(zero_block, 0, 2048);
 
   // FIXME: Setting bias to zero_block: this works for up to 256 output channels.
   auto bias_flat =
-      hexagonrt::SDLTensor<1>(zero_block, wgt_flat->dtype, zero_block, &wgt_flat->shape[3]);
-  auto act_shape = hexagonrt::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape);
-  auto filt_shape = hexagonrt::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape);
-  auto pad_shape = hexagonrt::SDLTensor<2>(nullptr, act_flat->dtype, nullptr, {pad_top, pad_left});
-  auto out_shape = hexagonrt::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape);
+      conv_utils::SDLTensor<1>(zero_block, wgt_flat->dtype, zero_block, &wgt_flat->shape[3]);
+  auto act_shape = conv_utils::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape);
+  auto filt_shape = conv_utils::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape);
+  auto pad_shape = conv_utils::SDLTensor<2>(nullptr, act_flat->dtype, nullptr, {pad_top, pad_left});
+  auto out_shape = conv_utils::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape);
   bool relu = false;
 
-  hexagonrt::conv_layer_fp16_hvx(out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape, bias_flat,
-                                 filt_shape, pad_shape, relu, stride_h, stride_w,
-                                 hexagonrt::to_uint(zero_block));
+  tvm::runtime::hexagon::conv_layer_fp16_hvx(out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape,
+                                             bias_flat, filt_shape, pad_shape, relu, stride_h,
+                                             stride_w, conv_utils::to_uint(zero_block));
 
-  hexagonrt::deblockize_hwc_16b(out_flat->data, out_vtcm.data, out_flat->shape[1],
-                                out_flat->shape[2], out_flat->shape[3]);
+  conv_utils::deblockize_hwc<uint16_t, 8, 4, 32>(out_flat->data, out_vtcm.data, out_flat->shape[1],
+                                                 out_flat->shape[2], out_flat->shape[3]);
 
-  device_api->FreeDataSpace(hexagonrt::hexagon_device, zero_block);
-  hexagonrt::release(device_api, out_vtcm);
-  hexagonrt::release(device_api, wgt_vtcm);
-  hexagonrt::release(device_api, act_vtcm);
+  device_api->FreeDataSpace(conv_utils::hexagon_device, zero_block);
+  conv_utils::release(device_api, out_vtcm);
+  conv_utils::release(device_api, wgt_vtcm);
+  conv_utils::release(device_api, act_vtcm);
 
   return 0;
 }
diff --git a/src/runtime/hexagon/ops/conv2d_quant_hvx.cc b/src/runtime/hexagon/ops/conv2d_quant_hvx.cc
new file mode 100644
index 000000000000..682eebb137c0
--- /dev/null
+++ b/src/runtime/hexagon/ops/conv2d_quant_hvx.cc
@@ -0,0 +1,319 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <hexagon_types.h>
+#include <hvx_hexagon_protos.h>
+#include <inttypes.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include "conv2d.h"
+
+extern "C" int conv2d_packed_quant(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val,
+                                   int out_code, void* res_handle);
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+inline uint8_t* getElementPtr_int8(int block_out_y, int block_out_x, int block_out_c, int yi,
+                                   int xi, int ci, const DLTensor& block) {
+  auto block_ptr =
+      tvm::runtime::hexagon::conv_utils::nhwc_at(block, 0, block_out_y, block_out_x, block_out_c);
+  const int width_stride = 32;
+  const int height_stride = width_stride * 8;
+  auto block_offset = yi * height_stride + xi * width_stride + ci;
+  auto first_element_ptr = reinterpret_cast<uint8_t*>(block_ptr);
+  return first_element_ptr + block_offset;
+}
+
+inline int8_t* getWgtPtr_int8(int out_i, int out_o, int h, int w, int i, int o,
+                              const DLTensor& wgt_vtcm, int width) {
+  auto data = static_cast<intptr_t*>(wgt_vtcm.data);
+  auto chunk = data[out_i * wgt_vtcm.shape[3] + out_o];
+  auto base_chunk_ptr = reinterpret_cast<int8_t*>(chunk);
+  auto wgt_chunk_offset = tvm::runtime::hexagon::conv_utils::hwio_to_sm_8b(width, h, w, i, o);
+  return base_chunk_ptr + wgt_chunk_offset;
+}
+
+int32_t saturate_uint8(int32_t val) { return std::max(std::min(val, 255), 0); }
+
+int32_t saturate_int8(int32_t val) { return std::max(std::min(val, 127), -128); }
+
+/**
+ * @brief Compute the quantized convolution along with requantize with output quantization params to
+ * get uint8 outputs
+ *
+ * The quantized convolution is represented by the below equation
+ * out_scale(out_q - out_zp) = Σr,s,c(act_scale(act_q[n,h+r,w+s,c] - act_zp) *
+ *                             wgt_scale(wgt_q[r,s,c,o] - wgt_zp))
+ * => out_q = Σr,s,c((act_q[n,h+r,w+s,c] - act_zp) * (wgt_q[r,s,c,o] - wgt_zp))
+ *            * (act_scale*wgt_scale/out_scale) + out_zp
+ * out_q = Σr,s,c((act_q[n,h+r,w+s,c] - act_zp) * (wgt_zp_q[r,s,c,o])) *
+ * (act_scale*wgt_scale/out_scale) + out_zp, where wgt_zp_q = (wgt_q[r,s,c,o] - wgt_zp)
+ *
+ *  Assumptions/Limitations:
+ *  - Strided convolution is not yet supported so the stride variables are unused
+ *
+ * @param cr_out blockized output tensor with zeros already filled in
+ * @param cr_act blockized activations
+ * @param cr_filt Chunkified weights as returned from output of prepare_hwio
+ * @param out_shape Original output shape of the tensor before blockization
+ * @param act_shape Original input shape
+ * @param filt_shape Original filter shape
+ * @param act_scale Quantization scale for activation
+ * @param act_zp Activations zero point
+ * @param wgt_scale Quantization scale for weights
+ * @param wgt_zp Weights zero point
+ * @param out_scale Quantization scale for output
+ * @param out_zp Output zero point
+ * @param fixed_final_scale Fixed point value of final_scale= (act_scale*wgt_scale/out_scale)
+ * @param scale_factor Scale factor for the fixed_final_scale
+ */
+void conv_layer_int8_hvx_whole(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
+                               const DLTensor& cr_filt, const DLTensor& out_shape,
+                               const DLTensor& act_shape, const DLTensor& filt_shape,
+                               float act_scale, int act_zp, float wgt_scale, int wgt_zp,
+                               float out_scale, int out_zp, int fixed_final_scale,
+                               int scale_factor) {
+  namespace conv_utils = tvm::runtime::hexagon::conv_utils;
+  int filt_height = filt_shape.shape[0];
+  int filt_width = filt_shape.shape[1];
+  int filt_idepth = filt_shape.shape[2];
+
+  int a_depth = cr_act.shape[3];
+
+  int o_height = cr_out.shape[1];
+  int o_width = cr_out.shape[2];
+  int o_depth = cr_out.shape[3];
+
+  int out_height = out_shape.shape[1];
+  int out_width = out_shape.shape[2];
+
+  uint8_t act_zp_u8 = static_cast<uint8_t>(act_zp);
+  int8_t wgt_zp_i8 = static_cast<int8_t>(wgt_zp);
+
+  HVX_Vector act_zp_vec = Q6_Vb_vsplat_R(act_zp_u8);
+  HVX_Vector wgt_zp_vec = Q6_Vb_vsplat_R(wgt_zp_i8);
+  HVX_VectorPair wgt_zp_vec_pair = Q6_Wh_vsxt_Vb(wgt_zp_vec);
+
+  ICHECK_EQ(a_depth, cr_filt.shape[2]) << "input depth should match weights input channels";
+  ICHECK_EQ(o_depth, cr_filt.shape[3]) << "output depth should match the weights output channel";
+
+  uint32_t scale_u = static_cast<uint32_t>(fixed_final_scale);
+  HVX_Vector scale_vec = Q6_V_vsplat_R(scale_u);
+  uint32_t new_scale_factor = static_cast<uint32_t>(scale_factor - 16);
+  HVX_Vector out_zp_vec = Q6_V_vsplat_R(out_zp);
+
+  auto computeOutVec = [&cr_act, &cr_filt, &act_zp_vec, &wgt_zp_vec_pair, &out_zp_vec, &scale_vec,
+                        new_scale_factor, filt_height, filt_width,
+                        filt_idepth](int out_h, int out_w, int out_c, int h, int w) -> HVX_Vector {
+    HVX_Vector out_vec = Q6_V_vzero();
+    for (int fh = 0; fh < filt_height; ++fh) {
+      for (int fw = 0; fw < filt_width; ++fw) {
+        for (int c = 0; c < conv_utils::round_up(filt_idepth, 4); c += 4) {
+          int act_h = out_h * 8 + h + fh;
+          int act_ho = act_h / 8;
+          int act_hi = act_h % 8;
+
+          int act_w = out_w * 8 + w + fw;
+          int act_wo = act_w / 8;
+          int act_wi = act_w % 8;
+
+          int act_co = c / 32;
+          int act_ci = c % 32;
+
+          uint8_t* act_ptr =
+              getElementPtr_int8(act_ho, act_wo, act_co, act_hi, act_wi, act_ci, cr_act);
+
+          uint32_t four_act_elems = *reinterpret_cast<uint32_t*>(act_ptr);
+          HVX_Vector act_vec = Q6_V_vsplat_R(four_act_elems);
+          int8_t* wgt_ptr = getWgtPtr_int8(act_co, out_c, fh, fw, act_ci, 0, cr_filt, filt_width);
+
+          HVX_Vector* wgt_vec_ptr = reinterpret_cast<HVX_Vector*>(wgt_ptr);
+          HVX_Vector wgt_vec = *wgt_vec_ptr;
+
+          HVX_VectorPair act_vec_zp_diff = Q6_Wh_vsub_VubVub(act_vec, act_zp_vec);
+          HVX_VectorPair wgt_i16_vec_nodiff = Q6_Wh_vsxt_Vb(wgt_vec);
+          HVX_VectorPair wgt_i16_vec = Q6_Wh_vsub_WhWh_sat(wgt_i16_vec_nodiff, wgt_zp_vec_pair);
+
+          out_vec = Q6_Vw_vdmpyacc_VwVhVh_sat(out_vec, Q6_V_lo_W(act_vec_zp_diff),
+                                              Q6_V_lo_W(wgt_i16_vec));
+          out_vec = Q6_Vw_vdmpyacc_VwVhVh_sat(out_vec, Q6_V_hi_W(act_vec_zp_diff),
+                                              Q6_V_hi_W(wgt_i16_vec));
+        }
+      }
+    }
+    HVX_Vector mul_vec = Q6_Vw_vmpye_VwVuh(out_vec, scale_vec);
+    HVX_Vector scaled_vec = Q6_Vw_vasr_VwR(mul_vec, new_scale_factor);
+    HVX_Vector sum_vec = Q6_Vw_vadd_VwVw(scaled_vec, out_zp_vec);
+    return sum_vec;
+  };
+
+  auto saturateAndStore = [&cr_out, &computeOutVec](int out_h, int out_w, int out_c, int h, int w) {
+    uint8_t* out_ptr = getElementPtr_int8(out_h, out_w, out_c, h, w, 0, cr_out);
+    HVX_Vector* out_vec_ptr = reinterpret_cast<HVX_Vector*>(out_ptr);
+    HVX_Vector out_vec1, out_vec2, out_vec3, out_vec4, out_vec;
+    out_vec1 = computeOutVec(out_h, out_w, out_c, h, w);
+    out_vec2 = computeOutVec(out_h, out_w, out_c, h, w + 1);
+    out_vec3 = computeOutVec(out_h, out_w, out_c, h, w + 2);
+    out_vec4 = computeOutVec(out_h, out_w, out_c, h, w + 3);
+
+    HVX_Vector half_vec1 = Q6_Vh_vpack_VwVw_sat(out_vec2, out_vec1);
+    HVX_Vector half_vec2 = Q6_Vh_vpack_VwVw_sat(out_vec4, out_vec3);
+    out_vec = Q6_Vub_vpack_VhVh_sat(half_vec2, half_vec1);
+    *out_vec_ptr = out_vec;
+  };
+
+  for (int out_c = 0; out_c < o_depth; ++out_c) {
+    for (int out_h = 0; out_h < o_height; ++out_h) {
+      int max_y = std::min(8, out_height - out_h * 8);
+      for (int out_w = 0; out_w < o_width; ++out_w) {
+        int max_x = std::min(8, out_width - out_w * 8);
+        for (int h = 0; h < max_y; ++h) {
+          if (max_x == 8) {
+            for (int w = 0; w < max_x; w += 4) {
+              saturateAndStore(out_h, out_w, out_c, h, w);
+            }
+          } else {
+            int w = 0;
+            if (max_x >= 4) {
+              saturateAndStore(out_h, out_w, out_c, h, w);
+              w = 4;
+            }
+            uint8_t* out_ptr = getElementPtr_int8(out_h, out_w, out_c, h, w, 0, cr_out);
+            HVX_Vector* out_vec_ptr = reinterpret_cast<HVX_Vector*>(out_ptr);
+            HVX_Vector out_vec1, out_vec2, out_vec3, out_vec;
+            if (max_x % 4 == 1) {
+              out_vec1 = computeOutVec(out_h, out_w, out_c, h, w);
+              HVX_Vector half_vec = Q6_Vh_vpack_VwVw_sat(Q6_V_vzero(), out_vec1);
+              out_vec = Q6_Vub_vpack_VhVh_sat(Q6_V_vzero(), half_vec);
+              *out_vec_ptr = out_vec;
+            } else if (max_x % 4 == 2) {
+              out_vec1 = computeOutVec(out_h, out_w, out_c, h, w);
+              out_vec2 = computeOutVec(out_h, out_w, out_c, h, w + 1);
+              HVX_Vector half_vec = Q6_Vh_vpack_VwVw_sat(out_vec2, out_vec1);
+              out_vec = Q6_Vub_vpack_VhVh_sat(Q6_V_vzero(), half_vec);
+              *out_vec_ptr = out_vec;
+            } else if (max_x % 4 == 3) {
+              out_vec1 = computeOutVec(out_h, out_w, out_c, h, w);
+              out_vec2 = computeOutVec(out_h, out_w, out_c, h, w + 1);
+              out_vec3 = computeOutVec(out_h, out_w, out_c, h, w + 2);
+              HVX_Vector half_vec1 = Q6_Vh_vpack_VwVw_sat(out_vec2, out_vec1);
+              HVX_Vector half_vec2 = Q6_Vh_vpack_VwVw_sat(Q6_V_vzero(), out_vec3);
+              out_vec = Q6_Vub_vpack_VhVh_sat(half_vec2, half_vec1);
+              *out_vec_ptr = out_vec;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
+
+int conv2d_packed_quant(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val,
+                        int out_code, void* res_handle) {
+  namespace conv_utils = tvm::runtime::hexagon::conv_utils;
+  ICHECK_EQ(num_args, 13) << "Unexpected number of arguments";
+  ICHECK_EQ(type_codes[0], kTVMDLTensorHandle)
+      << "First argument is expected to be the input tensor";  // Input activations
+  ICHECK_EQ(type_codes[1], kTVMDLTensorHandle)
+      << "Second argument is expected to be the weights tensor";  // Weights
+  ICHECK_EQ(type_codes[2], kDLFloat) << "Third argument is expected to be the activation scale";
+  ICHECK_EQ(type_codes[3], kDLInt) << "Fourth argument is expected to be the activation zero point";
+  ICHECK_EQ(type_codes[4], kDLFloat) << "Fifth argument is expected to be the weight scale";
+  ICHECK_EQ(type_codes[5], kDLInt) << "Sixth argument is expected to be the weight zero point";
+  ICHECK_EQ(type_codes[6], kDLFloat) << "Seventh argument is expected to be the output scale";
+  ICHECK_EQ(type_codes[7], kDLInt) << "Eigth argument is expected to be the output zero point";
+  ICHECK_EQ(type_codes[8], kDLInt) << "Nineth argument is expected to be the stride_h";  // stride_h
+  ICHECK_EQ(type_codes[9], kDLInt) << "Tenth argument is expected to be the stride_w";   // stride_w
+  ICHECK_EQ(type_codes[10], kDLInt) << "Eleventh argument is expected to be fixed final scale";
+  ICHECK_EQ(type_codes[11], kDLInt) << "Twelfth argument is expected to be scale factor";
+  ICHECK_EQ(type_codes[12], kTVMDLTensorHandle)
+      << "Thirteenth argument is expected to be the output tensor";  // output
+
+  auto* act_flat = static_cast<DLTensor*>(args[0].v_handle);
+  auto* wgt_flat = static_cast<DLTensor*>(args[1].v_handle);
+  auto* out_flat = static_cast<DLTensor*>(args[12].v_handle);
+
+  // Temporary assertion until multiple batches are supported
+  ICHECK_EQ(act_flat->shape[0], 1) << "Input batch size more than 1 is not supported yet";
+
+  // Temporary assertion until multiple batches are supported
+  ICHECK_EQ(out_flat->shape[0], 1) << "Output batch size more than 1 is not supported yet";
+
+  float act_scale = args[2].v_float64;
+  int act_zp = args[3].v_int64;
+  LOG_INFO << "act_scale: " << act_scale << ", act_zp: " << act_zp;
+
+  float wgt_scale = args[4].v_float64;
+  int wgt_zp = args[5].v_int64;
+  LOG_INFO << "wgt_scale: " << wgt_scale << ", wgt_zp: " << wgt_zp;
+
+  float out_scale = args[6].v_float64;
+  int out_zp = args[7].v_int64;
+  LOG_INFO << "out_scale: " << out_scale << ", out_zp: " << out_zp;
+
+  int stride_h = args[8].v_int64;
+  int stride_w = args[9].v_int64;
+  LOG_INFO << "stride_h: " << stride_h << ", stride_w: " << stride_w;
+
+  int fixed_final_scale = args[10].v_int64;
+  int scale_factor = args[11].v_int64;
+  LOG_INFO << "fixed_final_scale: " << fixed_final_scale << ", scale_factor: " << scale_factor;
+
+  auto* device_api = tvm::runtime::DeviceAPI::Get(conv_utils::hexagon_device, false);
+  ICHECK(device_api != nullptr);
+  tvm::runtime::String vtcm_scope = "global.vtcm";
+
+  auto act_vtcm =
+      conv_utils::prepare_nhwc<uint8_t, 8, 8, 32>(device_api, act_flat, /*copy_data=*/true);
+
+  int num_wgt_chunks = conv_utils::calculate_num_weight_chunks(
+      wgt_flat->shape, /* chunk_height */ wgt_flat->shape[0],
+      /* chunk_width */ wgt_flat->shape[1], /* chunk_in_channel */ 32, /* chunk_out_channel */ 32);
+  auto wgt_ptr_table =
+      reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
+
+  auto wgt_vtcm =
+      conv_utils::prepare_hwio_8b(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table, wgt_zp);
+
+  auto out_vtcm =
+      conv_utils::prepare_nhwc<uint8_t, 8, 8, 32>(device_api, out_flat, /*copy_data=*/false);
+
+  auto act_shape = conv_utils::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape);
+  auto filt_shape = conv_utils::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape);
+  auto out_shape = conv_utils::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape);
+
+  tvm::runtime::hexagon::conv_layer_int8_hvx_whole(
+      out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape, filt_shape, act_scale, act_zp, wgt_scale,
+      wgt_zp, out_scale, out_zp, fixed_final_scale, scale_factor);
+
+  conv_utils::deblockize_hwc<uint8_t, 8, 8, 32>(out_flat->data, out_vtcm.data, out_flat->shape[1],
+                                                out_flat->shape[2], out_flat->shape[3]);
+
+  conv_utils::release(device_api, out_vtcm);
+  conv_utils::release(device_api, wgt_vtcm);
+  conv_utils::release(device_api, act_vtcm);
+
+  return 0;
+}
diff --git a/src/runtime/hexagon/ops/conv_utils.cc b/src/runtime/hexagon/ops/conv_utils.cc
index b10f7cc315b2..a40e23e463d4 100644
--- a/src/runtime/hexagon/ops/conv_utils.cc
+++ b/src/runtime/hexagon/ops/conv_utils.cc
@@ -17,96 +17,69 @@
  * under the License.
  */
 
-#include "tvm/runtime/hexagon/ops/conv2d.h"
+#include <type_traits>
+
+#include "conv2d.h"
 
 namespace tvm {
 namespace runtime {
 namespace hexagon {
+namespace conv_utils {
 
 /**
- * @brief Function to "blockize" the flat input data
- * The term "blockize" is used to mention that the data is stored in non-contiguous blocks
+ * @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below:
  *
- * The input is mapped into the below mentioned layout (notation similar to index map used for
- * transform layout):
+ * Weights are packed into the below mentioned layout (notation similar to index map):
+ * Since weights cannot be exactly represented into a index map notation, the
+ * base split up is mentioned below with a few deviations
  *
- * lambda n, h, w, c: n, h//8, w//4, c//32, AXIS_SEPARATOR, h%8, (w%4)//2, c%32, w%2
+ * lambda h, w, i, o: o//32, i//32, h, w, (i%32)//4, o%32, i%4
  *
- * where AXIS_SEPARATOR represents split up in the physical layout
+ * The deviations are:
+ *  - w is actually stored in the right to left order, as in 3,2,1,0 instead of 0,1,2,3
  *
- * @param out Pre-allocated output memory pointer
- * @param inp_flat Flat input data pointer
- * @param height
- * @param width
- * @param depth
- */
-void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int depth) {
-  auto inp_data = static_cast<uint16_t*>(inp_flat);
-  auto out_data = static_cast<uintptr_t*>(out);
-  const int stride_x = depth;
-  const int stride_y = stride_x * width;
-
-  for (int cy = 0; cy < height; cy += 8) {
-    for (int cx = 0; cx < width; cx += 4) {
-      for (int cc = 0; cc < depth; cc += 32) {
-        auto block = reinterpret_cast<uint16_t*>(*out_data++);
-        int max_y = std::min(8, height - cy);
-        int max_x = std::min(4, width - cx);
-        int max_c = std::min(32, depth - cc);
-        for (int y = 0; y < max_y; ++y) {
-          for (int x = 0; x < max_x; ++x) {
-            for (int c = 0; c < max_c; ++c) {
-              block[xyc_to_sm_16b(y, x, c)] =
-                  inp_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)];
-            }
-            for (int c = max_c; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0;
-          }
-          for (int x = max_x; x < 4; ++x) {
-            for (int c = 0; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0;
-          }
-        }
-
-        for (int y = max_y; y < 8; ++y)
-          for (int x = 0; x < 4; ++x)
-            for (int c = 0; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0;
-      }  // cc
-    }    // cx
-  }      // cy
-}
-
-/**
- * @brief Convert back from non-contguous layout to a flat layout
- *
- * @param out_flat Pre-allocated output memory pointer
- * @param inp Blockized input data pointer
+ * @param out_ptr Base pointer table to be filled with the list of pointers to the first addresses
+ * of the "chunked" weights
+ * @param out_ptr_size The number of chunks
+ * @param out Pointer to pre-allocated output memory
+ * @param inp Pointer to flat input data
  * @param height
  * @param width
- * @param depth
+ * @param idepth
+ * @param odepth
  */
-void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int depth) {
-  uintptr_t* inp_data = static_cast<uintptr_t*>(inp);
-  uint16_t* out_data = static_cast<uint16_t*>(out_flat);
-  const int stride_x = depth;
+void chunkify_hwio_8b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height, int width,
+                      int idepth, int odepth, int wgt_zp) {
+  auto inp_data = static_cast<int8_t*>(inp);
+  auto out_data = static_cast<int8_t*>(out);
+  const int stride_i = odepth;
+  const int stride_x = stride_i * idepth;
   const int stride_y = stride_x * width;
 
-  for (int cy = 0; cy < height; cy += 8) {
-    for (int cx = 0; cx < width; cx += 4) {
-      for (int cc = 0; cc < depth; cc += 32) {
-        auto block = reinterpret_cast<uint16_t*>(*inp_data);
-        int max_y = std::min(8, height - cy);
-        int max_x = std::min(4, width - cx);
-        int max_c = std::min(32, depth - cc);
-        for (int y = 0; y < max_y; ++y) {
-          for (int x = 0; x < max_x; ++x) {
-            for (int c = 0; c < max_c; ++c) {
-              out_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)] =
-                  block[xyc_to_sm_16b(y, x, c)];
+  for (int ci = 0; ci < idepth; ci += 32) {
+    for (int co = 0; co < odepth; co += 32) {
+      int max_i = std::min(32, idepth - ci);
+      int max_o = std::min(32, odepth - co);
+
+      auto chunk = out_data;
+      for (int y = 0; y < height; ++y) {
+        for (int x = width - 1; x >= 0; --x) {
+          for (int i = 0; i < max_i; ++i) {
+            for (int o = 0; o < max_o; ++o) {
+              chunk[hwio_to_sm_8b(width, y, x, i, o)] =
+                  inp_data[y * stride_y + x * stride_x + (ci + i) * stride_i + (co + o)];
             }
+            for (int o = max_o; o < 32; ++o) chunk[hwio_to_sm_8b(width, y, x, i, o)] = wgt_zp;
           }
+          for (int i = max_i; i < 32; ++i)
+            for (int o = 0; o < 32; ++o) chunk[hwio_to_sm_8b(width, y, x, i, o)] = wgt_zp;
         }
-
-        inp_data++;
       }
+
+      *out_ptr++ = chunk;
+      out_data += height * width * 32 * 32;
+      out_ptr_size--;
+      assert(out_ptr_size >= 0);
     }
   }
 }
@@ -184,25 +157,27 @@ void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, i
   }
 }
 
-SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* nhwc_flat,
-                          bool copy_data) {
+std::tuple<int, int, int, int> getHWIO(const DLTensor* hwio_flat) {
+  int h = hwio_flat->shape[0];
+  int w = hwio_flat->shape[1];
+  int i = round_up(hwio_flat->shape[2], 32);
+  int o = round_up(hwio_flat->shape[3], 32);
+  return std::make_tuple(h, w, i, o);
+}
+
+SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
+                             int num_chunks, void** ptr_table, int wgt_zp) {
   tvm::runtime::String vtcm_scope = "global.vtcm";
 
-  // Allocate blocks for activations. We will use the block pointers
-  // directly from the allocated area.
-  int n = nhwc_flat->shape[0];
-  int h = round_up(nhwc_flat->shape[1], 8);
-  int w = round_up(nhwc_flat->shape[2], 4);
-  int c = round_up(nhwc_flat->shape[3], 32);
-  int64_t shape_2d[2] = {(n * h * w * c) / (8 * 4 * 32), 8 * 4 * 32};
-  void* nhwc_vtcm =
-      device_api->AllocDataSpace(hexagon_device, 2, shape_2d, nhwc_flat->dtype, vtcm_scope);
-  if (copy_data) {
-    blockize_hwc_16b(nhwc_vtcm, nhwc_flat->data, nhwc_flat->shape[1], nhwc_flat->shape[2],
-                     nhwc_flat->shape[3]);
-  }
+  auto [h, w, i, o] = getHWIO(hwio_flat);
+  int64_t shape_1d[] = {h * w * i * o};
+  void* hwio_vtcm =
+      device_api->AllocDataSpace(hexagon_device, 1, shape_1d, hwio_flat->dtype, vtcm_scope);
 
-  return SDLTensor<4>(nhwc_vtcm, nhwc_flat->dtype, nhwc_vtcm, {n, h / 8, w / 4, c / 32});
+  chunkify_hwio_8b(ptr_table, num_chunks, hwio_vtcm, hwio_flat->data, hwio_flat->shape[0],
+                   hwio_flat->shape[1], hwio_flat->shape[2], hwio_flat->shape[3], wgt_zp);
+
+  return SDLTensor<4>(ptr_table, hwio_flat->dtype, hwio_vtcm, {1, 1, i / 32, o / 32});
 }
 
 SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
@@ -214,10 +189,7 @@ SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* h
   // height- or width-wise, so filter chunks may have different sizes.
   // A filter chunk is a block of size HxWx32x32, where H, W are at most
   // height and width of a block respectively.
-  int h = hwio_flat->shape[0];
-  int w = hwio_flat->shape[1];
-  int i = round_up(hwio_flat->shape[2], 32);
-  int o = round_up(hwio_flat->shape[3], 32);
+  auto [h, w, i, o] = getHWIO(hwio_flat);
   int64_t shape_1d[] = {h * w * i * o};
   void* hwio_vtcm =
       device_api->AllocDataSpace(hexagon_device, 1, shape_1d, hwio_flat->dtype, vtcm_scope);
@@ -229,15 +201,19 @@ SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* h
                       {round_up(h, 8) / 8, round_up(w, 4) / 4, i / 32, o / 32});
 }
 
-int calculate_num_weight_chunks(int64_t* shape_hwio) {
-  int h = round_up(shape_hwio[0], 8);
-  int w = round_up(shape_hwio[1], 4);
-  int i = round_up(shape_hwio[2], 32);
-  int o = round_up(shape_hwio[3], 32);
+int calculate_num_weight_chunks(int64_t* shape_hwio, int chunk_height, int chunk_width,
+                                int chunk_in_channel, int chunk_out_channel) {
+  // Define slower roundup that doesn't assume multiplier 'p' to be power of 2
+  auto roundup = [](int v, int p) { return (v + p - 1) - ((v + p - 1) % p); };
+  int h = roundup(shape_hwio[0], chunk_height);
+  int w = roundup(shape_hwio[1], chunk_width);
+  int i = roundup(shape_hwio[2], chunk_in_channel);
+  int o = roundup(shape_hwio[3], chunk_out_channel);
 
-  return (h * w * i * o) / (8 * 4 * 32 * 32);
+  return (h * w * i * o) / (chunk_height * chunk_width * chunk_in_channel * chunk_out_channel);
 }
 
+}  // namespace conv_utils
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h b/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h
new file mode 100644
index 000000000000..07e15966863e
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_TESTS_CPPRUNTIME_HEXAGON_HEXAGON_CONV_UTILS_H
+#define TVM_TESTS_CPPRUNTIME_HEXAGON_HEXAGON_CONV_UTILS_H
+
+#include <dlpack/dlpack.h>
+#include <gtest/gtest.h>
+
+#include <limits>
+
+#include "conv2d.h"
+
+using namespace tvm::runtime::hexagon::conv_utils;
+
+template <typename T>
+class HexagonUtilsTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    vtcm_scope = "global.vtcm";
+    device_api = tvm::runtime::DeviceAPI::Get(hexagon_device, false);
+    float16.code = kDLFloat;
+    float16.bits = 16;
+    float16.lanes = 1;
+
+    uint8.code = kDLUInt;
+    uint8.bits = 8;
+    uint8.lanes = 1;
+
+    int8.code = kDLInt;
+    int8.bits = 8;
+    int8.lanes = 1;
+  }
+
+  void setupTensor(std::tuple<int64_t, int64_t, int64_t, int64_t> shape, DLDataType dtype) {
+    auto [s1, s2, s3, s4] = shape;
+    tensor_shape[0] = s1;
+    tensor_shape[1] = s2;
+    tensor_shape[2] = s3;
+    tensor_shape[3] = s4;
+    int64_t shape_1d[1] = {s1 * s2 * s3 * s4};
+
+    flat_mem = device_api->AllocDataSpace(hexagon_device, 1, shape_1d, dtype, vtcm_scope);
+    flat_mem_data = static_cast<T*>(flat_mem);
+    fill_vals(flat_mem_data, shape_1d[0]);
+
+    flat_tensor.data = flat_mem;
+    flat_tensor.device = hexagon_device;
+    flat_tensor.ndim = 4;
+    flat_tensor.dtype = dtype;
+    flat_tensor.shape = tensor_shape;
+    flat_tensor.strides = nullptr;
+    flat_tensor.byte_offset = 0;
+  }
+
+  void TearDownTensor() {
+    if (flat_tensor.data) device_api->FreeDataSpace(hexagon_device, flat_mem);
+  }
+
+  static void fill_vals(T* arr, int size) {
+    // Testing with uint16 instead of float16 as generating random float16 is not easy within c++
+    auto max = std::numeric_limits<T>::max();
+    srand(std::time(0));
+    for (int i = 0; i < size; ++i) {
+      arr[i] = static_cast<T>(std::rand() % max);
+    }
+  }
+
+  static int flattened_idx(int nn, int hh, int ww, int cc, int64_t* shape) {
+    int h = shape[1];
+    int w = shape[2];
+    int c = shape[3];
+    return cc + c * (ww + w * (hh + h * (nn)));
+  }
+
+  DLTensor flat_tensor;
+  void* flat_mem;
+  T* flat_mem_data;
+  tvm::runtime::DeviceAPI* device_api;
+  tvm::runtime::String vtcm_scope;
+  DLDataType float16;
+  DLDataType int8, uint8;
+  int64_t tensor_shape[4];
+};
+
+#endif
diff --git a/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc b/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc
index 3b922fa6c2a8..5f2ef490d020 100644
--- a/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc
@@ -26,73 +26,14 @@
 #include <string>
 #include <tuple>
 
-#include "tvm/runtime/hexagon/ops/conv2d.h"
+#include "conv2d.h"
+#include "hexagon_conv_utils_test.h"
 
 using namespace tvm::runtime::hexagon;
 
-class HexagonUtilsTest : public ::testing::Test {
- public:
-  void SetUp() override {
-    vtcm_scope = "global.vtcm";
-    device_api = tvm::runtime::DeviceAPI::Get(hexagon_device, false);
-    float16.code = kDLFloat;
-    float16.bits = 16;
-    float16.lanes = 1;
-  }
-
-  void setupTensor(std::tuple<int64_t, int64_t, int64_t, int64_t> shape) {
-    auto [s1, s2, s3, s4] = shape;
-    tensor_shape[0] = s1;
-    tensor_shape[1] = s2;
-    tensor_shape[2] = s3;
-    tensor_shape[3] = s4;
-    int64_t shape_1d[1] = {s1 * s2 * s3 * s4};
-
-    flat_mem = device_api->AllocDataSpace(hexagon_device, 1, shape_1d, float16, vtcm_scope);
-    flat_mem_data = static_cast<uint16_t*>(flat_mem);
-    fill_vals(flat_mem_data, shape_1d[0]);
-
-    flat_tensor.data = flat_mem;
-    flat_tensor.device = hexagon_device;
-    flat_tensor.ndim = 4;
-    flat_tensor.dtype = float16;
-    flat_tensor.shape = tensor_shape;
-    flat_tensor.strides = nullptr;
-    flat_tensor.byte_offset = 0;
-  }
-
-  void TearDownTensor() {
-    if (flat_tensor.data) device_api->FreeDataSpace(hexagon_device, flat_mem);
-  }
-
-  static void fill_vals(uint16_t* arr, int size) {
-    // Testing with uint16 instead of float16 as generating random float16 is not easy within c++
-    uint16_t max = UINT16_MAX;
-    srand(std::time(0));
-    for (int i = 0; i < size; ++i) {
-      arr[i] = static_cast<uint16_t>(std::rand() % max);
-    }
-  }
-
-  static int flattened_idx(int nn, int hh, int ww, int cc, int64_t* shape) {
-    int h = shape[1];
-    int w = shape[2];
-    int c = shape[3];
-    return cc + c * (ww + w * (hh + h * (nn)));
-  }
-
-  DLTensor flat_tensor;
-  void* flat_mem;
-  uint16_t* flat_mem_data;
-  tvm::runtime::DeviceAPI* device_api;
-  tvm::runtime::String vtcm_scope;
-  DLDataType float16;
-  int64_t tensor_shape[4];
-};
-
 // Parameterized test fixture with 4 params representing n, h, w, c
 class HexagonUtilsActivationsBlockizeTest
-    : public HexagonUtilsTest,
+    : public HexagonUtilsTest<uint16_t>,
       public ::testing::WithParamInterface<std::tuple<
           std::tuple<int64_t, int64_t, int64_t, int64_t>, std::tuple<int, int, int, int>>> {};
 
@@ -122,11 +63,12 @@ INSTANTIATE_TEST_SUITE_P(
 TEST_F(HexagonUtilsActivationsBlockizeTest, prepare_nhwc) {
   auto shape = std::make_tuple(1, 14, 7, 60);
   auto [n, h, w, c] = shape;
-  setupTensor(shape);
+  setupTensor(shape, float16);
 
   // // copy_data is set to false here as there's a separate test for blockize when copy_data
   // becomes true
-  auto blocked_tensor = prepare_nhwc(device_api, &flat_tensor, /*copy_data=*/false);
+  auto blocked_tensor =
+      prepare_nhwc<uint16_t, 8, 4, 32>(device_api, &flat_tensor, /*copy_data=*/false);
 
   EXPECT_EQ(blocked_tensor.shape[0], n);
   EXPECT_EQ(blocked_tensor.shape[1], round_up(h, 8) / 8);
@@ -139,7 +81,7 @@ TEST_F(HexagonUtilsActivationsBlockizeTest, prepare_nhwc) {
 
 TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) {
   auto shape_tuple = std::get<0>(GetParam());
-  setupTensor(shape_tuple);
+  setupTensor(shape_tuple, float16);
   auto [n, h, w, c] = shape_tuple;
   int64_t shape[] = {n, h, w, c};
 
@@ -150,7 +92,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) {
 
   void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, float16, vtcm_scope);
   int64_t blocked_shape[] = {n, h_rounded / 8, w_rounded / 4, c_rounded / 32};
-  blockize_hwc_16b(blocked_mem, flat_mem, h, w, c);
+  blockize_hwc<uint16_t, 8, 4, 32>(blocked_mem, flat_mem, h, w, c);
 
   std::function<int(int, int, int, int, int64_t*)> flatten =
       HexagonUtilsActivationsBlockizeTest::flattened_idx;
@@ -159,7 +101,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) {
     auto* blocks = static_cast<uintptr_t*>(blocked_mem);
     int blockIdx = flatten(nn, hh / 8, ww / 4, cc / 32, blocked_shape);
     uint16_t* block = reinterpret_cast<uint16_t*>(blocks[blockIdx]);
-    return block[xyc_to_sm_16b(hh % 8, ww % 4, cc % 32)];
+    return block[yxc_to_sm_16b(hh % 8, ww % 4, cc % 32)];
   };
 
   auto [nn, hh, ww, cc] = std::get<1>(GetParam());
@@ -172,7 +114,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) {
 
 TEST_P(HexagonUtilsActivationsBlockizeTest, deblockize_hwc_16b) {
   auto shape_tuple = std::get<0>(GetParam());
-  setupTensor(shape_tuple);
+  setupTensor(shape_tuple, float16);
   auto [n, h, w, c] = shape_tuple;
   int64_t shape[] = {n, h, w, c};
   int64_t shape_1d[1] = {n * h * w * c};
@@ -183,11 +125,11 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, deblockize_hwc_16b) {
   int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 4 * 32), 8 * 4 * 32};
 
   void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, float16, vtcm_scope);
-  blockize_hwc_16b(blocked_mem, flat_mem, h, w, c);
+  blockize_hwc<uint16_t, 8, 4, 32>(blocked_mem, flat_mem, h, w, c);
 
   void* deblocked_flat_mem =
       device_api->AllocDataSpace(hexagon_device, 1, shape_1d, float16, vtcm_scope);
-  deblockize_hwc_16b(deblocked_flat_mem, blocked_mem, h, w, c);
+  deblockize_hwc<uint16_t, 8, 4, 32>(deblocked_flat_mem, blocked_mem, h, w, c);
   auto* deblocked_flat_mem_data = static_cast<uint16_t*>(deblocked_flat_mem);
 
   auto [nn, hh, ww, cc] = std::get<1>(GetParam());
@@ -201,7 +143,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, deblockize_hwc_16b) {
 }
 
 class HexagonUtilsWeightsChunkifyTest
-    : public HexagonUtilsTest,
+    : public HexagonUtilsTest<uint16_t>,
       public ::testing::WithParamInterface<std::tuple<
           std::tuple<int64_t, int64_t, int64_t, int64_t>, std::tuple<int, int, int, int>>> {};
 
@@ -231,7 +173,9 @@ INSTANTIATE_TEST_SUITE_P(
 
 TEST_F(HexagonUtilsWeightsChunkifyTest, calculate_num_weight_chunks) {
   int64_t shape[] = {3, 3, 40, 40};
-  int num_wgt_chunks = calculate_num_weight_chunks(shape);
+  int num_wgt_chunks =
+      calculate_num_weight_chunks(shape, /* chunk_height */ 8, /* chunk_width */ 4,
+                                  /* chunk_in_channel */ 32, /* chunk_out_channel */ 32);
   EXPECT_EQ(num_wgt_chunks, 4);
 }
 
@@ -239,11 +183,11 @@ TEST_F(HexagonUtilsWeightsChunkifyTest, prepare_hwio) {
   int64_t shape[] = {3, 3, 40, 40};
   auto [h, w, i, o] = shape;
   auto shape_tuple = std::make_tuple(h, w, i, o);
-  setupTensor(shape_tuple);
+  setupTensor(shape_tuple, float16);
 
   // copy_data is set to false here as there's a separate test for blockize when copy_data becomes
   // true
-  auto num_wgt_chunks = calculate_num_weight_chunks(shape);
+  auto num_wgt_chunks = calculate_num_weight_chunks(shape, 8, 4, 32, 32);
   auto wgt_ptr_table =
       reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
   auto chunked_tensor = prepare_hwio(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table);
@@ -260,10 +204,10 @@ TEST_F(HexagonUtilsWeightsChunkifyTest, prepare_hwio) {
 TEST_P(HexagonUtilsWeightsChunkifyTest, chunkify_hwio_16b) {
   auto [shape_tuple, indices] = GetParam();
   auto [h, w, i, o] = shape_tuple;
-  setupTensor(shape_tuple);
+  setupTensor(shape_tuple, float16);
   int64_t shape[] = {h, w, i, o};
 
-  auto num_wgt_chunks = calculate_num_weight_chunks(shape);
+  auto num_wgt_chunks = calculate_num_weight_chunks(shape, 8, 4, 32, 32);
   auto wgt_ptr_table =
       reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
   auto chunked_tensor = prepare_hwio(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table);
diff --git a/tests/cpp-runtime/hexagon/hexagon_quant_utils_tests.cc b/tests/cpp-runtime/hexagon/hexagon_quant_utils_tests.cc
new file mode 100644
index 000000000000..449c69736050
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/hexagon_quant_utils_tests.cc
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <dlpack/dlpack.h>
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <ctime>
+#include <functional>
+#include <string>
+#include <tuple>
+
+#include "conv2d.h"
+#include "hexagon_conv_utils_test.h"
+
+using namespace tvm::runtime::hexagon::conv_utils;
+
+// Parameterized test fixture with 4 params representing n, h, w, c
+class HexagonUtilsQuantActivationsBlockizeTest
+    : public HexagonUtilsTest<uint8_t>,
+      public ::testing::WithParamInterface<std::tuple<
+          std::tuple<int64_t, int64_t, int64_t, int64_t>, std::tuple<int, int, int, int>>> {};
+
+// TODO (quic-sanirudh): See if we can test with random generated indices
+INSTANTIATE_TEST_SUITE_P(
+    BlockizeDeblockizeTestFixtures, HexagonUtilsQuantActivationsBlockizeTest,
+    ::testing::Combine(::testing::Values(std::make_tuple(1, 14, 7, 60)),
+                       ::testing::Values(std::make_tuple(0, 0, 0, 0),   // first element
+                                         std::make_tuple(0, 7, 3, 31),  // last element
+                                         // Remaining are random element tests
+                                         std::make_tuple(0, 13, 6, 59),
+                                         std::make_tuple(0, 0, 0, 32), std::make_tuple(0, 0, 4, 32),
+                                         std::make_tuple(0, 2, 3, 4), std::make_tuple(0, 5, 6, 7),
+                                         std::make_tuple(0, 10, 4, 12))),
+    [](const ::testing::TestParamInfo<HexagonUtilsQuantActivationsBlockizeTest::ParamType>& info) {
+      // Can use info.param here to generate the test suffix
+      auto indices = std::get<1>(info.param);
+      int h = std::get<1>(indices);
+      int w = std::get<2>(indices);
+      int c = std::get<3>(indices);
+      // Generate test name as "hwc0x0x0" if the indices of hwc are 0,0,0
+      std::string name =
+          "hwc" + std::to_string(h) + "x" + std::to_string(w) + "x" + std::to_string(c);
+      return name;
+    });
+
+TEST_F(HexagonUtilsQuantActivationsBlockizeTest, prepare_nhwc) {
+  auto shape = std::make_tuple(1, 14, 7, 60);
+  auto [n, h, w, c] = shape;
+  setupTensor(shape, uint8);
+
+  // // copy_data is set to false here as there's a separate test for blockize when copy_data
+  // becomes true
+  auto blocked_tensor =
+      prepare_nhwc<uint8_t, 8, 8, 32>(device_api, &flat_tensor, /*copy_data=*/false);
+
+  EXPECT_EQ(blocked_tensor.shape[0], n);
+  EXPECT_EQ(blocked_tensor.shape[1], round_up(h, 8) / 8);
+  EXPECT_EQ(blocked_tensor.shape[2], round_up(w, 8) / 8);
+  EXPECT_EQ(blocked_tensor.shape[3], round_up(c, 32) / 32);
+
+  TearDownTensor();
+  release(device_api, blocked_tensor);
+}
+
+TEST_P(HexagonUtilsQuantActivationsBlockizeTest, blockize_hwc_8b) {
+  auto shape_tuple = std::get<0>(GetParam());
+  setupTensor(shape_tuple, uint8);
+  auto [n, h, w, c] = shape_tuple;
+  int64_t shape[] = {n, h, w, c};
+
+  int h_rounded = round_up(h, 8);
+  int w_rounded = round_up(w, 8);
+  int c_rounded = round_up(c, 32);
+  int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 8 * 32), 8 * 8 * 32};
+
+  void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, uint8, vtcm_scope);
+  int64_t blocked_shape[] = {n, h_rounded / 8, w_rounded / 8, c_rounded / 32};
+  blockize_hwc<uint8_t, 8, 8, 32>(blocked_mem, flat_mem, h, w, c);
+
+  std::function<int(int, int, int, int, int64_t*)> flatten =
+      HexagonUtilsQuantActivationsBlockizeTest::flattened_idx;
+
+  auto getBlockedElem = [&blocked_shape, blocked_mem, flatten](int nn, int hh, int ww, int cc) {
+    auto* blocks = static_cast<uintptr_t*>(blocked_mem);
+    int blockIdx = flatten(nn, hh / 8, ww / 8, cc / 32, blocked_shape);
+    uint8_t* block = reinterpret_cast<uint8_t*>(blocks[blockIdx]);
+    return block[yxc_to_sm_8b(hh % 8, ww % 8, cc % 32)];
+  };
+
+  auto [nn, hh, ww, cc] = std::get<1>(GetParam());
+
+  EXPECT_EQ(flat_mem_data[flattened_idx(nn, hh, ww, cc, shape)], getBlockedElem(nn, hh, ww, cc));
+
+  TearDownTensor();
+  device_api->FreeDataSpace(hexagon_device, blocked_mem);
+}
+
+TEST_P(HexagonUtilsQuantActivationsBlockizeTest, deblockize_hwc_8b) {
+  auto shape_tuple = std::get<0>(GetParam());
+  setupTensor(shape_tuple, uint8);
+  auto [n, h, w, c] = shape_tuple;
+  int64_t shape[] = {n, h, w, c};
+  int64_t shape_1d[1] = {n * h * w * c};
+
+  int h_rounded = round_up(h, 8);
+  int w_rounded = round_up(w, 8);
+  int c_rounded = round_up(c, 32);
+  int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 8 * 32), 8 * 8 * 32};
+
+  void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, uint8, vtcm_scope);
+  blockize_hwc<uint8_t, 8, 8, 32>(blocked_mem, flat_mem, h, w, c);
+
+  void* deblocked_flat_mem =
+      device_api->AllocDataSpace(hexagon_device, 1, shape_1d, uint8, vtcm_scope);
+  deblockize_hwc<uint8_t, 8, 8, 32>(deblocked_flat_mem, blocked_mem, h, w, c);
+  auto* deblocked_flat_mem_data = static_cast<uint8_t*>(deblocked_flat_mem);
+
+  auto [nn, hh, ww, cc] = std::get<1>(GetParam());
+
+  auto idx = flattened_idx(nn, hh, ww, cc, shape);
+  EXPECT_EQ(flat_mem_data[idx], deblocked_flat_mem_data[idx]);
+
+  TearDownTensor();
+  device_api->FreeDataSpace(hexagon_device, blocked_mem);
+  device_api->FreeDataSpace(hexagon_device, deblocked_flat_mem);
+}
+
+class HexagonUtilsQuantWeightsChunkifyTest
+    : public HexagonUtilsTest<int8_t>,
+      public ::testing::WithParamInterface<std::tuple<
+          std::tuple<int64_t, int64_t, int64_t, int64_t>, std::tuple<int, int, int, int>>> {};
+
+INSTANTIATE_TEST_SUITE_P(
+    ChunkifyDechunkifyTests, HexagonUtilsQuantWeightsChunkifyTest,
+    ::testing::Combine(::testing::Values(std::make_tuple(3, 3, 40, 40)),
+                       ::testing::Values(std::make_tuple(0, 0, 0, 0),    // first element
+                                         std::make_tuple(2, 2, 39, 39),  // Last element
+                                         // Remaining are random element tests
+                                         std::make_tuple(1, 1, 28, 33),
+                                         std::make_tuple(1, 2, 8, 38),
+                                         std::make_tuple(1, 0, 12, 15),
+                                         std::make_tuple(2, 1, 9, 22), std::make_tuple(0, 2, 6, 7),
+                                         std::make_tuple(1, 2, 3, 4))),
+    [](const ::testing::TestParamInfo<HexagonUtilsQuantWeightsChunkifyTest::ParamType>& info) {
+      // Can use info.param here to generate the test suffix
+      auto indices = std::get<1>(info.param);
+      int h = std::get<0>(indices);
+      int w = std::get<1>(indices);
+      int i = std::get<2>(indices);
+      int o = std::get<3>(indices);
+      // Generate test name as "hwc0x0x0" if the indices of hwc are 0,0,0
+      std::string name = "hwio" + std::to_string(h) + std::to_string(w) + "x" + std::to_string(i) +
+                         "x" + std::to_string(o);
+      return name;
+    });
+
+TEST_F(HexagonUtilsQuantWeightsChunkifyTest, calculate_num_weight_chunks) {
+  int64_t shape[] = {3, 3, 40, 40};
+  int num_wgt_chunks = calculate_num_weight_chunks(shape, shape[0], shape[1], 32, 32);
+  EXPECT_EQ(num_wgt_chunks, 4);
+}
+
+TEST_F(HexagonUtilsQuantWeightsChunkifyTest, prepare_hwio) {
+  int64_t shape[] = {3, 3, 40, 40};
+  auto [h, w, i, o] = shape;
+  auto shape_tuple = std::make_tuple(h, w, i, o);
+  setupTensor(shape_tuple, int8);
+
+  // copy_data is set to false here as there's a separate test for blockize when copy_data becomes
+  // true
+  auto num_wgt_chunks = calculate_num_weight_chunks(shape, shape[0], shape[1], 32, 32);
+  auto wgt_ptr_table =
+      reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
+  auto chunked_tensor = prepare_hwio_8b(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table);
+
+  EXPECT_EQ(chunked_tensor.shape[0], 1);
+  EXPECT_EQ(chunked_tensor.shape[1], 1);
+  EXPECT_EQ(chunked_tensor.shape[2], round_up(i, 32) / 32);
+  EXPECT_EQ(chunked_tensor.shape[3], round_up(o, 32) / 32);
+
+  release(device_api, chunked_tensor);
+  TearDownTensor();
+}
+
+TEST_P(HexagonUtilsQuantWeightsChunkifyTest, chunkify_hwio_8b) {
+  auto [shape_tuple, indices] = GetParam();
+  auto [h, w, i, o] = shape_tuple;
+  setupTensor(shape_tuple, int8);
+  int64_t shape[] = {h, w, i, o};
+
+  auto num_wgt_chunks = calculate_num_weight_chunks(shape, shape[0], shape[1], 32, 32);
+  auto wgt_ptr_table =
+      reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
+  auto chunked_tensor = prepare_hwio_8b(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table);
+
+  auto getChunkedElem = [width = w, chunked_tensor](int hh, int ww, int ii, int oo) {
+    auto data = static_cast<intptr_t*>(chunked_tensor.data);
+    auto chunk = data[ii / 32 * chunked_tensor.shape[3] + oo / 32];
+    auto chunk_int8 = reinterpret_cast<int8_t*>(chunk);
+    return chunk_int8[hwio_to_sm_8b(width, hh, ww, ii % 32, oo % 32)];
+  };
+
+  auto [hh, ww, ii, oo] = indices;
+
+  EXPECT_EQ(flat_mem_data[flattened_idx(hh, ww, ii, oo, shape)], getChunkedElem(hh, ww, ii, oo));
+  release(device_api, chunked_tensor);
+}
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index c03701f83ccc..5b13513c0fb3 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -320,7 +320,7 @@ def quantize_np(arr_np: numpy.ndarray, dtype: str):
 
     scale = (fmax - fmin) / (qmax - qmin)
     zero_point = numpy.rint((fmax * qmin - fmin * qmax) / (fmax - fmin)).astype("int32")
-    quant_np = (arr_np / scale + zero_point).astype(dtype)
+    quant_np = numpy.clip(((arr_np / scale).round() + zero_point), qmin, qmax).astype(dtype)
     return quant_np, scale, zero_point
 
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py
new file mode 100644
index 000000000000..c26e6142ba5c
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py
@@ -0,0 +1,261 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test quantized conv2d HVX intrinsic implementation"""
+
+import numpy as np
+
+import tvm
+import tvm.contrib.hexagon
+from tvm.topi.hexagon.utils import get_fixed_point_value
+from tvm.topi.testing import conv2d_nhwc_python
+
+from ..infrastructure import get_hexagon_target, quantize_np
+
+
+def build_conv2d(target):
+    """Build and return the conv2d IRModule that calls the intrinsic implementation"""
+    act_n, act_h, act_w, act_c = (
+        tvm.te.var("an"),
+        tvm.te.var("ah"),
+        tvm.te.var("aw"),
+        tvm.te.var("ac"),
+    )
+    filt_h, filt_w, filt_o = tvm.te.var("filt_h"), tvm.te.var("filt_w"), tvm.te.var("filt_o")
+    act_scale, act_zp = tvm.te.var("act_scale", dtype="float32"), tvm.te.var("act_zp")
+    wgt_scale, wgt_zp = tvm.te.var("wgt_scale", dtype="float32"), tvm.te.var("wgt_zp")
+    out_scale, out_zp = tvm.te.var("out_scale", dtype="float32"), tvm.te.var("out_zp")
+    fixed_final_scale, scale_factor = tvm.te.var("fixed_final_scale", dtype="int32"), tvm.te.var(
+        "scale_factor"
+    )
+    stride_h, stride_w = tvm.te.var("stride_h"), tvm.te.var("stride_w")
+
+    act_flat = tvm.te.placeholder(
+        shape=(act_n, act_h, act_w, act_c), dtype="uint8", name="act_flat"
+    )
+    wgt_flat = tvm.te.placeholder(
+        shape=(filt_h, filt_w, act_c, filt_o), dtype="int8", name="wgt_flat"
+    )
+
+    out_flat = tvm.te.extern(
+        shape=(act_n, (act_h - filt_h) // stride_h + 1, (act_w - filt_w) // stride_w + 1, filt_o),
+        inputs=[act_flat, wgt_flat],
+        fcompute=lambda ins, outs: tvm.tir.call_cpacked(
+            "conv2d_packed_quant",  # Function from TVM runtime
+            ins[0],
+            ins[1],
+            act_scale,
+            act_zp,
+            wgt_scale,
+            wgt_zp,
+            out_scale,
+            out_zp,
+            stride_h,
+            stride_w,
+            fixed_final_scale,
+            scale_factor,
+            outs[0],
+            tvm.runtime.const(0),  # resource_handle (unused)
+        ),
+        dtype="uint8",
+    )
+
+    s = tvm.te.create_schedule(out_flat.op)
+
+    func_name = "conv2d_quant_hvx"
+    module = tvm.build(
+        s,
+        [
+            act_flat,
+            wgt_flat,
+            act_scale,
+            act_zp,
+            wgt_scale,
+            wgt_zp,
+            out_scale,
+            out_zp,
+            stride_h,
+            stride_w,
+            fixed_final_scale,
+            scale_factor,
+            out_flat,
+        ],
+        target=target,
+        name=func_name,
+    )
+
+    return module
+
+
+def gen_config(params):
+    """Utility function to generate useful ids for shape_parameters"""
+
+    dims = lambda vals: "x".join(map(str, vals))
+
+    config = {}
+    for param in params:
+        act_shape, wgt_shape, inp_stride = param
+        name = f"nhwc{dims(act_shape)}-hwio{dims(wgt_shape)}-stride{dims(inp_stride)}"
+        config[name] = param
+
+    return config
+
+
+class TestQuantConv2dIntrin:
+    """Test Quantized Conv2d Intrin class"""
+
+    shape_parameters = [
+        [
+            (1, 5, 5, 33),
+            (3, 3, 33, 33),
+            (1, 1),
+        ],
+        [
+            (1, 9, 8, 64),
+            (3, 3, 64, 64),
+            (1, 1),
+        ],
+        [
+            (1, 11, 16, 64),
+            (3, 3, 64, 32),
+            (1, 1),
+        ],
+        [
+            (1, 24, 8, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 4, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 5, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 6, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 7, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 8, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 9, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 10, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 11, 3),
+            (3, 3, 3, 3),
+            (1, 1),
+        ],
+        [
+            (1, 4, 4, 5),
+            (3, 3, 5, 3),
+            (1, 1),
+        ],
+    ]
+
+    config = gen_config(shape_parameters)
+    act_shape, wgt_shape, inp_stride = tvm.testing.parameters(*config.values(), ids=config.keys())
+    inp_offset = tvm.testing.parameter((0, 0), ids=["offset0x0"])
+
+    @tvm.testing.requires_hexagon
+    def test_conv2d_quant(self, act_shape, wgt_shape, inp_stride, hexagon_session):
+        """Test quantized conv2d intrinsic implementation"""
+        assert act_shape[3] == wgt_shape[2]
+
+        # Currently, input offset does not affect the output shape
+        def get_out_shape(ash, wsh, inp_stride):
+            assert ash[3] == wsh[2]
+            osh = (
+                ash[0],
+                (ash[1] - wsh[0]) // inp_stride[0] + 1,
+                (ash[2] - wsh[1]) // inp_stride[1] + 1,
+                wsh[3],
+            )
+            assert tvm.tir.all([x > 0 for x in osh])
+            return osh
+
+        act_f = np.random.uniform(-1.5, 1.0, size=act_shape).astype("float32")
+        wgt_f = np.random.uniform(-1.5, 1.0, size=wgt_shape).astype("float32")
+
+        # Quanize activations using onnxruntime
+        act_q, act_scale, act_zp = quantize_np(act_f, dtype="uint8")
+        act_q = act_q.reshape(act_f.shape)
+
+        # Quanize weights using onnxruntime
+        wgt_q, wgt_scale, wgt_zp = quantize_np(wgt_f, dtype="int8")
+        wgt_q = wgt_q.reshape(wgt_f.shape)
+
+        # Generate reference output
+        ref_out = conv2d_nhwc_python(act_f, wgt_f, stride=inp_stride, padding="VALID")
+
+        ref_out_q, out_scale, out_zp = quantize_np(ref_out, dtype="uint8")
+        ref_out_q = ref_out_q.reshape(ref_out.shape)
+
+        final_scale = act_scale * wgt_scale / out_scale
+        fixed_final_scale, scale_factor = get_fixed_point_value(final_scale)
+
+        module = build_conv2d(get_hexagon_target("v69"))
+        mod = hexagon_session.load_module(module)
+
+        output_shape = get_out_shape(act_shape, wgt_shape, inp_stride)
+
+        output = tvm.nd.array(
+            np.zeros(output_shape, dtype="uint8"),
+            device=hexagon_session.device,
+        )
+        mod(
+            tvm.nd.array(act_q, device=hexagon_session.device),
+            tvm.nd.array(wgt_q, device=hexagon_session.device),
+            act_scale,
+            act_zp,
+            wgt_scale,
+            wgt_zp,
+            out_scale,
+            out_zp,
+            inp_stride[0],  # stride_height
+            inp_stride[1],  # stride_width
+            fixed_final_scale,
+            scale_factor,
+            output,
+        )
+
+        out_q = output.numpy()
+
+        tvm.testing.assert_allclose(out_q, ref_out_q, rtol=0, atol=2)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()