apache · mehrdadh · Dec 1, 2022 · Oct 18, 2022 · Nov 29, 2022 · cconvey
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
@@ -178,6 +178,15 @@ if(BUILD_FOR_HEXAGON)
     "${TVMRT_SOURCE_DIR}/hexagon/ops/*.cc"
   )
 
+  include_directories(
+    "${TVMRT_SOURCE_DIR}/hexagon/ops"
+  )
+
+  set_source_files_properties(
+    "${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_quant_hvx.cc"
+    PROPERTIES COMPILE_FLAGS "-mhvx"
+  )
+
   set_source_files_properties(
     "${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_fp16_hvx.cc"
     PROPERTIES COMPILE_FLAGS "-mhvx"

diff --git a/include/tvm/runtime/hexagon/ops/conv2d.h → src/runtime/hexagon/ops/conv2d.h b/include/tvm/runtime/hexagon/ops/conv2d.h → src/runtime/hexagon/ops/conv2d.h
@@ -20,6 +20,7 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 
+#include <algorithm>
 #include <cassert>
 
 #ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
@@ -28,6 +29,7 @@
 namespace tvm {
 namespace runtime {
 namespace hexagon {
+namespace conv_utils {
 static constexpr auto hexagon_device = DLDevice{static_cast<DLDeviceType>(kDLHexagon), 0};
 
 // Standalone DLTensor: the standalone-ness means that this object owns the shape
@@ -75,26 +77,49 @@ inline void* to_ptr(uintptr_t v) { return reinterpret_cast<void*>(v); }
 
 inline uintptr_t to_uint(void* ptr) { return reinterpret_cast<uintptr_t>(ptr); }
 
-constexpr int xyc_to_sm_16b(int y, int x, int c) {
+constexpr int yxc_to_sm_16b(int y, int x, int c) {
   // Map y,x,c coordinates within a block to the offset (in 16-bit elements)
   // from the beginning of the block in spatial-major layout.
   // 10-bit spatial mask: yyyxcccccx
   assert(y >= 0 && x >= 0 && c >= 0);
+  assert(y < 8 && x < 4 && c < 32);
   return y << 7 | (x & 2) << 5 | c << 1 | (x & 1);
 }
 
+constexpr int yxc_to_sm_8b(int y, int x, int c) {
+  // Map y,x,c coordinates within a block to the offset (in 8-bit elements)
+  // from the beginning of the block in spatial-major layout.
+  // 10-bit spatial mask: yyyxxxccccc
+  assert(y >= 0 && x >= 0 && c >= 0);
+  assert(y < 8 && x < 8 && c < 32);
+  return y << 8 | x << 5 | c;
+}
+
+constexpr int hwio_to_sm_8b(int width, int y, int x, int i, int o) {
+  // Map y,x,i,o coordinates within a chunk (assuming the origin at the
+  // top-left spatial corner) to the offset (in 8-bit elements) from the
+  // beginning of the chunk in spatial-major layout.
+  // Spatial mask: p..piiioooooii, where p..p are position bits.
+  assert(width >= 1);
+  assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  assert(i < 32 && o < 32);
+  int p = y * width + (width - 1 - x);
+  return p << 10 | (i & 0x1c) << 5 | o << 2 | (i & 3);
+}
+
 constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
   // Map y,x,i,o coordinates within a chunk (assuming the origin at the
   // top-left spatial corner) to the offset (in 16-bit elements) from the
   // beginning of the chunk in spatial-major layout.
   // Spatial mask: p..piiiioooooi, where p..p are position bits.
   assert(width >= 1);
   assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  assert(i < 32 && o < 32);
   int p = y * width + (width - 1 - x);
   return p << 10 | (i & 0x1e) << 5 | o << 1 | (i & 1);
 }
 
-inline constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
+constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
 
 // Returns the block address at the given index
 // Assumptions
@@ -123,6 +148,10 @@ inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
  * The input is mapped into the below mentioned layout (notation similar to index map used for
  * transform layout):
  *
+ * For uint8_t type
+ * lambda n, h, w, c: n, h//8, w//8, c//32, AXIS_SEPARATOR, h%8, w%8, c%32
+ *
+ * For uint16_t type
  * lambda n, h, w, c: n, h//8, w//4, c//32, AXIS_SEPARATOR, h%8, (w%4)//2, c%32, w%2
  *
  * where AXIS_SEPARATOR represents split up in the physical layout
@@ -133,7 +162,48 @@ inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
  * @param width
  * @param depth
  */
-void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int depth);
+template <typename T, int block_height, int block_width, int block_depth>
+void blockize_hwc(void* out, void* inp_flat, int height, int width, int depth) {
+  int (*index_func)(int, int, int);
+  if constexpr (std::is_same_v<T, uint8_t>)
+    index_func = yxc_to_sm_8b;
+  else if constexpr (std::is_same_v<T, uint16_t>)
+    index_func = yxc_to_sm_16b;
+  else
+    LOG_ERROR << "blockize_hwc is only supported for uint8_t and uint16_t types";
+
+  auto inp_data = static_cast<T*>(inp_flat);
+  auto out_data = static_cast<uintptr_t*>(out);
+  const int stride_x = depth;
+  const int stride_y = stride_x * width;
+
+  for (int cy = 0; cy < height; cy += block_height) {
+    for (int cx = 0; cx < width; cx += block_width) {
+      for (int cc = 0; cc < depth; cc += block_depth) {
+        auto block = reinterpret_cast<T*>(*out_data++);
+        int max_y = std::min(block_height, height - cy);
+        int max_x = std::min(block_width, width - cx);
+        int max_c = std::min(block_depth, depth - cc);
+        for (int y = 0; y < max_y; ++y) {
+          for (int x = 0; x < max_x; ++x) {
+            for (int c = 0; c < max_c; ++c) {
+              block[index_func(y, x, c)] =
+                  inp_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)];
+            }
+            for (int c = max_c; c < block_depth; ++c) block[index_func(y, x, c)] = 0;
+          }
+          for (int x = max_x; x < block_width; ++x) {
+            for (int c = 0; c < block_depth; ++c) block[index_func(y, x, c)] = 0;
+          }
+        }
+
+        for (int y = max_y; y < block_height; ++y)
+          for (int x = 0; x < block_width; ++x)
+            for (int c = 0; c < block_depth; ++c) block[index_func(y, x, c)] = 0;
+      }  // cc
+    }    // cx
+  }      // cy
+}
 
 /**
  * @brief Convert back from non-contguous layout to a flat layout
@@ -144,7 +214,42 @@ void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int dept
  * @param width
  * @param depth
  */
-void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int depth);
+template <typename T, int block_height, int block_width, int block_depth>
+void deblockize_hwc(void* out_flat, void* inp, int height, int width, int depth) {
+  int (*index_func)(int, int, int);
+  if constexpr (std::is_same_v<T, uint8_t>)
+    index_func = yxc_to_sm_8b;
+  else if constexpr (std::is_same_v<T, uint16_t>)
+    index_func = yxc_to_sm_16b;
+  else
+    LOG_ERROR << "deblockize_hwc is only supported for uint8_t and uint16_t types";
+
+  uintptr_t* inp_data = static_cast<uintptr_t*>(inp);
+  T* out_data = static_cast<T*>(out_flat);
+  const int stride_x = depth;
+  const int stride_y = stride_x * width;
+
+  for (int cy = 0; cy < height; cy += block_height) {
+    for (int cx = 0; cx < width; cx += block_width) {
+      for (int cc = 0; cc < depth; cc += block_depth) {
+        auto block = reinterpret_cast<T*>(*inp_data);
+        int max_y = std::min(block_height, height - cy);
+        int max_x = std::min(block_width, width - cx);
+        int max_c = std::min(block_depth, depth - cc);
+        for (int y = 0; y < max_y; ++y) {
+          for (int x = 0; x < max_x; ++x) {
+            for (int c = 0; c < max_c; ++c) {
+              out_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)] =
+                  block[index_func(y, x, c)];
+            }
+          }
+        }
+
+        inp_data++;
+      }
+    }
+  }
+}
 
 /**
  * @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below:
@@ -175,22 +280,50 @@ void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int de
  */
 void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height,
                        int width, int idepth, int odepth);
+void chunkify_hwio_8b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height, int width,
+                      int idepth, int odepth);
 
+template <typename T, int block_height, int block_width, int block_depth>
 SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* nhwc_flat,
-                          bool copy_data);
+                          bool copy_data) {
+  tvm::runtime::String vtcm_scope = "global.vtcm";
+
+  // Allocate blocks for activations. We will use the block pointers
+  // directly from the allocated area.
+  int n = nhwc_flat->shape[0];
+  int h = round_up(nhwc_flat->shape[1], block_height);
+  int w = round_up(nhwc_flat->shape[2], block_width);
+  int c = round_up(nhwc_flat->shape[3], block_depth);
+  int64_t shape_2d[2] = {(n * h * w * c) / (block_height * block_width * block_depth),
+                         block_height * block_width * block_depth};
+  void* nhwc_vtcm =
+      device_api->AllocDataSpace(hexagon_device, 2, shape_2d, nhwc_flat->dtype, vtcm_scope);
+  if (copy_data) {
+    blockize_hwc<T, block_height, block_width, block_depth>(
+        nhwc_vtcm, nhwc_flat->data, nhwc_flat->shape[1], nhwc_flat->shape[2], nhwc_flat->shape[3]);
+  }
 
-int calculate_num_weight_chunks(int64_t* shape_hwio);
+  return SDLTensor<4>(nhwc_vtcm, nhwc_flat->dtype, nhwc_vtcm,
+                      {n, h / block_height, w / block_width, c / block_depth});
+}
+
+int calculate_num_weight_chunks(int64_t* shape_hwio, int chunk_height, int chunk_width,
+                                int chunk_in_channel, int chunk_out_channel);
 
 SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
                           int num_chunks, void** ptr_table);
 
+SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
+                             int num_chunks, void** ptr_table, int wgt_zp = 0);
+
 template <size_t N>
 void release(tvm::runtime::DeviceAPI* device_api, const SDLTensor<N>& tensor) {
   if (auto* data_space = tensor.GetDataSpace()) {
     device_api->FreeDataSpace(hexagon_device, data_space);
   }
 }
 
+}  // namespace conv_utils
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm

diff --git a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
@@ -27,7 +27,7 @@
 #include <cassert>
 #include <cinttypes>
 
-#include "tvm/runtime/hexagon/ops/conv2d.h"
+#include "conv2d.h"
 
 // Current limitations:
 // - N in NHWC must be 1
@@ -68,7 +68,7 @@ namespace hexagon {
  */
 static inline uint16_t* getElementPtr(int block_out_y, int block_out_x, int block_out_c, int yi,
                                       int xio, int ci, int xii, const DLTensor& tensor) {
-  auto block_ptr = nhwc_at(tensor, 0, block_out_y, block_out_x, block_out_c);
+  auto block_ptr = conv_utils::nhwc_at(tensor, 0, block_out_y, block_out_x, block_out_c);
   auto block_offset = yi * 128 + xio * 64 + ci * 2 + xii;
   auto first_element_ptr = reinterpret_cast<uint16_t*>(block_ptr);
   return first_element_ptr + block_offset;
@@ -279,10 +279,10 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
         }
         int fx = (fw < wgt_chunk_thin_width) ? fw : ((fw - wgt_chunk_thin_width) % 4);
         int fy = fh % 8;
-        for (int c = 0; c < round_up(filt_idepth, 2); c += 2) {
+        for (int c = 0; c < conv_utils::round_up(filt_idepth, 2); c += 2) {
           int out_act_cc = c / 32;
           int ci = c % 32;
-          auto wgt_chunk = hwio_at(cr_filt, fch, fcw, out_act_cc, out_c);
+          auto wgt_chunk = conv_utils::hwio_at(cr_filt, fch, fcw, out_act_cc, out_c);
 
           // Find weight chunk offset ptr
           int max_x = (fcw == 0) ? wgt_chunk_thin_width : 4;
@@ -306,7 +306,7 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
                                                true_wo, ci, true_wi, cr_act);
           HVX_Vector act_vec = getInputVector(act_element_ptr);
 
-          auto wgt_chunk_offset = hwio_to_sm_16b(max_x, fy, fx, ci, 0);
+          auto wgt_chunk_offset = conv_utils::hwio_to_sm_16b(max_x, fy, fx, ci, 0);
           auto base_chunk_ptr = reinterpret_cast<uint16_t*>(wgt_chunk);
           auto chunk_ptr = base_chunk_ptr + wgt_chunk_offset;
 
@@ -404,7 +404,7 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act,  // NOLINT(*)
 
 int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val,
                        int out_code, void* res_handle) {
-  namespace hexagonrt = tvm::runtime::hexagon;
+  namespace conv_utils = tvm::runtime::hexagon::conv_utils;
   ICHECK_EQ(num_args, 7) << "Unexpected number of arguments";
   ICHECK_EQ(type_codes[0], kTVMDLTensorHandle)
       << "First argument is expected to be the input tensor";  // Input activations
@@ -440,50 +440,55 @@ int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue*
            << wgt_flat->shape[2] << "x" << wgt_flat->shape[3] << ", pad_top=" << pad_top
            << ", pad_left=" << pad_left;
 
-  auto* device_api = tvm::runtime::DeviceAPI::Get(hexagonrt::hexagon_device, false);
+  auto* device_api = tvm::runtime::DeviceAPI::Get(conv_utils::hexagon_device, false);
   ICHECK(device_api != nullptr);
   tvm::runtime::String vtcm_scope = "global.vtcm";
 
-  auto act_vtcm = hexagonrt::prepare_nhwc(device_api, act_flat, /*copy_data=*/true);
+  auto act_vtcm =
+      conv_utils::prepare_nhwc<uint16_t, 8, 4, 32>(device_api, act_flat, /*copy_data=*/true);
 
   ICHECK_NE(wgt_flat->shape[0], 0) << "Weights height should not be zero";
   ICHECK_NE(wgt_flat->shape[1], 0) << "Weights width should not be zero";
   ICHECK_NE(wgt_flat->shape[2], 0) << "Weights input channels should not be zero";
   ICHECK_NE(wgt_flat->shape[3], 0) << "Weights output channels should not be zero";
-  int num_wgt_chunks = hexagonrt::calculate_num_weight_chunks(wgt_flat->shape);
+  int num_wgt_chunks = conv_utils::calculate_num_weight_chunks(
+      wgt_flat->shape, /* chunk_height */ 8, /* chunk_width */ 4, /* chunk_in_channel */ 32,
+      /* chunk_out_channel */ 32);
+
   LOG_INFO << "num_wgt_chunks: " << num_wgt_chunks;
   auto wgt_ptr_table =
       reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
-  auto wgt_vtcm = hexagonrt::prepare_hwio(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table);
+  auto wgt_vtcm = conv_utils::prepare_hwio(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table);
 
-  auto out_vtcm = hexagonrt::prepare_nhwc(device_api, out_flat, /*copy_data=*/false);
+  auto out_vtcm =
+      conv_utils::prepare_nhwc<uint16_t, 8, 4, 32>(device_api, out_flat, /*copy_data=*/false);
 
   // Prepare zero_block
   int64_t block_nbytes = 2048;
-  void* zero_block = device_api->AllocDataSpace(hexagonrt::hexagon_device, 1, &block_nbytes,
+  void* zero_block = device_api->AllocDataSpace(conv_utils::hexagon_device, 1, &block_nbytes,
                                                 tvm::runtime::DataType::UInt(8), vtcm_scope);
   memset(zero_block, 0, 2048);
 
   // FIXME: Setting bias to zero_block: this works for up to 256 output channels.
   auto bias_flat =
-      hexagonrt::SDLTensor<1>(zero_block, wgt_flat->dtype, zero_block, &wgt_flat->shape[3]);
-  auto act_shape = hexagonrt::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape);
-  auto filt_shape = hexagonrt::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape);
-  auto pad_shape = hexagonrt::SDLTensor<2>(nullptr, act_flat->dtype, nullptr, {pad_top, pad_left});
-  auto out_shape = hexagonrt::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape);
+      conv_utils::SDLTensor<1>(zero_block, wgt_flat->dtype, zero_block, &wgt_flat->shape[3]);
+  auto act_shape = conv_utils::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape);
+  auto filt_shape = conv_utils::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape);
+  auto pad_shape = conv_utils::SDLTensor<2>(nullptr, act_flat->dtype, nullptr, {pad_top, pad_left});
+  auto out_shape = conv_utils::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape);
   bool relu = false;
 
-  hexagonrt::conv_layer_fp16_hvx(out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape, bias_flat,
-                                 filt_shape, pad_shape, relu, stride_h, stride_w,
-                                 hexagonrt::to_uint(zero_block));
+  tvm::runtime::hexagon::conv_layer_fp16_hvx(out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape,
+                                             bias_flat, filt_shape, pad_shape, relu, stride_h,
+                                             stride_w, conv_utils::to_uint(zero_block));
 
-  hexagonrt::deblockize_hwc_16b(out_flat->data, out_vtcm.data, out_flat->shape[1],
-                                out_flat->shape[2], out_flat->shape[3]);
+  conv_utils::deblockize_hwc<uint16_t, 8, 4, 32>(out_flat->data, out_vtcm.data, out_flat->shape[1],
+                                                 out_flat->shape[2], out_flat->shape[3]);
 
-  device_api->FreeDataSpace(hexagonrt::hexagon_device, zero_block);
-  hexagonrt::release(device_api, out_vtcm);
-  hexagonrt::release(device_api, wgt_vtcm);
-  hexagonrt::release(device_api, act_vtcm);
+  device_api->FreeDataSpace(conv_utils::hexagon_device, zero_block);
+  conv_utils::release(device_api, out_vtcm);
+  conv_utils::release(device_api, wgt_vtcm);
+  conv_utils::release(device_api, act_vtcm);
 
   return 0;
 }