diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake index 31cece8a19e0..887211893558 100644 --- a/cmake/modules/Hexagon.cmake +++ b/cmake/modules/Hexagon.cmake @@ -178,6 +178,15 @@ if(BUILD_FOR_HEXAGON) "${TVMRT_SOURCE_DIR}/hexagon/ops/*.cc" ) + include_directories( + "${TVMRT_SOURCE_DIR}/hexagon/ops" + ) + + set_source_files_properties( + "${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_quant_hvx.cc" + PROPERTIES COMPILE_FLAGS "-mhvx" + ) + set_source_files_properties( "${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_fp16_hvx.cc" PROPERTIES COMPILE_FLAGS "-mhvx" diff --git a/include/tvm/runtime/hexagon/ops/conv2d.h b/src/runtime/hexagon/ops/conv2d.h similarity index 54% rename from include/tvm/runtime/hexagon/ops/conv2d.h rename to src/runtime/hexagon/ops/conv2d.h index d759149727e8..76c6cccff73d 100644 --- a/include/tvm/runtime/hexagon/ops/conv2d.h +++ b/src/runtime/hexagon/ops/conv2d.h @@ -20,6 +20,7 @@ #include #include +#include #include #ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_ @@ -28,6 +29,7 @@ namespace tvm { namespace runtime { namespace hexagon { +namespace conv_utils { static constexpr auto hexagon_device = DLDevice{static_cast(kDLHexagon), 0}; // Standalone DLTensor: the standalone-ness means that this object owns the shape @@ -75,14 +77,36 @@ inline void* to_ptr(uintptr_t v) { return reinterpret_cast(v); } inline uintptr_t to_uint(void* ptr) { return reinterpret_cast(ptr); } -constexpr int xyc_to_sm_16b(int y, int x, int c) { +constexpr int yxc_to_sm_16b(int y, int x, int c) { // Map y,x,c coordinates within a block to the offset (in 16-bit elements) // from the beginning of the block in spatial-major layout. // 10-bit spatial mask: yyyxcccccx assert(y >= 0 && x >= 0 && c >= 0); + assert(y < 8 && x < 4 && c < 32); return y << 7 | (x & 2) << 5 | c << 1 | (x & 1); } +constexpr int yxc_to_sm_8b(int y, int x, int c) { + // Map y,x,c coordinates within a block to the offset (in 8-bit elements) + // from the beginning of the block in spatial-major layout. + // 10-bit spatial mask: yyyxxxccccc + assert(y >= 0 && x >= 0 && c >= 0); + assert(y < 8 && x < 8 && c < 32); + return y << 8 | x << 5 | c; +} + +constexpr int hwio_to_sm_8b(int width, int y, int x, int i, int o) { + // Map y,x,i,o coordinates within a chunk (assuming the origin at the + // top-left spatial corner) to the offset (in 8-bit elements) from the + // beginning of the chunk in spatial-major layout. + // Spatial mask: p..piiioooooii, where p..p are position bits. + assert(width >= 1); + assert(y >= 0 && x >= 0 && i >= 0 && o >= 0); + assert(i < 32 && o < 32); + int p = y * width + (width - 1 - x); + return p << 10 | (i & 0x1c) << 5 | o << 2 | (i & 3); +} + constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) { // Map y,x,i,o coordinates within a chunk (assuming the origin at the // top-left spatial corner) to the offset (in 16-bit elements) from the @@ -90,11 +114,12 @@ constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) { // Spatial mask: p..piiiioooooi, where p..p are position bits. assert(width >= 1); assert(y >= 0 && x >= 0 && i >= 0 && o >= 0); + assert(i < 32 && o < 32); int p = y * width + (width - 1 - x); return p << 10 | (i & 0x1e) << 5 | o << 1 | (i & 1); } -inline constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; } +constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; } // Returns the block address at the given index // Assumptions @@ -123,6 +148,10 @@ inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) { * The input is mapped into the below mentioned layout (notation similar to index map used for * transform layout): * + * For uint8_t type + * lambda n, h, w, c: n, h//8, w//8, c//32, AXIS_SEPARATOR, h%8, w%8, c%32 + * + * For uint16_t type * lambda n, h, w, c: n, h//8, w//4, c//32, AXIS_SEPARATOR, h%8, (w%4)//2, c%32, w%2 * * where AXIS_SEPARATOR represents split up in the physical layout @@ -133,7 +162,48 @@ inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) { * @param width * @param depth */ -void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int depth); +template +void blockize_hwc(void* out, void* inp_flat, int height, int width, int depth) { + int (*index_func)(int, int, int); + if constexpr (std::is_same_v) + index_func = yxc_to_sm_8b; + else if constexpr (std::is_same_v) + index_func = yxc_to_sm_16b; + else + LOG_ERROR << "blockize_hwc is only supported for uint8_t and uint16_t types"; + + auto inp_data = static_cast(inp_flat); + auto out_data = static_cast(out); + const int stride_x = depth; + const int stride_y = stride_x * width; + + for (int cy = 0; cy < height; cy += block_height) { + for (int cx = 0; cx < width; cx += block_width) { + for (int cc = 0; cc < depth; cc += block_depth) { + auto block = reinterpret_cast(*out_data++); + int max_y = std::min(block_height, height - cy); + int max_x = std::min(block_width, width - cx); + int max_c = std::min(block_depth, depth - cc); + for (int y = 0; y < max_y; ++y) { + for (int x = 0; x < max_x; ++x) { + for (int c = 0; c < max_c; ++c) { + block[index_func(y, x, c)] = + inp_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)]; + } + for (int c = max_c; c < block_depth; ++c) block[index_func(y, x, c)] = 0; + } + for (int x = max_x; x < block_width; ++x) { + for (int c = 0; c < block_depth; ++c) block[index_func(y, x, c)] = 0; + } + } + + for (int y = max_y; y < block_height; ++y) + for (int x = 0; x < block_width; ++x) + for (int c = 0; c < block_depth; ++c) block[index_func(y, x, c)] = 0; + } // cc + } // cx + } // cy +} /** * @brief Convert back from non-contguous layout to a flat layout @@ -144,7 +214,42 @@ void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int dept * @param width * @param depth */ -void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int depth); +template +void deblockize_hwc(void* out_flat, void* inp, int height, int width, int depth) { + int (*index_func)(int, int, int); + if constexpr (std::is_same_v) + index_func = yxc_to_sm_8b; + else if constexpr (std::is_same_v) + index_func = yxc_to_sm_16b; + else + LOG_ERROR << "deblockize_hwc is only supported for uint8_t and uint16_t types"; + + uintptr_t* inp_data = static_cast(inp); + T* out_data = static_cast(out_flat); + const int stride_x = depth; + const int stride_y = stride_x * width; + + for (int cy = 0; cy < height; cy += block_height) { + for (int cx = 0; cx < width; cx += block_width) { + for (int cc = 0; cc < depth; cc += block_depth) { + auto block = reinterpret_cast(*inp_data); + int max_y = std::min(block_height, height - cy); + int max_x = std::min(block_width, width - cx); + int max_c = std::min(block_depth, depth - cc); + for (int y = 0; y < max_y; ++y) { + for (int x = 0; x < max_x; ++x) { + for (int c = 0; c < max_c; ++c) { + out_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)] = + block[index_func(y, x, c)]; + } + } + } + + inp_data++; + } + } + } +} /** * @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below: @@ -175,15 +280,42 @@ void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int de */ void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height, int width, int idepth, int odepth); +void chunkify_hwio_8b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height, int width, + int idepth, int odepth); +template SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* nhwc_flat, - bool copy_data); + bool copy_data) { + tvm::runtime::String vtcm_scope = "global.vtcm"; + + // Allocate blocks for activations. We will use the block pointers + // directly from the allocated area. + int n = nhwc_flat->shape[0]; + int h = round_up(nhwc_flat->shape[1], block_height); + int w = round_up(nhwc_flat->shape[2], block_width); + int c = round_up(nhwc_flat->shape[3], block_depth); + int64_t shape_2d[2] = {(n * h * w * c) / (block_height * block_width * block_depth), + block_height * block_width * block_depth}; + void* nhwc_vtcm = + device_api->AllocDataSpace(hexagon_device, 2, shape_2d, nhwc_flat->dtype, vtcm_scope); + if (copy_data) { + blockize_hwc( + nhwc_vtcm, nhwc_flat->data, nhwc_flat->shape[1], nhwc_flat->shape[2], nhwc_flat->shape[3]); + } -int calculate_num_weight_chunks(int64_t* shape_hwio); + return SDLTensor<4>(nhwc_vtcm, nhwc_flat->dtype, nhwc_vtcm, + {n, h / block_height, w / block_width, c / block_depth}); +} + +int calculate_num_weight_chunks(int64_t* shape_hwio, int chunk_height, int chunk_width, + int chunk_in_channel, int chunk_out_channel); SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat, int num_chunks, void** ptr_table); +SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat, + int num_chunks, void** ptr_table, int wgt_zp = 0); + template void release(tvm::runtime::DeviceAPI* device_api, const SDLTensor& tensor) { if (auto* data_space = tensor.GetDataSpace()) { @@ -191,6 +323,7 @@ void release(tvm::runtime::DeviceAPI* device_api, const SDLTensor& tensor) { } } +} // namespace conv_utils } // namespace hexagon } // namespace runtime } // namespace tvm diff --git a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc index a478fbab352d..53ea0868ad2a 100644 --- a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc +++ b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc @@ -27,7 +27,7 @@ #include #include -#include "tvm/runtime/hexagon/ops/conv2d.h" +#include "conv2d.h" // Current limitations: // - N in NHWC must be 1 @@ -68,7 +68,7 @@ namespace hexagon { */ static inline uint16_t* getElementPtr(int block_out_y, int block_out_x, int block_out_c, int yi, int xio, int ci, int xii, const DLTensor& tensor) { - auto block_ptr = nhwc_at(tensor, 0, block_out_y, block_out_x, block_out_c); + auto block_ptr = conv_utils::nhwc_at(tensor, 0, block_out_y, block_out_x, block_out_c); auto block_offset = yi * 128 + xio * 64 + ci * 2 + xii; auto first_element_ptr = reinterpret_cast(block_ptr); return first_element_ptr + block_offset; @@ -279,10 +279,10 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act, // NOLINT(*) } int fx = (fw < wgt_chunk_thin_width) ? fw : ((fw - wgt_chunk_thin_width) % 4); int fy = fh % 8; - for (int c = 0; c < round_up(filt_idepth, 2); c += 2) { + for (int c = 0; c < conv_utils::round_up(filt_idepth, 2); c += 2) { int out_act_cc = c / 32; int ci = c % 32; - auto wgt_chunk = hwio_at(cr_filt, fch, fcw, out_act_cc, out_c); + auto wgt_chunk = conv_utils::hwio_at(cr_filt, fch, fcw, out_act_cc, out_c); // Find weight chunk offset ptr int max_x = (fcw == 0) ? wgt_chunk_thin_width : 4; @@ -306,7 +306,7 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act, // NOLINT(*) true_wo, ci, true_wi, cr_act); HVX_Vector act_vec = getInputVector(act_element_ptr); - auto wgt_chunk_offset = hwio_to_sm_16b(max_x, fy, fx, ci, 0); + auto wgt_chunk_offset = conv_utils::hwio_to_sm_16b(max_x, fy, fx, ci, 0); auto base_chunk_ptr = reinterpret_cast(wgt_chunk); auto chunk_ptr = base_chunk_ptr + wgt_chunk_offset; @@ -404,7 +404,7 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act, // NOLINT(*) int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val, int out_code, void* res_handle) { - namespace hexagonrt = tvm::runtime::hexagon; + namespace conv_utils = tvm::runtime::hexagon::conv_utils; ICHECK_EQ(num_args, 7) << "Unexpected number of arguments"; ICHECK_EQ(type_codes[0], kTVMDLTensorHandle) << "First argument is expected to be the input tensor"; // Input activations @@ -440,50 +440,55 @@ int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue* << wgt_flat->shape[2] << "x" << wgt_flat->shape[3] << ", pad_top=" << pad_top << ", pad_left=" << pad_left; - auto* device_api = tvm::runtime::DeviceAPI::Get(hexagonrt::hexagon_device, false); + auto* device_api = tvm::runtime::DeviceAPI::Get(conv_utils::hexagon_device, false); ICHECK(device_api != nullptr); tvm::runtime::String vtcm_scope = "global.vtcm"; - auto act_vtcm = hexagonrt::prepare_nhwc(device_api, act_flat, /*copy_data=*/true); + auto act_vtcm = + conv_utils::prepare_nhwc(device_api, act_flat, /*copy_data=*/true); ICHECK_NE(wgt_flat->shape[0], 0) << "Weights height should not be zero"; ICHECK_NE(wgt_flat->shape[1], 0) << "Weights width should not be zero"; ICHECK_NE(wgt_flat->shape[2], 0) << "Weights input channels should not be zero"; ICHECK_NE(wgt_flat->shape[3], 0) << "Weights output channels should not be zero"; - int num_wgt_chunks = hexagonrt::calculate_num_weight_chunks(wgt_flat->shape); + int num_wgt_chunks = conv_utils::calculate_num_weight_chunks( + wgt_flat->shape, /* chunk_height */ 8, /* chunk_width */ 4, /* chunk_in_channel */ 32, + /* chunk_out_channel */ 32); + LOG_INFO << "num_wgt_chunks: " << num_wgt_chunks; auto wgt_ptr_table = reinterpret_cast(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t))); - auto wgt_vtcm = hexagonrt::prepare_hwio(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table); + auto wgt_vtcm = conv_utils::prepare_hwio(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table); - auto out_vtcm = hexagonrt::prepare_nhwc(device_api, out_flat, /*copy_data=*/false); + auto out_vtcm = + conv_utils::prepare_nhwc(device_api, out_flat, /*copy_data=*/false); // Prepare zero_block int64_t block_nbytes = 2048; - void* zero_block = device_api->AllocDataSpace(hexagonrt::hexagon_device, 1, &block_nbytes, + void* zero_block = device_api->AllocDataSpace(conv_utils::hexagon_device, 1, &block_nbytes, tvm::runtime::DataType::UInt(8), vtcm_scope); memset(zero_block, 0, 2048); // FIXME: Setting bias to zero_block: this works for up to 256 output channels. auto bias_flat = - hexagonrt::SDLTensor<1>(zero_block, wgt_flat->dtype, zero_block, &wgt_flat->shape[3]); - auto act_shape = hexagonrt::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape); - auto filt_shape = hexagonrt::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape); - auto pad_shape = hexagonrt::SDLTensor<2>(nullptr, act_flat->dtype, nullptr, {pad_top, pad_left}); - auto out_shape = hexagonrt::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape); + conv_utils::SDLTensor<1>(zero_block, wgt_flat->dtype, zero_block, &wgt_flat->shape[3]); + auto act_shape = conv_utils::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape); + auto filt_shape = conv_utils::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape); + auto pad_shape = conv_utils::SDLTensor<2>(nullptr, act_flat->dtype, nullptr, {pad_top, pad_left}); + auto out_shape = conv_utils::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape); bool relu = false; - hexagonrt::conv_layer_fp16_hvx(out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape, bias_flat, - filt_shape, pad_shape, relu, stride_h, stride_w, - hexagonrt::to_uint(zero_block)); + tvm::runtime::hexagon::conv_layer_fp16_hvx(out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape, + bias_flat, filt_shape, pad_shape, relu, stride_h, + stride_w, conv_utils::to_uint(zero_block)); - hexagonrt::deblockize_hwc_16b(out_flat->data, out_vtcm.data, out_flat->shape[1], - out_flat->shape[2], out_flat->shape[3]); + conv_utils::deblockize_hwc(out_flat->data, out_vtcm.data, out_flat->shape[1], + out_flat->shape[2], out_flat->shape[3]); - device_api->FreeDataSpace(hexagonrt::hexagon_device, zero_block); - hexagonrt::release(device_api, out_vtcm); - hexagonrt::release(device_api, wgt_vtcm); - hexagonrt::release(device_api, act_vtcm); + device_api->FreeDataSpace(conv_utils::hexagon_device, zero_block); + conv_utils::release(device_api, out_vtcm); + conv_utils::release(device_api, wgt_vtcm); + conv_utils::release(device_api, act_vtcm); return 0; } diff --git a/src/runtime/hexagon/ops/conv2d_quant_hvx.cc b/src/runtime/hexagon/ops/conv2d_quant_hvx.cc new file mode 100644 index 000000000000..682eebb137c0 --- /dev/null +++ b/src/runtime/hexagon/ops/conv2d_quant_hvx.cc @@ -0,0 +1,319 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include + +#include "conv2d.h" + +extern "C" int conv2d_packed_quant(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val, + int out_code, void* res_handle); + +namespace tvm { +namespace runtime { +namespace hexagon { +inline uint8_t* getElementPtr_int8(int block_out_y, int block_out_x, int block_out_c, int yi, + int xi, int ci, const DLTensor& block) { + auto block_ptr = + tvm::runtime::hexagon::conv_utils::nhwc_at(block, 0, block_out_y, block_out_x, block_out_c); + const int width_stride = 32; + const int height_stride = width_stride * 8; + auto block_offset = yi * height_stride + xi * width_stride + ci; + auto first_element_ptr = reinterpret_cast(block_ptr); + return first_element_ptr + block_offset; +} + +inline int8_t* getWgtPtr_int8(int out_i, int out_o, int h, int w, int i, int o, + const DLTensor& wgt_vtcm, int width) { + auto data = static_cast(wgt_vtcm.data); + auto chunk = data[out_i * wgt_vtcm.shape[3] + out_o]; + auto base_chunk_ptr = reinterpret_cast(chunk); + auto wgt_chunk_offset = tvm::runtime::hexagon::conv_utils::hwio_to_sm_8b(width, h, w, i, o); + return base_chunk_ptr + wgt_chunk_offset; +} + +int32_t saturate_uint8(int32_t val) { return std::max(std::min(val, 255), 0); } + +int32_t saturate_int8(int32_t val) { return std::max(std::min(val, 127), -128); } + +/** + * @brief Compute the quantized convolution along with requantize with output quantization params to + * get uint8 outputs + * + * The quantized convolution is represented by the below equation + * out_scale(out_q - out_zp) = Σr,s,c(act_scale(act_q[n,h+r,w+s,c] - act_zp) * + * wgt_scale(wgt_q[r,s,c,o] - wgt_zp)) + * => out_q = Σr,s,c((act_q[n,h+r,w+s,c] - act_zp) * (wgt_q[r,s,c,o] - wgt_zp)) + * * (act_scale*wgt_scale/out_scale) + out_zp + * out_q = Σr,s,c((act_q[n,h+r,w+s,c] - act_zp) * (wgt_zp_q[r,s,c,o])) * + * (act_scale*wgt_scale/out_scale) + out_zp, where wgt_zp_q = (wgt_q[r,s,c,o] - wgt_zp) + * + * Assumptions/Limitations: + * - Strided convolution is not yet supported so the stride variables are unused + * + * @param cr_out blockized output tensor with zeros already filled in + * @param cr_act blockized activations + * @param cr_filt Chunkified weights as returned from output of prepare_hwio + * @param out_shape Original output shape of the tensor before blockization + * @param act_shape Original input shape + * @param filt_shape Original filter shape + * @param act_scale Quantization scale for activation + * @param act_zp Activations zero point + * @param wgt_scale Quantization scale for weights + * @param wgt_zp Weights zero point + * @param out_scale Quantization scale for output + * @param out_zp Output zero point + * @param fixed_final_scale Fixed point value of final_scale= (act_scale*wgt_scale/out_scale) + * @param scale_factor Scale factor for the fixed_final_scale + */ +void conv_layer_int8_hvx_whole(DLTensor& cr_out, const DLTensor& cr_act, // NOLINT(*) + const DLTensor& cr_filt, const DLTensor& out_shape, + const DLTensor& act_shape, const DLTensor& filt_shape, + float act_scale, int act_zp, float wgt_scale, int wgt_zp, + float out_scale, int out_zp, int fixed_final_scale, + int scale_factor) { + namespace conv_utils = tvm::runtime::hexagon::conv_utils; + int filt_height = filt_shape.shape[0]; + int filt_width = filt_shape.shape[1]; + int filt_idepth = filt_shape.shape[2]; + + int a_depth = cr_act.shape[3]; + + int o_height = cr_out.shape[1]; + int o_width = cr_out.shape[2]; + int o_depth = cr_out.shape[3]; + + int out_height = out_shape.shape[1]; + int out_width = out_shape.shape[2]; + + uint8_t act_zp_u8 = static_cast(act_zp); + int8_t wgt_zp_i8 = static_cast(wgt_zp); + + HVX_Vector act_zp_vec = Q6_Vb_vsplat_R(act_zp_u8); + HVX_Vector wgt_zp_vec = Q6_Vb_vsplat_R(wgt_zp_i8); + HVX_VectorPair wgt_zp_vec_pair = Q6_Wh_vsxt_Vb(wgt_zp_vec); + + ICHECK_EQ(a_depth, cr_filt.shape[2]) << "input depth should match weights input channels"; + ICHECK_EQ(o_depth, cr_filt.shape[3]) << "output depth should match the weights output channel"; + + uint32_t scale_u = static_cast(fixed_final_scale); + HVX_Vector scale_vec = Q6_V_vsplat_R(scale_u); + uint32_t new_scale_factor = static_cast(scale_factor - 16); + HVX_Vector out_zp_vec = Q6_V_vsplat_R(out_zp); + + auto computeOutVec = [&cr_act, &cr_filt, &act_zp_vec, &wgt_zp_vec_pair, &out_zp_vec, &scale_vec, + new_scale_factor, filt_height, filt_width, + filt_idepth](int out_h, int out_w, int out_c, int h, int w) -> HVX_Vector { + HVX_Vector out_vec = Q6_V_vzero(); + for (int fh = 0; fh < filt_height; ++fh) { + for (int fw = 0; fw < filt_width; ++fw) { + for (int c = 0; c < conv_utils::round_up(filt_idepth, 4); c += 4) { + int act_h = out_h * 8 + h + fh; + int act_ho = act_h / 8; + int act_hi = act_h % 8; + + int act_w = out_w * 8 + w + fw; + int act_wo = act_w / 8; + int act_wi = act_w % 8; + + int act_co = c / 32; + int act_ci = c % 32; + + uint8_t* act_ptr = + getElementPtr_int8(act_ho, act_wo, act_co, act_hi, act_wi, act_ci, cr_act); + + uint32_t four_act_elems = *reinterpret_cast(act_ptr); + HVX_Vector act_vec = Q6_V_vsplat_R(four_act_elems); + int8_t* wgt_ptr = getWgtPtr_int8(act_co, out_c, fh, fw, act_ci, 0, cr_filt, filt_width); + + HVX_Vector* wgt_vec_ptr = reinterpret_cast(wgt_ptr); + HVX_Vector wgt_vec = *wgt_vec_ptr; + + HVX_VectorPair act_vec_zp_diff = Q6_Wh_vsub_VubVub(act_vec, act_zp_vec); + HVX_VectorPair wgt_i16_vec_nodiff = Q6_Wh_vsxt_Vb(wgt_vec); + HVX_VectorPair wgt_i16_vec = Q6_Wh_vsub_WhWh_sat(wgt_i16_vec_nodiff, wgt_zp_vec_pair); + + out_vec = Q6_Vw_vdmpyacc_VwVhVh_sat(out_vec, Q6_V_lo_W(act_vec_zp_diff), + Q6_V_lo_W(wgt_i16_vec)); + out_vec = Q6_Vw_vdmpyacc_VwVhVh_sat(out_vec, Q6_V_hi_W(act_vec_zp_diff), + Q6_V_hi_W(wgt_i16_vec)); + } + } + } + HVX_Vector mul_vec = Q6_Vw_vmpye_VwVuh(out_vec, scale_vec); + HVX_Vector scaled_vec = Q6_Vw_vasr_VwR(mul_vec, new_scale_factor); + HVX_Vector sum_vec = Q6_Vw_vadd_VwVw(scaled_vec, out_zp_vec); + return sum_vec; + }; + + auto saturateAndStore = [&cr_out, &computeOutVec](int out_h, int out_w, int out_c, int h, int w) { + uint8_t* out_ptr = getElementPtr_int8(out_h, out_w, out_c, h, w, 0, cr_out); + HVX_Vector* out_vec_ptr = reinterpret_cast(out_ptr); + HVX_Vector out_vec1, out_vec2, out_vec3, out_vec4, out_vec; + out_vec1 = computeOutVec(out_h, out_w, out_c, h, w); + out_vec2 = computeOutVec(out_h, out_w, out_c, h, w + 1); + out_vec3 = computeOutVec(out_h, out_w, out_c, h, w + 2); + out_vec4 = computeOutVec(out_h, out_w, out_c, h, w + 3); + + HVX_Vector half_vec1 = Q6_Vh_vpack_VwVw_sat(out_vec2, out_vec1); + HVX_Vector half_vec2 = Q6_Vh_vpack_VwVw_sat(out_vec4, out_vec3); + out_vec = Q6_Vub_vpack_VhVh_sat(half_vec2, half_vec1); + *out_vec_ptr = out_vec; + }; + + for (int out_c = 0; out_c < o_depth; ++out_c) { + for (int out_h = 0; out_h < o_height; ++out_h) { + int max_y = std::min(8, out_height - out_h * 8); + for (int out_w = 0; out_w < o_width; ++out_w) { + int max_x = std::min(8, out_width - out_w * 8); + for (int h = 0; h < max_y; ++h) { + if (max_x == 8) { + for (int w = 0; w < max_x; w += 4) { + saturateAndStore(out_h, out_w, out_c, h, w); + } + } else { + int w = 0; + if (max_x >= 4) { + saturateAndStore(out_h, out_w, out_c, h, w); + w = 4; + } + uint8_t* out_ptr = getElementPtr_int8(out_h, out_w, out_c, h, w, 0, cr_out); + HVX_Vector* out_vec_ptr = reinterpret_cast(out_ptr); + HVX_Vector out_vec1, out_vec2, out_vec3, out_vec; + if (max_x % 4 == 1) { + out_vec1 = computeOutVec(out_h, out_w, out_c, h, w); + HVX_Vector half_vec = Q6_Vh_vpack_VwVw_sat(Q6_V_vzero(), out_vec1); + out_vec = Q6_Vub_vpack_VhVh_sat(Q6_V_vzero(), half_vec); + *out_vec_ptr = out_vec; + } else if (max_x % 4 == 2) { + out_vec1 = computeOutVec(out_h, out_w, out_c, h, w); + out_vec2 = computeOutVec(out_h, out_w, out_c, h, w + 1); + HVX_Vector half_vec = Q6_Vh_vpack_VwVw_sat(out_vec2, out_vec1); + out_vec = Q6_Vub_vpack_VhVh_sat(Q6_V_vzero(), half_vec); + *out_vec_ptr = out_vec; + } else if (max_x % 4 == 3) { + out_vec1 = computeOutVec(out_h, out_w, out_c, h, w); + out_vec2 = computeOutVec(out_h, out_w, out_c, h, w + 1); + out_vec3 = computeOutVec(out_h, out_w, out_c, h, w + 2); + HVX_Vector half_vec1 = Q6_Vh_vpack_VwVw_sat(out_vec2, out_vec1); + HVX_Vector half_vec2 = Q6_Vh_vpack_VwVw_sat(Q6_V_vzero(), out_vec3); + out_vec = Q6_Vub_vpack_VhVh_sat(half_vec2, half_vec1); + *out_vec_ptr = out_vec; + } + } + } + } + } + } +} + +} // namespace hexagon +} // namespace runtime +} // namespace tvm + +int conv2d_packed_quant(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val, + int out_code, void* res_handle) { + namespace conv_utils = tvm::runtime::hexagon::conv_utils; + ICHECK_EQ(num_args, 13) << "Unexpected number of arguments"; + ICHECK_EQ(type_codes[0], kTVMDLTensorHandle) + << "First argument is expected to be the input tensor"; // Input activations + ICHECK_EQ(type_codes[1], kTVMDLTensorHandle) + << "Second argument is expected to be the weights tensor"; // Weights + ICHECK_EQ(type_codes[2], kDLFloat) << "Third argument is expected to be the activation scale"; + ICHECK_EQ(type_codes[3], kDLInt) << "Fourth argument is expected to be the activation zero point"; + ICHECK_EQ(type_codes[4], kDLFloat) << "Fifth argument is expected to be the weight scale"; + ICHECK_EQ(type_codes[5], kDLInt) << "Sixth argument is expected to be the weight zero point"; + ICHECK_EQ(type_codes[6], kDLFloat) << "Seventh argument is expected to be the output scale"; + ICHECK_EQ(type_codes[7], kDLInt) << "Eigth argument is expected to be the output zero point"; + ICHECK_EQ(type_codes[8], kDLInt) << "Nineth argument is expected to be the stride_h"; // stride_h + ICHECK_EQ(type_codes[9], kDLInt) << "Tenth argument is expected to be the stride_w"; // stride_w + ICHECK_EQ(type_codes[10], kDLInt) << "Eleventh argument is expected to be fixed final scale"; + ICHECK_EQ(type_codes[11], kDLInt) << "Twelfth argument is expected to be scale factor"; + ICHECK_EQ(type_codes[12], kTVMDLTensorHandle) + << "Thirteenth argument is expected to be the output tensor"; // output + + auto* act_flat = static_cast(args[0].v_handle); + auto* wgt_flat = static_cast(args[1].v_handle); + auto* out_flat = static_cast(args[12].v_handle); + + // Temporary assertion until multiple batches are supported + ICHECK_EQ(act_flat->shape[0], 1) << "Input batch size more than 1 is not supported yet"; + + // Temporary assertion until multiple batches are supported + ICHECK_EQ(out_flat->shape[0], 1) << "Output batch size more than 1 is not supported yet"; + + float act_scale = args[2].v_float64; + int act_zp = args[3].v_int64; + LOG_INFO << "act_scale: " << act_scale << ", act_zp: " << act_zp; + + float wgt_scale = args[4].v_float64; + int wgt_zp = args[5].v_int64; + LOG_INFO << "wgt_scale: " << wgt_scale << ", wgt_zp: " << wgt_zp; + + float out_scale = args[6].v_float64; + int out_zp = args[7].v_int64; + LOG_INFO << "out_scale: " << out_scale << ", out_zp: " << out_zp; + + int stride_h = args[8].v_int64; + int stride_w = args[9].v_int64; + LOG_INFO << "stride_h: " << stride_h << ", stride_w: " << stride_w; + + int fixed_final_scale = args[10].v_int64; + int scale_factor = args[11].v_int64; + LOG_INFO << "fixed_final_scale: " << fixed_final_scale << ", scale_factor: " << scale_factor; + + auto* device_api = tvm::runtime::DeviceAPI::Get(conv_utils::hexagon_device, false); + ICHECK(device_api != nullptr); + tvm::runtime::String vtcm_scope = "global.vtcm"; + + auto act_vtcm = + conv_utils::prepare_nhwc(device_api, act_flat, /*copy_data=*/true); + + int num_wgt_chunks = conv_utils::calculate_num_weight_chunks( + wgt_flat->shape, /* chunk_height */ wgt_flat->shape[0], + /* chunk_width */ wgt_flat->shape[1], /* chunk_in_channel */ 32, /* chunk_out_channel */ 32); + auto wgt_ptr_table = + reinterpret_cast(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t))); + + auto wgt_vtcm = + conv_utils::prepare_hwio_8b(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table, wgt_zp); + + auto out_vtcm = + conv_utils::prepare_nhwc(device_api, out_flat, /*copy_data=*/false); + + auto act_shape = conv_utils::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape); + auto filt_shape = conv_utils::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape); + auto out_shape = conv_utils::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape); + + tvm::runtime::hexagon::conv_layer_int8_hvx_whole( + out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape, filt_shape, act_scale, act_zp, wgt_scale, + wgt_zp, out_scale, out_zp, fixed_final_scale, scale_factor); + + conv_utils::deblockize_hwc(out_flat->data, out_vtcm.data, out_flat->shape[1], + out_flat->shape[2], out_flat->shape[3]); + + conv_utils::release(device_api, out_vtcm); + conv_utils::release(device_api, wgt_vtcm); + conv_utils::release(device_api, act_vtcm); + + return 0; +} diff --git a/src/runtime/hexagon/ops/conv_utils.cc b/src/runtime/hexagon/ops/conv_utils.cc index b10f7cc315b2..a40e23e463d4 100644 --- a/src/runtime/hexagon/ops/conv_utils.cc +++ b/src/runtime/hexagon/ops/conv_utils.cc @@ -17,96 +17,69 @@ * under the License. */ -#include "tvm/runtime/hexagon/ops/conv2d.h" +#include + +#include "conv2d.h" namespace tvm { namespace runtime { namespace hexagon { +namespace conv_utils { /** - * @brief Function to "blockize" the flat input data - * The term "blockize" is used to mention that the data is stored in non-contiguous blocks + * @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below: * - * The input is mapped into the below mentioned layout (notation similar to index map used for - * transform layout): + * Weights are packed into the below mentioned layout (notation similar to index map): + * Since weights cannot be exactly represented into a index map notation, the + * base split up is mentioned below with a few deviations * - * lambda n, h, w, c: n, h//8, w//4, c//32, AXIS_SEPARATOR, h%8, (w%4)//2, c%32, w%2 + * lambda h, w, i, o: o//32, i//32, h, w, (i%32)//4, o%32, i%4 * - * where AXIS_SEPARATOR represents split up in the physical layout + * The deviations are: + * - w is actually stored in the right to left order, as in 3,2,1,0 instead of 0,1,2,3 * - * @param out Pre-allocated output memory pointer - * @param inp_flat Flat input data pointer - * @param height - * @param width - * @param depth - */ -void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int depth) { - auto inp_data = static_cast(inp_flat); - auto out_data = static_cast(out); - const int stride_x = depth; - const int stride_y = stride_x * width; - - for (int cy = 0; cy < height; cy += 8) { - for (int cx = 0; cx < width; cx += 4) { - for (int cc = 0; cc < depth; cc += 32) { - auto block = reinterpret_cast(*out_data++); - int max_y = std::min(8, height - cy); - int max_x = std::min(4, width - cx); - int max_c = std::min(32, depth - cc); - for (int y = 0; y < max_y; ++y) { - for (int x = 0; x < max_x; ++x) { - for (int c = 0; c < max_c; ++c) { - block[xyc_to_sm_16b(y, x, c)] = - inp_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)]; - } - for (int c = max_c; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0; - } - for (int x = max_x; x < 4; ++x) { - for (int c = 0; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0; - } - } - - for (int y = max_y; y < 8; ++y) - for (int x = 0; x < 4; ++x) - for (int c = 0; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0; - } // cc - } // cx - } // cy -} - -/** - * @brief Convert back from non-contguous layout to a flat layout - * - * @param out_flat Pre-allocated output memory pointer - * @param inp Blockized input data pointer + * @param out_ptr Base pointer table to be filled with the list of pointers to the first addresses + * of the "chunked" weights + * @param out_ptr_size The number of chunks + * @param out Pointer to pre-allocated output memory + * @param inp Pointer to flat input data * @param height * @param width - * @param depth + * @param idepth + * @param odepth */ -void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int depth) { - uintptr_t* inp_data = static_cast(inp); - uint16_t* out_data = static_cast(out_flat); - const int stride_x = depth; +void chunkify_hwio_8b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height, int width, + int idepth, int odepth, int wgt_zp) { + auto inp_data = static_cast(inp); + auto out_data = static_cast(out); + const int stride_i = odepth; + const int stride_x = stride_i * idepth; const int stride_y = stride_x * width; - for (int cy = 0; cy < height; cy += 8) { - for (int cx = 0; cx < width; cx += 4) { - for (int cc = 0; cc < depth; cc += 32) { - auto block = reinterpret_cast(*inp_data); - int max_y = std::min(8, height - cy); - int max_x = std::min(4, width - cx); - int max_c = std::min(32, depth - cc); - for (int y = 0; y < max_y; ++y) { - for (int x = 0; x < max_x; ++x) { - for (int c = 0; c < max_c; ++c) { - out_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)] = - block[xyc_to_sm_16b(y, x, c)]; + for (int ci = 0; ci < idepth; ci += 32) { + for (int co = 0; co < odepth; co += 32) { + int max_i = std::min(32, idepth - ci); + int max_o = std::min(32, odepth - co); + + auto chunk = out_data; + for (int y = 0; y < height; ++y) { + for (int x = width - 1; x >= 0; --x) { + for (int i = 0; i < max_i; ++i) { + for (int o = 0; o < max_o; ++o) { + chunk[hwio_to_sm_8b(width, y, x, i, o)] = + inp_data[y * stride_y + x * stride_x + (ci + i) * stride_i + (co + o)]; } + for (int o = max_o; o < 32; ++o) chunk[hwio_to_sm_8b(width, y, x, i, o)] = wgt_zp; } + for (int i = max_i; i < 32; ++i) + for (int o = 0; o < 32; ++o) chunk[hwio_to_sm_8b(width, y, x, i, o)] = wgt_zp; } - - inp_data++; } + + *out_ptr++ = chunk; + out_data += height * width * 32 * 32; + out_ptr_size--; + assert(out_ptr_size >= 0); } } } @@ -184,25 +157,27 @@ void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, i } } -SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* nhwc_flat, - bool copy_data) { +std::tuple getHWIO(const DLTensor* hwio_flat) { + int h = hwio_flat->shape[0]; + int w = hwio_flat->shape[1]; + int i = round_up(hwio_flat->shape[2], 32); + int o = round_up(hwio_flat->shape[3], 32); + return std::make_tuple(h, w, i, o); +} + +SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat, + int num_chunks, void** ptr_table, int wgt_zp) { tvm::runtime::String vtcm_scope = "global.vtcm"; - // Allocate blocks for activations. We will use the block pointers - // directly from the allocated area. - int n = nhwc_flat->shape[0]; - int h = round_up(nhwc_flat->shape[1], 8); - int w = round_up(nhwc_flat->shape[2], 4); - int c = round_up(nhwc_flat->shape[3], 32); - int64_t shape_2d[2] = {(n * h * w * c) / (8 * 4 * 32), 8 * 4 * 32}; - void* nhwc_vtcm = - device_api->AllocDataSpace(hexagon_device, 2, shape_2d, nhwc_flat->dtype, vtcm_scope); - if (copy_data) { - blockize_hwc_16b(nhwc_vtcm, nhwc_flat->data, nhwc_flat->shape[1], nhwc_flat->shape[2], - nhwc_flat->shape[3]); - } + auto [h, w, i, o] = getHWIO(hwio_flat); + int64_t shape_1d[] = {h * w * i * o}; + void* hwio_vtcm = + device_api->AllocDataSpace(hexagon_device, 1, shape_1d, hwio_flat->dtype, vtcm_scope); - return SDLTensor<4>(nhwc_vtcm, nhwc_flat->dtype, nhwc_vtcm, {n, h / 8, w / 4, c / 32}); + chunkify_hwio_8b(ptr_table, num_chunks, hwio_vtcm, hwio_flat->data, hwio_flat->shape[0], + hwio_flat->shape[1], hwio_flat->shape[2], hwio_flat->shape[3], wgt_zp); + + return SDLTensor<4>(ptr_table, hwio_flat->dtype, hwio_vtcm, {1, 1, i / 32, o / 32}); } SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat, @@ -214,10 +189,7 @@ SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* h // height- or width-wise, so filter chunks may have different sizes. // A filter chunk is a block of size HxWx32x32, where H, W are at most // height and width of a block respectively. - int h = hwio_flat->shape[0]; - int w = hwio_flat->shape[1]; - int i = round_up(hwio_flat->shape[2], 32); - int o = round_up(hwio_flat->shape[3], 32); + auto [h, w, i, o] = getHWIO(hwio_flat); int64_t shape_1d[] = {h * w * i * o}; void* hwio_vtcm = device_api->AllocDataSpace(hexagon_device, 1, shape_1d, hwio_flat->dtype, vtcm_scope); @@ -229,15 +201,19 @@ SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* h {round_up(h, 8) / 8, round_up(w, 4) / 4, i / 32, o / 32}); } -int calculate_num_weight_chunks(int64_t* shape_hwio) { - int h = round_up(shape_hwio[0], 8); - int w = round_up(shape_hwio[1], 4); - int i = round_up(shape_hwio[2], 32); - int o = round_up(shape_hwio[3], 32); +int calculate_num_weight_chunks(int64_t* shape_hwio, int chunk_height, int chunk_width, + int chunk_in_channel, int chunk_out_channel) { + // Define slower roundup that doesn't assume multiplier 'p' to be power of 2 + auto roundup = [](int v, int p) { return (v + p - 1) - ((v + p - 1) % p); }; + int h = roundup(shape_hwio[0], chunk_height); + int w = roundup(shape_hwio[1], chunk_width); + int i = roundup(shape_hwio[2], chunk_in_channel); + int o = roundup(shape_hwio[3], chunk_out_channel); - return (h * w * i * o) / (8 * 4 * 32 * 32); + return (h * w * i * o) / (chunk_height * chunk_width * chunk_in_channel * chunk_out_channel); } +} // namespace conv_utils } // namespace hexagon } // namespace runtime } // namespace tvm diff --git a/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h b/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h new file mode 100644 index 000000000000..07e15966863e --- /dev/null +++ b/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef TVM_TESTS_CPPRUNTIME_HEXAGON_HEXAGON_CONV_UTILS_H +#define TVM_TESTS_CPPRUNTIME_HEXAGON_HEXAGON_CONV_UTILS_H + +#include +#include + +#include + +#include "conv2d.h" + +using namespace tvm::runtime::hexagon::conv_utils; + +template +class HexagonUtilsTest : public ::testing::Test { + public: + void SetUp() override { + vtcm_scope = "global.vtcm"; + device_api = tvm::runtime::DeviceAPI::Get(hexagon_device, false); + float16.code = kDLFloat; + float16.bits = 16; + float16.lanes = 1; + + uint8.code = kDLUInt; + uint8.bits = 8; + uint8.lanes = 1; + + int8.code = kDLInt; + int8.bits = 8; + int8.lanes = 1; + } + + void setupTensor(std::tuple shape, DLDataType dtype) { + auto [s1, s2, s3, s4] = shape; + tensor_shape[0] = s1; + tensor_shape[1] = s2; + tensor_shape[2] = s3; + tensor_shape[3] = s4; + int64_t shape_1d[1] = {s1 * s2 * s3 * s4}; + + flat_mem = device_api->AllocDataSpace(hexagon_device, 1, shape_1d, dtype, vtcm_scope); + flat_mem_data = static_cast(flat_mem); + fill_vals(flat_mem_data, shape_1d[0]); + + flat_tensor.data = flat_mem; + flat_tensor.device = hexagon_device; + flat_tensor.ndim = 4; + flat_tensor.dtype = dtype; + flat_tensor.shape = tensor_shape; + flat_tensor.strides = nullptr; + flat_tensor.byte_offset = 0; + } + + void TearDownTensor() { + if (flat_tensor.data) device_api->FreeDataSpace(hexagon_device, flat_mem); + } + + static void fill_vals(T* arr, int size) { + // Testing with uint16 instead of float16 as generating random float16 is not easy within c++ + auto max = std::numeric_limits::max(); + srand(std::time(0)); + for (int i = 0; i < size; ++i) { + arr[i] = static_cast(std::rand() % max); + } + } + + static int flattened_idx(int nn, int hh, int ww, int cc, int64_t* shape) { + int h = shape[1]; + int w = shape[2]; + int c = shape[3]; + return cc + c * (ww + w * (hh + h * (nn))); + } + + DLTensor flat_tensor; + void* flat_mem; + T* flat_mem_data; + tvm::runtime::DeviceAPI* device_api; + tvm::runtime::String vtcm_scope; + DLDataType float16; + DLDataType int8, uint8; + int64_t tensor_shape[4]; +}; + +#endif diff --git a/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc b/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc index 3b922fa6c2a8..5f2ef490d020 100644 --- a/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc +++ b/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc @@ -26,73 +26,14 @@ #include #include -#include "tvm/runtime/hexagon/ops/conv2d.h" +#include "conv2d.h" +#include "hexagon_conv_utils_test.h" using namespace tvm::runtime::hexagon; -class HexagonUtilsTest : public ::testing::Test { - public: - void SetUp() override { - vtcm_scope = "global.vtcm"; - device_api = tvm::runtime::DeviceAPI::Get(hexagon_device, false); - float16.code = kDLFloat; - float16.bits = 16; - float16.lanes = 1; - } - - void setupTensor(std::tuple shape) { - auto [s1, s2, s3, s4] = shape; - tensor_shape[0] = s1; - tensor_shape[1] = s2; - tensor_shape[2] = s3; - tensor_shape[3] = s4; - int64_t shape_1d[1] = {s1 * s2 * s3 * s4}; - - flat_mem = device_api->AllocDataSpace(hexagon_device, 1, shape_1d, float16, vtcm_scope); - flat_mem_data = static_cast(flat_mem); - fill_vals(flat_mem_data, shape_1d[0]); - - flat_tensor.data = flat_mem; - flat_tensor.device = hexagon_device; - flat_tensor.ndim = 4; - flat_tensor.dtype = float16; - flat_tensor.shape = tensor_shape; - flat_tensor.strides = nullptr; - flat_tensor.byte_offset = 0; - } - - void TearDownTensor() { - if (flat_tensor.data) device_api->FreeDataSpace(hexagon_device, flat_mem); - } - - static void fill_vals(uint16_t* arr, int size) { - // Testing with uint16 instead of float16 as generating random float16 is not easy within c++ - uint16_t max = UINT16_MAX; - srand(std::time(0)); - for (int i = 0; i < size; ++i) { - arr[i] = static_cast(std::rand() % max); - } - } - - static int flattened_idx(int nn, int hh, int ww, int cc, int64_t* shape) { - int h = shape[1]; - int w = shape[2]; - int c = shape[3]; - return cc + c * (ww + w * (hh + h * (nn))); - } - - DLTensor flat_tensor; - void* flat_mem; - uint16_t* flat_mem_data; - tvm::runtime::DeviceAPI* device_api; - tvm::runtime::String vtcm_scope; - DLDataType float16; - int64_t tensor_shape[4]; -}; - // Parameterized test fixture with 4 params representing n, h, w, c class HexagonUtilsActivationsBlockizeTest - : public HexagonUtilsTest, + : public HexagonUtilsTest, public ::testing::WithParamInterface, std::tuple>> {}; @@ -122,11 +63,12 @@ INSTANTIATE_TEST_SUITE_P( TEST_F(HexagonUtilsActivationsBlockizeTest, prepare_nhwc) { auto shape = std::make_tuple(1, 14, 7, 60); auto [n, h, w, c] = shape; - setupTensor(shape); + setupTensor(shape, float16); // // copy_data is set to false here as there's a separate test for blockize when copy_data // becomes true - auto blocked_tensor = prepare_nhwc(device_api, &flat_tensor, /*copy_data=*/false); + auto blocked_tensor = + prepare_nhwc(device_api, &flat_tensor, /*copy_data=*/false); EXPECT_EQ(blocked_tensor.shape[0], n); EXPECT_EQ(blocked_tensor.shape[1], round_up(h, 8) / 8); @@ -139,7 +81,7 @@ TEST_F(HexagonUtilsActivationsBlockizeTest, prepare_nhwc) { TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) { auto shape_tuple = std::get<0>(GetParam()); - setupTensor(shape_tuple); + setupTensor(shape_tuple, float16); auto [n, h, w, c] = shape_tuple; int64_t shape[] = {n, h, w, c}; @@ -150,7 +92,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) { void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, float16, vtcm_scope); int64_t blocked_shape[] = {n, h_rounded / 8, w_rounded / 4, c_rounded / 32}; - blockize_hwc_16b(blocked_mem, flat_mem, h, w, c); + blockize_hwc(blocked_mem, flat_mem, h, w, c); std::function flatten = HexagonUtilsActivationsBlockizeTest::flattened_idx; @@ -159,7 +101,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) { auto* blocks = static_cast(blocked_mem); int blockIdx = flatten(nn, hh / 8, ww / 4, cc / 32, blocked_shape); uint16_t* block = reinterpret_cast(blocks[blockIdx]); - return block[xyc_to_sm_16b(hh % 8, ww % 4, cc % 32)]; + return block[yxc_to_sm_16b(hh % 8, ww % 4, cc % 32)]; }; auto [nn, hh, ww, cc] = std::get<1>(GetParam()); @@ -172,7 +114,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) { TEST_P(HexagonUtilsActivationsBlockizeTest, deblockize_hwc_16b) { auto shape_tuple = std::get<0>(GetParam()); - setupTensor(shape_tuple); + setupTensor(shape_tuple, float16); auto [n, h, w, c] = shape_tuple; int64_t shape[] = {n, h, w, c}; int64_t shape_1d[1] = {n * h * w * c}; @@ -183,11 +125,11 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, deblockize_hwc_16b) { int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 4 * 32), 8 * 4 * 32}; void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, float16, vtcm_scope); - blockize_hwc_16b(blocked_mem, flat_mem, h, w, c); + blockize_hwc(blocked_mem, flat_mem, h, w, c); void* deblocked_flat_mem = device_api->AllocDataSpace(hexagon_device, 1, shape_1d, float16, vtcm_scope); - deblockize_hwc_16b(deblocked_flat_mem, blocked_mem, h, w, c); + deblockize_hwc(deblocked_flat_mem, blocked_mem, h, w, c); auto* deblocked_flat_mem_data = static_cast(deblocked_flat_mem); auto [nn, hh, ww, cc] = std::get<1>(GetParam()); @@ -201,7 +143,7 @@ TEST_P(HexagonUtilsActivationsBlockizeTest, deblockize_hwc_16b) { } class HexagonUtilsWeightsChunkifyTest - : public HexagonUtilsTest, + : public HexagonUtilsTest, public ::testing::WithParamInterface, std::tuple>> {}; @@ -231,7 +173,9 @@ INSTANTIATE_TEST_SUITE_P( TEST_F(HexagonUtilsWeightsChunkifyTest, calculate_num_weight_chunks) { int64_t shape[] = {3, 3, 40, 40}; - int num_wgt_chunks = calculate_num_weight_chunks(shape); + int num_wgt_chunks = + calculate_num_weight_chunks(shape, /* chunk_height */ 8, /* chunk_width */ 4, + /* chunk_in_channel */ 32, /* chunk_out_channel */ 32); EXPECT_EQ(num_wgt_chunks, 4); } @@ -239,11 +183,11 @@ TEST_F(HexagonUtilsWeightsChunkifyTest, prepare_hwio) { int64_t shape[] = {3, 3, 40, 40}; auto [h, w, i, o] = shape; auto shape_tuple = std::make_tuple(h, w, i, o); - setupTensor(shape_tuple); + setupTensor(shape_tuple, float16); // copy_data is set to false here as there's a separate test for blockize when copy_data becomes // true - auto num_wgt_chunks = calculate_num_weight_chunks(shape); + auto num_wgt_chunks = calculate_num_weight_chunks(shape, 8, 4, 32, 32); auto wgt_ptr_table = reinterpret_cast(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t))); auto chunked_tensor = prepare_hwio(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table); @@ -260,10 +204,10 @@ TEST_F(HexagonUtilsWeightsChunkifyTest, prepare_hwio) { TEST_P(HexagonUtilsWeightsChunkifyTest, chunkify_hwio_16b) { auto [shape_tuple, indices] = GetParam(); auto [h, w, i, o] = shape_tuple; - setupTensor(shape_tuple); + setupTensor(shape_tuple, float16); int64_t shape[] = {h, w, i, o}; - auto num_wgt_chunks = calculate_num_weight_chunks(shape); + auto num_wgt_chunks = calculate_num_weight_chunks(shape, 8, 4, 32, 32); auto wgt_ptr_table = reinterpret_cast(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t))); auto chunked_tensor = prepare_hwio(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table); diff --git a/tests/cpp-runtime/hexagon/hexagon_quant_utils_tests.cc b/tests/cpp-runtime/hexagon/hexagon_quant_utils_tests.cc new file mode 100644 index 000000000000..449c69736050 --- /dev/null +++ b/tests/cpp-runtime/hexagon/hexagon_quant_utils_tests.cc @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "conv2d.h" +#include "hexagon_conv_utils_test.h" + +using namespace tvm::runtime::hexagon::conv_utils; + +// Parameterized test fixture with 4 params representing n, h, w, c +class HexagonUtilsQuantActivationsBlockizeTest + : public HexagonUtilsTest, + public ::testing::WithParamInterface, std::tuple>> {}; + +// TODO (quic-sanirudh): See if we can test with random generated indices +INSTANTIATE_TEST_SUITE_P( + BlockizeDeblockizeTestFixtures, HexagonUtilsQuantActivationsBlockizeTest, + ::testing::Combine(::testing::Values(std::make_tuple(1, 14, 7, 60)), + ::testing::Values(std::make_tuple(0, 0, 0, 0), // first element + std::make_tuple(0, 7, 3, 31), // last element + // Remaining are random element tests + std::make_tuple(0, 13, 6, 59), + std::make_tuple(0, 0, 0, 32), std::make_tuple(0, 0, 4, 32), + std::make_tuple(0, 2, 3, 4), std::make_tuple(0, 5, 6, 7), + std::make_tuple(0, 10, 4, 12))), + [](const ::testing::TestParamInfo& info) { + // Can use info.param here to generate the test suffix + auto indices = std::get<1>(info.param); + int h = std::get<1>(indices); + int w = std::get<2>(indices); + int c = std::get<3>(indices); + // Generate test name as "hwc0x0x0" if the indices of hwc are 0,0,0 + std::string name = + "hwc" + std::to_string(h) + "x" + std::to_string(w) + "x" + std::to_string(c); + return name; + }); + +TEST_F(HexagonUtilsQuantActivationsBlockizeTest, prepare_nhwc) { + auto shape = std::make_tuple(1, 14, 7, 60); + auto [n, h, w, c] = shape; + setupTensor(shape, uint8); + + // // copy_data is set to false here as there's a separate test for blockize when copy_data + // becomes true + auto blocked_tensor = + prepare_nhwc(device_api, &flat_tensor, /*copy_data=*/false); + + EXPECT_EQ(blocked_tensor.shape[0], n); + EXPECT_EQ(blocked_tensor.shape[1], round_up(h, 8) / 8); + EXPECT_EQ(blocked_tensor.shape[2], round_up(w, 8) / 8); + EXPECT_EQ(blocked_tensor.shape[3], round_up(c, 32) / 32); + + TearDownTensor(); + release(device_api, blocked_tensor); +} + +TEST_P(HexagonUtilsQuantActivationsBlockizeTest, blockize_hwc_8b) { + auto shape_tuple = std::get<0>(GetParam()); + setupTensor(shape_tuple, uint8); + auto [n, h, w, c] = shape_tuple; + int64_t shape[] = {n, h, w, c}; + + int h_rounded = round_up(h, 8); + int w_rounded = round_up(w, 8); + int c_rounded = round_up(c, 32); + int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 8 * 32), 8 * 8 * 32}; + + void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, uint8, vtcm_scope); + int64_t blocked_shape[] = {n, h_rounded / 8, w_rounded / 8, c_rounded / 32}; + blockize_hwc(blocked_mem, flat_mem, h, w, c); + + std::function flatten = + HexagonUtilsQuantActivationsBlockizeTest::flattened_idx; + + auto getBlockedElem = [&blocked_shape, blocked_mem, flatten](int nn, int hh, int ww, int cc) { + auto* blocks = static_cast(blocked_mem); + int blockIdx = flatten(nn, hh / 8, ww / 8, cc / 32, blocked_shape); + uint8_t* block = reinterpret_cast(blocks[blockIdx]); + return block[yxc_to_sm_8b(hh % 8, ww % 8, cc % 32)]; + }; + + auto [nn, hh, ww, cc] = std::get<1>(GetParam()); + + EXPECT_EQ(flat_mem_data[flattened_idx(nn, hh, ww, cc, shape)], getBlockedElem(nn, hh, ww, cc)); + + TearDownTensor(); + device_api->FreeDataSpace(hexagon_device, blocked_mem); +} + +TEST_P(HexagonUtilsQuantActivationsBlockizeTest, deblockize_hwc_8b) { + auto shape_tuple = std::get<0>(GetParam()); + setupTensor(shape_tuple, uint8); + auto [n, h, w, c] = shape_tuple; + int64_t shape[] = {n, h, w, c}; + int64_t shape_1d[1] = {n * h * w * c}; + + int h_rounded = round_up(h, 8); + int w_rounded = round_up(w, 8); + int c_rounded = round_up(c, 32); + int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 8 * 32), 8 * 8 * 32}; + + void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, uint8, vtcm_scope); + blockize_hwc(blocked_mem, flat_mem, h, w, c); + + void* deblocked_flat_mem = + device_api->AllocDataSpace(hexagon_device, 1, shape_1d, uint8, vtcm_scope); + deblockize_hwc(deblocked_flat_mem, blocked_mem, h, w, c); + auto* deblocked_flat_mem_data = static_cast(deblocked_flat_mem); + + auto [nn, hh, ww, cc] = std::get<1>(GetParam()); + + auto idx = flattened_idx(nn, hh, ww, cc, shape); + EXPECT_EQ(flat_mem_data[idx], deblocked_flat_mem_data[idx]); + + TearDownTensor(); + device_api->FreeDataSpace(hexagon_device, blocked_mem); + device_api->FreeDataSpace(hexagon_device, deblocked_flat_mem); +} + +class HexagonUtilsQuantWeightsChunkifyTest + : public HexagonUtilsTest, + public ::testing::WithParamInterface, std::tuple>> {}; + +INSTANTIATE_TEST_SUITE_P( + ChunkifyDechunkifyTests, HexagonUtilsQuantWeightsChunkifyTest, + ::testing::Combine(::testing::Values(std::make_tuple(3, 3, 40, 40)), + ::testing::Values(std::make_tuple(0, 0, 0, 0), // first element + std::make_tuple(2, 2, 39, 39), // Last element + // Remaining are random element tests + std::make_tuple(1, 1, 28, 33), + std::make_tuple(1, 2, 8, 38), + std::make_tuple(1, 0, 12, 15), + std::make_tuple(2, 1, 9, 22), std::make_tuple(0, 2, 6, 7), + std::make_tuple(1, 2, 3, 4))), + [](const ::testing::TestParamInfo& info) { + // Can use info.param here to generate the test suffix + auto indices = std::get<1>(info.param); + int h = std::get<0>(indices); + int w = std::get<1>(indices); + int i = std::get<2>(indices); + int o = std::get<3>(indices); + // Generate test name as "hwc0x0x0" if the indices of hwc are 0,0,0 + std::string name = "hwio" + std::to_string(h) + std::to_string(w) + "x" + std::to_string(i) + + "x" + std::to_string(o); + return name; + }); + +TEST_F(HexagonUtilsQuantWeightsChunkifyTest, calculate_num_weight_chunks) { + int64_t shape[] = {3, 3, 40, 40}; + int num_wgt_chunks = calculate_num_weight_chunks(shape, shape[0], shape[1], 32, 32); + EXPECT_EQ(num_wgt_chunks, 4); +} + +TEST_F(HexagonUtilsQuantWeightsChunkifyTest, prepare_hwio) { + int64_t shape[] = {3, 3, 40, 40}; + auto [h, w, i, o] = shape; + auto shape_tuple = std::make_tuple(h, w, i, o); + setupTensor(shape_tuple, int8); + + // copy_data is set to false here as there's a separate test for blockize when copy_data becomes + // true + auto num_wgt_chunks = calculate_num_weight_chunks(shape, shape[0], shape[1], 32, 32); + auto wgt_ptr_table = + reinterpret_cast(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t))); + auto chunked_tensor = prepare_hwio_8b(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table); + + EXPECT_EQ(chunked_tensor.shape[0], 1); + EXPECT_EQ(chunked_tensor.shape[1], 1); + EXPECT_EQ(chunked_tensor.shape[2], round_up(i, 32) / 32); + EXPECT_EQ(chunked_tensor.shape[3], round_up(o, 32) / 32); + + release(device_api, chunked_tensor); + TearDownTensor(); +} + +TEST_P(HexagonUtilsQuantWeightsChunkifyTest, chunkify_hwio_8b) { + auto [shape_tuple, indices] = GetParam(); + auto [h, w, i, o] = shape_tuple; + setupTensor(shape_tuple, int8); + int64_t shape[] = {h, w, i, o}; + + auto num_wgt_chunks = calculate_num_weight_chunks(shape, shape[0], shape[1], 32, 32); + auto wgt_ptr_table = + reinterpret_cast(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t))); + auto chunked_tensor = prepare_hwio_8b(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table); + + auto getChunkedElem = [width = w, chunked_tensor](int hh, int ww, int ii, int oo) { + auto data = static_cast(chunked_tensor.data); + auto chunk = data[ii / 32 * chunked_tensor.shape[3] + oo / 32]; + auto chunk_int8 = reinterpret_cast(chunk); + return chunk_int8[hwio_to_sm_8b(width, hh, ww, ii % 32, oo % 32)]; + }; + + auto [hh, ww, ii, oo] = indices; + + EXPECT_EQ(flat_mem_data[flattened_idx(hh, ww, ii, oo, shape)], getChunkedElem(hh, ww, ii, oo)); + release(device_api, chunked_tensor); +} diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py index c03701f83ccc..5b13513c0fb3 100644 --- a/tests/python/contrib/test_hexagon/infrastructure.py +++ b/tests/python/contrib/test_hexagon/infrastructure.py @@ -320,7 +320,7 @@ def quantize_np(arr_np: numpy.ndarray, dtype: str): scale = (fmax - fmin) / (qmax - qmin) zero_point = numpy.rint((fmax * qmin - fmin * qmax) / (fmax - fmin)).astype("int32") - quant_np = (arr_np / scale + zero_point).astype(dtype) + quant_np = numpy.clip(((arr_np / scale).round() + zero_point), qmin, qmax).astype(dtype) return quant_np, scale, zero_point diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py new file mode 100644 index 000000000000..c26e6142ba5c --- /dev/null +++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_quant_intrin.py @@ -0,0 +1,261 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" Test quantized conv2d HVX intrinsic implementation""" + +import numpy as np + +import tvm +import tvm.contrib.hexagon +from tvm.topi.hexagon.utils import get_fixed_point_value +from tvm.topi.testing import conv2d_nhwc_python + +from ..infrastructure import get_hexagon_target, quantize_np + + +def build_conv2d(target): + """Build and return the conv2d IRModule that calls the intrinsic implementation""" + act_n, act_h, act_w, act_c = ( + tvm.te.var("an"), + tvm.te.var("ah"), + tvm.te.var("aw"), + tvm.te.var("ac"), + ) + filt_h, filt_w, filt_o = tvm.te.var("filt_h"), tvm.te.var("filt_w"), tvm.te.var("filt_o") + act_scale, act_zp = tvm.te.var("act_scale", dtype="float32"), tvm.te.var("act_zp") + wgt_scale, wgt_zp = tvm.te.var("wgt_scale", dtype="float32"), tvm.te.var("wgt_zp") + out_scale, out_zp = tvm.te.var("out_scale", dtype="float32"), tvm.te.var("out_zp") + fixed_final_scale, scale_factor = tvm.te.var("fixed_final_scale", dtype="int32"), tvm.te.var( + "scale_factor" + ) + stride_h, stride_w = tvm.te.var("stride_h"), tvm.te.var("stride_w") + + act_flat = tvm.te.placeholder( + shape=(act_n, act_h, act_w, act_c), dtype="uint8", name="act_flat" + ) + wgt_flat = tvm.te.placeholder( + shape=(filt_h, filt_w, act_c, filt_o), dtype="int8", name="wgt_flat" + ) + + out_flat = tvm.te.extern( + shape=(act_n, (act_h - filt_h) // stride_h + 1, (act_w - filt_w) // stride_w + 1, filt_o), + inputs=[act_flat, wgt_flat], + fcompute=lambda ins, outs: tvm.tir.call_cpacked( + "conv2d_packed_quant", # Function from TVM runtime + ins[0], + ins[1], + act_scale, + act_zp, + wgt_scale, + wgt_zp, + out_scale, + out_zp, + stride_h, + stride_w, + fixed_final_scale, + scale_factor, + outs[0], + tvm.runtime.const(0), # resource_handle (unused) + ), + dtype="uint8", + ) + + s = tvm.te.create_schedule(out_flat.op) + + func_name = "conv2d_quant_hvx" + module = tvm.build( + s, + [ + act_flat, + wgt_flat, + act_scale, + act_zp, + wgt_scale, + wgt_zp, + out_scale, + out_zp, + stride_h, + stride_w, + fixed_final_scale, + scale_factor, + out_flat, + ], + target=target, + name=func_name, + ) + + return module + + +def gen_config(params): + """Utility function to generate useful ids for shape_parameters""" + + dims = lambda vals: "x".join(map(str, vals)) + + config = {} + for param in params: + act_shape, wgt_shape, inp_stride = param + name = f"nhwc{dims(act_shape)}-hwio{dims(wgt_shape)}-stride{dims(inp_stride)}" + config[name] = param + + return config + + +class TestQuantConv2dIntrin: + """Test Quantized Conv2d Intrin class""" + + shape_parameters = [ + [ + (1, 5, 5, 33), + (3, 3, 33, 33), + (1, 1), + ], + [ + (1, 9, 8, 64), + (3, 3, 64, 64), + (1, 1), + ], + [ + (1, 11, 16, 64), + (3, 3, 64, 32), + (1, 1), + ], + [ + (1, 24, 8, 3), + (3, 3, 3, 3), + (1, 1), + ], + [ + (1, 4, 4, 3), + (3, 3, 3, 3), + (1, 1), + ], + [ + (1, 4, 5, 3), + (3, 3, 3, 3), + (1, 1), + ], + [ + (1, 4, 6, 3), + (3, 3, 3, 3), + (1, 1), + ], + [ + (1, 4, 7, 3), + (3, 3, 3, 3), + (1, 1), + ], + [ + (1, 4, 8, 3), + (3, 3, 3, 3), + (1, 1), + ], + [ + (1, 4, 9, 3), + (3, 3, 3, 3), + (1, 1), + ], + [ + (1, 4, 10, 3), + (3, 3, 3, 3), + (1, 1), + ], + [ + (1, 4, 11, 3), + (3, 3, 3, 3), + (1, 1), + ], + [ + (1, 4, 4, 5), + (3, 3, 5, 3), + (1, 1), + ], + ] + + config = gen_config(shape_parameters) + act_shape, wgt_shape, inp_stride = tvm.testing.parameters(*config.values(), ids=config.keys()) + inp_offset = tvm.testing.parameter((0, 0), ids=["offset0x0"]) + + @tvm.testing.requires_hexagon + def test_conv2d_quant(self, act_shape, wgt_shape, inp_stride, hexagon_session): + """Test quantized conv2d intrinsic implementation""" + assert act_shape[3] == wgt_shape[2] + + # Currently, input offset does not affect the output shape + def get_out_shape(ash, wsh, inp_stride): + assert ash[3] == wsh[2] + osh = ( + ash[0], + (ash[1] - wsh[0]) // inp_stride[0] + 1, + (ash[2] - wsh[1]) // inp_stride[1] + 1, + wsh[3], + ) + assert tvm.tir.all([x > 0 for x in osh]) + return osh + + act_f = np.random.uniform(-1.5, 1.0, size=act_shape).astype("float32") + wgt_f = np.random.uniform(-1.5, 1.0, size=wgt_shape).astype("float32") + + # Quanize activations using onnxruntime + act_q, act_scale, act_zp = quantize_np(act_f, dtype="uint8") + act_q = act_q.reshape(act_f.shape) + + # Quanize weights using onnxruntime + wgt_q, wgt_scale, wgt_zp = quantize_np(wgt_f, dtype="int8") + wgt_q = wgt_q.reshape(wgt_f.shape) + + # Generate reference output + ref_out = conv2d_nhwc_python(act_f, wgt_f, stride=inp_stride, padding="VALID") + + ref_out_q, out_scale, out_zp = quantize_np(ref_out, dtype="uint8") + ref_out_q = ref_out_q.reshape(ref_out.shape) + + final_scale = act_scale * wgt_scale / out_scale + fixed_final_scale, scale_factor = get_fixed_point_value(final_scale) + + module = build_conv2d(get_hexagon_target("v69")) + mod = hexagon_session.load_module(module) + + output_shape = get_out_shape(act_shape, wgt_shape, inp_stride) + + output = tvm.nd.array( + np.zeros(output_shape, dtype="uint8"), + device=hexagon_session.device, + ) + mod( + tvm.nd.array(act_q, device=hexagon_session.device), + tvm.nd.array(wgt_q, device=hexagon_session.device), + act_scale, + act_zp, + wgt_scale, + wgt_zp, + out_scale, + out_zp, + inp_stride[0], # stride_height + inp_stride[1], # stride_width + fixed_final_scale, + scale_factor, + output, + ) + + out_q = output.numpy() + + tvm.testing.assert_allclose(out_q, ref_out_q, rtol=0, atol=2) + + +if __name__ == "__main__": + tvm.testing.main()