-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Description
I have a model that I tried to deploy using AOT runtime. The model final output has type int8 and based on that I allocated a placeholder for the output like this:
int8_t output_data0[] ={0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, };
However, when I look at the C code generated for AOT runtime library, here's what has been generated:
TVM_DLL int32_t tvm__run_func(void* args, void* arg_type_ids, int32_t num_args, void* out_ret_value, void* out_ret_tcode, void* resource_handle) {
TVMValue stack[5];
void* stack_tcode = stack;
TVMValue stack1[9];
void* stack_value = stack1;
void* arg0 = (((TVMValue*)args)[0].v_handle);
int32_t arg0_code = ((int32_t*)arg_type_ids)[(0)];
void* arg1 = (((TVMValue*)args)[1].v_handle);
int32_t arg1_code = ((int32_t*)arg_type_ids)[(1)];
void* input_0 = arg0;
void* output_0 = arg1;
...
TVMValue stack245[6];
void* sid_59_value1 = stack245;
(((DLTensor*)sid_59_value1)[0].data) = sid_59;
TVMValue stack246[1];
void* param_60_array = stack246;
TVMValue stack247[1];
void* ret_value110 = stack247;
TVMValue stack248[1];
void* ret_value111 = stack248;
TVMValue stack249[1];
void* param_60_value = stack249;
(((TVMValue*)param_60_value)[0].v_int64) = 60;
(void)_lookup_linked_param(param_60_value, 0, 0, ret_value111, ret_value110, 0);
(((DLTensor*)param_60_array)[0].data) = (((TVMValue*)ret_value111)[0].v_handle);
TVMValue stack250[1];
void* param_61_array = stack250;
TVMValue stack251[1];
void* ret_value112 = stack251;
TVMValue stack252[1];
void* ret_value113 = stack252;
TVMValue stack253[1];
void* param_61_value = stack253;
(((TVMValue*)param_61_value)[0].v_int64) = 61;
(void)_lookup_linked_param(param_61_value, 0, 0, ret_value113, ret_value112, 0);
(((DLTensor*)param_61_array)[0].data) = (((TVMValue*)ret_value113)[0].v_handle);
(((TVMValue*)stack_value)[0].v_handle) = sid_59_value1;
((int32_t*)stack_tcode)[(0)] = 3;
(((TVMValue*)stack_value)[1].v_handle) = param_60_array;
((int32_t*)stack_tcode)[(1)] = 3;
(((TVMValue*)stack_value)[2].v_handle) = param_61_array;
((int32_t*)stack_tcode)[(2)] = 3;
(((TVMValue*)stack_value)[3].v_handle) = output_0;
((int32_t*)stack_tcode)[(3)] = 3;
TVMValue ret_val12;
int ret_type_code12;
if (fused_nn_contrib_dense_pack_add_fixed_point_multiply_add_clip_cast_cast_subtract_14669711146056581479_( (TVMValue*) stack_value , (int*) stack_tcode, 4, &ret_val12, &ret_type_code12, NULL) != 0){
return -1;
}
TVMValue stack254[6];
void* sid_59_value2 = stack254;
(((DLTensor*)sid_59_value2)[0].data) = sid_59;
(((TVMValue*)stack_value)[0].v_handle) = output_0;
((int32_t*)stack_tcode)[(0)] = 3;
(((TVMValue*)stack_value)[1].v_handle) = sid_59_value2;
((int32_t*)stack_tcode)[(1)] = 3;
TVMValue ret_val13;
int ret_type_code13;
if (fused_nn_softmax( (TVMValue*) stack_value , (int*) stack_tcode, 2, &ret_val13, &ret_type_code13, NULL) != 0){
return -1;
}
TVMValue stack255[6];
void* sid_59_value3 = stack255;
(((DLTensor*)sid_59_value3)[0].data) = sid_59;
(((TVMValue*)stack_value)[0].v_handle) = sid_59_value3;
((int32_t*)stack_tcode)[(0)] = 3;
(((TVMValue*)stack_value)[1].v_handle) = output_0;
((int32_t*)stack_tcode)[(1)] = 3;
TVMValue ret_val14;
int ret_type_code14;
if (fused_divide_add_round_cast_clip_cast( (TVMValue*) stack_value , (int*) stack_tcode, 2, &ret_val14, &ret_type_code14, NULL) != 0){
return -1;
}
output_0 is the placeholder for final output (output_data0) that we passed to function tvm__run_func and it has int8 type, however, output_0 has been used other intermediate functions and assigned other types like float. For ex. fused_nn_contrib_dense_pack_add_fixed_point_multiply_add_clip_cast_cast_subtract_14669711146056581479_ function is defined here:
TVM_DLL int32_t fused_nn_contrib_dense_pack_add_fixed_point_multiply_add_clip_cast_cast_subtract_14669711146056581479_(void* args, void* arg_type_ids, int32_t num_args, void* out_ret_value, void* out_ret_tcode, void* resource_handle) {
void* arg0 = (((TVMValue*)args)[0].v_handle);
int32_t arg0_code = ((int32_t*)arg_type_ids)[(0)];
void* arg1 = (((TVMValue*)args)[1].v_handle);
int32_t arg1_code = ((int32_t*)arg_type_ids)[(1)];
void* arg2 = (((TVMValue*)args)[2].v_handle);
int32_t arg2_code = ((int32_t*)arg_type_ids)[(2)];
void* arg3 = (((TVMValue*)args)[3].v_handle);
int32_t arg3_code = ((int32_t*)arg_type_ids)[(3)];
void* placeholder = (((DLTensor*)arg0)[0].data);
void* arg0_shape = (((DLTensor*)arg0)[0].shape);
void* arg0_strides = (((DLTensor*)arg0)[0].strides);
int32_t dev_id = (((DLTensor*)arg0)[0].device.device_id);
void* placeholder1 = (((DLTensor*)arg1)[0].data);
void* arg1_shape = (((DLTensor*)arg1)[0].shape);
void* arg1_strides = (((DLTensor*)arg1)[0].strides);
void* placeholder2 = (((DLTensor*)arg2)[0].data);
void* arg2_shape = (((DLTensor*)arg2)[0].shape);
void* arg2_strides = (((DLTensor*)arg2)[0].strides);
void* T_multiply = (((DLTensor*)arg3)[0].data);
void* arg3_shape = (((DLTensor*)arg3)[0].shape);
void* arg3_strides = (((DLTensor*)arg3)[0].strides);
if (!(arg0_strides == NULL)) {
}
if (!(arg1_strides == NULL)) {
}
if (!(arg2_strides == NULL)) {
}
if (!(arg3_strides == NULL)) {
}
void* compute_global = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)48, 0, 32);
if (compute_global == NULL) {
return -1;
}
for (int32_t x_c_init = 0; x_c_init < 12; ++x_c_init) {
((int32_t*)compute_global)[(x_c_init)] = 0;
}
for (int32_t k_outer = 0; k_outer < 64; ++k_outer) {
for (int32_t x_c = 0; x_c < 12; ++x_c) {
((int32_t*)compute_global)[(x_c)] = (((int32_t*)compute_global)[(x_c)] + (((int32_t)((int16_t*)placeholder)[(k_outer)]) * ((int32_t)((int16_t*)placeholder1)[(((k_outer * 12) + x_c))])));
}
}
for (int32_t ax1_inner_inner = 0; ax1_inner_inner < 12; ++ax1_inner_inner) {
int32_t _1 = ((int32_t)(((((0 != 0) ? (((int64_t)(((int32_t*)compute_global)[(ax1_inner_inner)] + ((int32_t*)placeholder2)[(ax1_inner_inner)])) << ((int64_t)0)) : ((int64_t)(((int32_t*)compute_global)[(ax1_inner_inner)] + ((int32_t*)placeholder2)[(ax1_inner_inner)]))) * (int64_t)1278221461) + ((int64_t)1 << ((int64_t)((7 + 31) - 1)))) >> ((int64_t)(7 + 31)))) + 14;
int32_t _2 = (_1) < (127) ? (_1) : (127);
((float*)T_multiply)[(ax1_inner_inner)] = (((float)(((int32_t)((int8_t)((_2) > (-128) ? (_2) : (-128)))) - 14)) * 1.446925e-01f);
}
if (TVMBackendFreeWorkspace(1, dev_id, compute_global) != 0) {
return -1;
}
return 0;
}
and here T_multiply is the output_0 which is interpreted as float type and this cause memory overriding of other variables.
One quick fix is to assign the final output as the largest size that we used in graph(float32/float64) to avoid this problem, but we need a better way to fix this problem.