diff --git a/apps/bundle_deploy/bundle.c b/apps/bundle_deploy/bundle.c index 6018d40dd300..e411c929f936 100644 --- a/apps/bundle_deploy/bundle.c +++ b/apps/bundle_deploy/bundle.c @@ -32,8 +32,10 @@ #define CRT_MEMORY_NUM_PAGES 16384 #define CRT_MEMORY_PAGE_SIZE_LOG2 10 +#define CRT_MEMORY_PAGE_SIZE_BYTES (1 << CRT_MEMORY_PAGE_SIZE_LOG2) -static uint8_t g_crt_memory[CRT_MEMORY_NUM_PAGES * (1 << CRT_MEMORY_PAGE_SIZE_LOG2)]; +static uint8_t g_crt_memory[CRT_MEMORY_NUM_PAGES * CRT_MEMORY_PAGE_SIZE_BYTES] + __attribute__((aligned(CRT_MEMORY_PAGE_SIZE_BYTES))); static MemoryManagerInterface* g_memory_manager; /*! \brief macro to do C API call */ diff --git a/apps/bundle_deploy/bundle_static.c b/apps/bundle_deploy/bundle_static.c index 18a7b2bbb0ff..846e21b36883 100644 --- a/apps/bundle_deploy/bundle_static.c +++ b/apps/bundle_deploy/bundle_static.c @@ -33,8 +33,10 @@ #define CRT_MEMORY_NUM_PAGES 16384 #define CRT_MEMORY_PAGE_SIZE_LOG2 10 +#define CRT_MEMORY_PAGE_SIZE_BYTES (1 << CRT_MEMORY_PAGE_SIZE_LOG2) -static uint8_t g_crt_memory[CRT_MEMORY_NUM_PAGES * (1 << CRT_MEMORY_PAGE_SIZE_LOG2)]; +static uint8_t g_crt_memory[CRT_MEMORY_NUM_PAGES * CRT_MEMORY_PAGE_SIZE_BYTES] + __attribute__((aligned(CRT_MEMORY_PAGE_SIZE_BYTES))); static MemoryManagerInterface* g_memory_manager; /*! \brief macro to do C API call */ diff --git a/apps/bundle_deploy/test.cc b/apps/bundle_deploy/test.cc index 25056f4d17a6..7194427426d2 100644 --- a/apps/bundle_deploy/test.cc +++ b/apps/bundle_deploy/test.cc @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -52,7 +53,7 @@ char* read_all_or_die(const char* name, const char* file_path, size_t* out_size) *out_size = st.st_size; } - char* data = (char*)malloc(st.st_size); + char* data = (char*)std::aligned_alloc(64, st.st_size); FILE* fp = fopen(file_path, "rb"); size_t bytes_to_read = st.st_size; size_t bytes_read = 0; @@ -129,7 +130,7 @@ int main(int argc, char** argv) { ftvm_runtime_run(handle); gettimeofday(&t3, 0); - float output_storage[10 * 5]; + float* output_storage = static_cast(std::aligned_alloc(64, 10 * 5 * sizeof(float))); std::vector output_shape = {10, 5}; DLTensor output; output.data = output_storage; @@ -162,6 +163,9 @@ int main(int argc, char** argv) { (t4.tv_sec - t3.tv_sec) * 1000.0f + (t4.tv_usec - t3.tv_usec) / 1000.f, (t5.tv_sec - t4.tv_sec) * 1000.0f + (t5.tv_usec - t4.tv_usec) / 1000.f); + free(output_storage); + free(result_storage); + free(input_storage); free(json_data); free(params_data); dlclose(bundle); diff --git a/apps/bundle_deploy/test_static.c b/apps/bundle_deploy/test_static.c index b9c980843ea1..4258b3a4e0f9 100644 --- a/apps/bundle_deploy/test_static.c +++ b/apps/bundle_deploy/test_static.c @@ -54,12 +54,12 @@ int main(int argc, char** argv) { void* handle = tvm_runtime_create(json_data, params_data, params_size, argv[0]); gettimeofday(&t1, 0); - float input_storage[10 * 5]; + float* input_storage = aligned_alloc(64, 10 * 5 * sizeof(float)); fp = fopen(argv[1], "rb"); fread(input_storage, 10 * 5, 4, fp); fclose(fp); - float result_storage[10 * 5]; + float* result_storage = aligned_alloc(64, 10 * 5 * sizeof(float)); fp = fopen(argv[2], "rb"); fread(result_storage, 10 * 5, 4, fp); fclose(fp); @@ -82,7 +82,7 @@ int main(int argc, char** argv) { tvm_runtime_run(handle); gettimeofday(&t3, 0); - float output_storage[10 * 5]; + float* output_storage = aligned_alloc(64, 10 * 5 * sizeof(float)); DLTensor output; output.data = output_storage; DLDevice out_dev = {kDLCPU, 0}; @@ -117,6 +117,9 @@ int main(int argc, char** argv) { (t4.tv_sec - t3.tv_sec) * 1000 + (t4.tv_usec - t3.tv_usec) / 1000.f, (t5.tv_sec - t4.tv_sec) * 1000 + (t5.tv_usec - t4.tv_usec) / 1000.f); + free(output_storage); + free(result_storage); + free(input_storage); free(json_data); free(params_data); diff --git a/cmake/utils/CRTConfig.cmake b/cmake/utils/CRTConfig.cmake index 42c523b08786..7f6b058bfd0a 100644 --- a/cmake/utils/CRTConfig.cmake +++ b/cmake/utils/CRTConfig.cmake @@ -25,6 +25,7 @@ function(generate_crt_config platform output_path) set(TVM_CRT_MAX_STRLEN_DLTYPE 10) set(TVM_CRT_MAX_STRLEN_FUNCTION_NAME 120) set(TVM_CRT_MAX_STRLEN_PARAM_NAME 80) + set(TVM_CRT_ALLOC_ALIGNMENT 64) if("${platform}" STREQUAL "zephyr") set(TVM_CRT_MAX_PACKET_SIZE_BYTES 512) diff --git a/rust/tvm-graph-rt/tests/test_tvm_basic/src/build_test_lib.py b/rust/tvm-graph-rt/tests/test_tvm_basic/src/build_test_lib.py index d6e1922efa85..52b1dec2c9fe 100755 --- a/rust/tvm-graph-rt/tests/test_tvm_basic/src/build_test_lib.py +++ b/rust/tvm-graph-rt/tests/test_tvm_basic/src/build_test_lib.py @@ -23,19 +23,25 @@ import tvm from tvm.relay.backend import Runtime -from tvm import te +from tvm.script import tir as T def main(): - n = te.var("n") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") - s = tvm.te.create_schedule(C.op) - s[C].parallel(s[C].op.axis[0]) + @T.prim_func + def func(var_A: T.handle, var_B: T.handle, var_C: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": T.bool(True)}) + n = T.int32() + A = T.match_buffer(var_A, (n,), align=1) + B = T.match_buffer(var_B, (n,), align=1) + C = T.match_buffer(var_C, (n,), align=1) + for i in T.parallel(n): + with T.block("C"): + vi = T.axis.spatial(n, i) + C[vi] = A[vi] + B[vi] + runtime = Runtime("cpp", {"system-lib": True}) - print(tvm.lower(s, [A, B, C], simple_mode=True)) - tvm.build(s, [A, B, C], "llvm", runtime=runtime).save(osp.join(sys.argv[1], "test.o")) + print(tvm.lower(func, simple_mode=True)) + tvm.build(func, target="llvm", runtime=runtime).save(osp.join(sys.argv[1], "test.o")) if __name__ == "__main__": diff --git a/rust/tvm-graph-rt/tests/test_tvm_dso/src/build_test_lib.py b/rust/tvm-graph-rt/tests/test_tvm_dso/src/build_test_lib.py index 4b270fa17cbc..415832a614f0 100755 --- a/rust/tvm-graph-rt/tests/test_tvm_dso/src/build_test_lib.py +++ b/rust/tvm-graph-rt/tests/test_tvm_dso/src/build_test_lib.py @@ -22,20 +22,26 @@ import sys import tvm -from tvm import te from tvm.contrib import cc +from tvm.script import tir as T def main(): - n = te.var("n") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") - s = tvm.te.create_schedule(C.op) - s[C].parallel(s[C].op.axis[0]) - print(tvm.lower(s, [A, B, C], simple_mode=True)) + @T.prim_func + def func(var_A: T.handle, var_B: T.handle, var_C: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": T.bool(True)}) + n = T.int32() + A = T.match_buffer(var_A, (n,), align=1) + B = T.match_buffer(var_B, (n,), align=1) + C = T.match_buffer(var_C, (n,), align=1) + for i in T.parallel(n): + with T.block("C"): + vi = T.axis.spatial(n, i) + C[vi] = A[vi] + B[vi] + + print(tvm.lower(func, simple_mode=True)) obj_file = osp.join(sys.argv[1], "test.o") - tvm.build(s, [A, B, C], "llvm").save(obj_file) + tvm.build(func, "llvm").save(obj_file) cc.create_shared(osp.join(sys.argv[1], "test.so"), [obj_file]) diff --git a/rust/tvm-graph-rt/tests/test_wasm32/src/build_test_lib.py b/rust/tvm-graph-rt/tests/test_wasm32/src/build_test_lib.py index 2bf327a31b1b..6d744b28c67e 100755 --- a/rust/tvm-graph-rt/tests/test_wasm32/src/build_test_lib.py +++ b/rust/tvm-graph-rt/tests/test_wasm32/src/build_test_lib.py @@ -22,20 +22,26 @@ import sys import tvm -from tvm import te from tvm.relay.backend import Runtime +from tvm.script import tir as T def main(): - n = te.var("n") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") - s = tvm.te.create_schedule(C.op) - s[C].parallel(s[C].op.axis[0]) - print(tvm.lower(s, [A, B, C], simple_mode=True)) + @T.prim_func + def func(var_A: T.handle, var_B: T.handle, var_C: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": T.bool(True)}) + n = T.int32() + A = T.match_buffer(var_A, (n,), align=1) + B = T.match_buffer(var_B, (n,), align=1) + C = T.match_buffer(var_C, (n,), align=1) + for i in T.parallel(n): + with T.block("C"): + vi = T.axis.spatial(n, i) + C[vi] = A[vi] + B[vi] + + print(tvm.lower(s, simple_mode=True)) runtime = Runtime("cpp", {"system-lib": True}) - tvm.build(s, [A, B, C], "llvm -mtriple=wasm32-unknown-unknown", runtime=runtime).save( + tvm.build(func, target="llvm -mtriple=wasm32-unknown-unknown", runtime=runtime).save( osp.join(sys.argv[1], "test.o") ) diff --git a/rust/tvm/tests/basics/src/tvm_add.py b/rust/tvm/tests/basics/src/tvm_add.py index fc5c4213bd08..807e27db7794 100755 --- a/rust/tvm/tests/basics/src/tvm_add.py +++ b/rust/tvm/tests/basics/src/tvm_add.py @@ -20,23 +20,34 @@ import sys import tvm -from tvm import te from tvm.contrib import cc +from tvm.script import tir as T def main(target, out_dir): - n = te.var("n") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") - s = te.create_schedule(C.op) + @T.prim_func + def func(var_A: T.handle, var_B: T.handle, var_C: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": T.bool(True)}) + n = T.int32() + A = T.match_buffer(var_A, (n,), align=1) + B = T.match_buffer(var_B, (n,), align=1) + C = T.match_buffer(var_C, (n,), align=1) + # with T.block("root"): + for i in range(n): + with T.block("C"): + v_i = T.axis.spatial(n, i) + T.reads(A[v_i], B[v_i]) + T.writes(C[v_i]) + C[v_i] = A[v_i] + B[v_i] if target == "cuda": - bx, tx = s[C].split(C.op.axis[0], factor=64) - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) + sch = tvm.tir.Schedule(func) + i, j = sch.split(sch.get_loops("C")[0], [None, 64]) + sch.bind(i, "blockIdx.x") + sch.bind(j, "threadIdx.x") + func = sch.mod["main"] - fadd = tvm.build(s, [A, B, C], tvm.target.Target(target, host="llvm"), name="myadd") + fadd = tvm.build(func, target=tvm.target.Target(target, host="llvm"), name="myadd") fadd.save(osp.join(out_dir, "test_add.o")) if target == "cuda": fadd.imported_modules[0].save(osp.join(out_dir, "test_add.ptx")) diff --git a/src/runtime/crt/aot_executor_module/aot_executor_module.c b/src/runtime/crt/aot_executor_module/aot_executor_module.c index a5c8105144f7..155f45bff5ce 100644 --- a/src/runtime/crt/aot_executor_module/aot_executor_module.c +++ b/src/runtime/crt/aot_executor_module/aot_executor_module.c @@ -154,7 +154,7 @@ int32_t TVMAotExecutorModule_GetInputName(TVMValue* args, int* tcodes, int nargs return kTvmErrorFunctionCallNumArguments; } - char* name; + const char* name; int ret = TVMAotExecutor_GetInputName(aot_executor.executor, args[0].v_int64, &name); if (ret < 0) { return kTvmErrorExecutorModuleNoSuchInput; diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c index 99b3201b95b0..4e0e0d4fc240 100644 --- a/src/runtime/crt/common/crt_runtime_api.c +++ b/src/runtime/crt/common/crt_runtime_api.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -90,10 +91,26 @@ int TVMArrayFree(TVMArrayHandle handle) { int TVMDeviceAllocDataSpace(DLDevice dev, size_t nbytes, size_t alignment, DLDataType type_hint, void** out_data) { - if (alignment != 1) { - nbytes = (nbytes + alignment - 1) / alignment * alignment; - } - return TVMPlatformMemoryAllocate(nbytes, dev, out_data); + // The TVMPlatformMemoryAllocate function does not guarantee the + // alignment of the allocation. Therefore, deliberately + // overallocate by (alignment-1) and return an aligned region from + // it. + size_t total_bytes = nbytes + sizeof(void*) + (alignment - 1); + void* allocated_buf; + int err = TVMPlatformMemoryAllocate(total_bytes, dev, &allocated_buf); + if (err) return err; + + void* first_allowed_data_ptr = ((uint8_t*)allocated_buf) + sizeof(void*); + uintptr_t offset = (alignment - ((uintptr_t)first_allowed_data_ptr) % alignment); + void* data_ptr = first_allowed_data_ptr + offset; + + // Must keep a pointer to the original allocation, so that it can be + // passed to TVMPlatformMemoryFree. + ((void**)data_ptr)[-1] = allocated_buf; + + *out_data = data_ptr; + + return err; } int TVMDeviceAllocDataSpaceWithScope(DLDevice dev, int ndim, const int64_t* shape, DLDataType dtype, @@ -110,7 +127,10 @@ int TVMDeviceAllocDataSpaceWithScope(DLDevice dev, int ndim, const int64_t* shap return TVMDeviceAllocDataSpace(dev, nbytes, align, dtype, out_data); } -int TVMDeviceFreeDataSpace(DLDevice dev, void* ptr) { return TVMPlatformMemoryFree(ptr, dev); } +int TVMDeviceFreeDataSpace(DLDevice dev, void* ptr) { + void* allocated_buf = ((void**)ptr)[-1]; + return TVMPlatformMemoryFree(allocated_buf, dev); +} TVM_ATTRIBUTE_UNUSED static bool IsContiguous(const DLTensor* arr) { if (arr->strides == NULL) return true; diff --git a/src/runtime/crt/common/ndarray.c b/src/runtime/crt/common/ndarray.c index b0e869766bde..1c1e3c58f47a 100644 --- a/src/runtime/crt/common/ndarray.c +++ b/src/runtime/crt/common/ndarray.c @@ -63,8 +63,8 @@ int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, D return status; } int total_elem_bytes = TVMNDArray_DataSizeBytes(array); - array->dl_tensor.data = - TVMBackendAllocWorkspace(kDLCPU, 0, total_elem_bytes, dtype.code, dtype.bits); + TVMDeviceAllocDataSpace(dev, total_elem_bytes, TVM_CRT_ALLOC_ALIGNMENT, dtype, + &array->dl_tensor.data); memset(array->dl_tensor.data, 0, total_elem_bytes); return 0; } @@ -167,7 +167,7 @@ int TVMNDArray_Release(TVMNDArray* arr) { return 0; } - err = TVMPlatformMemoryFree(arr->dl_tensor.data, dev); + err = TVMDeviceFreeDataSpace(dev, arr->dl_tensor.data); if (err != kTvmErrorNoError) { return err; } diff --git a/src/runtime/crt/crt_config.h.template b/src/runtime/crt/crt_config.h.template index 1d32253282e8..70ac85b91fec 100644 --- a/src/runtime/crt/crt_config.h.template +++ b/src/runtime/crt/crt_config.h.template @@ -54,6 +54,9 @@ /*! Maximum supported string length in parameter names */ #define TVM_CRT_MAX_STRLEN_PARAM_NAME ${TVM_CRT_MAX_STRLEN_PARAM_NAME} +/*! Alignment (in bytes) for data buffer allocation */ +#define TVM_CRT_ALLOC_ALIGNMENT ${TVM_CRT_ALLOC_ALIGNMENT} + /*! Enable checks to enforce the stack allocator with a FIFO ordering. Off by default */ // #define TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK diff --git a/src/runtime/crt/graph_executor/graph_executor.c b/src/runtime/crt/graph_executor/graph_executor.c index 395a343ccb41..46704a332a31 100644 --- a/src/runtime/crt/graph_executor/graph_executor.c +++ b/src/runtime/crt/graph_executor/graph_executor.c @@ -841,15 +841,10 @@ int TVMGraphExecutor_LoadParams(TVMGraphExecutor* executor, const char* param_bl status = -1; } - if (executor->data_entry[eid].dl_tensor.shape) { - err = TVMPlatformMemoryFree(executor->data_entry[eid].dl_tensor.shape, dev); - if (err != kTvmErrorNoError) { - status = -1; - } - executor->data_entry[eid].dl_tensor.shape = 0; - } + // The memory in the executor->data_entry[eid].dl_tensor.shape is + // owned by attrs->shape, and should not be freed here. if (executor->data_entry[eid].dl_tensor.data) { - err = TVMPlatformMemoryFree(executor->data_entry[eid].dl_tensor.data, dev); + err = TVMDeviceFreeDataSpace(dev, executor->data_entry[eid].dl_tensor.data); if (err != kTvmErrorNoError) { status = -1; } diff --git a/src/runtime/crt/host/CMakeLists.txt.template b/src/runtime/crt/host/CMakeLists.txt.template index be0bce85513b..8059ea316524 100644 --- a/src/runtime/crt/host/CMakeLists.txt.template +++ b/src/runtime/crt/host/CMakeLists.txt.template @@ -34,6 +34,9 @@ set(CRT_LIBS microtvm_rpc_server memory ) + +add_library(tvm_model) + # Build CRT libraries foreach(crt_lib_name ${CRT_LIBS}) add_library(${crt_lib_name}) @@ -41,11 +44,16 @@ foreach(crt_lib_name ${CRT_LIBS}) target_sources(${crt_lib_name} PRIVATE ${crt_lib_srcs}) target_include_directories(${crt_lib_name} PRIVATE crt_config crt/include) target_compile_definitions(${crt_lib_name} PRIVATE -DTVM_HOST_USE_GRAPH_EXECUTOR_MODULE) - target_link_libraries(main PRIVATE ${crt_lib_name}) + # Circular dependencies result in the static libraries being listed + # twice in the link command, resolving circular dependencies between + # the libraries and the model. + # + # See https://cmake.org/cmake/help/latest/command/target_link_libraries.html#cyclic-dependencies-of-static-libraries + target_link_libraries(tvm_model PRIVATE ${crt_lib_name}) + target_link_libraries(${crt_lib_name} PRIVATE tvm_model) endforeach(crt_lib_name ${CRT_LIBS}) # Build model files -add_library(tvm_model) file(GLOB_RECURSE tvm_model_srcs model/codegen/host/src/*.c model/codegen/host/lib/*.o) target_sources(tvm_model PRIVATE ${tvm_model_srcs}) target_include_directories(tvm_model PRIVATE ${CMAKE_SOURCE_DIR}/include crt_config crt/include) diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc index c2efa79c0c83..e055187bb467 100644 --- a/src/runtime/ndarray.cc +++ b/src/runtime/ndarray.cc @@ -182,30 +182,7 @@ struct NDArray::Internal { NDArray NDArray::CreateView(ShapeTuple shape, DLDataType dtype, uint64_t relative_byte_offset) { ICHECK(data_ != nullptr); - const DLTensor& orig = get_mutable()->dl_tensor; - CHECK(IsContiguous()) << [&orig]() { - std::stringstream ss; - ss << "Can only create view for compact tensor, but found strides "; - - ss << "["; - for (int i = 0; i < orig.ndim; i++) { - if (i) ss << ", "; - ss << orig.strides[i]; - } - ss << "]"; - - ss << ", for shape "; - ss << "["; - for (int i = 0; i < orig.ndim; i++) { - if (i) ss << ", "; - ss << orig.shape[i]; - } - ss << "]"; - return ss.str(); - }(); - const auto& curr_dl_tensor = get_mutable()->dl_tensor; - NDArray ret = Internal::Create(shape, dtype, curr_dl_tensor.device); size_t curr_size = GetDataSize(this->get_mutable()->dl_tensor); @@ -273,9 +250,6 @@ NDArray NDArray::FromDLPack(DLManagedTensor* tensor) { data->SetDeleter(Internal::DLPackDeleter); // fill up content. data->manager_ctx = tensor; - ICHECK(::tvm::runtime::IsContiguous(tensor->dl_tensor)) << "DLManagedTensor must be contiguous."; - ICHECK(IsAligned(tensor->dl_tensor)) - << "Data in DLManagedTensor is not aligned as required by NDArray"; data->dl_tensor = tensor->dl_tensor; // update shape_ std::vector shape; diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc index 6fc083d17ccf..e378de6a9228 100644 --- a/src/target/llvm/codegen_llvm.cc +++ b/src/target/llvm/codegen_llvm.cc @@ -1439,7 +1439,12 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) { return ret_dummy; } else if (op->op.same_as(builtin::reinterpret())) { llvm::Type* target = DTypeToLLVMType(op->dtype); - return builder_->CreateBitCast(MakeValue(op->args[0]), target); + llvm::Value* value = MakeValue(op->args[0]); + if (op->args[0].dtype().is_handle() && (op->dtype.is_int() || op->dtype.is_uint())) { + return builder_->CreatePtrToInt(value, target); + } else { + return builder_->CreateBitCast(value, target); + } } else if (op->op.same_as(builtin::isnan())) { // TODO(hgt312): set fast math flag llvm::Value* a = MakeValue(op->args[0]); diff --git a/src/tir/transforms/arg_binder.cc b/src/tir/transforms/arg_binder.cc index 5b9e005b7ea3..a4e39984ffd5 100644 --- a/src/tir/transforms/arg_binder.cc +++ b/src/tir/transforms/arg_binder.cc @@ -302,25 +302,47 @@ void ArgBinder::BindDLTensor(const Buffer& buffer, const PrimExpr& device_type, if (Bind_(buffer->data, TVMArrayGet(DataType::Handle(), handle, builtin::kArrData), arg_name + ".data", true)) { Var vptr(buffer->data); + def_handle_dtype_.Set(vptr, tir::TypeAnnotation(buffer->dtype)); + + // Mark alignment of external bufs. + AssertStmt satisfies_tvm_alignment = [&]() { + PrimExpr interpretable_as_pointer = + (device_type == kDLCPU) || (device_type == kDLCUDA) || (device_type == kDLCUDAHost) || + (device_type == kDLCUDAManaged) || (device_type == kDLHexagon); + PrimExpr is_aligned = + tvm::floormod(tvm::reinterpret(DataType::UInt(vptr->dtype.bits()), vptr), + buffer->data_alignment) == 0; + + std::ostringstream alignment_err_msg; + alignment_err_msg << arg_name << ".data is expected to be aligned to " + << buffer->data_alignment << " bytes"; + + return AssertStmt(!interpretable_as_pointer || is_aligned, StringImm(alignment_err_msg.str()), + nop); + }(); + asserts_.push_back(satisfies_tvm_alignment); // Check if the data pointer is NULL. This check is skipped for // size-0 arrays, since CUDA provides a NULL pointer for size-zero // allocations. - auto alloc_size = [&]() -> PrimExpr { - PrimExpr product = IntImm(buffer->DefaultIndexType(), 1); - for (const auto& dim : buffer->shape) { - product *= dim; - } - return product; + AssertStmt valid_data_ptr_for_non_empty_array = [&]() { + auto alloc_size = [&]() -> PrimExpr { + PrimExpr product = IntImm(buffer->DefaultIndexType(), 1); + for (const auto& dim : buffer->shape) { + product *= dim; + } + return product; + }(); + return AssertStmt( + alloc_size == 0 || !Call(DataType::Bool(), builtin::isnullptr(), {vptr}), + tvm::tir::StringImm(arg_name + " is expected to have non-NULL data pointer"), nop); }(); - asserts_.emplace_back(AssertStmt( - alloc_size == 0 || !Call(DataType::Bool(), builtin::isnullptr(), {vptr}), - tvm::tir::StringImm(arg_name + " is expected to have non-NULL data pointer"), nop)); + asserts_.push_back(valid_data_ptr_for_non_empty_array); - def_handle_dtype_.Set(vptr, tir::TypeAnnotation(buffer->dtype)); - // mark alignment of external bufs - init_nest_.emplace_back(AttrStmt(vptr, tir::attr::storage_alignment, - IntImm(DataType::Int(32), buffer->data_alignment), nop)); + // AttrStmt provides a @llvm.assume that can be placed in the + // generated LLVM kernel + init_nest_.push_back(AttrStmt(vptr, tir::attr::storage_alignment, + IntImm(DataType::Int(32), buffer->data_alignment), nop)); } } diff --git a/src/tir/usmp/analysis/extract_buffer_info.cc b/src/tir/usmp/analysis/extract_buffer_info.cc index 5abfe24f434d..f447ddaa200f 100644 --- a/src/tir/usmp/analysis/extract_buffer_info.cc +++ b/src/tir/usmp/analysis/extract_buffer_info.cc @@ -244,13 +244,25 @@ void BufferInfoExtractor::RecordAllocateNodeInfo(const AllocateNode* op) { "user-given arguments for memory pools, the default behaviour is a single size " "un-restricted pool is assigned"; PrimFunc func = scope_stack_.top().func; - Optional executor_config = - module_->GetAttr(tvm::attr::kExecutor); - Integer workspace_alignment = 16; - if (executor_config) { - workspace_alignment = - executor_config.value()->GetAttr("workspace-byte-alignment").value_or(16); - } + auto workspace_alignment = [&]() -> Integer { + if (const auto* decl_buffer = op->body.as()) { + ICHECK(decl_buffer->buffer->data.same_as(op->buffer_var)) + << "DeclBuffer of Buffer " << decl_buffer->buffer << " has data ptr " + << decl_buffer->buffer->data + << ", which is mismatched from the parent Allocate's buffer_var of " + << op->buffer_var; + return decl_buffer->buffer->data_alignment; + } + + if (auto executor_config = module_->GetAttr(tvm::attr::kExecutor)) { + if (auto config_alignment = + executor_config.value()->GetAttr("workspace-byte-alignment")) { + return config_alignment.value(); + } + } + + return tvm::runtime::kAllocAlignment; + }(); BufferInfoKind bi_kind = BufferInfoKind::kIntermediate; String buffer_info_name = op->buffer_var->name_hint; diff --git a/tests/python/codegen/test_target_codegen_llvm.py b/tests/python/codegen/test_target_codegen_llvm.py index f1316ae3cee0..f81bb8a3b744 100644 --- a/tests/python/codegen/test_target_codegen_llvm.py +++ b/tests/python/codegen/test_target_codegen_llvm.py @@ -382,7 +382,7 @@ def test_alignment(): s = te.create_schedule(B.op) bx, tx = s[B].split(B.op.axis[0], factor=8) s[B].vectorize(tx) - f = tvm.build(s, [A, B], "llvm", name="test_alignment") + f = tvm.build(s, [A, B], "llvm -opt-level=0", name="test_alignment") lines = f.get_source().split("\n") diff --git a/tests/python/tir-transform/test_tir_argument_alignment.py b/tests/python/tir-transform/test_tir_argument_alignment.py new file mode 100644 index 000000000000..081dae64254f --- /dev/null +++ b/tests/python/tir-transform/test_tir_argument_alignment.py @@ -0,0 +1,162 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-function-docstring,missing-module-docstring + +import pytest + +import tvm +import tvm.testing +from tvm.script import tir as T + + +alignment_required = tvm.testing.parameter(1, 32, 64) +offset = tvm.testing.parameter(0, 3, 32, 64) + + +def test_aligned_dltensor(alignment_required, offset): + """Alignment of buffer arguments checked during DLTensor unpacking + + TVM allocates buffers that are aligned according to the value of + `tvm::runtime::kAllocAlignment`. However, buffers may be + non-aligned, either when provided by an external source, or when + the TVM runtime used for compilation and for execution have a + different value of `tvm::runtime::kAllocAlignment` through the + `TVM_KALLOC_ALIGNMENT` macro definition. In addition, while + `tvm::runtime::kAllocAlignment` is the default alignment for TIR + buffers, it can be overridden on a per-buffer basis. + + This test varies the alignment required by a buffer argument and + the alignment provided by an externally-owned array, validating + that non-aligned buffers may always be converted to TVM, and must + have their alignment validated when calling a PrimFunc. + """ + torch = pytest.importorskip("torch") + + @T.prim_func + def func(a: T.handle): + A = T.match_buffer(a, 16, dtype="int8", align=alignment_required) + T.evaluate(0) + + built = tvm.build(func) + + torch_tensor = torch.arange(128, dtype=torch.int8) + torch_view = torch_tensor[offset : offset + 16] + tvm_array = tvm.nd.from_dlpack(torch_view) + + satisfies_alignment = offset % alignment_required == 0 + if satisfies_alignment: + built(tvm_array) + else: + with pytest.raises(tvm.TVMError): + built(tvm_array) + + +contiguity_test_case = tvm.testing.parameter( + by_dict={ + "entire_first_row": ([4, 16], [0, 0]), + "entire_second_row": ([4, 16], [1, 0]), + "left_half_of_first_row": ([4, 32], [0, 0]), + "right_half_of_first_row": ([4, 32], [0, 16]), + } +) + + +def test_contiguous_dltensor(contiguity_test_case): + """Validate argument buffer is compact when strides are unspecified.""" + torch = pytest.importorskip("torch") + + @T.prim_func + def func(a: T.handle): + A = T.match_buffer(a, [1, 16], dtype="int8", align=1) + T.evaluate(0) + + built = tvm.build(func) + + view_backing_shape, view_offset = contiguity_test_case + torch_tensor = torch.zeros(*view_backing_shape, dtype=torch.int8) + torch_view = torch_tensor[ + view_offset[0] : view_offset[0] + 1, + view_offset[1] : view_offset[1] + 16, + ] + tvm_array = tvm.nd.from_dlpack(torch_view) + + built(tvm_array) + + +strided_test_case = tvm.testing.parameter( + by_dict={ + "entire_buffer": (8, 16), + "split_in_slowest_changing_dim": (32, 16), + "split_in_fastest_changing_dim": (8, 64), + } +) + + +def test_dynamic_striding_on_external_dltensor(strided_test_case): + """External buffers may be strided. + + Validity is checked by the TIR unpacking of the DLTensor, based on + the requirements of the TIR buffer. + """ + torch = pytest.importorskip("torch") + + @T.prim_func + def func(a: T.handle): + stride_i = T.var("int32") + stride_j = T.var("int32") + A = T.match_buffer(a, [8, 16], strides=[stride_i, stride_j], dtype="int8", align=1) + T.evaluate(0) + + built = tvm.build(func) + + torch_tensor = torch.zeros(*strided_test_case, dtype=torch.int8) + torch_view = torch_tensor[:8, :16] + tvm_array = tvm.nd.from_dlpack(torch_view) + + built(tvm_array) + + +def test_static_striding_on_external_dltensor(strided_test_case): + """External buffers may be strided. + + Import of strided arrays from external sources is legal. The + validity for any given PrimFunc is checked by the TIR unpacking of + the DLTensor, based on the requirements of the TIR buffer. + """ + torch = pytest.importorskip("torch") + + @T.prim_func + def func(a: T.handle): + A = T.match_buffer(a, [8, 16], dtype="int8", align=1) + T.evaluate(0) + + built = tvm.build(func) + + torch_tensor = torch.zeros(*strided_test_case, dtype=torch.int8) + torch_view = torch_tensor[:8, :16] + tvm_array = tvm.nd.from_dlpack(torch_view) + + has_correct_striding = strided_test_case[1] == 16 + if has_correct_striding: + built(tvm_array) + else: + with pytest.raises(tvm.TVMError): + built(tvm_array) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/tir-usmp/test_tir_usmp_algo.py b/tests/python/tir-usmp/test_tir_usmp_algo.py index b9cfde485633..886b10c3b2fc 100644 --- a/tests/python/tir-usmp/test_tir_usmp_algo.py +++ b/tests/python/tir-usmp/test_tir_usmp_algo.py @@ -365,8 +365,8 @@ def run_model(input: T.handle, output: T.handle) -> None: @pytest.mark.parametrize( ["algorithm", "fast_memory_size", "slow_memory_size"], [ - ("greedy_by_size", 200704, 1418528), - ("greedy_by_conflicts", 200704, 1418528), + ("greedy_by_size", 200704, 1418560), + ("greedy_by_conflicts", 200704, 1418560), ("hill_climb", 200704, 1117462), ], ) diff --git a/tests/python/tir-usmp/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/tir-usmp/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py index 9e9fea7c8152..5de8c83f288e 100644 --- a/tests/python/tir-usmp/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py +++ b/tests/python/tir-usmp/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py @@ -106,7 +106,7 @@ class LinearStructure: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150560], dtype="uint8", elem_offset=0, align=64, offset_factor=1) placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body @@ -118,7 +118,7 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True}) - placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_65 = T.match_buffer(placeholder_62, [150560], dtype="int16", elem_offset=0, align=64, offset_factor=1) placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1) placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) @@ -178,11 +178,11 @@ class LinearStructurePlanned: @T.prim_func def __tvm_main__(input: T.handle, fast_memory_0_var: T.handle("uint8"), slow_memory_1_var: T.handle("uint8"), output: T.handle) -> None: fast_memory_0_buffer_var = T.match_buffer(fast_memory_0_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16) - slow_memory_1_buffer_var = T.match_buffer(slow_memory_1_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16) + slow_memory_1_buffer_var = T.match_buffer(slow_memory_1_var, [1418560], dtype="uint8", strides=[1], elem_offset=0, align=16) # body T.attr("default", "device_id", 0) T.attr("default", "device_type", 1) - sid_9_let: T.handle("int8") = T.address_of(slow_memory_1_buffer_var[1117472], dtype="handle") + sid_9_let: T.handle("int8") = T.address_of(slow_memory_1_buffer_var[1117504], dtype="handle") sid_8_let: T.handle("int8") = T.address_of(slow_memory_1_buffer_var[0], dtype="handle") T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9_let, fast_memory_0_buffer_var.data, slow_memory_1_buffer_var.data, dtype="int32")) T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9_let, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8_let, fast_memory_0_buffer_var.data, slow_memory_1_buffer_var.data, dtype="int32")) @@ -193,7 +193,7 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8") T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16") fast_memory_6_buffer_var = T.match_buffer(fast_memory_6_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16) - slow_memory_7_buffer_var = T.match_buffer(slow_memory_7_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16) + slow_memory_7_buffer_var = T.match_buffer(slow_memory_7_var, [1418560], dtype="uint8", strides=[1], elem_offset=0, align=16) # body tensor_2_let = T.Buffer([200704], dtype="uint8") with T.LetStmt(T.address_of(fast_memory_6_buffer_var[0], dtype="handle"), var=tensor_2_let.data): @@ -207,23 +207,23 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: @T.prim_func def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle, fast_memory_2_var: T.handle("uint8"), slow_memory_3_var: T.handle("uint8")) -> None: - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8") + placeholder_4 = T.match_buffer(placeholder_2, [150560], dtype="uint8") placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16") T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16") fast_memory_2_buffer_var = T.match_buffer(fast_memory_2_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16) - slow_memory_3_buffer_var = T.match_buffer(slow_memory_3_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16) + slow_memory_3_buffer_var = T.match_buffer(slow_memory_3_var, [1418560], dtype="uint8", strides=[1], elem_offset=0, align=16) # body for ax0_ax1_fused_1, ax2_1, ax3_inner_1 in T.grid(224, 224, 3): T_subtract_1[ax0_ax1_fused_1 * 672 + ax2_1 * 3 + ax3_inner_1] = T.cast(placeholder_4[ax0_ax1_fused_1 * 672 + ax2_1 * 3 + ax3_inner_1], "int16") - placeholder_5[0] @T.prim_func def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle, fast_memory_4_var: T.handle("uint8"), slow_memory_5_var: T.handle("uint8")) -> None: - placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16") + placeholder_65 = T.match_buffer(placeholder_62, [150560], dtype="int16") placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16") placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32") T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8") fast_memory_4_buffer_var = T.match_buffer(fast_memory_4_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16) - slow_memory_5_buffer_var = T.match_buffer(slow_memory_5_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16) + slow_memory_5_buffer_var = T.match_buffer(slow_memory_5_var, [1418560], dtype="uint8", strides=[1], elem_offset=0, align=16) # body PaddedInput_7_let = T.Buffer([157323], "int16") with T.LetStmt(T.address_of(slow_memory_5_buffer_var[802816], dtype="handle"), var=PaddedInput_7_let.data):