From f813889c81efa9dc649edf178d2007a6064d1d94 Mon Sep 17 00:00:00 2001 From: Denis Khalikov Date: Fri, 28 Dec 2018 11:48:07 +0300 Subject: [PATCH] [RUNTIME][TRACE] Support trace primitive. Add trace call expr to allow trace Tensor data at the runtime. By default the default handler is enabled which prints a tracing data to stdout, otherwise user should specify a call_back as global_function (aka @tvm.register_func). The issue is related to: https://discuss.tvm.ai/t/idea-trace-expression/945 --- include/tvm/ir.h | 30 +++ python/tvm/intrin.py | 36 ++++ src/codegen/llvm/codegen_cpu.cc | 99 +++++++--- src/codegen/llvm/codegen_cpu.h | 7 + src/pass/lower_tvm_builtin.cc | 53 +++++- .../unittest/test_runtime_packed_func.py | 179 ++++++++++++++++++ 6 files changed, 374 insertions(+), 30 deletions(-) diff --git a/include/tvm/ir.h b/include/tvm/ir.h index adaffa77dae6..3ef955e834d0 100644 --- a/include/tvm/ir.h +++ b/include/tvm/ir.h @@ -393,6 +393,17 @@ constexpr const char* tvm_stack_make_array = "tvm_stack_make_array"; * } */ constexpr const char* tvm_call_packed = "tvm_call_packed"; +/*! + * \brief See pesudo code + * + * int tvm_call_trace_packed(name, TVMValue* args) { + * ModuleNode* env = GetCurrentEnv(); + * const PackedFunc* f = env->GetFuncFromEnv(name); + * (*f)(args, type_code_of(args), len(args)); + * return 0; + * } + */ +constexpr const char *tvm_call_trace_packed = "tvm_call_trace_packed"; /*! * \brief See pesudo code * Mark the content as thread local context, can get optimized @@ -422,6 +433,25 @@ constexpr const char* tvm_thread_context = "tvm_thread_context"; * } */ constexpr const char* tvm_call_packed_lowered = "tvm_call_packed_lowered"; +/*! + * \brief Lowered version of trace intrinsic, the space of value and + * type codes are explicitly allocated. The return value is the + * (end - 1) value on the stack. + * + * int tvm_call_trace_packed_lowered(name, + * TVMValue* value_stack, + * int* tcode_stack, + * int begin, + * int end) { + * ModuleNode* env = GetCurrentEnv(); + * const PackedFunc* f = env->GetFuncFromEnv(name); + * f->CallPacked(TVMArgs(value_stack[begin:end], + * tcode_stack[begin:end]), + * TVMRetValue(value_stack + end, tcode_stack + end)); + * } + */ +constexpr const char *tvm_call_trace_packed_lowered = + "tvm_call_trace_packed_lowered"; /*! * \brief See pseudo code * diff --git a/python/tvm/intrin.py b/python/tvm/intrin.py index 3207b6112b1d..ea8d2eafc686 100644 --- a/python/tvm/intrin.py +++ b/python/tvm/intrin.py @@ -488,6 +488,42 @@ def _rule_float_direct(op): return call_pure_extern(op.dtype, op.name, *op.args) return None +@_register_func("tvm.default_trace_action") +def _tvm_default_trace_action(*args): + print(list(args)) + +def trace(args, trace_action="tvm.default_trace_action"): + """Trace tensor data at the runtime. + + The trace function allows to trace specific tensor at the + runtime. The tracing value should come as last argument. + The trace action should be specified, by default + tvm.default_trace_action is used. + + Parameters + ---------- + args : list of Expr or Buffers. + Positional arguments. + + trace_action : str. + The name of the trace action. + + Returns + ------- + call : Expr + The call expression. + + See Also + -------- + tvm.call_packed : Creates packed function. + """ + if not isinstance(args, list): + raise Exception("tvm.trace consumes the args as list type") + call_args = [_pack_buffer(x) if isinstance(x, _Buffer) else x for x in args] + call_args.insert(0, trace_action) + return _make.Call( + args[-1].dtype, "tvm_call_trace_packed", call_args, _Call.Intrinsic, None, 0) + # opencl pattern for exp register_intrin_rule("opencl", "exp", _rule_float_direct, override=True) # default pattern for exp diff --git a/src/codegen/llvm/codegen_cpu.cc b/src/codegen/llvm/codegen_cpu.cc index 4e005346624b..fcad0f7b2903 100644 --- a/src/codegen/llvm/codegen_cpu.cc +++ b/src/codegen/llvm/codegen_cpu.cc @@ -526,42 +526,81 @@ llvm::Value* CodeGenCPU::GetPackedFuncHandle(const std::string& fname) { return phi; } -llvm::Value* CodeGenCPU::CreateCallPacked(const Call* op) { - CHECK_EQ(op->args.size(), 5U); - std::string func_name = op->args[0].as()->value; - llvm::Value* handle = GetPackedFuncHandle(func_name); +llvm::BasicBlock * +CodeGenCPU::MakeCallPacked(const Array &args, llvm::Value **rvalue, + llvm::Value **ret_tcode, const Type &r_type, + const int64_t begin, const int64_t end) { + using llvm::BasicBlock; + std::string func_name = args[0].as()->value; + llvm::Value *handle = GetPackedFuncHandle(func_name); // call the function - int64_t begin = op->args[3].as()->value; - int64_t end = op->args[4].as()->value; int64_t nargs = end - begin; CHECK_GE(nargs, 0); - llvm::Value* stack_value = MakeValue(op->args[1]); - llvm::Value* stack_tcode = MakeValue(op->args[2]); - llvm::Value* arg_value = builder_->CreateInBoundsGEP( - builder_->CreatePointerCast( - stack_value, t_tvm_value_->getPointerTo()), ConstInt32(begin)); - llvm::Value* arg_tcode = CreateBufferPtr( - Int(32), stack_tcode, ConstInt32(begin)); - llvm::Value* ret_value = builder_->CreateInBoundsGEP( - builder_->CreatePointerCast( - stack_value, t_tvm_value_->getPointerTo()), ConstInt32(end)); - llvm::Value* ret_tcode = CreateBufferPtr( - Int(32), stack_tcode, ConstInt32(end)); - CheckCallSuccess( - builder_->CreateCall( - RuntimeTVMFuncCall(), - {handle, arg_value, arg_tcode, ConstInt32(nargs), - ret_value, ret_tcode})); - Type r_type = op->type; + llvm::Value *stack_value = MakeValue(args[1]); + llvm::Value *stack_tcode = MakeValue(args[2]); + llvm::Value *arg_value = builder_->CreateInBoundsGEP( + builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()), + ConstInt32(begin)); + llvm::Value *arg_tcode = + CreateBufferPtr(Int(32), stack_tcode, ConstInt32(begin)); + llvm::Value *ret_value = builder_->CreateInBoundsGEP( + builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()), + ConstInt32(end)); + *ret_tcode = CreateBufferPtr(Int(32), stack_tcode, ConstInt32(end)); + BasicBlock *end_block = CheckCallSuccess(builder_->CreateCall( + RuntimeTVMFuncCall(), {handle, arg_value, arg_tcode, ConstInt32(nargs), + ret_value, *ret_tcode})); Type r_api_type = ir::APIType(r_type); - llvm::Value* rvalue = - builder_->CreateAlignedLoad( - builder_->CreatePointerCast( - ret_value, LLVMType(r_api_type)->getPointerTo()), 8); - rvalue = CreateCast(r_api_type, r_type, rvalue); + *rvalue = builder_->CreateAlignedLoad( + builder_->CreatePointerCast(ret_value, + LLVMType(r_api_type)->getPointerTo()), + 8); + *rvalue = CreateCast(r_api_type, r_type, *rvalue); + return end_block; +} + +llvm::Value *CodeGenCPU::CreateCallPacked(const Call *op) { + CHECK_EQ(op->args.size(), 5U); + llvm::Value *rvalue = nullptr; + llvm::Value *ret_tcode = nullptr; + MakeCallPacked(op->args, &rvalue, &ret_tcode, op->type, + op->args[3].as()->value, + op->args[4].as()->value); return rvalue; } +llvm::Value *CodeGenCPU::CreateCallTracePacked(const Call *op) { + using llvm::BasicBlock; + CHECK_EQ(op->args.size(), 6U); + llvm::Value *rvalue = nullptr; + llvm::Value *ret_tcode = nullptr; + BasicBlock *end_block = MakeCallPacked( + op->args, &rvalue, &ret_tcode, op->type, op->args[3].as()->value, + op->args[4].as()->value); + // Get traced value. + llvm::Value *traced_value = MakeValue(op->args[5]); + // The update_block handles case when we need to update the return value. + BasicBlock *update_block = + BasicBlock::Create(*ctx_, "update_block", function_); + // The continue_block handles case when we need to return original + // traced value. + BasicBlock *continue_block = + BasicBlock::Create(*ctx_, "continue_block", function_); + llvm::Value *ret_tcode_value = builder_->CreateAlignedLoad(ret_tcode, 8); + // Check the ret_type_code and create cmp instruction. + llvm::Value *cmp = builder_->CreateICmpNE( + ret_tcode_value, llvm::ConstantInt::get(t_int_, kNull)); + builder_->CreateCondBr(cmp, update_block, continue_block); + builder_->SetInsertPoint(update_block); + builder_->CreateBr(continue_block); + builder_->SetInsertPoint(continue_block); + // The return value depends on from what bb we come from. + llvm::PHINode *phi_rvalue = builder_->CreatePHI(traced_value->getType(), 2); + phi_rvalue->addIncoming(rvalue, update_block); + phi_rvalue->addIncoming(traced_value, end_block); + return phi_rvalue; +} + llvm::Value* CodeGenCPU::RuntimeTVMFuncCall() { if (f_tvm_func_call_ != nullptr) return f_tvm_func_call_; return GetContextPtr(gv_tvm_func_call_); @@ -608,6 +647,8 @@ void CodeGenCPU::AddStartupFunction() { llvm::Value* CodeGenCPU::CreateIntrinsic(const Call* op) { if (op->is_intrinsic(intrinsic::tvm_call_packed_lowered)) { return CreateCallPacked(op); + } else if (op->is_intrinsic(intrinsic::tvm_call_trace_packed_lowered)) { + return CreateCallTracePacked(op); } else if (op->is_intrinsic(intrinsic::tvm_static_handle)) { return CreateStaticHandle(); } else if (op->is_intrinsic(intrinsic::tvm_throw_last_error)) { diff --git a/src/codegen/llvm/codegen_cpu.h b/src/codegen/llvm/codegen_cpu.h index b7a95a835d89..464b06042b15 100644 --- a/src/codegen/llvm/codegen_cpu.h +++ b/src/codegen/llvm/codegen_cpu.h @@ -79,8 +79,15 @@ class CodeGenCPU : public CodeGenLLVM { void UnpackClosureData(llvm::Value*cdata, const Array& fields, std::unordered_map* vmap); + // Make packed call. + llvm::BasicBlock *MakeCallPacked(const Array &args, + llvm::Value **rvalue, + llvm::Value **ret_tcode, const Type &r_type, + const int64_t begin, const int64_t end); // create call into tvm packed function. llvm::Value* CreateCallPacked(const Call* op); + // Create trace call into tvm packed function. + llvm::Value* CreateCallTracePacked(const Call *op); // Create static initialization void CreateStaticInit(const std::string& init_fname, const Stmt& body); // Create parallel launch diff --git a/src/pass/lower_tvm_builtin.cc b/src/pass/lower_tvm_builtin.cc index cf3d9f7eeeb1..7820967c4b15 100644 --- a/src/pass/lower_tvm_builtin.cc +++ b/src/pass/lower_tvm_builtin.cc @@ -54,7 +54,6 @@ class BuiltinLower : public IRMutator { stmt = IRMutator::Mutate(stmt); CHECK_EQ(run_shape_stack_, 0); CHECK_EQ(run_array_stack_, 0); - CHECK_EQ(run_arg_stack_, 0); while (prep_seq_.size() != 0) { stmt = Block::make(prep_seq_.back(), stmt); prep_seq_.pop_back(); @@ -140,6 +139,8 @@ class BuiltinLower : public IRMutator { Expr Mutate_(const Call* op, const Expr &e) final { if (op->is_intrinsic(intrinsic::tvm_call_packed)) { return MakeCallPacked(op, e); + } else if (op->is_intrinsic(intrinsic::tvm_call_trace_packed)) { + return MakeCallTracePacked(op, e); } else if (op->is_intrinsic(intrinsic::tvm_stack_make_shape)) { return MakeShape(op, e); } else if (op->is_intrinsic(intrinsic::tvm_stack_make_array)) { @@ -256,6 +257,56 @@ class BuiltinLower : public IRMutator { packed_args, Call::Intrinsic); } + Expr MakeCallTracePacked(const Call *op, const Expr &e) { + size_t restore_shape_stack = run_shape_stack_; + size_t restore_array_stack = run_array_stack_; + size_t arg_stack_begin = run_arg_stack_; + run_arg_stack_ += op->args.size(); + size_t args_size = op->args.size(); + CHECK_GT(args_size, 0); + Expr expr = IRMutator::Mutate_(op, e); + op = expr.as(); + for (size_t i = 1; i < op->args.size(); ++i) { + Expr stack_index = ConstInt32(arg_stack_begin + i - 1); + Expr arg = op->args[i]; + Type t = arg.type(); + Type api_type = APIType(t); + if (t != api_type) { + arg = Cast::make(api_type, arg); + } + prep_seq_.emplace_back(TVMStructSet( + stack_value_, static_cast(arg_stack_begin + i - 1), + intrinsic::kTVMValueContent, arg)); + int arg_tcode = api_type.code(); + CHECK(!IsArrayHandle(arg)) << "Trace does not support Buffers"; + prep_seq_.emplace_back( + Store::make(stack_tcode_, + ConstInt32(arg_tcode), + stack_index, const_true(1))); + } + // UPDATE stack value + max_arg_stack_ = std::max(run_arg_stack_, max_arg_stack_); + max_shape_stack_ = std::max(run_shape_stack_, max_shape_stack_); + max_array_stack_ = std::max(run_array_stack_, max_array_stack_); + run_shape_stack_ = restore_shape_stack; + run_array_stack_ = restore_array_stack; + // Update the top of the stack, so we can use more than one + // packed function's arguments with the one stack. + run_arg_stack_ = arg_stack_begin + args_size - 1; + Array packed_args = { + op->args[0], + stack_value_, + stack_tcode_, + ConstInt32(arg_stack_begin), + ConstInt32(arg_stack_begin + op->args.size() - 1), + // Pass traced value. + op->args[args_size - 1] + }; + return Call::make( + op->type, intrinsic::tvm_call_trace_packed_lowered, + packed_args, Call::Intrinsic); + } + private: bool IsArrayHandle(const Expr& arg) { // specially set array handle. diff --git a/tests/python/unittest/test_runtime_packed_func.py b/tests/python/unittest/test_runtime_packed_func.py index 2d7d0197640b..734deac22487 100644 --- a/tests/python/unittest/test_runtime_packed_func.py +++ b/tests/python/unittest/test_runtime_packed_func.py @@ -80,6 +80,177 @@ def test_ctx_func(ctx): x = tvm._api_internal._context_test(x, x.device_type, x.device_id) assert x == tvm.opencl(10) +def test_trace_default_action(): + n = 2 + x = tvm.placeholder((n,n,n), name="X", dtype="float32") + y = tvm.compute(x.shape, lambda i, j, k: tvm.trace([i, j, k, x[i][j][k]])) + s = tvm.create_schedule(y.op) + f = tvm.build(s, [x, y], target="llvm") + xnd = tvm.nd.array(np.ones((n,n,n), dtype=x.dtype)) + ynd = tvm.nd.array(np.zeros((n,n,n), dtype=y.dtype)) + f(xnd, ynd) + +def test_trace_expr_assign(): + @tvm.register_func("tvm.trace_callback2") + def trace_buffer(x): + return + + def check_assign(dtype): + n = 4 + x = tvm.placeholder((n,n,n), name="X", dtype=dtype) + y = tvm.compute(x.shape, lambda i, j, k: tvm.trace([x[i][j][k]], "tvm.trace_callback2")) + z = tvm.compute(x.shape, lambda i, j, k: tvm.trace([y[i][j][k]], "tvm.trace_callback2")) + s = tvm.create_schedule(z.op) + f = tvm.build(s, [x, y, z], "llvm") + + xnd = tvm.nd.array(np.ones((n,n,n), dtype=x.dtype)) + ynd = tvm.nd.array(np.zeros((n,n,n), dtype=y.dtype)) + znd = tvm.nd.array(np.zeros((n,n,n), dtype=z.dtype)) + f(xnd, ynd, znd) + + assert(np.array_equal(xnd.asnumpy(), np.ones((n,n,n)))) + assert(np.array_equal(ynd.asnumpy(), np.ones((n,n,n)))) + assert(np.array_equal(znd.asnumpy(), np.ones((n,n,n)))) + + for t in ["float64", "float32", "int64", "int32"]: + check_assign(t) + +def test_trace_expr_sum_generated(): + @tvm.register_func("tvm.trace_callback3") + def trace_buffer(x): + return + + def check_expr_sum(dtype): + n = 4 + a = tvm.placeholder((n,n,n), name="a", dtype=dtype) + b = tvm.placeholder((n,n,n), name="b", dtype=dtype) + c = tvm.compute(a.shape, lambda i, j, k: tvm.trace([a[i][j][k]],"tvm.trace_callback3") + + tvm.trace([b[i][j][k]],"tvm.trace_callback3")) + s = tvm.create_schedule(c.op) + f = tvm.build(s, [a, b, c]) + xnd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=a.dtype))) + ynd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=b.dtype))) + znd = tvm.nd.array(np.zeros((n,n,n), dtype=c.dtype)) + f(xnd, ynd, znd) + assert(np.array_equal(znd.asnumpy(), xnd.asnumpy() + ynd.asnumpy())) + + for t in ["float64", "float32", "int64", "int32"]: + check_expr_sum(t) + +def test_trace_expr_sum_args(): + @tvm.register_func("tvm.trace_silent") + def silent(*args): + return + + def check_expr_sum(dtype): + n = 4 + a = tvm.placeholder((n,n,n), name="a", dtype=dtype) + b = tvm.placeholder((n,n,n), name="b", dtype=dtype) + e = tvm.placeholder((n,n,n), name="e", dtype=dtype) + d = tvm.placeholder((n,n,n), name="d", dtype=dtype) + + c = tvm.compute(a.shape, lambda i, j, k: tvm.trace([i, j, k, a[i][j][k]], "tvm.trace_silent") + + tvm.trace([i, j, k, b[i][j][k]], "tvm.trace_silent") + + tvm.trace([i, j, k, d[i][j][k]], "tvm.trace_silent") + + tvm.trace([i, j, k, e[i][j][k]], "tvm.trace_silent")) + s = tvm.create_schedule(c.op) + f = tvm.build(s, [a, b, d, e, c]) + a_nd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=a.dtype))) + b_nd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=b.dtype))) + d_nd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=d.dtype))) + e_nd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=e.dtype))) + c_nd = tvm.nd.array(np.zeros((n,n,n), dtype=c.dtype)) + f(a_nd, b_nd, d_nd, e_nd, c_nd) + assert(np.array_equal(c_nd.asnumpy(), a_nd.asnumpy() + + b_nd.asnumpy() + + d_nd.asnumpy() + + e_nd.asnumpy())) + + for t in ["float64", "float32", "int64", "int32"]: + check_expr_sum(t) + +def test_trace_expr_sum_custom(): + @tvm.register_func("tvm.trace_callback4") + def trace_buffer(x): + return + + def check_expr_sum_custom(dtype): + n = 4 + a = tvm.placeholder((n,n), name="a", dtype=dtype) + b = tvm.placeholder((n,n), name="b", dtype=dtype) + c = tvm.compute(a.shape, lambda i,j: tvm.trace([a[i][j]], "tvm.trace_callback4") + + tvm.trace([b[i][j]], "tvm.trace_callback4")) + s = tvm.create_schedule(c.op) + f = tvm.build(s, [a, b, c]) + npa = np.array([[1,0,0,0], [0,1,0,0],[0,0,1,0],[0,0,0,1]], dtype=a.dtype) + npb = np.array([[1,0,0,0], [0,1,0,0],[0,0,1,0],[0,0,0,1]], dtype=a.dtype) + xnd = tvm.nd.array(npa) + ynd = tvm.nd.array(npb) + znd = tvm.nd.array(np.zeros((n,n), dtype=c.dtype)) + f(xnd, ynd, znd) + assert(np.array_equal(znd.asnumpy(), npa + npb)) + + for t in ["float64", "float32", "int64", "int32"]: + check_expr_sum_custom(t) + +def test_trace_can_change_traced_value_int(): + @tvm.register_func("tvm.trace_change_int_first") + def trace_buffer(x): + return 13 + + @tvm.register_func("tvm.trace_change_int_second") + def trace_buffer(x): + return 14 + + def check_assign(dtype): + n = 4 + x = tvm.placeholder((n,), name="X", dtype=dtype) + y = tvm.compute(x.shape, lambda i: tvm.trace([x[i]], "tvm.trace_change_int_first")) + z = tvm.compute(x.shape, lambda i: tvm.trace([y[i]], "tvm.trace_change_int_second")) + s = tvm.create_schedule(z.op) + f = tvm.build(s, [x, y, z], "llvm") + + xnd = tvm.nd.array(np.ones((n,), dtype=x.dtype)) + ynd = tvm.nd.array(np.zeros((n,), dtype=y.dtype)) + znd = tvm.nd.array(np.zeros((n,), dtype=z.dtype)) + f(xnd, ynd, znd) + check_array_first = np.array([13, 13, 13, 13]) + check_array_second = np.array([14, 14, 14, 14]) + assert(np.array_equal(ynd.asnumpy(), check_array_first)) + assert(np.array_equal(znd.asnumpy(), check_array_second)) + + for t in ["int64", "int32"]: + check_assign(t) + +def test_trace_can_change_traced_value_float(): + @tvm.register_func("tvm.trace_change_float_first") + def trace_buffer(x): + return 13.0 + + @tvm.register_func("tvm.trace_change_float_second") + def trace_buffer(x): + return 14.0 + + def check_assign(dtype): + n = 4 + x = tvm.placeholder((n,), name="X", dtype=dtype) + y = tvm.compute(x.shape, lambda i: tvm.trace([x[i]], "tvm.trace_change_float_first")) + z = tvm.compute(x.shape, lambda i: tvm.trace([y[i]], "tvm.trace_change_float_second")) + s = tvm.create_schedule(z.op) + f = tvm.build(s, [x, y, z], "llvm") + + xnd = tvm.nd.array(np.ones((n,), dtype=x.dtype)) + ynd = tvm.nd.array(np.zeros((n,), dtype=y.dtype)) + znd = tvm.nd.array(np.zeros((n,), dtype=z.dtype)) + f(xnd, ynd, znd) + check_array_first = np.array([13.0, 13.0, 13.0, 13.0]) + check_array_second = np.array([14.0, 14.0, 14.0, 14.0]) + assert(np.array_equal(ynd.asnumpy(), check_array_first)) + assert(np.array_equal(znd.asnumpy(), check_array_second)) + + for t in ["float64", "float32"]: + check_assign(t) + if __name__ == "__main__": test_empty_array() test_get_global() @@ -88,3 +259,11 @@ def test_ctx_func(ctx): test_return_func() test_byte_array() test_ctx() + test_trace_expr_assign() + test_trace_expr_sum_generated() + test_trace_expr_sum_custom() + test_trace_expr_sum_args() + test_trace_default_action() + test_trace_can_change_traced_value_int() + test_trace_can_change_traced_value_float() +