Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 17 additions & 10 deletions src/relay/backend/contrib/codegen_c/codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
for (size_t i = 0; i < out_shape.size(); ++i) {
out_size *= out_shape[i];
}
buf_stream << dtype << "* " << out << " = (" << dtype << "*)std::malloc(4 * " << out_size
<< ");";
buf_stream << dtype << "* " << out << " = (" << dtype << "*)malloc(4 * " << out_size << ");";
buf_decl_.push_back(buf_stream.str());

decl_stream << ", " << out << ");";
Expand Down Expand Up @@ -229,25 +228,33 @@ class CSourceCodegen : public CSourceModuleCodegenBase {
String func_name = std::get<1>(res);

// Create headers
code_stream_ << "#include <cstring>\n";
code_stream_ << "#include <vector>\n";
code_stream_ << "#include <stdio.h>\n";
code_stream_ << "#include <stdlib.h>\n";
code_stream_ << "#include <string.h>\n";
code_stream_ << "#include <tvm/runtime/c_runtime_api.h>\n";
code_stream_ << "#include <tvm/runtime/container.h>\n";
code_stream_ << "#include <tvm/runtime/packed_func.h>\n";
code_stream_ << "#include <dlpack/dlpack.h>\n";
code_stream_ << "using namespace tvm::runtime;\n";
code_stream_ << "#include <tvm/runtime/c_backend_api.h>\n";
if (!variables.empty()) {
// This segment would be generated in C++ because of the usage
// of tvm::runtime::Array. This is not ideal, but this to demonstrate
// constant copying process used packed imports in other external
// codegen. Moreover, in uTVM we dont expect this part to be generated.
code_stream_ << "#ifdef __cplusplus\n";
code_stream_ << "#include <tvm/runtime/ndarray.h>\n";
code_stream_ << "#include <tvm/runtime/packed_func.h>\n";
code_stream_ << "#endif\n";
}

// Append some common macro for operator definition.
const char* operator_macro = R"op_macro(
#define CSOURCE_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_, p_DTYPE) \
extern "C" void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) { \
void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) { \
for (int64_t i = 0; i < p_DIM1_; ++i) { \
out[i] = a[i] p_OP_ b[i]; \
} \
}

#define CSOURCE_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_, p_DTYPE) \
extern "C" void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) { \
void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) { \
for (int64_t i = 0; i < p_DIM1_; ++i) { \
for (int64_t j = 0; j < p_DIM2_; ++j) { \
int64_t k = i * p_DIM2_ + j; \
Expand Down
104 changes: 85 additions & 19 deletions src/relay/backend/contrib/codegen_c/codegen_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,40 @@ class CodegenCBase {
indent_ -= 2;
}

/*!
* \brief Creates a runtime function header
*/
void PrintRuntimeFunctionHeader(std::string func_name) {
code_stream_ << "#ifdef __cplusplus\n";
code_stream_ << "extern \"C\" {\n";
code_stream_ << "#endif\n";
code_stream_ << "TVM_DLL int32_t ";
code_stream_ << func_name << "(";
code_stream_ << "TVMValue* args, ";
code_stream_ << "int* type_code, ";
code_stream_ << "int num_args, ";
code_stream_ << "TVMValue* out_value, ";
code_stream_ << "int* out_type_code) {\n";
}

/*!
* \brief Adds a line to convert TVMValue args to DLTensors
*/
void PrintArgToData(int idx) {
PrintIndents();
code_stream_ << "DLTensor* arg" << idx << " = ";
code_stream_ << "(DLTensor*)(((TVMValue*)args)[" << idx << "].v_handle);\n";
}

/*!
* \brief Adds a line to convert TVMValue rets to DLTensors
*/
void PrintRetToData(int idx) {
PrintIndents();
code_stream_ << "DLTensor* ret" << idx << " = ";
code_stream_ << "(DLTensor*)(((TVMValue*)args)[" << idx << "].v_handle);\n";
}

/*!
* \brief Gerenate C code for the external function.
*
Expand All @@ -100,12 +134,12 @@ class CodegenCBase {
* Array<NDArray> foo_consts;
*
* // An example code for the generated C function.
* extern "C" int foo_wrapper_(DLTensor* arg0,
* int foo_wrapper_(DLTensor* arg0,
* DLTensor* arg1,
* DLTensor* out) {
* foo_(static_cast<float*>(arg0->data),
* static_cast<float*>(arg1->data),
* static_cast<float*>(out->data));
* foo_((float*)(arg0->data),
* (float*)(arg1->data),
* (float*)(out->data));
* return 0;
* }
*
Expand All @@ -124,7 +158,8 @@ class CodegenCBase {
const std::string& const_arr_name, const std::vector<Output>& outs) {
// Print signature
code_stream_ << "\n";
code_stream_ << "extern \"C\" int " << func_name << "_wrapper_(";

code_stream_ << "int " << func_name << "_wrapper_(";
for (size_t i = 0; i < args.size(); i++) {
code_stream_ << "DLTensor* arg" << i << ",\n";
code_stream_ << "\t";
Expand All @@ -142,26 +177,54 @@ class CodegenCBase {
code_stream_ << func_name << "_(";
for (size_t i = 0; i < args.size(); i++) {
const auto& dtype_str = GetDtypeString(args[i]);
code_stream_ << "static_cast<" << dtype_str << "*>(arg" << i << "->data),\n";
code_stream_ << "(" << dtype_str << "*)(arg" << i << "->data),\n";
PrintIndents();
}
for (size_t i = 0; i < outs.size() - 1; i++) {
code_stream_ << "static_cast<" << outs[i].dtype << "*>(out" << i << "->data),\n";
code_stream_ << "(" << outs[i].dtype << "*)(out" << i << "->data),\n";
PrintIndents();
}
code_stream_ << "static_cast<" << outs.back().dtype << "*>(out" << outs.size() - 1
<< "->data));\n";
code_stream_ << "(" << outs.back().dtype << "*)(out" << outs.size() - 1 << "->data));\n";
PrintIndents();
code_stream_ << "return 0;\n";
ExitScope();
code_stream_ << "}\n\n";

// Generate the macro
code_stream_ << "TVM_DLL_EXPORT_TYPED_FUNC(" << func_name << ", " << func_name
<< "_wrapper_);\n\n";
// Create the external function
PrintRuntimeFunctionHeader(func_name);
EnterScope();
for (size_t i = 0; i < args.size(); i++) {
PrintArgToData(i);
}
for (size_t i = 0; i < outs.size(); i++) {
PrintRetToData(args.size() + i);
}
PrintIndents();
code_stream_ << func_name << "_wrapper_(";
for (size_t i = 0; i < args.size(); i++) {
code_stream_ << "arg" << i << ",";
}
for (size_t i = 0; i < outs.size() - 1; i++) {
code_stream_ << "ret" << args.size() + i << ",";
}
code_stream_ << "ret" << args.size() + outs.size() - 1 << ");\n";
PrintIndents();
code_stream_ << "return 0;\n";
ExitScope();
code_stream_ << "}\n";
code_stream_ << "#ifdef __cplusplus\n";
code_stream_ << "}\n";
code_stream_ << "#endif\n";

if (!const_arr_name.empty()) {
code_stream_ << "int " << func_name << "_init_wrapper_(Array<NDArray> arr) {\n";
// If there are constants, insert the __init_ and the wrapper
// This segment would be generated in C++ because of the usage
// of tvm::runtime::Array. This is not ideal, but this to demonstrate
// constant copying process used packed imports in other external
// codegen. Moreover, in uTVM we dont expect this part to be generated.
code_stream_ << "#ifdef __cplusplus\n";
code_stream_ << "int " << func_name
<< "_init_wrapper_(tvm::runtime::Array<tvm::runtime::NDArray> arr) {\n";
EnterScope();
PrintIndents();
code_stream_ << func_name << "_consts = arr;\n";
Expand All @@ -170,6 +233,7 @@ class CodegenCBase {
code_stream_ << "}\n\n";
code_stream_ << "TVM_DLL_EXPORT_TYPED_FUNC(__init_" << func_name << ", " << func_name
<< "_init_wrapper_);\n\n";
code_stream_ << "#endif\n";
}
}

Expand Down Expand Up @@ -202,11 +266,13 @@ class CodegenCBase {
const std::vector<Output>& outs) {
// Create a declaration for global ndarrays that contain constant data.
if (!const_arr_name.empty()) {
code_stream_ << "#ifdef __cplusplus\n";
code_stream_ << const_arr_name << "\n\n";
code_stream_ << "#endif\n";
}
// Create the signature. For example, it could be:
// extern "C" void dnnl_0_(float* in0, float* in1, float* out0, float* out1) {}
code_stream_ << "extern \"C\" void " << ext_func_id << "_(";
// void dnnl_0_(float* in0, float* in1, float* out0, float* out1) {}
code_stream_ << "void " << ext_func_id << "_(";

for (const auto& arg : args) {
const auto& dtype_str = GetDtypeString(arg);
Expand Down Expand Up @@ -235,14 +301,14 @@ class CodegenCBase {
continue;
}
this->PrintIndents();
code_stream_ << "std::memcpy(out" << i << ", " << outs[i].name << ", 4 * " << outs[i].size
code_stream_ << "memcpy(out" << i << ", " << outs[i].name << ", 4 * " << outs[i].size
<< ");\n";
}

// Free buffers
for (size_t i = 0; i < buf_decl.size(); i++) {
this->PrintIndents();
code_stream_ << "std::free(buf_" << i << ");\n";
code_stream_ << "free(buf_" << i << ");\n";
}

this->ExitScope();
Expand Down Expand Up @@ -310,7 +376,7 @@ class CodegenCBase {
* \return The created declaration
*/
std::string CreateNDArrayPool(const std::string& symbol) const {
return "Array<NDArray> " + symbol + "_consts;";
return "tvm::runtime::Array<tvm::runtime::NDArray> " + symbol + "_consts;";
}

/*!
Expand All @@ -322,7 +388,7 @@ class CodegenCBase {
* \return The created reference
*/
std::string CreateDataReference(const std::string& symbol, int const_id) const {
return "static_cast<float*>(" + symbol + "_consts[" + std::to_string(const_id) + "]->data)";
return "(float*)(" + symbol + "_consts[" + std::to_string(const_id) + "]->data)";
}

/*!
Expand Down
140 changes: 140 additions & 0 deletions tests/micro/qemu/test_zephyr.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@

from tvm.micro.contrib import zephyr
from tvm.contrib import utils
from tvm.relay.expr_functor import ExprMutator
from tvm.relay.op.annotation import compiler_begin, compiler_end

BUILD = True
DEBUG = False
Expand Down Expand Up @@ -198,5 +200,143 @@ def test_relay(platform):
tvm.testing.assert_allclose(result, x_in * x_in + 1)


class CcompilerAnnotator(ExprMutator):
"""
This is used to create external functions for ccompiler.
A simple annotator that creates the following program:
|
-- begin --
|
add
|
subtract
|
multiply
|
-- end --
|
"""

def __init__(self):
super(CcompilerAnnotator, self).__init__()
self.in_compiler = 0

def visit_call(self, call):
if call.op.name == "add": # Annotate begin at args
if self.in_compiler == 1:
lhs = compiler_begin(super().visit(call.args[0]), "ccompiler")
rhs = compiler_begin(super().visit(call.args[1]), "ccompiler")
op = relay.add(lhs, rhs)
self.in_compiler = 2
return op
elif call.op.name == "subtract":
if self.in_compiler == 1:
lhs = super().visit(call.args[0])
rhs = super().visit(call.args[1])
if isinstance(lhs, relay.expr.Var):
lhs = compiler_begin(lhs, "ccompiler")
if isinstance(rhs, relay.expr.Var):
rhs = compiler_begin(rhs, "ccompiler")
return relay.subtract(lhs, rhs)
elif call.op.name == "multiply": # Annotate end at output
self.in_compiler = 1
lhs = super().visit(call.args[0])
rhs = super().visit(call.args[1])
if isinstance(lhs, relay.expr.Var):
lhs = compiler_begin(lhs, "ccompiler")
if isinstance(rhs, relay.expr.Var):
rhs = compiler_begin(rhs, "ccompiler")
op = relay.multiply(lhs, rhs)
if self.in_compiler == 2:
op = compiler_end(op, "ccompiler")
self.in_compiler = 0
return op
return super().visit_call(call)


def check_result(relay_mod, model, zephyr_board, map_inputs, out_shape, result):
"""Helper function to verify results"""
TOL = 1e-5
target = tvm.target.target.micro(model)
with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
graph, mod, params = tvm.relay.build(relay_mod, target=target)

with _make_session(model, target, zephyr_board, mod) as session:
rt_mod = tvm.micro.create_local_graph_runtime(
graph, session.get_system_lib(), session.context
)
rt_mod.set_input(**params)
for name, data in map_inputs.items():
rt_mod.set_input(name, data)
rt_mod.set_input(**params)
rt_mod.run()

out_shapes = out_shape if isinstance(out_shape, list) else [out_shape]
results = result if isinstance(result, list) else [result]

for idx, shape in enumerate(out_shapes):
out = tvm.nd.empty(shape, ctx=session.context)
out = rt_mod.get_output(idx, out)
tvm.testing.assert_allclose(out.asnumpy(), results[idx], rtol=TOL, atol=TOL)


def test_byoc_utvm(platform):
"""This is a simple test case to check BYOC capabilities of uTVM"""
model, zephyr_board = PLATFORMS[platform]
x = relay.var("x", shape=(10, 10))
w0 = relay.var("w0", shape=(10, 10))
w1 = relay.var("w1", shape=(10, 10))
w2 = relay.var("w2", shape=(10, 10))
w3 = relay.var("w3", shape=(10, 10))
w4 = relay.var("w4", shape=(10, 10))
w5 = relay.var("w5", shape=(10, 10))
w6 = relay.var("w6", shape=(10, 10))
w7 = relay.var("w7", shape=(10, 10))

# C compiler
z0 = relay.add(x, w0)
p0 = relay.subtract(z0, w1)
q0 = relay.multiply(p0, w2)

z1 = relay.add(x, w3)
p1 = relay.subtract(z1, w4)
q1 = relay.multiply(p1, w5)

# Other parts on TVM
z2 = relay.add(x, w6)
q2 = relay.subtract(z2, w7)

r = relay.concatenate((q0, q1, q2), axis=0)
f = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], r)
mod = tvm.IRModule()
ann = CcompilerAnnotator()
mod["main"] = ann.visit(f)
mod = tvm.relay.transform.PartitionGraph()(mod)
mod = tvm.relay.transform.InferType()(mod)

x_data = np.random.rand(10, 10).astype("float32")
w_data = []
for _ in range(8):
w_data.append(np.random.rand(10, 10).astype("float32"))

map_inputs = {"w{}".format(i): w_data[i] for i in range(8)}
map_inputs["x"] = x_data
check_result(
relay_mod=mod,
map_inputs=map_inputs,
out_shape=(30, 10),
result=np.concatenate(
(
((x_data + w_data[0]) - w_data[1]) * w_data[2],
((x_data + w_data[3]) - w_data[4]) * w_data[5],
x_data + w_data[6] - w_data[7],
),
axis=0,
),
model=model,
zephyr_board=zephyr_board,
)


if __name__ == "__main__":
sys.exit(pytest.main([os.path.dirname(__file__)] + sys.argv[1:]))