Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions src/codegen/codegen_cuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ std::string CodeGenCUDA::Finish() {
decl_stream << "#include <cuda_fp16.h>\n";
}

if (enable_int8_) {
decl_stream << "#include <sm_61_intrinsics.h>\n";
}

return CodeGenC::Finish();
}

Expand Down Expand Up @@ -81,13 +85,19 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) { // NOLINT(*)
os << "unsigned ";
}
}
if (t.bits() == 8 && t.lanes() == 4) {
// directly 4 8 bit int in integer.
os << "int"; return;
}
switch (t.bits()) {
case 8: {
if (!t.is_uint() && t.lanes() == 1) {
if (t.lanes() == 4) {
// directly 4 8 bit int in integer.
enable_int8_ = true;
os << "char4"; return;
} else if (t.lanes() == 8) {
enable_int8_ = true;
os << "int2"; return;
} else if (t.lanes() == 16) {
enable_int8_ = true;
os << "int4"; return;
} else if (!t.is_uint() && t.lanes() == 1) {
os << "signed char"; break;
} else {
os << "char"; break;
Expand Down
4 changes: 3 additions & 1 deletion src/codegen/codegen_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class CodeGenCUDA final : public CodeGenC {
void Init(bool output_ssa);
void AddFunction(LoweredFunc f);
std::string Finish();
bool need_include_path() { return enable_fp16_; }
bool need_include_path() { return (enable_fp16_ || enable_int8_); }
// override behavior
void VisitStmt_(const ir::For* op) final;
void PrintStorageSync(const Call* op) final;
Expand Down Expand Up @@ -49,6 +49,8 @@ class CodeGenCUDA final : public CodeGenC {
std::string vid_global_barrier_expect_;
// whether enable fp16
bool enable_fp16_{false};
// whether enable int8
bool enable_int8_{false};
};

} // namespace codegen
Expand Down
3 changes: 0 additions & 3 deletions src/codegen/opt/build_cuda_on.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ std::string FindCUDAIncludePath() {
std::string NVRTCCompile(const std::string& code, bool include_path = false) {
std::vector<std::string> compile_params;
std::vector<const char*> param_cstrings{};
int num_options = 0;
nvrtcProgram prog;
cudaDeviceProp device_prop;
std::string cc = "30";
Expand All @@ -78,13 +77,11 @@ std::string NVRTCCompile(const std::string& code, bool include_path = false) {
}

compile_params.push_back("-arch=compute_" + cc);
num_options++;

if (include_path) {
std::string include_option = "--include-path=" + FindCUDAIncludePath();

compile_params.push_back(include_option);
num_options++;
}

for (const auto& string : compile_params) {
Expand Down
64 changes: 63 additions & 1 deletion tests/python/unittest/test_codegen_cuda.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import tvm
import numpy as np
from tvm.contrib.nvcc import have_fp16
from tvm.contrib.nvcc import have_fp16, have_int8
from tvm.contrib import nvcc

def test_cuda_vectorize_add():
num_thread = 8
Expand All @@ -11,6 +12,9 @@ def check_cuda(dtype, n, lanes):
if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
print("skip because gpu does not support fp16")
return
if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
print("skip because gpu does not support int8")
return
A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
B = tvm.compute((n,), lambda i: A[i]+tvm.const(1, A.dtype), name='B')
s = tvm.create_schedule(B.op)
Expand All @@ -27,6 +31,64 @@ def check_cuda(dtype, n, lanes):

check_cuda("float32", 64, 2)
check_cuda("float16", 64, 2)
check_cuda("int8", 64, 4)


def test_cuda_multiply_add():
num_thread = 8
def check_cuda(dtype, n, lanes):
if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
print("skip because cuda is not enabled..")
return
if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
print("skip because gpu does not support int8")
return
A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
B = tvm.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes))
C = tvm.placeholder((n,), name='C', dtype="int32")
D = tvm.compute((n,),
lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D')
s = tvm.create_schedule(D.op)
xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
s[D].bind(xo, tvm.thread_axis("blockIdx.x"))
s[D].bind(xi, tvm.thread_axis("threadIdx.x"))
fun = tvm.build(s, [A, B, C, D], "cuda")
np_a = np.random.randint(low=-128, high=127, size=(n,lanes))
np_b = np.random.randint(low=-128, high=127, size=(n,lanes))
np_c = np.random.randint(low=0, high=127, size=(n,))
np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)]
ctx = tvm.gpu(0)
a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b)
c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c)
d = tvm.nd.empty((n,), D.dtype, ctx)
fun(a, b, c, d)
np.testing.assert_allclose(d.asnumpy(), np_d)
check_cuda("int8", 64, 4)

def test_cuda_vectorize_load():
num_thread = 8
def check_cuda(dtype, n, lanes):
if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
print("skip because cuda is not enabled..")
return
ctx = tvm.gpu(0)
A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
B = tvm.compute((n,), lambda i: A[i], name='B')
s = tvm.create_schedule(B.op)
bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
fun = tvm.build(s, [A, B], "cuda", name="vector_load")
np_a = np.random.randint(low=-128, high=127, size=(n,lanes))
a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
b = tvm.nd.empty((n,), B.dtype, ctx)
fun(a,b)
np.testing.assert_allclose(a.asnumpy(), b.asnumpy())
check_cuda("int8", 64, 8)
check_cuda("int8", 64, 16)

if __name__ == "__main__":
test_cuda_vectorize_add()
test_cuda_multiply_add()
test_cuda_load_store()