diff --git a/src/runtime/contrib/cutlass/fp16_group_gemm.cuh b/src/runtime/contrib/cutlass/fp16_group_gemm.cuh index a09051a86e79..cb26a0796d53 100644 --- a/src/runtime/contrib/cutlass/fp16_group_gemm.cuh +++ b/src/runtime/contrib/cutlass/fp16_group_gemm.cuh @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -36,7 +37,8 @@ void tvm_cutlass_group_gemm_impl(NDArray x, NDArray weight, NDArray indptr, NDAr NDArray out) { // Workspace is used for storing device-side group gemm arguments and cutlass internal workspace. // Recommened size is 4MB. - cudaStream_t stream = static_cast(TVMFFIEnvGetCurrentStream(kDLCUDA, x->device.device_id)); + cudaStream_t stream = + static_cast(TVMFFIEnvGetCurrentStream(kDLCUDA, x->device.device_id)); CHECK_EQ(x->ndim, 2); CHECK_EQ(weight->ndim, 3); CHECK_EQ(indptr->ndim, 1); @@ -47,7 +49,6 @@ void tvm_cutlass_group_gemm_impl(NDArray x, NDArray weight, NDArray indptr, NDAr int k = weight->shape[2]; float alpha = 1.0f; float beta = 0.0f; - cudaStream_t stream = static_cast(func().cast()); if (DataType(x->dtype) == DataType::Float(16)) { CHECK(DataType(weight->dtype) == DataType::Float(16)); diff --git a/src/runtime/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu b/src/runtime/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu index 2745c0b1fc03..b9be378a9aff 100644 --- a/src/runtime/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu +++ b/src/runtime/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu @@ -19,6 +19,7 @@ #include #include +#include #include #include #include