From 77274c7de85560dffec318a0aabc647ccd275eee Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Wed, 28 Jan 2026 17:25:40 +0000 Subject: [PATCH 1/6] version change Signed-off-by: Pawel Gadzinski --- tests/cpp/operator/test_grouped_gemm.cu | 6 +++--- .../common/gemm/cublaslt_grouped_gemm.cu | 16 ++++++++-------- .../common/include/transformer_engine/gemm.h | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index 35c4375cbe..a694052b15 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -102,8 +102,8 @@ std::vector> make_shapes(ShapeCase scase) { } void run_grouped_gemm_case(const TestParams& params) { -#if CUBLAS_VERSION < 130100 - GTEST_SKIP() << "Grouped GEMM requires cuBLAS 13.1+, but compile-time cuBLAS version is " +#if CUBLAS_VERSION < 130200 + GTEST_SKIP() << "Grouped GEMM requires cuBLAS 13.2+, but compile-time cuBLAS version is " << CUBLAS_VERSION << "."; #else if (getDeviceComputeCapability() < blackwellComputeCapability) { @@ -267,7 +267,7 @@ void run_grouped_gemm_case(const TestParams& params) { atol, rtol); } -#endif // CUBLAS_VERSION >= 130100 +#endif // CUBLAS_VERSION >= 130200 } class GroupedGemmTest : public ::testing::TestWithParam {}; diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index a1206474ea..773a9f9080 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -26,7 +26,7 @@ inline void CreateCublasHandle(cublasLtHandle_t *handle) { } // namespace -#if CUBLAS_VERSION >= 130100 +#if CUBLAS_VERSION >= 130200 namespace { @@ -543,12 +543,12 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT NVTE_API_CALL(nvte_grouped_gemm); using namespace transformer_engine; - // Grouped GEMM requires Blackwell (SM100) or newer and cuBLAS 13.1+ + // Grouped GEMM requires Blackwell (SM100) or newer and cuBLAS 13.2+ const int current_device = cuda::current_device(); NVTE_CHECK(cuda::sm_arch(current_device) >= 100, "nvte_grouped_gemm requires Blackwell (SM100) or newer architecture."); - NVTE_CHECK(cuda::cublas_version() >= 130100, - "nvte_grouped_gemm requires cuBLAS 13.1+, but run-time cuBLAS version is ", + NVTE_CHECK(cuda::cublas_version() >= 130200, + "nvte_grouped_gemm requires cuBLAS 13.2+, but run-time cuBLAS version is ", cuda::cublas_version()); // Convert to internal types @@ -631,15 +631,15 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT kGroupedGemmCublasWorkspaceSize, stream)); } -#else // CUBLAS_VERSION < 130100 +#else // CUBLAS_VERSION < 130200 void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedTensor B, int transb, const NVTEGroupedTensor C, NVTEGroupedTensor D, const NVTETensor alpha, const NVTETensor beta, NVTETensor workspace_setup, NVTETensor workspace_cublas, NVTEGroupedMatmulConfig config, cudaStream_t stream) { - NVTE_ERROR("nvte_grouped_gemm requires cuBLAS 13.1+, but compile-time cuBLAS version is ", - CUBLAS_VERSION, ". Please upgrade to CUDA 13.1 or newer."); + NVTE_ERROR("nvte_grouped_gemm requires cuBLAS 13.2+, but compile-time cuBLAS version is ", + CUBLAS_VERSION, ". Please upgrade to CUDA 13.2 or newer."); } -#endif // CUBLAS_VERSION >= 130100 +#endif // CUBLAS_VERSION >= 130200 diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 1afc9828e8..92713a5ba3 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -299,7 +299,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor /* EXPERIMENTAL FEATURE AND SUBJECT TO CHANGE. */ /*! \brief Grouped matrix multiplication: D = alpha * op(A) @ op(B) + beta * C * - * \note Requires cuBLAS 13.1+ (CUDA 13.1+) and Blackwell (SM100) or newer GPU architecture. + * \note Requires cuBLAS 13.2+ (CUDA 13.2+) and Blackwell (SM100) or newer GPU architecture. * Will error at runtime if compiled with an older cuBLAS version or run on * a pre-Blackwell GPU. * @@ -322,7 +322,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor * \param[in] stream CUDA stream for the operation. * * Requirements: - * - cuBLAS 13.1+ (CUDA 13.1+) + * - cuBLAS 13.2+ (CUDA 13.2+) * - Blackwell (SM100) or newer GPU architecture * - A, B, C (if provided), D must have the same num_tensors * - For each i: D[i] = alpha[i] * op(A[i]) @ op(B[i]) + beta[i] * C[i] From e034376517f8f8ee8c0687035ca584a3587ac7ab Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Wed, 28 Jan 2026 23:58:55 +0000 Subject: [PATCH 2/6] fix Signed-off-by: Pawel Gadzinski --- transformer_engine/common/gemm/cublaslt_grouped_gemm.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index 773a9f9080..899e354c18 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -544,12 +544,12 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT using namespace transformer_engine; // Grouped GEMM requires Blackwell (SM100) or newer and cuBLAS 13.2+ - const int current_device = cuda::current_device(); - NVTE_CHECK(cuda::sm_arch(current_device) >= 100, + const int current_device = transformer_engine::cuda::current_device(); + NVTE_CHECK(transformer_engine::cuda::sm_arch(current_device) >= 100, "nvte_grouped_gemm requires Blackwell (SM100) or newer architecture."); - NVTE_CHECK(cuda::cublas_version() >= 130200, + NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 130200, "nvte_grouped_gemm requires cuBLAS 13.2+, but run-time cuBLAS version is ", - cuda::cublas_version()); + transformer_engine::cuda::cublas_version()); // Convert to internal types const GroupedTensor *inputA = convertNVTEGroupedTensorCheck(A); From 12f7e8ffbde6614382b15312780a4d7536b6efde Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Thu, 29 Jan 2026 17:28:20 +0000 Subject: [PATCH 3/6] ifx Signed-off-by: Pawel Gadzinski --- .../common/gemm/cublaslt_gemm.cu | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index e4e97abd91..c58c3cb47a 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -494,9 +494,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, #endif // CUBLAS_VERSION >= 120800 } else if (mxfp8_gemm) { #if CUBLAS_VERSION >= 120800 - NVTE_CHECK(cuda::cublas_version() >= 120800, + NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120800, "MXFP8 requires cuBLAS 12.8+, but run-time cuBLAS version is ", - cuda::cublas_version()); + transformer_engine::cuda::cublas_version()); // Check that scales are in expected format NVTE_CHECK(inputA->with_gemm_swizzled_scales, @@ -518,7 +518,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, // Workaround for heuristic cache bug in cublasLt. This separates the MXFP8 cache key from non-block scaling. // CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE is unused for block scaling so it's safe to set. - if (cuda::cublas_version() <= 120803) { + if (transformer_engine::cuda::cublas_version() <= 120803) { const int64_t dummy_a_vec_stride = 1; NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( operationDesc, CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE, &dummy_a_vec_stride, @@ -530,9 +530,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, #endif // CUBLAS_VERSION >= 120800 } else if (use_fp4) { // NVFP4 GEMM #if CUBLAS_VERSION >= 120800 - NVTE_CHECK(cuda::cublas_version() >= 120800, + NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120800, "FP4 requires cuBLAS 12.8+, but run-time cuBLAS version is ", - cuda::cublas_version()); + transformer_engine::cuda::cublas_version()); // Check that scales are in expected format NVTE_CHECK(inputA->with_gemm_swizzled_scales, @@ -567,9 +567,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, (inputB->scaling_mode == NVTE_BLOCK_SCALING_1D || inputB->scaling_mode == NVTE_BLOCK_SCALING_2D)) { #if CUBLAS_VERSION >= 120900 - NVTE_CHECK(cuda::cublas_version() >= 120900, + NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120900, "FP8 block scaling requires cuBLAS 12.9+, but run-time cuBLAS version is ", - cuda::cublas_version()); + transformer_engine::cuda::cublas_version()); // Check that matrix formats are valid NVTE_CHECK((!(inputA->scaling_mode == NVTE_BLOCK_SCALING_2D && @@ -602,7 +602,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, } #if CUBLAS_VERSION >= 120800 - if (cuda::cublas_version() >= 120800) { + if (transformer_engine::cuda::cublas_version() >= 120800) { NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_A_SCALE_MODE, &scaling_mode_a, sizeof(scaling_mode_a))); @@ -619,7 +619,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( operationDesc, CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, &D_amax, sizeof(D_amax))); #if CUBLAS_VERSION >= 120800 - if (cuda::cublas_version() >= 120800) { + if (transformer_engine::cuda::cublas_version() >= 120800) { // NOTE: In all current cases where FP8 output is supported, the input is // scaled identically to the output. NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, @@ -703,12 +703,14 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, "Atomic GEMM requires cuBLAS >=12.2.5 and <13.0.0, but compile-time cuBLAS version is ", CUBLAS_VERSION); #else - NVTE_CHECK(cuda::cudart_version() >= 12020 && cuda::cudart_version() < 13000, + NVTE_CHECK(transformer_engine::cuda::cudart_version() >= 12020 && + transformer_engine::cuda::cudart_version() < 13000, "Atomic GEMM requires CUDA >=12.2.0 and <13.0.0, but run-time CUDA version is ", - cuda::cudart_version()); - NVTE_CHECK(cuda::cublas_version() >= 120205 && cuda::cublas_version() < 130000, + transformer_engine::cuda::cudart_version()); + NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120205 && + transformer_engine::cuda::cublas_version() < 130000, "Atomic GEMM requires cuBLAS >=12.2.5 and <13.0.0, but run-time cuBLAS version is ", - cuda::cublas_version()); + transformer_engine::cuda::cublas_version()); if (m_split == 0) m_split = 1; if (n_split == 0) n_split = 1; NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( @@ -934,9 +936,10 @@ void nvte_cublas_atomic_gemm(const NVTETensor A, const NVTETensor B, NVTETensor "Atomic GEMM requires CUDA version >=12.2.0 and <13.0.0, but run-time CUDA version is ", transformer_engine::cuda::cudart_version()); NVTE_CHECK( - cuda::cublas_version() >= 120205 && cuda::cublas_version() < 130000, + transformer_engine::cuda::cublas_version() >= 120205 && + transformer_engine::cuda::cublas_version() < 130000, "Atomic GEMM requires cuBLAS version >=12.2.5 and <13.0.0, but run-time cuBLAS version is ", - cuda::cublas_version()); + transformer_engine::cuda::cublas_version()); const Tensor *inputA = convertNVTETensorCheck(A); const Tensor *inputB = convertNVTETensorCheck(B); From 69773b3691eef90fc824b7b5f1b142d407c0c198 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Thu, 29 Jan 2026 23:15:25 +0000 Subject: [PATCH 4/6] fix Signed-off-by: Pawel Gadzinski --- transformer_engine/common/gemm/cublaslt_grouped_gemm.cu | 2 +- transformer_engine/common/include/transformer_engine/gemm.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index 899e354c18..e4310d53f0 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -543,7 +543,7 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT NVTE_API_CALL(nvte_grouped_gemm); using namespace transformer_engine; - // Grouped GEMM requires Blackwell (SM100) or newer and cuBLAS 13.2+ + // Grouped GEMM requires Blackwell (SM100) or newer and cuBLAS 13.1+ const int current_device = transformer_engine::cuda::current_device(); NVTE_CHECK(transformer_engine::cuda::sm_arch(current_device) >= 100, "nvte_grouped_gemm requires Blackwell (SM100) or newer architecture."); diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 92713a5ba3..7403448722 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -299,7 +299,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor /* EXPERIMENTAL FEATURE AND SUBJECT TO CHANGE. */ /*! \brief Grouped matrix multiplication: D = alpha * op(A) @ op(B) + beta * C * - * \note Requires cuBLAS 13.2+ (CUDA 13.2+) and Blackwell (SM100) or newer GPU architecture. + * \note Requires cuBLAS 13.2+ (CUDA 13.1+) and Blackwell (SM100) or newer GPU architecture. * Will error at runtime if compiled with an older cuBLAS version or run on * a pre-Blackwell GPU. * @@ -322,7 +322,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor * \param[in] stream CUDA stream for the operation. * * Requirements: - * - cuBLAS 13.2+ (CUDA 13.2+) + * - cuBLAS 13.2+ (CUDA 13.1+) * - Blackwell (SM100) or newer GPU architecture * - A, B, C (if provided), D must have the same num_tensors * - For each i: D[i] = alpha[i] * op(A[i]) @ op(B[i]) + beta[i] * C[i] From bd5de1162de59eab8026942e328b4e94c1fa9763 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Thu, 29 Jan 2026 23:22:54 +0000 Subject: [PATCH 5/6] fix Signed-off-by: Pawel Gadzinski --- transformer_engine/common/gemm/cublaslt_grouped_gemm.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index e4310d53f0..899e354c18 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -543,7 +543,7 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT NVTE_API_CALL(nvte_grouped_gemm); using namespace transformer_engine; - // Grouped GEMM requires Blackwell (SM100) or newer and cuBLAS 13.1+ + // Grouped GEMM requires Blackwell (SM100) or newer and cuBLAS 13.2+ const int current_device = transformer_engine::cuda::current_device(); NVTE_CHECK(transformer_engine::cuda::sm_arch(current_device) >= 100, "nvte_grouped_gemm requires Blackwell (SM100) or newer architecture."); From 3ae26d3be57592545912b4f2c525c50e34f8d5a0 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Fri, 30 Jan 2026 16:48:11 +0000 Subject: [PATCH 6/6] fix Signed-off-by: Pawel Gadzinski --- transformer_engine/common/gemm/cublaslt_grouped_gemm.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index 899e354c18..b3e216dc4f 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -639,7 +639,7 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT NVTETensor workspace_cublas, NVTEGroupedMatmulConfig config, cudaStream_t stream) { NVTE_ERROR("nvte_grouped_gemm requires cuBLAS 13.2+, but compile-time cuBLAS version is ", - CUBLAS_VERSION, ". Please upgrade to CUDA 13.2 or newer."); + CUBLAS_VERSION, ". Please upgrade to CUDA 13.1 or newer."); } #endif // CUBLAS_VERSION >= 130200