From eb1294b7faa4f35b55884a21b0ffc1473cb048d3 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 28 Jul 2025 14:37:57 -0700 Subject: [PATCH 1/3] Reapply "Pipe quantize kernel through `FusionExecutorCache` (#4760)" (#4854) This reverts commit d2688974a924d8b1c363d892e1aadca13002f0f2. --- csrc/fusion_segmenter.cpp | 10 ++--- csrc/ir/utils.cpp | 7 ++-- csrc/ir/utils.h | 6 +-- csrc/ops/alias.cpp | 2 +- csrc/scheduler/matmul.cpp | 9 ++++ csrc/scheduler/normalization_inner_outer.cpp | 10 +++++ csrc/scheduler/normalization_utils.cpp | 9 ++++ csrc/scheduler/reduction.cpp | 10 +++++ csrc/scheduler/resize.cpp | 9 ++++ csrc/scheduler/transpose.cpp | 9 ++++ csrc/scheduler/utils.cpp | 2 +- runtime/helpers.cu | 20 +++++++++ tests/cpp/test_gpu3.cpp | 14 +++---- tests/cpp/test_low_precision_recipe.cpp | 44 +++++++++++++------- 14 files changed, 126 insertions(+), 35 deletions(-) diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp index 55562144500..bce9375f26f 100644 --- a/csrc/fusion_segmenter.cpp +++ b/csrc/fusion_segmenter.cpp @@ -3684,7 +3684,7 @@ class MergeUpAndDownCast { } bool isUpCast(SegmentedGroup* group) const { - if (auto precision_bits = getProducerConsumerPrecision(group); + if (auto precision_bits = getProducerConsumerPrecisionBit(group); precision_bits.has_value()) { return precision_bits->first < precision_bits->second; } else { @@ -3693,7 +3693,7 @@ class MergeUpAndDownCast { } bool isDownCast(SegmentedGroup* group) const { - if (auto precision_bits = getProducerConsumerPrecision(group); + if (auto precision_bits = getProducerConsumerPrecisionBit(group); precision_bits.has_value()) { return precision_bits->first > precision_bits->second; } else { @@ -3701,7 +3701,7 @@ class MergeUpAndDownCast { } } - std::optional> getProducerConsumerPrecision( + std::optional> getProducerConsumerPrecisionBit( SegmentedGroup* group) const { if (group->exprs().size() != 1) { return std::nullopt; @@ -3712,7 +3712,7 @@ class MergeUpAndDownCast { return std::nullopt; } - return ir_utils::getPrecisionOfProducerConsumerTensors(uop); + return ir_utils::getPrecisionOfProducerConsumerTensorsBit(uop); } private: @@ -4372,7 +4372,7 @@ void SegmentCandidateFinder::privatizeUpcast() { } auto precisions = - ir_utils::getPrecisionOfProducerConsumerTensors(maybe_upcast_op); + ir_utils::getPrecisionOfProducerConsumerTensorsBit(maybe_upcast_op); if (!precisions.has_value() || precisions->first >= precisions->second) { continue; } diff --git a/csrc/ir/utils.cpp b/csrc/ir/utils.cpp index 0e744fb9c49..1252dbd6c33 100644 --- a/csrc/ir/utils.cpp +++ b/csrc/ir/utils.cpp @@ -1551,8 +1551,8 @@ std::vector strideOrderToAllocation( return allocation_domain; } -std::optional> getPrecisionOfProducerConsumerTensors( - UnaryOp* uop) { +std::optional> +getPrecisionOfProducerConsumerTensorsBit(UnaryOp* uop) { NVF_CHECK(uop != nullptr); NVF_CHECK( uop->getUnaryOpType() == UnaryOpType::Cast, @@ -1577,8 +1577,7 @@ std::optional> getPrecisionOfProducerConsumerTensors } return std::make_pair( - primDataTypeSizeByte(*inp_prim_type), - primDataTypeSizeByte(*out_prim_type)); + primDataTypeSizeBit(*inp_prim_type), primDataTypeSizeBit(*out_prim_type)); } int64_t getTMemLdStVectorizeSize(TensorView* consumer_tv) { diff --git a/csrc/ir/utils.h b/csrc/ir/utils.h index 557024aaa68..d87413bb0e1 100644 --- a/csrc/ir/utils.h +++ b/csrc/ir/utils.h @@ -794,10 +794,10 @@ std::vector strideOrderToAllocation( const std::vector& logical_domain, const std::vector& stride_order); -// Returns the number of bytes of data types of the producer and +// Returns the number of bits of data types of the producer and // consumer tensors of a cast unary op -std::optional> getPrecisionOfProducerConsumerTensors( - UnaryOp* cast_op); +std::optional> +getPrecisionOfProducerConsumerTensorsBit(UnaryOp* cast_op); // Get the in the PTX instruction of TMem load/store: // tcgen05.st.sync.aligned.32x32b.x.b32 diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp index daa5e341de0..49b5f13e6c4 100644 --- a/csrc/ops/alias.cpp +++ b/csrc/ops/alias.cpp @@ -213,7 +213,7 @@ NVF_API TensorView* reshape( logical_domain, TensorDomain::getContiguityFilledWith(logical_domain, true)), x->getDataType().value()); - IrBuilder::create(x, out_tv); + IrBuilder::create(out_tv, x); return out_tv; } diff --git a/csrc/scheduler/matmul.cpp b/csrc/scheduler/matmul.cpp index ffc2aab228f..8023d7b5f48 100644 --- a/csrc/scheduler/matmul.cpp +++ b/csrc/scheduler/matmul.cpp @@ -26,6 +26,15 @@ namespace nvfuser { bool MatmulScheduler::canScheduleCompileTime(Fusion* fusion) { + for (auto tv : fusion->allTvs()) { + if (tv->dtype() != DataType::Index && + dataTypeSizeBit(tv->dtype()) % 8 != 0) { + scheduler_debug_utils::canScheduleRejectReason( + schedulerType(), "Does not support sub-byte data types."); + return false; + } + } + const auto msg = matmul_utils::getMatmulCompileTimeRejectReason(fusion); if (!msg.empty()) { scheduler_debug_utils::canScheduleRejectReason(schedulerType(), msg); diff --git a/csrc/scheduler/normalization_inner_outer.cpp b/csrc/scheduler/normalization_inner_outer.cpp index 255d325061c..3208439a8f7 100644 --- a/csrc/scheduler/normalization_inner_outer.cpp +++ b/csrc/scheduler/normalization_inner_outer.cpp @@ -222,6 +222,16 @@ bool InnerOuterPersistentKernelScheduler::canScheduleCompileTime( Fusion* fusion) { FUSER_PERF_SCOPE( "InnerOuterPersistentKernelScheduler::canScheduleCompileTime"); + + for (auto tv : fusion->allTvs()) { + if (tv->dtype() != DataType::Index && + dataTypeSizeBit(tv->dtype()) % 8 != 0) { + scheduler_debug_utils::canScheduleRejectReason( + schedulerType(), "Does not support sub-byte data types."); + return false; + } + } + // common checks for all persistent heuristics if (!normalization_scheduler_utils::checkOpsAndInputs( fusion, schedulerType())) { diff --git a/csrc/scheduler/normalization_utils.cpp b/csrc/scheduler/normalization_utils.cpp index 980ae52f440..19f8de31622 100644 --- a/csrc/scheduler/normalization_utils.cpp +++ b/csrc/scheduler/normalization_utils.cpp @@ -1191,6 +1191,15 @@ bool checkReductionPattern( // The identical compile time check of InnerPersistentKernelScheduler and // OuterPersistentKernelScheduler. bool compileTimeCheck(Fusion* fusion, SchedulerType scheduler_type) { + for (auto tv : fusion->allTvs()) { + if (tv->dtype() != DataType::Index && + dataTypeSizeBit(tv->dtype()) % 8 != 0) { + scheduler_debug_utils::canScheduleRejectReason( + scheduler_type, "Does not support sub-byte data types."); + return false; + } + } + // common checks for all persistent heuristics if (!normalization_scheduler_utils::checkOpsAndInputs( fusion, scheduler_type)) { diff --git a/csrc/scheduler/reduction.cpp b/csrc/scheduler/reduction.cpp index dc37c972f12..26bfc13e548 100644 --- a/csrc/scheduler/reduction.cpp +++ b/csrc/scheduler/reduction.cpp @@ -1650,6 +1650,16 @@ void scheduleReduction(Fusion* fusion, const ReductionParams* rparams) { //! Check if the reduction heuristics apply in given fusion bool ReductionScheduler::canScheduleCompileTime(Fusion* fusion) { FUSER_PERF_SCOPE("ReductionScheduler::canScheduleCompileTime"); + + for (auto tv : fusion->allTvs()) { + if (tv->dtype() != DataType::Index && + dataTypeSizeBit(tv->dtype()) % 8 != 0) { + scheduler_debug_utils::canScheduleRejectReason( + schedulerType(), "Does not support sub-byte data types."); + return false; + } + } + if (scheduler_utils::isResharding(fusion)) { scheduler_debug_utils::canScheduleRejectReason( schedulerType(), "Fusion is resharding."); diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp index 5ee96cdb48c..c6a1a6a6dcb 100644 --- a/csrc/scheduler/resize.cpp +++ b/csrc/scheduler/resize.cpp @@ -67,6 +67,15 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) { return false; } + for (auto tv : fusion->allTvs()) { + if (tv->dtype() != DataType::Index && + dataTypeSizeBit(tv->dtype()) % 8 != 0) { + scheduler_debug_utils::canScheduleRejectReason( + schedulerType(), "Does not support sub-byte data types."); + return false; + } + } + if (!scheduler_tools::hasResizeBasedOps(fusion)) { scheduler_debug_utils::canScheduleRejectReason( schedulerType(), "No resize op to schedule"); diff --git a/csrc/scheduler/transpose.cpp b/csrc/scheduler/transpose.cpp index ce708680fa4..49155731e47 100644 --- a/csrc/scheduler/transpose.cpp +++ b/csrc/scheduler/transpose.cpp @@ -22,6 +22,15 @@ namespace nvfuser { bool TransposeScheduler::canScheduleCompileTime(Fusion* fusion) { FUSER_PERF_SCOPE("TransposeScheduler::canScheduleCompileTime"); + for (auto tv : fusion->allTvs()) { + if (tv->dtype() != DataType::Index && + dataTypeSizeBit(tv->dtype()) % 8 != 0) { + scheduler_debug_utils::canScheduleRejectReason( + schedulerType(), "Does not support sub-byte data types."); + return false; + } + } + if (scheduler_utils::isResharding(fusion)) { scheduler_debug_utils::canScheduleRejectReason( schedulerType(), "Fusion is resharding."); diff --git a/csrc/scheduler/utils.cpp b/csrc/scheduler/utils.cpp index 025fa6d1b5d..fa78a01c55c 100644 --- a/csrc/scheduler/utils.cpp +++ b/csrc/scheduler/utils.cpp @@ -575,7 +575,7 @@ TensorView* getUpCastInputOf(const TensorView* tv) { return nullptr; } // skip if the cast is not upcast - auto precisions = ir_utils::getPrecisionOfProducerConsumerTensors(uop); + auto precisions = ir_utils::getPrecisionOfProducerConsumerTensorsBit(uop); if (!precisions.has_value() || precisions->first >= precisions->second) { return nullptr; } diff --git a/runtime/helpers.cu b/runtime/helpers.cu index 91805e2a1aa..b343d1225ea 100644 --- a/runtime/helpers.cu +++ b/runtime/helpers.cu @@ -106,6 +106,26 @@ __device__ float fmax(float a, float b) { } } +__device__ __half fmax(__half a, __half b) { + return __half2float(a) > __half2float(b) ? a : b; +} + +__device__ __bfloat fmax(__bfloat a, __bfloat b) { + return __bfloat2float(a) > __bfloat2float(b) ? a : b; +} + +__device__ float abs(float a) { + return fabs(a); +} + +__device__ __half abs(__half a) { + return __float2half(fabs(__half2float(a))); +} + +__device__ __bfloat abs(__bfloat a) { + return __float2bfloat(fabs(__bfloat2float(a))); +} + __device__ constexpr int min(int a, int b) { return a > b ? b : a; } diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index b9d7f324f40..211a768392e 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -8972,20 +8972,20 @@ TEST_F(NVFuserTest, CastPrecision) { auto tv4 = castOp(DataType::Int, tv3); fusion.addOutput(tv4); - auto tv1_precision = ir_utils::getPrecisionOfProducerConsumerTensors( + auto tv1_precision = ir_utils::getPrecisionOfProducerConsumerTensorsBit( tv1->definition()->as()); ASSERT_TRUE(tv1_precision.has_value()); - EXPECT_EQ(tv1_precision->first, 2); - EXPECT_EQ(tv1_precision->second, 4); + EXPECT_EQ(tv1_precision->first, 16); + EXPECT_EQ(tv1_precision->second, 32); - auto tv2_precision = ir_utils::getPrecisionOfProducerConsumerTensors( + auto tv2_precision = ir_utils::getPrecisionOfProducerConsumerTensorsBit( tv2->definition()->as()); ASSERT_TRUE(tv2_precision.has_value()); - EXPECT_EQ(tv2_precision->first, 4); - EXPECT_EQ(tv2_precision->second, 2); + EXPECT_EQ(tv2_precision->first, 32); + EXPECT_EQ(tv2_precision->second, 16); // Precision of type Index is not possible to determine until lowering - auto tv4_precision = ir_utils::getPrecisionOfProducerConsumerTensors( + auto tv4_precision = ir_utils::getPrecisionOfProducerConsumerTensorsBit( tv4->definition()->as()); ASSERT_FALSE(tv4_precision.has_value()); } diff --git a/tests/cpp/test_low_precision_recipe.cpp b/tests/cpp/test_low_precision_recipe.cpp index e4e213d8117..6dc175e0997 100644 --- a/tests/cpp/test_low_precision_recipe.cpp +++ b/tests/cpp/test_low_precision_recipe.cpp @@ -17,8 +17,6 @@ namespace nvfuser { -using FP4RecipeTest = NVFuserTest; - // Testing the following function: // https://github.com/pytorch/ao/blob/b1163dc63dfa22d403586672fd3648cd661c5003/torchao/prototype/mx_formats/nvfp4_tensor.py#L545-L617 // @@ -105,17 +103,17 @@ constexpr double F4_E2M1_MAX = 6.0; constexpr double E4M3_EPS = 0.015625; constexpr double F8E4M3_MAX = 448.0; -class NVFP4QuantizeTest : public FP4RecipeTest, +class NVFP4QuantizeTest : public BlackwellBase, public ::testing::WithParamInterface {}; TEST_P(NVFP4QuantizeTest, WithoutPerTensorAmax) { auto data_hp_dtype = GetParam(); - Fusion fusion; - FusionGuard fg(&fusion); + std::unique_ptr fusion = std::make_unique(); + FusionGuard fg(fusion.get()); auto tv_data_hp = makeContigTensor(2, data_hp_dtype); - fusion.addInput(tv_data_hp); + fusion->addInput(tv_data_hp); auto tv_data_hp_reshaped = reshape(tv_data_hp, [](auto& x) { x.split(-1, block_size); }); @@ -142,23 +140,32 @@ TEST_P(NVFP4QuantizeTest, WithoutPerTensorAmax) { tv_data_scaled, IrBuilder::create(-F4_E2M1_MAX, DataType::Float), IrBuilder::create(F4_E2M1_MAX, DataType::Float)); + auto tv_data_lp_fp4 = castOp(DataType::Float4_e2m1fn, tv_data_scaled_clamp); auto tv_data_lp = reshape(tv_data_lp_fp4, [](auto& x) { x.merge(-2); }); - fusion.addOutput(tv_block_scale_fp8); - fusion.addOutput(tv_data_lp); + fusion->addOutput(tv_block_scale_fp8); + fusion->addOutput(tv_data_lp); + + FusionExecutorCache fec(std::move(fusion)); + + std::vector inputs; + inputs.push_back( + at::randn({1024, 1024}, at::device(at::kCUDA).dtype(at::kFloat)) + .to(data_type_to_aten(data_hp_dtype))); + auto outputs = fec.runFusionWithInputs(inputs); } TEST_P(NVFP4QuantizeTest, WithPerTensorAmax) { auto data_hp_dtype = GetParam(); - Fusion fusion; - FusionGuard fg(&fusion); + std::unique_ptr fusion = std::make_unique(); + FusionGuard fg(fusion.get()); auto tv_data_hp = makeContigTensor(2, data_hp_dtype); auto tv_per_tensor_scale = makeContigTensor(0, DataType::Float); - fusion.addInput(tv_data_hp); - fusion.addInput(tv_per_tensor_scale); + fusion->addInput(tv_data_hp); + fusion->addInput(tv_per_tensor_scale); auto tv_data_hp_reshaped = reshape(tv_data_hp, [](auto& x) { x.split(-1, block_size); }); @@ -199,8 +206,17 @@ TEST_P(NVFP4QuantizeTest, WithPerTensorAmax) { auto tv_data_lp_fp4 = castOp(DataType::Float4_e2m1fn, tv_data_scaled_clamp); auto tv_data_lp = reshape(tv_data_lp_fp4, [](auto& x) { x.merge(-2); }); - fusion.addOutput(tv_scaled_block_scales_fp8); - fusion.addOutput(tv_data_lp); + fusion->addOutput(tv_scaled_block_scales_fp8); + fusion->addOutput(tv_data_lp); + + FusionExecutorCache fec(std::move(fusion)); + + std::vector inputs; + inputs.push_back( + at::randn({1024, 1024}, at::device(at::kCUDA).dtype(at::kFloat)) + .to(data_type_to_aten(data_hp_dtype))); + inputs.push_back(at::randn({}, at::device(at::kCUDA).dtype(at::kFloat))); + auto outputs = fec.runFusionWithInputs(inputs); } INSTANTIATE_TEST_SUITE_P( From 8c3bf0d877e24d9c3b6a9345cddd94ba0a83d086 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 28 Jul 2025 15:07:37 -0700 Subject: [PATCH 2/3] abs --- runtime/helpers.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/runtime/helpers.cu b/runtime/helpers.cu index b343d1225ea..27c34ba0b3c 100644 --- a/runtime/helpers.cu +++ b/runtime/helpers.cu @@ -114,8 +114,9 @@ __device__ __bfloat fmax(__bfloat a, __bfloat b) { return __bfloat2float(a) > __bfloat2float(b) ? a : b; } -__device__ float abs(float a) { - return fabs(a); +template +__device__ T abs(T a) { + return a > 0 ? a : -a; } __device__ __half abs(__half a) { From e1bb076b19cb8aa0957f619638256ad4d4600a90 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 29 Jul 2025 09:52:35 -0700 Subject: [PATCH 3/3] propagate nan --- runtime/helpers.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/runtime/helpers.cu b/runtime/helpers.cu index 27c34ba0b3c..4f190af5203 100644 --- a/runtime/helpers.cu +++ b/runtime/helpers.cu @@ -107,11 +107,15 @@ __device__ float fmax(float a, float b) { } __device__ __half fmax(__half a, __half b) { - return __half2float(a) > __half2float(b) ? a : b; + auto a_float = __half2float(a); + auto b_float = __half2float(b); + return __float2half(fmax(a_float, b_float)); } __device__ __bfloat fmax(__bfloat a, __bfloat b) { - return __bfloat2float(a) > __bfloat2float(b) ? a : b; + auto a_float = __bfloat2float(a); + auto b_float = __bfloat2float(b); + return __float2bfloat(fmax(a_float, b_float)); } template