diff --git a/tests/python/direct_utils/narrow_precision.py b/tests/python/direct_utils/narrow_precision.py index 83da00226e0..6b2537d20b7 100644 --- a/tests/python/direct_utils/narrow_precision.py +++ b/tests/python/direct_utils/narrow_precision.py @@ -79,7 +79,7 @@ def linear_to_swizzled_128_4(a_sf_linear: torch.Tensor): # details about layout requirement on block-wise scaling factor # https://docs.nvidia.com/cutlass/media/docs/cpp/blackwell_functionality.html#scale-factor-layouts tmp = torch.reshape(a_sf_padded, (m_tiles, 4, 32, k_tiles, 4)) - return tmp.transpose(1, 3).reshape(mn_padded, k_padded)[:mn, :sf_k] + return tmp.transpose(1, 3).reshape(mn_padded, k_padded) def dequantize_to_dtype( @@ -168,7 +168,7 @@ def activation_scale_to_nvfp4(x, g_sf, offsets, blockscale_offsets, block_size): else: r = offsets[i + 1] l_sf = blockscale_offsets[i] - r_sf = l_sf + r - l + r_sf = l_sf + (r - l + 127) // 128 * 128 v, b_sf = pytorch_nvfp4_quantize(x[l:r], g_sf[i]) v_scaled[l:r] = v block_scale[l_sf:r_sf] = linear_to_swizzled_128_4(b_sf)