From cf5350658d21895cf25954228d34967724fa9d3f Mon Sep 17 00:00:00 2001 From: amirumoAMD Date: Tue, 16 Dec 2025 20:09:51 +0000 Subject: [PATCH 1/4] QR cap implemented to limit QR to prefill --- aiter/dist/device_communicators/communicator_cuda.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aiter/dist/device_communicators/communicator_cuda.py b/aiter/dist/device_communicators/communicator_cuda.py index fcc7ee05b2..21000f7a07 100644 --- a/aiter/dist/device_communicators/communicator_cuda.py +++ b/aiter/dist/device_communicators/communicator_cuda.py @@ -149,6 +149,8 @@ def all_reduce( qr_comm is not None and not qr_comm.disabled and qr_comm.should_quick_allreduce(input_) + and input_.shape[0] > 512 # input shape should be such that quick reduce will show benefits. + # input shape estimated at 2 * max concurrency for now. if performance issues, subject to change ): out = qr_comm.quick_all_reduce(input_) assert out is not None From b1d691ee4bd2cd44932c56370f70e32bdb83d8fb Mon Sep 17 00:00:00 2001 From: amirumoAMD Date: Wed, 17 Dec 2025 20:49:35 +0000 Subject: [PATCH 2/4] test git config --- aiter/dist/device_communicators/communicator_cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiter/dist/device_communicators/communicator_cuda.py b/aiter/dist/device_communicators/communicator_cuda.py index 21000f7a07..135241ca28 100644 --- a/aiter/dist/device_communicators/communicator_cuda.py +++ b/aiter/dist/device_communicators/communicator_cuda.py @@ -149,7 +149,7 @@ def all_reduce( qr_comm is not None and not qr_comm.disabled and qr_comm.should_quick_allreduce(input_) - and input_.shape[0] > 512 # input shape should be such that quick reduce will show benefits. + #test # input shape estimated at 2 * max concurrency for now. if performance issues, subject to change ): out = qr_comm.quick_all_reduce(input_) From a12324ceebdd3292d74d5537d92a54232dd56c7f Mon Sep 17 00:00:00 2001 From: amirumoAMD Date: Wed, 17 Dec 2025 20:50:35 +0000 Subject: [PATCH 3/4] Fix to genericize qr comm cap --- aiter/dist/device_communicators/communicator_cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiter/dist/device_communicators/communicator_cuda.py b/aiter/dist/device_communicators/communicator_cuda.py index 135241ca28..3edf90a1c6 100644 --- a/aiter/dist/device_communicators/communicator_cuda.py +++ b/aiter/dist/device_communicators/communicator_cuda.py @@ -149,7 +149,7 @@ def all_reduce( qr_comm is not None and not qr_comm.disabled and qr_comm.should_quick_allreduce(input_) - #test + and (input_.nelement() * input_.element_size()) > 4000000 # input shape should be such that quick reduce will show benefits. # input shape estimated at 2 * max concurrency for now. if performance issues, subject to change ): out = qr_comm.quick_all_reduce(input_) From b1aca3becc6a117df2b01d33f8ebd1840b7de22b Mon Sep 17 00:00:00 2001 From: amirumoAMD Date: Wed, 17 Dec 2025 21:05:14 +0000 Subject: [PATCH 4/4] Incorrect cap number --- aiter/dist/device_communicators/communicator_cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiter/dist/device_communicators/communicator_cuda.py b/aiter/dist/device_communicators/communicator_cuda.py index 3edf90a1c6..c7837376c8 100644 --- a/aiter/dist/device_communicators/communicator_cuda.py +++ b/aiter/dist/device_communicators/communicator_cuda.py @@ -149,7 +149,7 @@ def all_reduce( qr_comm is not None and not qr_comm.disabled and qr_comm.should_quick_allreduce(input_) - and (input_.nelement() * input_.element_size()) > 4000000 # input shape should be such that quick reduce will show benefits. + and (input_.nelement() * input_.element_size()) >= 4*1024*1024 # input shape should be such that quick reduce will show benefits. # input shape estimated at 2 * max concurrency for now. if performance issues, subject to change ): out = qr_comm.quick_all_reduce(input_)