microsoft · mahdiehghazim · Dec 9, 2025 · Dec 9, 2025 · Dec 17, 2025 · Dec 17, 2025
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
@@ -10,6 +10,7 @@ on:
       - 'src/**'
       - 'include/**'
       - 'CMakeLists.txt'
+      - 'apps/nccl/**'
       - '.github/workflows/codeql-analysis.yml'
   pull_request:
     branches:
@@ -20,6 +21,7 @@ on:
       - 'src/**'
       - 'include/**'
       - 'CMakeLists.txt'
+      - 'apps/nccl/**'
       - '.github/workflows/codeql-analysis.yml'
   schedule:
     - cron: "30 1 * * 1"

diff --git a/apps/nccl/src/allreduce.cu b/apps/nccl/src/allreduce.cu
@@ -79,7 +79,11 @@ struct NvlsAdapter {
 #endif
     {
       using ChannelType = mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>;
-      int nBlocks = nRanksPerNode;
+      cudaDeviceProp prop;
+      MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&prop, 0));
+      // On GB200, the optimal number of blocks depends on the GPU issue rate +
+      // NVLink switch reduction capacity, which is 24 here
+      int nBlocks = (prop.major == 10) ? 24 : nRanksPerNode;
       int nThreadsPerBlock = 1024;
       allreduce9<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>((ChannelType*)memoryChannels, nvlsChannels,
                                                               nvlsOutChannels, channelInOffset, channelOutOffset,
@@ -331,7 +335,11 @@ mscclpp::Algorithm AllreducePacket::build() {
 
 void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm,
                                std::unordered_map<std::string, std::shared_ptr<void>>&) {
-  nSwitchChannels_ = 8;
+  cudaDeviceProp prop;
+  MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&prop, 0));
+  // On GB200, the optimal number of blocks depends on the GPU issue rate +
+  // NVLink switch reduction capacity, which is 24 here
+  nSwitchChannels_ = (prop.major == 10) ? 24 : 8;
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =
@@ -680,4 +688,4 @@ mscclpp::Algorithm AllreduceNvlsPacket::build() {
         return self->generateAllreduceContextKey(input, output, count, dtype);
       });
   return allreduceAlgo;
-}
+}
diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
@@ -821,7 +821,11 @@ __global__ void __launch_bounds__(1024, 1)
   int nBlocks = gridDim.x;
   int bid = blockIdx.x;
   size_t sizePerRank = size / nRanksPerNode;
+#if __CUDA_ARCH__ >= 1000
+  size_t sizePerBlock = (sizePerRank / nBlocks) / 16 * 16;
+#else
   size_t sizePerBlock = sizePerRank / nBlocks;
+#endif
   size_t rankOffset = sizePerRank * rank;
   size_t blockOffset = sizePerBlock * bid + rankOffset;
   mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastPtr = multicast + bid;
@@ -842,6 +846,11 @@ __global__ void __launch_bounds__(1024, 1)
   __syncthreads();
   T* src = (T*)multicastPtr->mcPtr;
   T* dst = (T*)multicastOutPtr->mcPtr;
+#if __CUDA_ARCH__ >= 1000
+  if (bid == nBlocks - 1) {
+    sizePerBlock = sizePerRank - sizePerBlock * (nBlocks - 1);
+  }
+#endif
   handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, sizePerBlock,
                              threadIdx.x, blockDim.x);
   __syncthreads();