Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ on:
- 'src/**'
- 'include/**'
- 'CMakeLists.txt'
- 'apps/nccl/**'
- '.github/workflows/codeql-analysis.yml'
pull_request:
branches:
Expand All @@ -20,6 +21,7 @@ on:
- 'src/**'
- 'include/**'
- 'CMakeLists.txt'
- 'apps/nccl/**'
- '.github/workflows/codeql-analysis.yml'
schedule:
- cron: "30 1 * * 1"
Expand Down
14 changes: 11 additions & 3 deletions apps/nccl/src/allreduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,11 @@ struct NvlsAdapter {
#endif
{
using ChannelType = mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>;
int nBlocks = nRanksPerNode;
cudaDeviceProp prop;
MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&prop, 0));
// On GB200, the optimal number of blocks depends on the GPU issue rate +
// NVLink switch reduction capacity, which is 24 here
int nBlocks = (prop.major == 10) ? 24 : nRanksPerNode;
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The magic number 24 for nBlocks on GB200 lacks documentation. Consider adding a comment explaining why 24 blocks are optimal for compute capability 10.0, or referencing relevant documentation or performance tuning results.

Copilot uses AI. Check for mistakes.
int nThreadsPerBlock = 1024;
allreduce9<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>((ChannelType*)memoryChannels, nvlsChannels,
nvlsOutChannels, channelInOffset, channelOutOffset,
Expand Down Expand Up @@ -331,7 +335,11 @@ mscclpp::Algorithm AllreducePacket::build() {

void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm,
std::unordered_map<std::string, std::shared_ptr<void>>&) {
nSwitchChannels_ = 8;
cudaDeviceProp prop;
MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&prop, 0));
// On GB200, the optimal number of blocks depends on the GPU issue rate +
// NVLink switch reduction capacity, which is 24 here
nSwitchChannels_ = (prop.major == 10) ? 24 : 8;
this->conns_ = setupConnections(comm);
// setup semaphores
std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =
Expand Down Expand Up @@ -680,4 +688,4 @@ mscclpp::Algorithm AllreduceNvlsPacket::build() {
return self->generateAllreduceContextKey(input, output, count, dtype);
});
return allreduceAlgo;
}
}
9 changes: 9 additions & 0 deletions apps/nccl/src/allreduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -821,7 +821,11 @@ __global__ void __launch_bounds__(1024, 1)
int nBlocks = gridDim.x;
int bid = blockIdx.x;
size_t sizePerRank = size / nRanksPerNode;
#if __CUDA_ARCH__ >= 1000
size_t sizePerBlock = (sizePerRank / nBlocks) / 16 * 16;
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The magic number 16 is used for alignment without explanation. Consider adding a comment explaining that this is for 16-byte alignment optimization on GB200, or defining it as a named constant to improve code readability and maintainability.

Copilot uses AI. Check for mistakes.
#else
size_t sizePerBlock = sizePerRank / nBlocks;
#endif
size_t rankOffset = sizePerRank * rank;
size_t blockOffset = sizePerBlock * bid + rankOffset;
mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastPtr = multicast + bid;
Expand All @@ -842,6 +846,11 @@ __global__ void __launch_bounds__(1024, 1)
__syncthreads();
T* src = (T*)multicastPtr->mcPtr;
T* dst = (T*)multicastOutPtr->mcPtr;
#if __CUDA_ARCH__ >= 1000
if (bid == nBlocks - 1) {
sizePerBlock = sizePerRank - sizePerBlock * (nBlocks - 1);
}
#endif
handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, sizePerBlock,
threadIdx.x, blockDim.x);
__syncthreads();
Expand Down
Loading