diff --git a/README.md b/README.md index 580996b2..128f0e22 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,7 @@ All tests support the same set of arguments : * `-m,--agg_iters ` number of operations to aggregate together in each iteration. Default : 1. * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. * Test operation + * `-s,--setup_file ` Read parameters from file for tests that require it. Currently only required for alltoallv benchmark. Default : disabled. Max of 64 characters for filename. * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1. * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. diff --git a/paramfiles/alltoallv_paramfiles/Rank1Test1.csv b/paramfiles/alltoallv_paramfiles/Rank1Test1.csv new file mode 100644 index 00000000..d00491fd --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank1Test1.csv @@ -0,0 +1 @@ +1 diff --git a/paramfiles/alltoallv_paramfiles/Rank2Test1.csv b/paramfiles/alltoallv_paramfiles/Rank2Test1.csv new file mode 100644 index 00000000..6504ac95 --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank2Test1.csv @@ -0,0 +1,2 @@ +0.1,0.4 +0,0.5 diff --git a/paramfiles/alltoallv_paramfiles/Rank2Test2.csv b/paramfiles/alltoallv_paramfiles/Rank2Test2.csv new file mode 100644 index 00000000..adbb77f7 --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank2Test2.csv @@ -0,0 +1,2 @@ +0,1 +0,0 diff --git a/paramfiles/alltoallv_paramfiles/Rank3Test1.csv b/paramfiles/alltoallv_paramfiles/Rank3Test1.csv new file mode 100644 index 00000000..5e2bda46 --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank3Test1.csv @@ -0,0 +1,3 @@ +0.3,0.4,0.3 +0.2,0,0.8 +0.1,0.2,0.7 diff --git a/paramfiles/alltoallv_paramfiles/Rank3Test2.csv b/paramfiles/alltoallv_paramfiles/Rank3Test2.csv new file mode 100644 index 00000000..072e914f --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank3Test2.csv @@ -0,0 +1,3 @@ +0,0,1 +0,1,0 +1,0,0 diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test1.csv b/paramfiles/alltoallv_paramfiles/Rank4Test1.csv new file mode 100644 index 00000000..9316fe3b --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank4Test1.csv @@ -0,0 +1,4 @@ +1,1,1,1 +1,1,1,1 +1,1,1,1 +1,1,1,1 diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test2.csv b/paramfiles/alltoallv_paramfiles/Rank4Test2.csv new file mode 100644 index 00000000..9d31d094 --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank4Test2.csv @@ -0,0 +1,4 @@ +0.25,0.25,0.25,0.25 +0.50,0.50,0.50,0.50 +0.75,0.75,0.75,0.75 +1,1,1,1 diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test3.csv b/paramfiles/alltoallv_paramfiles/Rank4Test3.csv new file mode 100644 index 00000000..46b8070e --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank4Test3.csv @@ -0,0 +1,4 @@ +1,1,1,1 +0.1,0.1,0.1,0.1 +0.1,0.1,0.1,0.1 +0.1,0.1,0.1,0.1 diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test4.csv b/paramfiles/alltoallv_paramfiles/Rank4Test4.csv new file mode 100644 index 00000000..e43df533 --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank4Test4.csv @@ -0,0 +1,4 @@ +1,1,1,1 +0,0,0,0 +0,0,0,0 +0,0,0,0 diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test5.csv b/paramfiles/alltoallv_paramfiles/Rank4Test5.csv new file mode 100644 index 00000000..2750e108 --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank4Test5.csv @@ -0,0 +1,4 @@ +0,0,0,0 +1,1,1,1 +1,1,1,1 +1,1,1,1 diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test6.csv b/paramfiles/alltoallv_paramfiles/Rank4Test6.csv new file mode 100644 index 00000000..f81b129f --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank4Test6.csv @@ -0,0 +1,5 @@ +1,0,0,0 +1,0,0,0 +1,0,0,0 +1,0,0,0 + diff --git a/src/Makefile b/src/Makefile index 393de8e4..3ef4ebd6 100644 --- a/src/Makefile +++ b/src/Makefile @@ -76,7 +76,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube +BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall alltoallv scatter gather sendrecv hypercube BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src/all_gather.cu b/src/all_gather.cu index 08312074..9031ed12 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -45,7 +45,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream)); return testSuccess; } diff --git a/src/all_reduce.cu b/src/all_reduce.cu index a38eabe0..3bdfb3b7 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -40,7 +40,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); return testSuccess; } diff --git a/src/alltoall.cu b/src/alltoall.cu index 41c7c4ae..b737dbb8 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -45,7 +45,7 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double *busBw = baseBw * factor; } -testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); size_t rankOffset = count * wordSize(type); diff --git a/src/alltoallv.cu b/src/alltoallv.cu new file mode 100644 index 00000000..6fb265f8 --- /dev/null +++ b/src/alltoallv.cu @@ -0,0 +1,170 @@ +#include +#include +#include +#include +#include "cuda_runtime.h" +#include "common.h" + +int CHECK = 0; + +/** + * @brief Parses the parameter file and stores the matrix data into the imbalancingFactors reference passed in. + * @param nranks The number of ranks in the test + * @param imbalancingFactors The reference to the vector that will store the parsed data + * @param filename The name of the parameter file to parse +**/ +testResult_t parseParamFile(int nranks, std::vector> &imbalancingFactors, const char filename[PATH_MAX]){ + std::vector> paramFile_data; + std::ifstream paramFile(filename); + + if (!paramFile.is_open()) { + PRINT("\nUNABLE TO OPEN PARAMS FILE AT: %s\n", filename); + return testInternalError; + } + + std::string row; + int rowidx = 0; + while(std::getline(paramFile,row)){ //iterate over every row + std::vector values; //values from this line + std::stringstream rowstream(row); + std::string value; + while(std::getline(rowstream,value,',')){ //go over the row and get each value + double dval = std::stod(value); + if(dval<0 || dval>1) { + PRINT("\nINVALID PARAMS FILE, PARAMETER OUT OF 0:1 RANGE, ROW NUMBER: %i \n", rowidx); + return testInternalError; + } //ensure that the value is between 0 and 1 (necessary for probability distribution) + values.push_back(dval); + } + if(values.size()!=nranks) { + PRINT("\nINVALID PARAMS FILE, ROW %i DOES NOT HAVE CORRECT NUMBER OF VALUES, HAS %lu ENTRIES, NEEDS %i ENTRIES\n", rowidx, values.size(), nranks); + return testInternalError; + }//ensure that this row has the right amount of values + paramFile_data.push_back(values); + rowidx++; + } + + if(paramFile_data.size()!=nranks) { + PRINT("\nINVALID PARAMS FILE, DOES NOT HAVE CORRECT NUMBER OF ROWS, HAS %i ROWS, NEEDS %i ROWS\n", paramFile_data.size(), nranks); + return testInternalError; + } //ensure we have the right amount of rows + + imbalancingFactors = paramFile_data; //store the data in the return variable + return testSuccess; +} +void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = (count/nranks)*nranks; //Total send count rounded to a multiple of ranks + *recvcount = (count/nranks)*nranks; //Total recv count rounded to a multiple of ranks + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = (count/nranks); //Each rank can send a maximum of count/nranks data to each other rank +} + +testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t maxchunk = args->nbytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + //parse the param file + std::vector> imbalancingFactors; + testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors, args->setup_file); + if(parseSuccess != testSuccess) return parseSuccess; + for (int i=0; inGpus; i++) { + CUDACHECK(cudaSetDevice(args->gpus[i])); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); //zeroes out the receive buffer of each GPU with total size (recvcount*wordSize(type)) + CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); //copies the zeroed out receive buffer to the expected buffer + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); //current rank + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, maxchunk*nranks, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank. Should be chunk size * nranks + for (int j=0; jexpected[i] + j*maxchunk*wordSize(type), partcount_mod, rank*maxchunk, type, ncclSum, 33*rep + j, 1, 0)); + } + CUDACHECK(cudaDeviceSynchronize()); + } + // We don't support in-place alltoallv + args->reportErrors = in_place ? 0 : 1; + return testSuccess; +} + +void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(nranks-1))/((double)(nranks)); + *busBw = baseBw * factor; +} + +testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { + int nRanks, myRank; + NCCLCHECK(ncclCommCount(comm, &nRanks)); + NCCLCHECK(ncclCommUserRank(comm, &myRank)); + std::vector> imbalancingFactors; + testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors, args->setup_file); //parse the param file + if(parseSuccess != testSuccess) return parseSuccess; + size_t rankOffset = count * wordSize(type); + +#if NCCL_MAJOR < 2 || NCCL_MINOR < 7 + printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); + return testNcclError; +#else + NCCLCHECK(ncclGroupStart()); + for (int r=0; rimbalancingFactors.size()){ + PRINT("\nmyRank is greater than imbalancingFactors.size(), %i\n", myRank); + return testInternalError; + } else if (r > imbalancingFactors[myRank].size()) { + PRINT("\nr is greater than imbalancingFactors[myRank].size(), %i\n", r); + return testInternalError; + } + unsigned long send_count_mod = count * imbalancingFactors[myRank][r]; + unsigned long recv_count_mod = count * imbalancingFactors[r][myRank]; + NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, send_count_mod, type, r, comm, stream)); + NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, recv_count_mod, type, r, comm, stream)); + } + + + NCCLCHECK(ncclGroupEnd()); + return testSuccess; +#endif +} + +struct testColl AlltoAllvTest = { + "AlltoAllV", + AlltoAllvGetCollByteCount, + AlltoAllvInitData, + AlltoAllvGetBw, + AlltoAllvRunColl +}; + +void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AlltoAllvGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t AlltoAllvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &AlltoAllvTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + for (int i=0; i= 2 && NCCL_MINOR >= 2 diff --git a/src/common.cu b/src/common.cu index 48a629ce..559f360d 100644 --- a/src/common.cu +++ b/src/common.cu @@ -59,6 +59,7 @@ int is_main_proc = 0; thread_local int is_main_thread = 0; // Command line parameter defaults +static char setup_file[PATH_MAX]; static int nThreads = 1; static int nGpus = 1; static size_t minBytes = 32*1024*1024; @@ -373,11 +374,10 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); } #endif - TESTCHECK(args->collTest->runColl( (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff), (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff), - count, type, op, root, args->comms[i], args->streams[i])); + count, type, op, root, args->comms[i], args->streams[i], args)); #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) if(opIndex >= ncclNumOps) { @@ -685,6 +685,7 @@ int main(int argc, char* argv[]) { double parsed; int longindex; static struct option longopts[] = { + {"setup_file",optional_argument, 0, 's'}, {"nthreads", required_argument, 0, 't'}, {"ngpus", required_argument, 0, 'g'}, {"minbytes", required_argument, 0, 'b'}, @@ -711,12 +712,15 @@ int main(int argc, char* argv[]) { while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex); + c = getopt_long(argc, argv, "s:t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex); if (c == -1) break; switch(c) { + case 's': + strcpy(setup_file,optarg); + break; case 't': nThreads = strtol(optarg, NULL, 0); break; @@ -983,6 +987,8 @@ testResult_t run() { memset(threads, 0, sizeof(struct testThread)*nThreads); for (int t=nThreads-1; t>=0; t--) { + strcpy(threads[t].args.setup_file, setup_file); + threads[t].args.minbytes=minBytes; threads[t].args.maxbytes=maxBytes; threads[t].args.stepbytes=stepBytes; diff --git a/src/common.h b/src/common.h index 20fa4612..9bf2769c 100644 --- a/src/common.h +++ b/src/common.h @@ -92,7 +92,7 @@ struct testColl { ncclRedOp_t op, int root, int rep, int in_place); void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args); }; extern struct testColl allReduceTest; extern struct testColl allGatherTest; @@ -110,6 +110,8 @@ struct testEngine { extern struct testEngine ncclTestEngine; struct threadArgs { + char setup_file[PATH_MAX]; + size_t nbytes; size_t minbytes; size_t maxbytes; diff --git a/src/gather.cu b/src/gather.cu index 03ef4d9e..9f3c5ad0 100644 --- a/src/gather.cu +++ b/src/gather.cu @@ -43,7 +43,7 @@ void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* *busBw = baseBw * factor; } -testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank; diff --git a/src/hypercube.cu b/src/hypercube.cu index 5c1456f8..8cb73ed7 100644 --- a/src/hypercube.cu +++ b/src/hypercube.cu @@ -45,7 +45,7 @@ void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { char* sbuff = (char*)sendbuff; char* rbuff = (char*)recvbuff; int nRanks; diff --git a/src/reduce.cu b/src/reduce.cu index f2fa80dd..6fed902c 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -39,7 +39,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* *busBw = baseBw; } -testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream)); return testSuccess; } diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index ed372e3b..f23cbc24 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -44,7 +44,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d *busBw = baseBw * factor; } -testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream)); return testSuccess; } diff --git a/src/scatter.cu b/src/scatter.cu index 49d20e16..0644a52c 100644 --- a/src/scatter.cu +++ b/src/scatter.cu @@ -39,7 +39,7 @@ void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* *busBw = baseBw * factor; } -testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank; diff --git a/src/sendrecv.cu b/src/sendrecv.cu index c9eb5bb4..8052b815 100644 --- a/src/sendrecv.cu +++ b/src/sendrecv.cu @@ -43,7 +43,7 @@ void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double *busBw = baseBw * factor; } -testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank;