From fd83f7ca84983b7b5ec1930a5cae546f0cf8bd7e Mon Sep 17 00:00:00 2001 From: Zhongqi An Date: Thu, 14 Nov 2024 11:30:42 +0800 Subject: [PATCH 01/26] tweak build and test scripts for SiCL. * add '/usr/local/sihpc/lib' to rpath. * print 'NVCC_GENCODE' in Makefile, and by default generate bin for Volta, Ampere, Ada, and Hopper. * add test run wrapper scripts "nccl_perf" and "nccl_test". --- scripts/nccl_perf | 27 +++++++++++++++++++++++++++ scripts/nccl_test | 36 ++++++++++++++++++++++++++++++++++++ src/Makefile | 12 ++++++++++-- 3 files changed, 73 insertions(+), 2 deletions(-) create mode 100755 scripts/nccl_perf create mode 100755 scripts/nccl_test diff --git a/scripts/nccl_perf b/scripts/nccl_perf new file mode 100755 index 00000000..b2193f3d --- /dev/null +++ b/scripts/nccl_perf @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +BASE_DIR=$(cd $(dirname $0) && pwd) +TEST_DIR=$BASE_DIR/../libexec/nccl-tests +COLL=all_reduce +while [[ -n $1 ]]; do + case $1 in + -l*) + case ${1:2} in + Broadcast|broadcast|Bcast|bcast) COLL=broadcast;; + Reduce|reduce) COLL=reduce;; + Gather|gather) COLL=gather;; + Scatter|scatter) COLL=scatter;; + AllToAll|alltoall) COLL=alltoall;; + AllGather|allgather) COLL=all_gather;; + ReduceScatter|reducescatter) COLL=reduce_scatter;; + HyperCube|Hypercube|hypercube) COLL=hypercube;; + SendRecv|sendrecv) COLL=sendrecv;; + esac;; + *) OPTIONS="$OPTIONS $1";; + esac + shift 1 +done +export OMPI_ALLOW_RUN_AS_ROOT=1 +export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 +export OMPI_MCA_btl=self,tcp +export OMPI_MCA_pml=^ucx +$TEST_DIR/${COLL}_perf$OPTIONS diff --git a/scripts/nccl_test b/scripts/nccl_test new file mode 100755 index 00000000..0738c573 --- /dev/null +++ b/scripts/nccl_test @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +TEST_DIR=$(cd $(dirname $0) && pwd) +COLL=all_reduce +while [[ -n $1 ]]; do + case $1 in + -l*) + case ${1:2} in + Broadcast|broadcast|Bcast|bcast) COLL=broadcast;; + Reduce|reduce) COLL=reduce;; + Gather|gather) COLL=gather;; + Scatter|scatter) COLL=scatter;; + AllToAll|alltoall) COLL=alltoall;; + AllGather|allgather) COLL=all_gather;; + ReduceScatter|reducescatter) COLL=reduce_scatter;; + HyperCube|Hypercube|hypercube) COLL=hypercube;; + SendRecv|sendrecv) COLL=sendrecv;; + esac;; + *) OPTIONS="$OPTIONS $1";; + esac + shift 1 +done + +export OMPI_ALLOW_RUN_AS_ROOT=1 +export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 +export OMPI_MCA_btl=self,tcp +export OMPI_MCA_pml=^ucx + +if [ $OMPI_COMM_WORLD_SIZE -gt $OMPI_COMM_WORLD_LOCAL_SIZE ]; then + if [ $OMPI_COMM_WORLD_RANK -eq 0 ]; then + export NCCL_DEBUG=${NCCL_DEBUG:-"INFO"} + fi +fi +if [ $OMPI_COMM_WORLD_RANK -eq 0 ]; then + echo "[$(hostname)] running nccl test $COLL$OPTIONS, world_size=$OMPI_COMM_WORLD_SIZE" +fi +$TEST_DIR/${COLL}_perf -f2$OPTIONS diff --git a/src/Makefile b/src/Makefile index 393de8e4..b4bb42a2 100644 --- a/src/Makefile +++ b/src/Makefile @@ -19,7 +19,14 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) # Better define NVCC_GENCODE in your environment to the minimal set # of archs to reduce compile time. -ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) +ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0) +NVCC_GENCODE ?= -gencode=arch=compute_70,code=sm_70 \ + -gencode=arch=compute_80,code=sm_80 \ + -gencode=arch=compute_86,code=sm_86 \ + -gencode=arch=compute_89,code=sm_89 \ + -gencode=arch=compute_90,code=sm_90 \ + -gencode=arch=compute_90,code=compute_90 +else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ -gencode=arch=compute_61,code=sm_61 \ -gencode=arch=compute_70,code=sm_70 \ @@ -33,6 +40,7 @@ NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ -gencode=arch=compute_70,code=sm_70 \ -gencode=arch=compute_70,code=compute_70 endif +$(info NVCC_GENCODE is ${NVCC_GENCODE}) NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 CXXFLAGS := -std=c++11 @@ -101,5 +109,5 @@ ${DST_DIR}/timer.o: timer.cc timer.h ${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) @printf "Linking %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} - $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} + $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} -Xcompiler \"-Wl,-rpath,/usr/local/sihpc/lib\" From 395e3455025b43711957fe79bda287677337e1cb Mon Sep 17 00:00:00 2001 From: Zhongqi An Date: Thu, 14 Nov 2024 11:33:58 +0800 Subject: [PATCH 02/26] add ucommd get_bw for checking the perf results (busbw). * do check for AllReduce only. * disable option '-t', thus nThreads = 1 always. * message size, min+max bytes, timeouts, etc. are fed automatically. * support checking results when running in comm split mode. other changes: * try to get physical hostname via env 'NODE_NAME'. * check ib port state and print a log if not up nor active. * default stepFactor is changed to '2', datatype is changed to 'bf16'. --- src/Makefile | 7 +- src/common.cu | 48 +++++++--- src/common.h | 15 +-- src/ucommd.cc | 250 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/ucommd.h | 45 +++++++++ 5 files changed, 343 insertions(+), 22 deletions(-) create mode 100644 src/ucommd.cc create mode 100644 src/ucommd.h diff --git a/src/Makefile b/src/Makefile index b4bb42a2..0a994b7f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -106,7 +106,12 @@ ${DST_DIR}/timer.o: timer.cc timer.h @mkdir -p ${DST_DIR} $(CXX) $(CXXFLAGS) -o $@ -c timer.cc -${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) +${DST_DIR}/ucommd.o: ucommd.cc ucommd.h + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(CXX) $(CXXFLAGS) -o $@ -c ucommd.cc + +${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o ${DST_DIR}/ucommd.o $(TEST_VERIFIABLE_OBJS) @printf "Linking %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} -Xcompiler \"-Wl,-rpath,/usr/local/sihpc/lib\" diff --git a/src/common.cu b/src/common.cu index e1f8a85f..c8f323fb 100644 --- a/src/common.cu +++ b/src/common.cu @@ -14,6 +14,10 @@ #include "../verifiable/verifiable.h" +#include "ucommd.h" + +static Ucommd ucommd_; + int test_ncclVersion = 0; // init'd with ncclGetVersion() #if NCCL_MAJOR >= 2 @@ -64,14 +68,18 @@ static int nGpus = 1; static size_t minBytes = 32*1024*1024; static size_t maxBytes = 32*1024*1024; static size_t stepBytes = 1*1024*1024; -static size_t stepFactor = 1; +static size_t stepFactor = 2; static int datacheck = 1; static int warmup_iters = 5; static int iters = 20; static int agg_iters = 1; static int run_cycles = 1; static int ncclop = ncclSum; -static int nccltype = ncclFloat; +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) +static int nccltype = ncclBfloat16; +#else +static int nccltype = ncclHalf; +#endif static int ncclroot = 0; static int parallel_init = 0; static int blocking_coll = 0; @@ -709,11 +717,15 @@ int main(int argc, char* argv[]) { } #endif + nGpus = ucommd_.getNGpusPerProc(); + minBytes = maxBytes = ucommd_.getBytes(); + timeout = ucommd_.getTimeoutSec(); + // Parse args double parsed; int longindex; static struct option longopts[] = { - {"nthreads", required_argument, 0, 't'}, + //{"nthreads", required_argument, 0, 't'}, {"ngpus", required_argument, 0, 'g'}, {"minbytes", required_argument, 0, 'b'}, {"maxbytes", required_argument, 0, 'e'}, @@ -741,15 +753,16 @@ int main(int argc, char* argv[]) { while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex); + //c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex); + c = getopt_long(argc, argv, "g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:hG:C:a:R:", longopts, &longindex); if (c == -1) break; switch(c) { - case 't': - nThreads = strtol(optarg, NULL, 0); - break; + //case 't': + // nThreads = strtol(optarg, NULL, 0); + // break; case 'g': nGpus = strtol(optarg, NULL, 0); break; @@ -846,7 +859,7 @@ int main(int argc, char* argv[]) { default: if (c != 'h') printf("invalid option '%c'\n", c); printf("USAGE: %s \n\t" - "[-t,--nthreads ] \n\t" + // "[-t,--nthreads ] \n\t" "[-g,--ngpus ] \n\t" "[-b,--minbytes ] \n\t" "[-e,--maxbytes ] \n\t" @@ -919,8 +932,10 @@ testResult_t run() { #endif is_main_thread = is_main_proc = (proc == 0) ? 1 : 0; - PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n", - nThreads, nGpus, minBytes, maxBytes, +//PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n", +// nThreads, nGpus, minBytes, maxBytes, + PRINT("# nGpus(perProc) %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n", + nGpus, minBytes, maxBytes, (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches); if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); @@ -949,7 +964,9 @@ testResult_t run() { // Gather all output in rank order to root (0) MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD); if (proc == 0) { - for (int p = 0; p < totalProcs; p++) + //for (int p = 0; p < totalProcs; p++) + int stride = ucommd_.getLocalSize() > 0 ? ucommd_.getLocalSize() : 1; + for (int p = stride-1; p < totalProcs; p+=stride) PRINT("%s", lines+MAX_LINE*p); free(lines); } @@ -1123,11 +1140,14 @@ testResult_t run() { #endif envstr = getenv("NCCL_TESTS_MIN_BW"); - double check_avg_bw = envstr ? atof(envstr) : -1; +//double check_avg_bw = envstr ? atof(envstr) : -1; + double check_avg_bw = envstr ? atof(envstr) : + (!strcmp(threads[0].args.collTest->name, "AllReduce") && minBytes == maxBytes && minBytes >= ucommd_.getBytes()) ? ucommd_.getBw(nGpus) : -1; bw[0] /= bw_count[0]; PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK"); - PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK")); + PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw/**(0.9)*/ ? "FAILED" : "OK")); + if (bw[0] < check_avg_bw) PRINT("# Expected min bandwidth : %g\n", check_avg_bw); PRINT("#\n"); #ifdef MPI_SUPPORT MPI_Comm_free(&mpi_comm); @@ -1139,7 +1159,7 @@ testResult_t run() { // 'cuda-memcheck --leak-check full' requires this cudaDeviceReset(); - if (errors[0] || bw[0] < check_avg_bw*(0.9)) + if (errors[0] || bw[0] < check_avg_bw/**(0.9)*/) exit(EXIT_FAILURE); else exit(EXIT_SUCCESS); diff --git a/src/common.h b/src/common.h index e6762e1c..5bfcd388 100644 --- a/src/common.h +++ b/src/common.h @@ -94,12 +94,6 @@ struct testColl { testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); }; -extern struct testColl allReduceTest; -extern struct testColl allGatherTest; -extern struct testColl reduceScatterTest; -extern struct testColl broadcastTest; -extern struct testColl reduceTest; -extern struct testColl alltoAllTest; struct testEngine { void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks); @@ -162,7 +156,14 @@ extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, voi #include static void getHostName(char* hostname, int maxlen) { - gethostname(hostname, maxlen); + const char* node_name = getenv("NODE_NAME"); + if (node_name && node_name[0]) { + strncpy(hostname, node_name, maxlen); + } else { + if (-1 == gethostname(hostname, maxlen)) { + strncpy(hostname, "unknown", 16); + } + } for (int i=0; i< maxlen; i++) { if (hostname[i] == '.') { hostname[i] = '\0'; diff --git a/src/ucommd.cc b/src/ucommd.cc new file mode 100644 index 00000000..d0c1bd6a --- /dev/null +++ b/src/ucommd.cc @@ -0,0 +1,250 @@ +/** + * Copyright (c) 2024, Scitix Tech PTE. LTD. All rights reserved. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "ucommd.h" + +Ucommd::Ucommd() { + (void)_check_multi_node_via_ompi(); + (void)_check_sys_nv_devices(); + (void)_check_sys_ib_devices(); + (void)_get_node_name(); +} + +Ucommd::~Ucommd() { + nvdevs_.clear(); + ibdevs_.clear(); +} + +void Ucommd::_check_multi_node_via_ompi() { + const auto world_size_env = std::getenv("OMPI_COMM_WORLD_SIZE"); + if (world_size_env == nullptr) return; + world_size_ = std::strtol(world_size_env, nullptr, 10); + const auto local_size_env = std::getenv("OMPI_COMM_WORLD_LOCAL_SIZE"); + if (local_size_env == nullptr) return; + local_size_ = std::strtol(local_size_env, nullptr, 10); + + // assume homogeneous mpirun + nnodes_ = world_size_ / local_size_; + is_multi_node_ = (nnodes_ > 1); +} + +void Ucommd::_check_sys_nv_devices() { + DIR* dir; + dir = opendir("/sys/bus/pci/drivers/nvidia"); + if (dir) { + struct dirent *entry; + while ((entry = readdir(dir))) { + if (entry->d_name[0] != '0') continue; + const auto nvdev = std::string(entry->d_name); + auto dev_class = std::ifstream( + std::string("/sys/bus/pci/drivers/nvidia/") + nvdev + "/class"); + if (dev_class.is_open()) { + char dclass[16] = {0}; + dev_class.getline(dclass, 16); + if (dev_class.good() && + (std::string("0x030200").compare(dclass) == 0 || + std::string("0x030000").compare(dclass) == 0)) { + nvdevs_.push_back(nvdev); + } + dev_class.close(); + } + } + closedir(dir); + std::sort(nvdevs_.begin(), nvdevs_.end()); + } +} + +void Ucommd::_check_sys_ib_devices() { + DIR* dir; + dir = opendir("/sys/class/infiniband"); + if (dir) { + struct dirent *entry; + while ((entry = readdir(dir))) { + if ((strcmp(entry->d_name, ".") == 0) || + (strcmp(entry->d_name, "..") == 0)) { + continue; + } + const auto ibdev = std::string(entry->d_name); + if ([&ibdev] { + bool is_ib = false; + auto node_type = std::ifstream( + std::string("/sys/class/infiniband/") + ibdev + "/node_type"); + if (node_type.is_open()) { + char ntype = node_type.get(); + if (node_type.good()) is_ib = '1' <= ntype && ntype <= '3'; + node_type.close(); + } + return is_ib; + }() && + [&ibdev] { + bool is_cx6 = false; + auto hca_type = std::ifstream( + std::string("/sys/class/infiniband/") + ibdev + "/hca_type"); + if (hca_type.is_open()) { + char htype[8] = {0}; + hca_type.getline(htype, 8); + if (hca_type.good()) { + is_cx6 = std::string("MT4123").compare(htype) == 0 || + std::string("MT4125").compare(htype) == 0 || + std::string("MT4129").compare(htype) == 0 || + std::string("MT4131").compare(htype) == 0 || + std::string("MT4124").compare(htype) == 0; + } + hca_type.close(); + } + return is_cx6; + }() && + [&ibdev, this] { + bool port_active = false; + auto port_state = std::ifstream( + std::string("/sys/class/infiniband/") + ibdev + "/ports/1/state"); + if (port_state.is_open()) { + char state = port_state.get(); + if (port_state.good()) { + port_active = state == '4'; + } + port_state.close(); + } + if (!port_active) { + printf("[%s] %s: port not active or unable to get port state\n", + node_name_.c_str(), ibdev.c_str()); + } + return port_active; + }() && + [&ibdev, this] { + bool link_up = false; + auto phys_state = std::ifstream( + std::string("/sys/class/infiniband/") + ibdev + "/ports/1/phys_state"); + if (phys_state.is_open()) { + char state = phys_state.get(); + if (phys_state.good()) { + link_up = state == '5'; + } + phys_state.close(); + } + if (!link_up) { + printf("[%s] %s: phys link not up or unable to get phys state\n", + node_name_.c_str(), ibdev.c_str()); + } + return link_up; + }()) { + ibdevs_.push_back(ibdev); + } + } + closedir(dir); + std::sort(ibdevs_.begin(), ibdevs_.end()); + } +} + +void Ucommd::_get_node_name() { + const auto node_name_env = std::getenv("NODE_NAME"); + if (node_name_env && node_name_env[0]) { + node_name_.assign(node_name_env); + } else { + char hostname[128] = {0}; + if (!gethostname(hostname, 128)) { + node_name_.assign(hostname); + } else { + node_name_.assign("unknown"); + } + } +} + +int Ucommd::getLocalSize() const { + return local_size_; +} + +int Ucommd::getNGpusPerProc() const { + return (is_multi_node_ || local_size_ > 1) ? 1 : (int)nvdevs_.size(); +} + +size_t Ucommd::getBytes() const { + return !is_multi_node_ ? 1UL << 32 : + world_size_ > 1024 ? ((size_t)world_size_) << 24 : + local_size_ > 4 ? ((size_t)world_size_) << 25 : + local_size_ > 1 ? ((size_t)world_size_) << 26 : + ((size_t)world_size_) << 27; +} + +int Ucommd::getTimeoutSec() const { + return 600; +} + +int Ucommd::getBw(int ngpus) { + return is_multi_node_ ? get_ib_bw() : + (ngpus > 1 || local_size_ > 1) ? get_nvlink_bw() : -1; +} + +int Ucommd::get_nvlink_bw() { + int bw = -1; + if (!nvdevs_.empty()) { + auto dev_id = std::ifstream( + std::string("/sys/bus/pci/drivers/nvidia/") + nvdevs_.at(0) + "/device"); + if (dev_id.is_open()) { + char device[16] = {0}; + dev_id.getline(device, 16); + if (dev_id.good()) { + if (std::string("0x2330").compare(device) == 0) { + bw = 450 * 3 / 4; + } else + if (std::string("0x20b0").compare(device) == 0 || + std::string("0x20b2").compare(device) == 0 || + std::string("0x20b3").compare(device) == 0) { + bw = 300 * 2 / 3; + } else + if (std::string("0x20f3").compare(device) == 0 || + std::string("0x20bd").compare(device) == 0) { + bw = 200 * 2 / 3; + } + } + } + } + return bw; +} + +int Ucommd::get_ib_bw() { + int bw = -1; + if (!ibdevs_.empty()) { + int rate = 0; + auto port_rate = std::ifstream( + std::string("/sys/class/infiniband/") + ibdevs_.at(0) + "/ports/1/rate"); + if (port_rate.is_open()) { + char c; + while ((c = port_rate.get()) && ('0' <= c && c <= '9')) { + rate = rate * 10 + c - '0'; + } + port_rate.close(); + } + bw = rate * 3 / 32; + + // for DP AllReduce only ... + auto nnics = ibdevs_.size(); + if (local_size_ == 2) { + bw = nnics > 1 ? bw * 2 : bw; + } else + if (local_size_ == 4) { + bw = nnics > 3 ? bw * 4 : nnics > 1 ? bw * 2 : bw; + } else + if (local_size_ == 8) { + bw *= nnics; + const char* mask_env = getenv("NCCL_TESTS_SPLIT_MASK"); + if (mask_env) { + auto mask = std::strtol(mask_env, nullptr, 10); + if (mask == 7 || mask == 3 || mask == 1) { + bw /= (mask+1); + } // else ??? + } + } + } + return bw; +} diff --git a/src/ucommd.h b/src/ucommd.h new file mode 100644 index 00000000..92dc26a0 --- /dev/null +++ b/src/ucommd.h @@ -0,0 +1,45 @@ +/** + * Copyright (c) 2024, Scitix Tech PTE. LTD. All rights reserved. + */ + +#ifndef __UCOMMD_H__ +#define __UCOMMD_H__ + +#include +#include + +class Ucommd { + public: + Ucommd(); + ~Ucommd(); + + public: + int getLocalSize() const; + + int getNGpusPerProc() const; + int getTimeoutSec() const; + size_t getBytes() const; + + int getBw(int ngpus = -1); + + private: + int get_nvlink_bw(); + int get_ib_bw(); + + private: + void _check_multi_node_via_ompi(); + void _check_sys_nv_devices(); + void _check_sys_ib_devices(); + void _get_node_name(); + + private: + int world_size_ = -1; + int local_size_ = -1; + int nnodes_ = -1; + bool is_multi_node_ = false; + std::string node_name_; + std::vector nvdevs_; + std::vector ibdevs_; +}; + +#endif From 10a48eb2a95ad86ee9f94f68e98ea8f65b086600 Mon Sep 17 00:00:00 2001 From: Zhongqi An Date: Wed, 4 Dec 2024 14:08:35 +0800 Subject: [PATCH 03/26] fix: get NODE_NAME first during ucommd initialization. --- src/ucommd.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ucommd.cc b/src/ucommd.cc index d0c1bd6a..75d539de 100644 --- a/src/ucommd.cc +++ b/src/ucommd.cc @@ -14,10 +14,10 @@ #include "ucommd.h" Ucommd::Ucommd() { + (void)_get_node_name(); (void)_check_multi_node_via_ompi(); (void)_check_sys_nv_devices(); (void)_check_sys_ib_devices(); - (void)_get_node_name(); } Ucommd::~Ucommd() { From 38bf3cdc2bac9ba9f95e9a1aced0dfec37c11263 Mon Sep 17 00:00:00 2001 From: Zhongqi An Date: Wed, 4 Dec 2024 15:41:37 +0800 Subject: [PATCH 04/26] update copyright notice. --- src/common.cu | 1 + src/ucommd.cc | 6 ++++-- src/ucommd.h | 6 ++++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/common.cu b/src/common.cu index c8f323fb..bceedc35 100644 --- a/src/common.cu +++ b/src/common.cu @@ -1,5 +1,6 @@ /************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2024, Scitix Tech PTE. LTD. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/ucommd.cc b/src/ucommd.cc index 75d539de..1fed88fe 100644 --- a/src/ucommd.cc +++ b/src/ucommd.cc @@ -1,6 +1,8 @@ -/** +/************************************************************************* * Copyright (c) 2024, Scitix Tech PTE. LTD. All rights reserved. - */ + * + * See LICENSE.txt for license information + ************************************************************************/ #include #include diff --git a/src/ucommd.h b/src/ucommd.h index 92dc26a0..ade79b13 100644 --- a/src/ucommd.h +++ b/src/ucommd.h @@ -1,6 +1,8 @@ -/** +/************************************************************************* * Copyright (c) 2024, Scitix Tech PTE. LTD. All rights reserved. - */ + * + * See LICENSE.txt for license information + ************************************************************************/ #ifndef __UCOMMD_H__ #define __UCOMMD_H__ From 1fb35de24ed71ff2f52c90d7b7127102864b3076 Mon Sep 17 00:00:00 2001 From: Zhongqi An Date: Wed, 5 Feb 2025 17:55:57 +0800 Subject: [PATCH 05/26] enlarge limit of opened file descriptors to run AllToAll test. --- scripts/nccl_test | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/nccl_test b/scripts/nccl_test index 0738c573..61e9019a 100755 --- a/scripts/nccl_test +++ b/scripts/nccl_test @@ -1,4 +1,5 @@ #!/usr/bin/env bash +ulimit -n $((1<<16)) TEST_DIR=$(cd $(dirname $0) && pwd) COLL=all_reduce while [[ -n $1 ]]; do From 351b3c1fd9c36ba58d2aff3bb3ec0869e6d71fdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=B0=8F=E4=B8=BD?= Date: Fri, 19 Dec 2025 10:05:04 +0800 Subject: [PATCH 06/26] add scripts --- scripts/env.sh | 6 ++++ scripts/install_sihpc | 69 +++++++++++++++++++++++++++++++++++++++++ scripts/uninstall_sihpc | 26 ++++++++++++++++ src/Makefile | 2 +- 4 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 scripts/env.sh create mode 100644 scripts/install_sihpc create mode 100644 scripts/uninstall_sihpc diff --git a/scripts/env.sh b/scripts/env.sh new file mode 100644 index 00000000..5e2610be --- /dev/null +++ b/scripts/env.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export SIHPC_HOME=/usr/local/sihpc +export PATH=$SIHPC_HOME/bin:$PATH +export LD_LIBRARY_PATH=$SIHPC_HOME/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH +export OMPI_MCA_opal_prefix=$SIHPC_HOME +export OPAL_PREFIX=$SIHPC_HOME \ No newline at end of file diff --git a/scripts/install_sihpc b/scripts/install_sihpc new file mode 100644 index 00000000..7ac717bb --- /dev/null +++ b/scripts/install_sihpc @@ -0,0 +1,69 @@ +#!/bin/bash +set -euo pipefail + +PREFIX="/usr/local/sihpc" +LIBCONF="/etc/ld.so.conf.d/sihpc.conf" +PROFILE_SH="/etc/profile.d/sihpc.sh" + +echo "Installing sihpc runtime to: $PREFIX" + +if [ ! -d "$PREFIX" ]; then + mkdir -p "$PREFIX" +fi +cp -r ./* "$PREFIX/" +echo "sihpc installed to $PREFIX" + +# register path to ld.so.conf.d +if [ ! -f "$LIBCONF" ]; then + echo "$PREFIX/lib" > "$LIBCONF" + [ -d "$PREFIX/lib64" ] && echo "$PREFIX/lib64" >> "$LIBCONF" + echo "Added $LIBCONF" + + echo "$PREFIX/lib" >> "$LIBCONF" + fi + if [ -d "$PREFIX/lib64" ] && ! grep -q "$PREFIX/lib64" "$LIBCONF"; then + echo "$PREFIX/lib64" >> "$LIBCONF" + fi + echo "Updated existing $LIBCONF" +fi + +# update ld.so cache +ldconfig +echo "ldconfig updated" + +# create profile.d for auto-loading +if [ ! -f "$PROFILE_SH" ]; then + cat < "$PROFILE_SH" +# Auto-generated by sihpc installer +if [ -f $PREFIX/env.sh ]; then + source $PREFIX/env.sh +fi +EOF + chmod +x "$PROFILE_SH" + echo "Added $PROFILE_SH" +else + echo "$PROFILE_SH already exists, skipping." +fi + +# bash.bashrc / bashrc +for f in /etc/bash.bashrc /etc/bashrc; do + if [ -f "$f" ]; then + if ! grep -q "$PREFIX/env.sh" "$f"; then + { + echo "" + echo "# sihpc environment" + echo "if [ -f $PREFIX/env.sh ]; then" + echo " source $PREFIX/env.sh" + echo "fi" + } >> "$f" + echo "Added sihpc source to $f" + fi + fi +done + +echo +echo "sihpc installation completed successfully!" +echo "Installed to: $PREFIX" +echo "Library config: $LIBCONF" +echo "Auto env setup: $PROFILE_SH" +echo "Run 'source $PREFIX/env.sh' now to activate current shell." \ No newline at end of file diff --git a/scripts/uninstall_sihpc b/scripts/uninstall_sihpc new file mode 100644 index 00000000..a58be35d --- /dev/null +++ b/scripts/uninstall_sihpc @@ -0,0 +1,26 @@ +#!/bin/bash +# sihpc-uninstaller.sh + +set -e + +SIHPC_ROOT="/usr/local/sihpc" + +if [ ! -d "$SIHPC_ROOT" ]; then + echo "sihpc install dir $SIHPC_ROOT not exist." + exit 1 +fi + +echo "==============================" +echo " uninstall sihpc" +echo " install dir: $SIHPC_ROOT" +echo "==============================" + +echo "deleting $SIHPC_ROOT ..." +rm -rf "$SIHPC_ROOT" + +# 尝试清理 PATH 中的 sihpc/bin(如果用户在 .bashrc 中手动添加过,可以提示用户) +echo "please check shell config(~/.bashrc, ~/.zshrc etc.)," +echo "remove $SIHPC_ROOT/bin from PATH" + +echo "sihpc unintall done!" +exit 0 \ No newline at end of file diff --git a/src/Makefile b/src/Makefile index 5cc38244..fbee8ffa 100644 --- a/src/Makefile +++ b/src/Makefile @@ -74,7 +74,7 @@ ${DST_DIR}/%_perf$(NAME_SUFFIX): ${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX). @mkdir -p ${DST_DIR} $(NVCC) -o $@ $(NVCUFLAGS) $^ -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable ${NVLDFLAGS} -Xlinker "--enable-new-dtags" -Xlinker "-rpath,\$$ORIGIN:\$$ORIGIN/verifiable" else -${DST_DIR}/%_perf$(NAME_SUFFIX):${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/util$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) +${DST_DIR}/%_perf$(NAME_SUFFIX):${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/util$(NAME_SUFFIX).o ${DST_DIR}/timer.o ${DST_DIR}/ucommd.o $(TEST_VERIFIABLE_OBJS) @printf "Linking %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} From a922a05e603f151dadb5c33377897484b164f61f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=B0=8F=E4=B8=BD?= Date: Fri, 19 Dec 2025 20:13:29 +0800 Subject: [PATCH 07/26] update scripts --- scripts/install_sihpc | 19 ++++++++----------- scripts/uninstall_sihpc | 1 - 2 files changed, 8 insertions(+), 12 deletions(-) mode change 100644 => 100755 scripts/install_sihpc mode change 100644 => 100755 scripts/uninstall_sihpc diff --git a/scripts/install_sihpc b/scripts/install_sihpc old mode 100644 new mode 100755 index 7ac717bb..01e370ad --- a/scripts/install_sihpc +++ b/scripts/install_sihpc @@ -5,20 +5,20 @@ PREFIX="/usr/local/sihpc" LIBCONF="/etc/ld.so.conf.d/sihpc.conf" PROFILE_SH="/etc/profile.d/sihpc.sh" -echo "Installing sihpc runtime to: $PREFIX" +echo "Installing SiHPC runtime to: $PREFIX" if [ ! -d "$PREFIX" ]; then mkdir -p "$PREFIX" fi cp -r ./* "$PREFIX/" -echo "sihpc installed to $PREFIX" +echo "Files installed to $PREFIX" -# register path to ld.so.conf.d if [ ! -f "$LIBCONF" ]; then echo "$PREFIX/lib" > "$LIBCONF" [ -d "$PREFIX/lib64" ] && echo "$PREFIX/lib64" >> "$LIBCONF" echo "Added $LIBCONF" - +else + if ! grep -q "$PREFIX/lib" "$LIBCONF"; then echo "$PREFIX/lib" >> "$LIBCONF" fi if [ -d "$PREFIX/lib64" ] && ! grep -q "$PREFIX/lib64" "$LIBCONF"; then @@ -27,14 +27,12 @@ if [ ! -f "$LIBCONF" ]; then echo "Updated existing $LIBCONF" fi -# update ld.so cache ldconfig echo "ldconfig updated" -# create profile.d for auto-loading if [ ! -f "$PROFILE_SH" ]; then cat < "$PROFILE_SH" -# Auto-generated by sihpc installer +# Auto-generated by SiHPC installer if [ -f $PREFIX/env.sh ]; then source $PREFIX/env.sh fi @@ -45,24 +43,23 @@ else echo "$PROFILE_SH already exists, skipping." fi -# bash.bashrc / bashrc for f in /etc/bash.bashrc /etc/bashrc; do if [ -f "$f" ]; then if ! grep -q "$PREFIX/env.sh" "$f"; then { echo "" - echo "# sihpc environment" + echo "# SiHPC environment" echo "if [ -f $PREFIX/env.sh ]; then" echo " source $PREFIX/env.sh" echo "fi" } >> "$f" - echo "Added sihpc source to $f" + echo "Added SiHPC source to $f" fi fi done echo -echo "sihpc installation completed successfully!" +echo "SiHPC installation completed successfully!" echo "Installed to: $PREFIX" echo "Library config: $LIBCONF" echo "Auto env setup: $PROFILE_SH" diff --git a/scripts/uninstall_sihpc b/scripts/uninstall_sihpc old mode 100644 new mode 100755 index a58be35d..20cabe24 --- a/scripts/uninstall_sihpc +++ b/scripts/uninstall_sihpc @@ -18,7 +18,6 @@ echo "==============================" echo "deleting $SIHPC_ROOT ..." rm -rf "$SIHPC_ROOT" -# 尝试清理 PATH 中的 sihpc/bin(如果用户在 .bashrc 中手动添加过,可以提示用户) echo "please check shell config(~/.bashrc, ~/.zshrc etc.)," echo "remove $SIHPC_ROOT/bin from PATH" From 4aebbcbf931d081382c8d6a447f135140224c280 Mon Sep 17 00:00:00 2001 From: xlliu Date: Sun, 21 Dec 2025 10:15:30 +0800 Subject: [PATCH 08/26] add Dockerfile --- Dockerfile | 153 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..12299f0c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,153 @@ +########################### +# Build-time configuration +########################### + +# Base OS and CUDA versions +ARG UBUNTU_VERSION=22.04 +ARG CUDA_VERSION=13.1.0 +ARG CUDART_VERSION=13.1.80 +ARG CUDART_MAJOR_VERSION=13 + +# NCCL versions +ARG NCCL_PACKAGE_VERSION=2.28.9-1+cuda13.0 +ARG NCCL_SO_VERSION=2.28.9 + +# OpenMPI versions +# - MPI_VERSION: full OpenMPI version +# - MPI_SERIES: major.minor series used in download URL +ARG MPI_VERSION=4.1.8 +ARG MPI_SERIES=4.1 + +# Build date (override at build time) +ARG BUILD_DATE=20251221 + +########################### +# Build Stage +########################### +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS build + +# Re-declare build args for this stage (values are inherited) +ARG UBUNTU_VERSION +ARG CUDA_VERSION +ARG CUDART_VERSION +ARG NCCL_PACKAGE_VERSION +ARG NCCL_SO_VERSION +ARG MPI_VERSION +ARG MPI_SERIES +ARG BUILD_DATE + +ENV DEBIAN_FRONTEND=noninteractive +WORKDIR /workspace + +# ------------------------- +# 1. Base build dependencies +# ------------------------- +RUN apt-get -o Acquire::http::No-Cache=true update && \ + apt-get install -y --no-install-recommends \ + build-essential gcc g++ curl git wget ca-certificates \ + make automake autoconf libtool pkg-config \ + python3 python3-pip gzip xz-utils && \ + rm -rf /var/lib/apt/lists/* + +# ------------------------- +# 2. Install CUDA keyring and restore NVIDIA repository +# ------------------------- +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \ + dpkg -i cuda-keyring_1.1-1_all.deb && \ + apt-get update + +# ------------------------- +# 3. Install NCCL (pinned version) +# ------------------------- +RUN apt-get install -y --no-install-recommends \ + libnccl2=${NCCL_PACKAGE_VERSION} \ + libnccl-dev=${NCCL_PACKAGE_VERSION} && \ + ldconfig && \ + rm -rf /var/lib/apt/lists/* + +# ------------------------- +# 4. Build OpenMPI from source +# ------------------------- +RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz && \ + tar zxvf openmpi-${MPI_VERSION}.tar.gz && \ + cd openmpi-${MPI_VERSION} && \ + ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda && \ + make -j$(nproc) && make install + +# ------------------------- +# 5. Build nccl-tests +# ------------------------- +RUN cd /tmp && \ + git clone https://github.com/scitix/nccl-tests.git -b sync/upstream-20251216 && \ + cd nccl-tests && \ + make MPI=1 MPI_HOME=/usr/local/sihpc && \ + mkdir -p /usr/local/sihpc/libexec/nccl-tests && \ + cp -rf build/*_perf /usr/local/sihpc/libexec/nccl-tests/ && \ + mkdir -p /usr/local/sihpc/bin && \ + cp scripts/nccl_perf /usr/local/sihpc/bin/nccl_perf && \ + cp scripts/nccl_test /usr/local/sihpc/libexec/nccl-tests/nccl_test && \ + cp scripts/env.sh /usr/local/sihpc/env.sh && \ + cp scripts/install_sihpc /usr/local/sihpc/bin/install_sihpc && \ + cp scripts/uninstall_sihpc /usr/local/sihpc/bin/uninstall_sihpc + +# ------------------------- +# 6. Collect runtime libraries (strict selection) +# ------------------------- +RUN set -eux && \ + mkdir -p /usr/local/sihpc/lib && \ + cp /usr/local/cuda/lib64/libcudart* /usr/local/sihpc/lib/ && \ + cp /usr/lib/x86_64-linux-gnu/libnccl.so* /usr/local/sihpc/lib/ +# cp /lib/x86_64-linux-gnu/libltdl.so.7.3.1 /usr/local/sihpc/lib/; \ +# cp /usr/lib/x86_64-linux-gnu/libhwloc.so* /usr/local/sihpc/lib/; \ +# cp /usr/lib/x86_64-linux-gnu/libevent_core* /usr/local/sihpc/lib/; \ +# cp /usr/lib/x86_64-linux-gnu/libevent_pthreads* /usr/local/sihpc/lib/; \ + +# ------------------------- +# 7. Fix library symlinks +# ------------------------- +RUN cd /usr/local/sihpc/lib && \ + rm -f libcudart.so libcudart.so.${CUDART_MAJOR_VERSION} && \ + ln -sf libnccl.so.${NCCL_SO_VERSION} libnccl.so.2 && \ + ln -sf libnccl.so.2 libnccl.so && \ + ln -sf libcudart.so.${CUDART_VERSION} libcudart.so.${CUDART_MAJOR_VERSION} && \ + ln -sf libcudart.so.${CUDART_MAJOR_VERSION} libcudart.so +# rm -f libevent_core-2.1.so.7 && \ +# ln -sf libhwloc.so.15.1.0 libhwloc.so.15 && \ +# ln -sf libhwloc.so.15.1.0 libhwloc.so && \ +# ln -sf libevent_core-2.1.so.7.0.0 libevent_core-2.1.so.7 && \ +# ln -sf libevent_core-2.1.so.7 libevent_core-2.1.so && \ +# ln -sf libevent_pthreads-2.1.so.7.0.0 libevent_pthreads-2.1.so.7 && \ +# ln -sf libevent_pthreads-2.1.so.7 libevent_pthreads-2.1.so && \ +# ln -sf libltdl.so.7.3.1 libltdl.so.7 && \ +# ln -sf libltdl.so.7 libltdl.so + +########################### +# Package Stage +########################### +FROM ubuntu:${UBUNTU_VERSION} AS package + +# Re-declare args for this stage (values are inherited) +ARG UBUNTU_VERSION +ARG NCCL_PACKAGE_VERSION +ARG MPI_VERSION +ARG BUILD_DATE + +# Expose versions/date as environment variables for runtime shell expansion +ENV NCCL_PACKAGE_VERSION=${NCCL_PACKAGE_VERSION} \ + MPI_VERSION=${MPI_VERSION} \ + BUILD_DATE=${BUILD_DATE} + +WORKDIR /build +COPY --from=build /usr/local/sihpc /usr/local/sihpc + +WORKDIR /build +RUN apt-get update && apt-get install -y --no-install-recommends makeself && \ + chmod +x /usr/local/sihpc/bin/install_sihpc && \ + SAFE_NCCL_PKG="${NCCL_PACKAGE_VERSION//+/-}" && \ + PACKAGE_FILENAME="sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run" && \ + makeself --gzip /usr/local/sihpc \ + "${PACKAGE_FILENAME}" \ + "SiHPC MPI + NCCL + NCCL-tests Portable Installer" \ + ./bin/install_sihpc + +CMD ["bash", "-c", "SAFE_NCCL_PKG=${NCCL_PACKAGE_VERSION//+/-}; FILE=\"sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run\"; ls -lh \"/build/$FILE\" && echo 'Build complete.'"] \ No newline at end of file From 7dce185a165d63b040f8e190da3c4a2e87275bf4 Mon Sep 17 00:00:00 2001 From: xlliu Date: Sun, 21 Dec 2025 10:18:09 +0800 Subject: [PATCH 09/26] update Dockerfile --- Dockerfile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 12299f0c..f71d72dd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -137,10 +137,9 @@ ENV NCCL_PACKAGE_VERSION=${NCCL_PACKAGE_VERSION} \ MPI_VERSION=${MPI_VERSION} \ BUILD_DATE=${BUILD_DATE} -WORKDIR /build COPY --from=build /usr/local/sihpc /usr/local/sihpc -WORKDIR /build +WORKDIR /dist RUN apt-get update && apt-get install -y --no-install-recommends makeself && \ chmod +x /usr/local/sihpc/bin/install_sihpc && \ SAFE_NCCL_PKG="${NCCL_PACKAGE_VERSION//+/-}" && \ @@ -148,6 +147,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends makeself && \ makeself --gzip /usr/local/sihpc \ "${PACKAGE_FILENAME}" \ "SiHPC MPI + NCCL + NCCL-tests Portable Installer" \ - ./bin/install_sihpc + ./bin/install_sihpc && \ + rm -rf /usr/local/sihpc && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* CMD ["bash", "-c", "SAFE_NCCL_PKG=${NCCL_PACKAGE_VERSION//+/-}; FILE=\"sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run\"; ls -lh \"/build/$FILE\" && echo 'Build complete.'"] \ No newline at end of file From a7444d2ed89d8271fa182f67d25c8e414b7a25a7 Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 03:12:56 +0000 Subject: [PATCH 10/26] chore(scripts): add install scripts --- scripts/env.sh | 6 ++++ scripts/install_sihpc | 66 +++++++++++++++++++++++++++++++++++++++++ scripts/uninstall_sihpc | 25 ++++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 scripts/env.sh create mode 100644 scripts/install_sihpc create mode 100644 scripts/uninstall_sihpc diff --git a/scripts/env.sh b/scripts/env.sh new file mode 100644 index 00000000..5e2610be --- /dev/null +++ b/scripts/env.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export SIHPC_HOME=/usr/local/sihpc +export PATH=$SIHPC_HOME/bin:$PATH +export LD_LIBRARY_PATH=$SIHPC_HOME/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH +export OMPI_MCA_opal_prefix=$SIHPC_HOME +export OPAL_PREFIX=$SIHPC_HOME \ No newline at end of file diff --git a/scripts/install_sihpc b/scripts/install_sihpc new file mode 100644 index 00000000..01e370ad --- /dev/null +++ b/scripts/install_sihpc @@ -0,0 +1,66 @@ +#!/bin/bash +set -euo pipefail + +PREFIX="/usr/local/sihpc" +LIBCONF="/etc/ld.so.conf.d/sihpc.conf" +PROFILE_SH="/etc/profile.d/sihpc.sh" + +echo "Installing SiHPC runtime to: $PREFIX" + +if [ ! -d "$PREFIX" ]; then + mkdir -p "$PREFIX" +fi +cp -r ./* "$PREFIX/" +echo "Files installed to $PREFIX" + +if [ ! -f "$LIBCONF" ]; then + echo "$PREFIX/lib" > "$LIBCONF" + [ -d "$PREFIX/lib64" ] && echo "$PREFIX/lib64" >> "$LIBCONF" + echo "Added $LIBCONF" +else + if ! grep -q "$PREFIX/lib" "$LIBCONF"; then + echo "$PREFIX/lib" >> "$LIBCONF" + fi + if [ -d "$PREFIX/lib64" ] && ! grep -q "$PREFIX/lib64" "$LIBCONF"; then + echo "$PREFIX/lib64" >> "$LIBCONF" + fi + echo "Updated existing $LIBCONF" +fi + +ldconfig +echo "ldconfig updated" + +if [ ! -f "$PROFILE_SH" ]; then + cat < "$PROFILE_SH" +# Auto-generated by SiHPC installer +if [ -f $PREFIX/env.sh ]; then + source $PREFIX/env.sh +fi +EOF + chmod +x "$PROFILE_SH" + echo "Added $PROFILE_SH" +else + echo "$PROFILE_SH already exists, skipping." +fi + +for f in /etc/bash.bashrc /etc/bashrc; do + if [ -f "$f" ]; then + if ! grep -q "$PREFIX/env.sh" "$f"; then + { + echo "" + echo "# SiHPC environment" + echo "if [ -f $PREFIX/env.sh ]; then" + echo " source $PREFIX/env.sh" + echo "fi" + } >> "$f" + echo "Added SiHPC source to $f" + fi + fi +done + +echo +echo "SiHPC installation completed successfully!" +echo "Installed to: $PREFIX" +echo "Library config: $LIBCONF" +echo "Auto env setup: $PROFILE_SH" +echo "Run 'source $PREFIX/env.sh' now to activate current shell." \ No newline at end of file diff --git a/scripts/uninstall_sihpc b/scripts/uninstall_sihpc new file mode 100644 index 00000000..20cabe24 --- /dev/null +++ b/scripts/uninstall_sihpc @@ -0,0 +1,25 @@ +#!/bin/bash +# sihpc-uninstaller.sh + +set -e + +SIHPC_ROOT="/usr/local/sihpc" + +if [ ! -d "$SIHPC_ROOT" ]; then + echo "sihpc install dir $SIHPC_ROOT not exist." + exit 1 +fi + +echo "==============================" +echo " uninstall sihpc" +echo " install dir: $SIHPC_ROOT" +echo "==============================" + +echo "deleting $SIHPC_ROOT ..." +rm -rf "$SIHPC_ROOT" + +echo "please check shell config(~/.bashrc, ~/.zshrc etc.)," +echo "remove $SIHPC_ROOT/bin from PATH" + +echo "sihpc unintall done!" +exit 0 \ No newline at end of file From cc230e61e1d2709252d314097c775f0a4e159ce2 Mon Sep 17 00:00:00 2001 From: xlliu Date: Sun, 21 Dec 2025 14:13:42 +0800 Subject: [PATCH 11/26] add github workflow --- .github/workflows/pre_check.yml | 29 ++++ .github/workflows/release.yml | 52 ++++++ docker/Dockerfile.cuda12.x.ubuntu20.04 | 153 ++++++++++++++++++ .../Dockerfile.cuda13.x.ubuntu22.04 | 38 +---- 4 files changed, 242 insertions(+), 30 deletions(-) create mode 100644 .github/workflows/pre_check.yml create mode 100644 .github/workflows/release.yml create mode 100644 docker/Dockerfile.cuda12.x.ubuntu20.04 rename Dockerfile => docker/Dockerfile.cuda13.x.ubuntu22.04 (83%) diff --git a/.github/workflows/pre_check.yml b/.github/workflows/pre_check.yml new file mode 100644 index 00000000..67bd7b4e --- /dev/null +++ b/.github/workflows/pre_check.yml @@ -0,0 +1,29 @@ +on: + pull_request: + workflow_dispatch: + +jobs: + build-only: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - uses: docker/setup-buildx-action@v3 + - name: Free disk space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache + sudo docker system prune -af || true + df -h + - name: Build run package + run: | + docker buildx build \ + -f docker/Dockerfile.cuda13.x.ubuntu22.04 \ + --platform linux/amd64 \ + --target package \ + --output type=local,dest=dist/cuda13.x.ubuntu22.04 \ + . + - name: List artifacts + run: | + ls -lh dist/cuda13.x.ubuntu22.04 \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..4cf8622b --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,52 @@ +name: release + +on: + push: + tags: + - "v*" + +permissions: + contents: write # permit uploading Release assets + +jobs: + build-and-release: + name: build-run (${{ matrix.name }}) + runs-on: ubuntu-22.04 + + strategy: + fail-fast: false + matrix: + include: + - name: cuda13-ubuntu22.04 + dockerfile: docker/Dockerfile.cuda13.ubuntu22.04 + + # - name: cuda12-ubuntu20.04 + # dockerfile: docker/Dockerfile.cuda12.ubuntu20.04 + + steps: + - name: Checkout source + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build run package + run: | + BUILD_DATE=$(date +%Y%m%d) + + docker buildx build \ + --platform linux/amd64 \ + --progress=plain \ + -f ${{ matrix.dockerfile }} \ + --build-arg BUILD_DATE=${BUILD_DATE} \ + --output type=local,dest=dist/${{ matrix.name }} \ + . + + echo "Produced files:" + ls -lh dist/${{ matrix.name }} + + - name: Upload run to GitHub Release + uses: softprops/action-gh-release@v2 + with: + files: | + dist/${{ matrix.name }}/*.run diff --git a/docker/Dockerfile.cuda12.x.ubuntu20.04 b/docker/Dockerfile.cuda12.x.ubuntu20.04 new file mode 100644 index 00000000..efe1a627 --- /dev/null +++ b/docker/Dockerfile.cuda12.x.ubuntu20.04 @@ -0,0 +1,153 @@ +########################### +# Build-time configuration +########################### + +# Base OS and CUDA versions +ARG UBUNTU_VERSION=20.04 +ARG CUDA_VERSION=12.8.1 +ARG CUDART_VERSION=12.8.90 +ARG CUDART_MAJOR_VERSION=12 + +# NCCL versions +ARG NCCL_PACKAGE_VERSION=2.27.7-1+cuda12.4 +ARG NCCL_SO_VERSION=2.27.7 + +# OpenMPI versions +# - MPI_VERSION: full OpenMPI version +# - MPI_SERIES: major.minor series used in download URL +ARG MPI_VERSION=4.1.8 +ARG MPI_SERIES=4.1 + +# Build date (override at build time) +ARG BUILD_DATE=20251221 + +########################### +# Build Stage +########################### +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS build + +# Re-declare build args for this stage (values are inherited) +ARG UBUNTU_VERSION +ARG CUDA_VERSION +ARG CUDART_VERSION +ARG NCCL_PACKAGE_VERSION +ARG NCCL_SO_VERSION +ARG MPI_VERSION +ARG MPI_SERIES +ARG BUILD_DATE + +ENV DEBIAN_FRONTEND=noninteractive +WORKDIR /workspace + +# ------------------------- +# 1. Base build dependencies +# ------------------------- +RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list || true && \ + apt-get -o Acquire::http::No-Cache=true update && \ + apt-get install -y --no-install-recommends \ + build-essential gcc g++ curl git wget ca-certificates \ + make automake autoconf libtool pkg-config \ + python3 python3-pip gzip xz-utils && \ + rm -rf /var/lib/apt/lists/* + +# ------------------------- +# 2. Install CUDA keyring and restore NVIDIA repository +# ------------------------- +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb && \ + dpkg -i cuda-keyring_1.1-1_all.deb && \ + apt-get update + +# ------------------------- +# 3. Install NCCL (pinned version) +# ------------------------- +RUN apt-mark unhold libnccl2 libnccl-dev || true && \ + apt-get install -y --no-install-recommends \ + libnccl2=${NCCL_PACKAGE_VERSION} \ + libnccl-dev=${NCCL_PACKAGE_VERSION} && \ + apt-mark hold libnccl2 libnccl-dev && \ + ldconfig && \ + rm -rf /var/lib/apt/lists/* + +# ------------------------- +# 4. Build OpenMPI from source +# ------------------------- +RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz && \ + tar zxvf openmpi-${MPI_VERSION}.tar.gz && \ + cd openmpi-${MPI_VERSION} && \ + ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda && \ + make -j$(nproc) && make install && \ + rm -rf /workspace/openmpi-${MPI_VERSION} /workspace/openmpi-${MPI_VERSION}.tar.gz + +# ------------------------- +# 5. Build nccl-tests +# ------------------------- +RUN git clone https://github.com/scitix/nccl-tests.git -b sicl && \ + cd nccl-tests && \ + make MPI=1 MPI_HOME=/usr/local/sihpc && \ + mkdir -p /usr/local/sihpc/libexec/nccl-tests && \ + cp -rf build/*_perf /usr/local/sihpc/libexec/nccl-tests/ && \ + cp scripts/nccl_perf /usr/local/sihpc/bin/nccl_perf && \ + cp scripts/nccl_test /usr/local/sihpc/libexec/nccl-tests/nccl_test && \ + cp scripts/env.sh /usr/local/sihpc/env.sh && \ + cp scripts/install_sihpc /usr/local/sihpc/bin/install_sihpc && \ + cp scripts/uninstall_sihpc /usr/local/sihpc/bin/uninstall_sihpc && \ + rm -rf /workspace/nccl-tests + +# ------------------------- +# 6. Collect runtime libraries (strict selection) +# ------------------------- +RUN set -eux && \ + mkdir -p /usr/local/sihpc/lib && \ + cp /usr/local/cuda/lib64/libcudart* /usr/local/sihpc/lib/ && \ + cp /usr/lib/x86_64-linux-gnu/libnccl.so* /usr/local/sihpc/lib/ + # cp /lib/x86_64-linux-gnu/libltdl.so.7.3.1 /usr/local/sihpc/lib/ && \ + # cp /usr/lib/x86_64-linux-gnu/libhwloc.so* /usr/local/sihpc/lib/ && \ + # cp /usr/lib/x86_64-linux-gnu/libevent_core* /usr/local/sihpc/lib/ && \ + # cp /usr/lib/x86_64-linux-gnu/libevent_pthreads* /usr/local/sihpc/lib/ + +# ------------------------- +# 7. Fix library symlinks +# ------------------------- +RUN cd /usr/local/sihpc/lib && \ + rm -f libcudart.so libcudart.so.12 && \ + ln -sf libnccl.so.2.27.7 libnccl.so.2 && \ + ln -sf libnccl.so.2 libnccl.so && \ + ln -sf libcudart.so.12.8.90 libcudart.so.12 && \ + ln -sf libcudart.so.12 libcudart.so + # rm -f libevent_core-2.1.so.7 && \ + # ln -sf libhwloc.so.15.1.0 libhwloc.so.15 && \ + # ln -sf libhwloc.so.15.1.0 libhwloc.so && \ + # ln -sf libevent_core-2.1.so.7.0.0 libevent_core-2.1.so.7 && \ + # ln -sf libevent_core-2.1.so.7 libevent_core-2.1.so && \ + # ln -sf libevent_pthreads-2.1.so.7.0.0 libevent_pthreads-2.1.so.7 && \ + # ln -sf libevent_pthreads-2.1.so.7 libevent_pthreads-2.1.so && \ + # ln -sf libltdl.so.7.3.1 libltdl.so.7 && \ + # ln -sf libltdl.so.7 libltdl.so + +########################### +# Package Stage +########################### +FROM ubuntu:20.04 AS package + +# Re-declare args for this stage (values are inherited) +ARG UBUNTU_VERSION +ARG NCCL_PACKAGE_VERSION +ARG MPI_VERSION +ARG BUILD_DATE + +# Expose versions/date as environment variables for runtime shell expansion +ENV NCCL_PACKAGE_VERSION=${NCCL_PACKAGE_VERSION} \ + MPI_VERSION=${MPI_VERSION} \ + BUILD_DATE=${BUILD_DATE} + +COPY --from=build /usr/local/sihpc /usr/local/sihpc + +WORKDIR / +RUN apt-get update && apt-get install -y --no-install-recommends makeself && \ + chmod +x /usr/local/sihpc/bin/install_sihpc && \ + SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ + PACKAGE_FILENAME="sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run" && \ + makeself --gzip /usr/local/sihpc \ + "${PACKAGE_FILENAME}" \ + "SiHPC MPI + NCCL + NCCL-tests Portable Installer" \ + ./bin/install_sihpc diff --git a/Dockerfile b/docker/Dockerfile.cuda13.x.ubuntu22.04 similarity index 83% rename from Dockerfile rename to docker/Dockerfile.cuda13.x.ubuntu22.04 index f71d72dd..1ca8a1c7 100644 --- a/Dockerfile +++ b/docker/Dockerfile.cuda13.x.ubuntu22.04 @@ -77,8 +77,7 @@ RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-$ # ------------------------- # 5. Build nccl-tests # ------------------------- -RUN cd /tmp && \ - git clone https://github.com/scitix/nccl-tests.git -b sync/upstream-20251216 && \ +RUN git clone https://github.com/scitix/nccl-tests.git -b sync/upstream-20251216 && \ cd nccl-tests && \ make MPI=1 MPI_HOME=/usr/local/sihpc && \ mkdir -p /usr/local/sihpc/libexec/nccl-tests && \ @@ -97,10 +96,10 @@ RUN set -eux && \ mkdir -p /usr/local/sihpc/lib && \ cp /usr/local/cuda/lib64/libcudart* /usr/local/sihpc/lib/ && \ cp /usr/lib/x86_64-linux-gnu/libnccl.so* /usr/local/sihpc/lib/ -# cp /lib/x86_64-linux-gnu/libltdl.so.7.3.1 /usr/local/sihpc/lib/; \ -# cp /usr/lib/x86_64-linux-gnu/libhwloc.so* /usr/local/sihpc/lib/; \ -# cp /usr/lib/x86_64-linux-gnu/libevent_core* /usr/local/sihpc/lib/; \ -# cp /usr/lib/x86_64-linux-gnu/libevent_pthreads* /usr/local/sihpc/lib/; \ +# cp /lib/x86_64-linux-gnu/libltdl.so.7.3.1 /usr/local/sihpc/lib/ && \ +# cp /usr/lib/x86_64-linux-gnu/libhwloc.so* /usr/local/sihpc/lib/ && \ +# cp /usr/lib/x86_64-linux-gnu/libevent_core* /usr/local/sihpc/lib/ && \ +# cp /usr/lib/x86_64-linux-gnu/libevent_pthreads* /usr/local/sihpc/lib/ # ------------------------- # 7. Fix library symlinks @@ -121,36 +120,15 @@ RUN cd /usr/local/sihpc/lib && \ # ln -sf libltdl.so.7.3.1 libltdl.so.7 && \ # ln -sf libltdl.so.7 libltdl.so -########################### -# Package Stage -########################### -FROM ubuntu:${UBUNTU_VERSION} AS package - -# Re-declare args for this stage (values are inherited) -ARG UBUNTU_VERSION -ARG NCCL_PACKAGE_VERSION -ARG MPI_VERSION -ARG BUILD_DATE - # Expose versions/date as environment variables for runtime shell expansion ENV NCCL_PACKAGE_VERSION=${NCCL_PACKAGE_VERSION} \ MPI_VERSION=${MPI_VERSION} \ BUILD_DATE=${BUILD_DATE} -COPY --from=build /usr/local/sihpc /usr/local/sihpc - -WORKDIR /dist -RUN apt-get update && apt-get install -y --no-install-recommends makeself && \ - chmod +x /usr/local/sihpc/bin/install_sihpc && \ - SAFE_NCCL_PKG="${NCCL_PACKAGE_VERSION//+/-}" && \ +WORKDIR / +RUN SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ PACKAGE_FILENAME="sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run" && \ makeself --gzip /usr/local/sihpc \ "${PACKAGE_FILENAME}" \ "SiHPC MPI + NCCL + NCCL-tests Portable Installer" \ - ./bin/install_sihpc && \ - rm -rf /usr/local/sihpc && \ - apt-get autoremove -y && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -CMD ["bash", "-c", "SAFE_NCCL_PKG=${NCCL_PACKAGE_VERSION//+/-}; FILE=\"sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run\"; ls -lh \"/build/$FILE\" && echo 'Build complete.'"] \ No newline at end of file + ./bin/install_sihpc \ No newline at end of file From 5eec99dc086d31ff99e62c2a88ea52dbe05cbbd5 Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 15:12:18 +0800 Subject: [PATCH 12/26] add dockerfile and ci workflow (#1) --- .github/workflows/pre-check.yml | 29 +++++ .github/workflows/release.yml | 37 ++++++ docker/Dockerfile.cuda12.x.ubuntu20.04 | 153 +++++++++++++++++++++++++ 3 files changed, 219 insertions(+) create mode 100644 .github/workflows/pre-check.yml create mode 100644 .github/workflows/release.yml create mode 100644 docker/Dockerfile.cuda12.x.ubuntu20.04 diff --git a/.github/workflows/pre-check.yml b/.github/workflows/pre-check.yml new file mode 100644 index 00000000..f293c611 --- /dev/null +++ b/.github/workflows/pre-check.yml @@ -0,0 +1,29 @@ +on: + pull_request: + workflow_dispatch: + +jobs: + build-only: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - uses: docker/setup-buildx-action@v3 + - name: Free disk space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache + sudo docker system prune -af || true + df -h + - name: Build run package + run: | + docker buildx build \ + -f docker/Dockerfile.cuda12.x.ubuntu20.04 \ + --platform linux/amd64 \ + --target package \ + --output type=local,dest=dist \ + . + - name: List artifacts + run: | + ls -lh dist \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..86b42f8f --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,37 @@ +name: Release run installer + +on: + push: + tags: + - "v*" + +permissions: + contents: write + +jobs: + build-release: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v4 + + - name: Setup Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build .run installer (Docker) + run: | + docker buildx build \ + -f docker/Dockerfile.cuda12.x.ubuntu20.04 \ + --platform linux/amd64 \ + --build-arg BUILD_DATE=$(date +%Y%m%d) \ + --output type=local,dest=dist \ + . + + - name: List artifacts + run: ls -lh dist + + - name: Upload to GitHub Release + uses: softprops/action-gh-release@v2 + with: + files: | + dist/*.run diff --git a/docker/Dockerfile.cuda12.x.ubuntu20.04 b/docker/Dockerfile.cuda12.x.ubuntu20.04 new file mode 100644 index 00000000..efe1a627 --- /dev/null +++ b/docker/Dockerfile.cuda12.x.ubuntu20.04 @@ -0,0 +1,153 @@ +########################### +# Build-time configuration +########################### + +# Base OS and CUDA versions +ARG UBUNTU_VERSION=20.04 +ARG CUDA_VERSION=12.8.1 +ARG CUDART_VERSION=12.8.90 +ARG CUDART_MAJOR_VERSION=12 + +# NCCL versions +ARG NCCL_PACKAGE_VERSION=2.27.7-1+cuda12.4 +ARG NCCL_SO_VERSION=2.27.7 + +# OpenMPI versions +# - MPI_VERSION: full OpenMPI version +# - MPI_SERIES: major.minor series used in download URL +ARG MPI_VERSION=4.1.8 +ARG MPI_SERIES=4.1 + +# Build date (override at build time) +ARG BUILD_DATE=20251221 + +########################### +# Build Stage +########################### +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS build + +# Re-declare build args for this stage (values are inherited) +ARG UBUNTU_VERSION +ARG CUDA_VERSION +ARG CUDART_VERSION +ARG NCCL_PACKAGE_VERSION +ARG NCCL_SO_VERSION +ARG MPI_VERSION +ARG MPI_SERIES +ARG BUILD_DATE + +ENV DEBIAN_FRONTEND=noninteractive +WORKDIR /workspace + +# ------------------------- +# 1. Base build dependencies +# ------------------------- +RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list || true && \ + apt-get -o Acquire::http::No-Cache=true update && \ + apt-get install -y --no-install-recommends \ + build-essential gcc g++ curl git wget ca-certificates \ + make automake autoconf libtool pkg-config \ + python3 python3-pip gzip xz-utils && \ + rm -rf /var/lib/apt/lists/* + +# ------------------------- +# 2. Install CUDA keyring and restore NVIDIA repository +# ------------------------- +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb && \ + dpkg -i cuda-keyring_1.1-1_all.deb && \ + apt-get update + +# ------------------------- +# 3. Install NCCL (pinned version) +# ------------------------- +RUN apt-mark unhold libnccl2 libnccl-dev || true && \ + apt-get install -y --no-install-recommends \ + libnccl2=${NCCL_PACKAGE_VERSION} \ + libnccl-dev=${NCCL_PACKAGE_VERSION} && \ + apt-mark hold libnccl2 libnccl-dev && \ + ldconfig && \ + rm -rf /var/lib/apt/lists/* + +# ------------------------- +# 4. Build OpenMPI from source +# ------------------------- +RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz && \ + tar zxvf openmpi-${MPI_VERSION}.tar.gz && \ + cd openmpi-${MPI_VERSION} && \ + ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda && \ + make -j$(nproc) && make install && \ + rm -rf /workspace/openmpi-${MPI_VERSION} /workspace/openmpi-${MPI_VERSION}.tar.gz + +# ------------------------- +# 5. Build nccl-tests +# ------------------------- +RUN git clone https://github.com/scitix/nccl-tests.git -b sicl && \ + cd nccl-tests && \ + make MPI=1 MPI_HOME=/usr/local/sihpc && \ + mkdir -p /usr/local/sihpc/libexec/nccl-tests && \ + cp -rf build/*_perf /usr/local/sihpc/libexec/nccl-tests/ && \ + cp scripts/nccl_perf /usr/local/sihpc/bin/nccl_perf && \ + cp scripts/nccl_test /usr/local/sihpc/libexec/nccl-tests/nccl_test && \ + cp scripts/env.sh /usr/local/sihpc/env.sh && \ + cp scripts/install_sihpc /usr/local/sihpc/bin/install_sihpc && \ + cp scripts/uninstall_sihpc /usr/local/sihpc/bin/uninstall_sihpc && \ + rm -rf /workspace/nccl-tests + +# ------------------------- +# 6. Collect runtime libraries (strict selection) +# ------------------------- +RUN set -eux && \ + mkdir -p /usr/local/sihpc/lib && \ + cp /usr/local/cuda/lib64/libcudart* /usr/local/sihpc/lib/ && \ + cp /usr/lib/x86_64-linux-gnu/libnccl.so* /usr/local/sihpc/lib/ + # cp /lib/x86_64-linux-gnu/libltdl.so.7.3.1 /usr/local/sihpc/lib/ && \ + # cp /usr/lib/x86_64-linux-gnu/libhwloc.so* /usr/local/sihpc/lib/ && \ + # cp /usr/lib/x86_64-linux-gnu/libevent_core* /usr/local/sihpc/lib/ && \ + # cp /usr/lib/x86_64-linux-gnu/libevent_pthreads* /usr/local/sihpc/lib/ + +# ------------------------- +# 7. Fix library symlinks +# ------------------------- +RUN cd /usr/local/sihpc/lib && \ + rm -f libcudart.so libcudart.so.12 && \ + ln -sf libnccl.so.2.27.7 libnccl.so.2 && \ + ln -sf libnccl.so.2 libnccl.so && \ + ln -sf libcudart.so.12.8.90 libcudart.so.12 && \ + ln -sf libcudart.so.12 libcudart.so + # rm -f libevent_core-2.1.so.7 && \ + # ln -sf libhwloc.so.15.1.0 libhwloc.so.15 && \ + # ln -sf libhwloc.so.15.1.0 libhwloc.so && \ + # ln -sf libevent_core-2.1.so.7.0.0 libevent_core-2.1.so.7 && \ + # ln -sf libevent_core-2.1.so.7 libevent_core-2.1.so && \ + # ln -sf libevent_pthreads-2.1.so.7.0.0 libevent_pthreads-2.1.so.7 && \ + # ln -sf libevent_pthreads-2.1.so.7 libevent_pthreads-2.1.so && \ + # ln -sf libltdl.so.7.3.1 libltdl.so.7 && \ + # ln -sf libltdl.so.7 libltdl.so + +########################### +# Package Stage +########################### +FROM ubuntu:20.04 AS package + +# Re-declare args for this stage (values are inherited) +ARG UBUNTU_VERSION +ARG NCCL_PACKAGE_VERSION +ARG MPI_VERSION +ARG BUILD_DATE + +# Expose versions/date as environment variables for runtime shell expansion +ENV NCCL_PACKAGE_VERSION=${NCCL_PACKAGE_VERSION} \ + MPI_VERSION=${MPI_VERSION} \ + BUILD_DATE=${BUILD_DATE} + +COPY --from=build /usr/local/sihpc /usr/local/sihpc + +WORKDIR / +RUN apt-get update && apt-get install -y --no-install-recommends makeself && \ + chmod +x /usr/local/sihpc/bin/install_sihpc && \ + SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ + PACKAGE_FILENAME="sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run" && \ + makeself --gzip /usr/local/sihpc \ + "${PACKAGE_FILENAME}" \ + "SiHPC MPI + NCCL + NCCL-tests Portable Installer" \ + ./bin/install_sihpc From 42c1025d7dcb58fc64eb11bd09f73530bdfd3891 Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 07:18:22 +0000 Subject: [PATCH 13/26] add free disk space in release.yml ci --- .github/workflows/release.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 86b42f8f..6cfd95d7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -18,6 +18,15 @@ jobs: - name: Setup Docker Buildx uses: docker/setup-buildx-action@v3 + - name: Free disk space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache + sudo docker system prune -af || true + df -h + - name: Build .run installer (Docker) run: | docker buildx build \ From b5c25a812cfbca87fd9b8cfeb6f8858df6f0ace2 Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 07:49:52 +0000 Subject: [PATCH 14/26] delete duplicated pre-check.yml --- .github/workflows/pre-check.yml | 2 +- .github/workflows/pre_check.yml | 29 ----------------------------- 2 files changed, 1 insertion(+), 30 deletions(-) delete mode 100644 .github/workflows/pre_check.yml diff --git a/.github/workflows/pre-check.yml b/.github/workflows/pre-check.yml index f293c611..0b8d4140 100644 --- a/.github/workflows/pre-check.yml +++ b/.github/workflows/pre-check.yml @@ -19,7 +19,7 @@ jobs: - name: Build run package run: | docker buildx build \ - -f docker/Dockerfile.cuda12.x.ubuntu20.04 \ + -f docker/Dockerfile.cuda13.x.ubuntu22.04 \ --platform linux/amd64 \ --target package \ --output type=local,dest=dist \ diff --git a/.github/workflows/pre_check.yml b/.github/workflows/pre_check.yml deleted file mode 100644 index 67bd7b4e..00000000 --- a/.github/workflows/pre_check.yml +++ /dev/null @@ -1,29 +0,0 @@ -on: - pull_request: - workflow_dispatch: - -jobs: - build-only: - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v4 - - uses: docker/setup-buildx-action@v3 - - name: Free disk space - run: | - sudo rm -rf /usr/share/dotnet - sudo rm -rf /usr/local/lib/android - sudo rm -rf /opt/ghc - sudo rm -rf /opt/hostedtoolcache - sudo docker system prune -af || true - df -h - - name: Build run package - run: | - docker buildx build \ - -f docker/Dockerfile.cuda13.x.ubuntu22.04 \ - --platform linux/amd64 \ - --target package \ - --output type=local,dest=dist/cuda13.x.ubuntu22.04 \ - . - - name: List artifacts - run: | - ls -lh dist/cuda13.x.ubuntu22.04 \ No newline at end of file From 5fe638cb662cfca0446b0b32e9f4ad2304a3675b Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 08:02:26 +0000 Subject: [PATCH 15/26] update Dockerfile --- docker/Dockerfile.cuda13.x.ubuntu22.04 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.cuda13.x.ubuntu22.04 b/docker/Dockerfile.cuda13.x.ubuntu22.04 index 1ca8a1c7..8e6a4539 100644 --- a/docker/Dockerfile.cuda13.x.ubuntu22.04 +++ b/docker/Dockerfile.cuda13.x.ubuntu22.04 @@ -24,7 +24,7 @@ ARG BUILD_DATE=20251221 ########################### # Build Stage ########################### -FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS build +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} # Re-declare build args for this stage (values are inherited) ARG UBUNTU_VERSION From c9ace03f2d1ca0b36215dd945fc91b2933c3b0c7 Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 08:12:47 +0000 Subject: [PATCH 16/26] update pre-check.yml --- .github/workflows/pre-check.yml | 2 +- docker/Dockerfile.cuda13.x.ubuntu22.04 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pre-check.yml b/.github/workflows/pre-check.yml index 0b8d4140..f8f75d00 100644 --- a/.github/workflows/pre-check.yml +++ b/.github/workflows/pre-check.yml @@ -21,7 +21,7 @@ jobs: docker buildx build \ -f docker/Dockerfile.cuda13.x.ubuntu22.04 \ --platform linux/amd64 \ - --target package \ + --target build \ --output type=local,dest=dist \ . - name: List artifacts diff --git a/docker/Dockerfile.cuda13.x.ubuntu22.04 b/docker/Dockerfile.cuda13.x.ubuntu22.04 index 8e6a4539..1ca8a1c7 100644 --- a/docker/Dockerfile.cuda13.x.ubuntu22.04 +++ b/docker/Dockerfile.cuda13.x.ubuntu22.04 @@ -24,7 +24,7 @@ ARG BUILD_DATE=20251221 ########################### # Build Stage ########################### -FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS build # Re-declare build args for this stage (values are inherited) ARG UBUNTU_VERSION From e115bd8517a82a7c814f6579b960a928aef76ef7 Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 08:39:06 +0000 Subject: [PATCH 17/26] fix bugs --- docker/Dockerfile.cuda12.x.ubuntu20.04 | 2 +- docker/Dockerfile.cuda13.x.ubuntu22.04 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.cuda12.x.ubuntu20.04 b/docker/Dockerfile.cuda12.x.ubuntu20.04 index efe1a627..c3122e6b 100644 --- a/docker/Dockerfile.cuda12.x.ubuntu20.04 +++ b/docker/Dockerfile.cuda12.x.ubuntu20.04 @@ -47,7 +47,7 @@ RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list || true && \ apt-get install -y --no-install-recommends \ build-essential gcc g++ curl git wget ca-certificates \ make automake autoconf libtool pkg-config \ - python3 python3-pip gzip xz-utils && \ + python3 python3-pip gzip xz-utils makeself && \ rm -rf /var/lib/apt/lists/* # ------------------------- diff --git a/docker/Dockerfile.cuda13.x.ubuntu22.04 b/docker/Dockerfile.cuda13.x.ubuntu22.04 index 1ca8a1c7..b42a9a71 100644 --- a/docker/Dockerfile.cuda13.x.ubuntu22.04 +++ b/docker/Dockerfile.cuda13.x.ubuntu22.04 @@ -46,7 +46,7 @@ RUN apt-get -o Acquire::http::No-Cache=true update && \ apt-get install -y --no-install-recommends \ build-essential gcc g++ curl git wget ca-certificates \ make automake autoconf libtool pkg-config \ - python3 python3-pip gzip xz-utils && \ + python3 python3-pip gzip xz-utils makeself && \ rm -rf /var/lib/apt/lists/* # ------------------------- From d5d08b99cccbabd1c84bf59d35cb90a8218ab4f3 Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 10:00:50 +0000 Subject: [PATCH 18/26] update Dockerfile --- docker/Dockerfile.cuda12.x.ubuntu20.04 | 20 +++----------------- docker/Dockerfile.cuda13.x.ubuntu22.04 | 6 ++++-- 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/docker/Dockerfile.cuda12.x.ubuntu20.04 b/docker/Dockerfile.cuda12.x.ubuntu20.04 index c3122e6b..2b3a62d2 100644 --- a/docker/Dockerfile.cuda12.x.ubuntu20.04 +++ b/docker/Dockerfile.cuda12.x.ubuntu20.04 @@ -86,6 +86,7 @@ RUN git clone https://github.com/scitix/nccl-tests.git -b sicl && \ make MPI=1 MPI_HOME=/usr/local/sihpc && \ mkdir -p /usr/local/sihpc/libexec/nccl-tests && \ cp -rf build/*_perf /usr/local/sihpc/libexec/nccl-tests/ && \ + mkdir -p /usr/local/sihpc/bin && \ cp scripts/nccl_perf /usr/local/sihpc/bin/nccl_perf && \ cp scripts/nccl_test /usr/local/sihpc/libexec/nccl-tests/nccl_test && \ cp scripts/env.sh /usr/local/sihpc/env.sh && \ @@ -124,30 +125,15 @@ RUN cd /usr/local/sihpc/lib && \ # ln -sf libltdl.so.7.3.1 libltdl.so.7 && \ # ln -sf libltdl.so.7 libltdl.so -########################### -# Package Stage -########################### -FROM ubuntu:20.04 AS package - -# Re-declare args for this stage (values are inherited) -ARG UBUNTU_VERSION -ARG NCCL_PACKAGE_VERSION -ARG MPI_VERSION -ARG BUILD_DATE - # Expose versions/date as environment variables for runtime shell expansion ENV NCCL_PACKAGE_VERSION=${NCCL_PACKAGE_VERSION} \ MPI_VERSION=${MPI_VERSION} \ BUILD_DATE=${BUILD_DATE} -COPY --from=build /usr/local/sihpc /usr/local/sihpc - WORKDIR / -RUN apt-get update && apt-get install -y --no-install-recommends makeself && \ - chmod +x /usr/local/sihpc/bin/install_sihpc && \ - SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ +RUN SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ PACKAGE_FILENAME="sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run" && \ makeself --gzip /usr/local/sihpc \ "${PACKAGE_FILENAME}" \ "SiHPC MPI + NCCL + NCCL-tests Portable Installer" \ - ./bin/install_sihpc + ./bin/install_sihpc \ No newline at end of file diff --git a/docker/Dockerfile.cuda13.x.ubuntu22.04 b/docker/Dockerfile.cuda13.x.ubuntu22.04 index b42a9a71..bd3c1e64 100644 --- a/docker/Dockerfile.cuda13.x.ubuntu22.04 +++ b/docker/Dockerfile.cuda13.x.ubuntu22.04 @@ -72,7 +72,8 @@ RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-$ tar zxvf openmpi-${MPI_VERSION}.tar.gz && \ cd openmpi-${MPI_VERSION} && \ ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda && \ - make -j$(nproc) && make install + make -j$(nproc) && make install && \ + rm -rf /workspace/openmpi-${MPI_VERSION} /workspace/openmpi-${MPI_VERSION}.tar.gz # ------------------------- # 5. Build nccl-tests @@ -87,7 +88,8 @@ RUN git clone https://github.com/scitix/nccl-tests.git -b sync/upstream-20251216 cp scripts/nccl_test /usr/local/sihpc/libexec/nccl-tests/nccl_test && \ cp scripts/env.sh /usr/local/sihpc/env.sh && \ cp scripts/install_sihpc /usr/local/sihpc/bin/install_sihpc && \ - cp scripts/uninstall_sihpc /usr/local/sihpc/bin/uninstall_sihpc + cp scripts/uninstall_sihpc /usr/local/sihpc/bin/uninstall_sihpc && \ + rm -rf /workspace/nccl-tests # ------------------------- # 6. Collect runtime libraries (strict selection) From eb04f2127a832494394073cac975fdfb7bf69884 Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 10:48:36 +0000 Subject: [PATCH 19/26] update dockerfile --- docker/Dockerfile.cuda12.x.ubuntu20.04 | 4 ++-- docker/Dockerfile.cuda13.x.ubuntu22.04 | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile.cuda12.x.ubuntu20.04 b/docker/Dockerfile.cuda12.x.ubuntu20.04 index 2b3a62d2..66c68cd8 100644 --- a/docker/Dockerfile.cuda12.x.ubuntu20.04 +++ b/docker/Dockerfile.cuda12.x.ubuntu20.04 @@ -74,8 +74,8 @@ RUN apt-mark unhold libnccl2 libnccl-dev || true && \ RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz && \ tar zxvf openmpi-${MPI_VERSION}.tar.gz && \ cd openmpi-${MPI_VERSION} && \ - ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda && \ - make -j$(nproc) && make install && \ + ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda > /dev/null && \ + make -j$(nproc) > /dev/null && make install && \ rm -rf /workspace/openmpi-${MPI_VERSION} /workspace/openmpi-${MPI_VERSION}.tar.gz # ------------------------- diff --git a/docker/Dockerfile.cuda13.x.ubuntu22.04 b/docker/Dockerfile.cuda13.x.ubuntu22.04 index bd3c1e64..23cd53f5 100644 --- a/docker/Dockerfile.cuda13.x.ubuntu22.04 +++ b/docker/Dockerfile.cuda13.x.ubuntu22.04 @@ -71,8 +71,8 @@ RUN apt-get install -y --no-install-recommends \ RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz && \ tar zxvf openmpi-${MPI_VERSION}.tar.gz && \ cd openmpi-${MPI_VERSION} && \ - ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda && \ - make -j$(nproc) && make install && \ + ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda > /dev/null && \ + make -j$(nproc) > /dev/null && make install && \ rm -rf /workspace/openmpi-${MPI_VERSION} /workspace/openmpi-${MPI_VERSION}.tar.gz # ------------------------- From 2225d3f05ef0d056eb6a9537af93cd080d5ce681 Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 10:56:28 +0000 Subject: [PATCH 20/26] update dockefile --- .github/workflows/release.yml | 2 +- docker/Dockerfile.cuda12.x.ubuntu20.04 | 4 ++-- docker/Dockerfile.cuda13.x.ubuntu22.04 | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 737b6df7..e62bca7b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -39,7 +39,7 @@ jobs: sudo docker system prune -af || true df -h - - name: Build run package + - name: Build run package run: | BUILD_DATE=$(date +%Y%m%d) diff --git a/docker/Dockerfile.cuda12.x.ubuntu20.04 b/docker/Dockerfile.cuda12.x.ubuntu20.04 index 66c68cd8..4ad5add0 100644 --- a/docker/Dockerfile.cuda12.x.ubuntu20.04 +++ b/docker/Dockerfile.cuda12.x.ubuntu20.04 @@ -74,8 +74,8 @@ RUN apt-mark unhold libnccl2 libnccl-dev || true && \ RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz && \ tar zxvf openmpi-${MPI_VERSION}.tar.gz && \ cd openmpi-${MPI_VERSION} && \ - ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda > /dev/null && \ - make -j$(nproc) > /dev/null && make install && \ + ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda > /dev/null 2>&1 && \ + make -j$(nproc) > /dev/null 2>&1 && make install && \ rm -rf /workspace/openmpi-${MPI_VERSION} /workspace/openmpi-${MPI_VERSION}.tar.gz # ------------------------- diff --git a/docker/Dockerfile.cuda13.x.ubuntu22.04 b/docker/Dockerfile.cuda13.x.ubuntu22.04 index 23cd53f5..edf62853 100644 --- a/docker/Dockerfile.cuda13.x.ubuntu22.04 +++ b/docker/Dockerfile.cuda13.x.ubuntu22.04 @@ -71,8 +71,8 @@ RUN apt-get install -y --no-install-recommends \ RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz && \ tar zxvf openmpi-${MPI_VERSION}.tar.gz && \ cd openmpi-${MPI_VERSION} && \ - ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda > /dev/null && \ - make -j$(nproc) > /dev/null && make install && \ + ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda > /dev/null 2>&1 && \ + make -j$(nproc) > /dev/null 2>&1 && make install && \ rm -rf /workspace/openmpi-${MPI_VERSION} /workspace/openmpi-${MPI_VERSION}.tar.gz # ------------------------- From a63433ce7009ad2ccbd7df08b7db64a588bd119d Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 12:00:18 +0000 Subject: [PATCH 21/26] update dockerfile --- docker/Dockerfile.cuda12.x.ubuntu20.04 | 2 +- docker/Dockerfile.cuda13.x.ubuntu22.04 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.cuda12.x.ubuntu20.04 b/docker/Dockerfile.cuda12.x.ubuntu20.04 index 4ad5add0..daf72289 100644 --- a/docker/Dockerfile.cuda12.x.ubuntu20.04 +++ b/docker/Dockerfile.cuda12.x.ubuntu20.04 @@ -136,4 +136,4 @@ RUN SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ makeself --gzip /usr/local/sihpc \ "${PACKAGE_FILENAME}" \ "SiHPC MPI + NCCL + NCCL-tests Portable Installer" \ - ./bin/install_sihpc \ No newline at end of file + ./bin/install_sihpc > /dev/null 2>&1 \ No newline at end of file diff --git a/docker/Dockerfile.cuda13.x.ubuntu22.04 b/docker/Dockerfile.cuda13.x.ubuntu22.04 index edf62853..c30cd48e 100644 --- a/docker/Dockerfile.cuda13.x.ubuntu22.04 +++ b/docker/Dockerfile.cuda13.x.ubuntu22.04 @@ -133,4 +133,4 @@ RUN SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ makeself --gzip /usr/local/sihpc \ "${PACKAGE_FILENAME}" \ "SiHPC MPI + NCCL + NCCL-tests Portable Installer" \ - ./bin/install_sihpc \ No newline at end of file + ./bin/install_sihpc > /dev/null 2>&1 \ No newline at end of file From 0b7aeaf09b61548c83b826caed93db2f768c1f4c Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 12:05:00 +0000 Subject: [PATCH 22/26] update dockerfile --- docker/Dockerfile.cuda12.x.ubuntu20.04 | 8 ++++---- docker/Dockerfile.cuda13.x.ubuntu22.04 | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docker/Dockerfile.cuda12.x.ubuntu20.04 b/docker/Dockerfile.cuda12.x.ubuntu20.04 index daf72289..14279189 100644 --- a/docker/Dockerfile.cuda12.x.ubuntu20.04 +++ b/docker/Dockerfile.cuda12.x.ubuntu20.04 @@ -47,7 +47,7 @@ RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list || true && \ apt-get install -y --no-install-recommends \ build-essential gcc g++ curl git wget ca-certificates \ make automake autoconf libtool pkg-config \ - python3 python3-pip gzip xz-utils makeself && \ + python3 python3-pip gzip xz-utils makeself > /dev/null 2>&1 && \ rm -rf /var/lib/apt/lists/* # ------------------------- @@ -55,7 +55,7 @@ RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list || true && \ # ------------------------- RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb && \ dpkg -i cuda-keyring_1.1-1_all.deb && \ - apt-get update + apt-get update > /dev/null 2>&1 # ------------------------- # 3. Install NCCL (pinned version) @@ -63,7 +63,7 @@ RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86 RUN apt-mark unhold libnccl2 libnccl-dev || true && \ apt-get install -y --no-install-recommends \ libnccl2=${NCCL_PACKAGE_VERSION} \ - libnccl-dev=${NCCL_PACKAGE_VERSION} && \ + libnccl-dev=${NCCL_PACKAGE_VERSION} > /dev/null 2>&1 && \ apt-mark hold libnccl2 libnccl-dev && \ ldconfig && \ rm -rf /var/lib/apt/lists/* @@ -72,7 +72,7 @@ RUN apt-mark unhold libnccl2 libnccl-dev || true && \ # 4. Build OpenMPI from source # ------------------------- RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz && \ - tar zxvf openmpi-${MPI_VERSION}.tar.gz && \ + tar zxvf openmpi-${MPI_VERSION}.tar.gz > /dev/null 2>&1 && \ cd openmpi-${MPI_VERSION} && \ ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda > /dev/null 2>&1 && \ make -j$(nproc) > /dev/null 2>&1 && make install && \ diff --git a/docker/Dockerfile.cuda13.x.ubuntu22.04 b/docker/Dockerfile.cuda13.x.ubuntu22.04 index c30cd48e..b716db87 100644 --- a/docker/Dockerfile.cuda13.x.ubuntu22.04 +++ b/docker/Dockerfile.cuda13.x.ubuntu22.04 @@ -46,7 +46,7 @@ RUN apt-get -o Acquire::http::No-Cache=true update && \ apt-get install -y --no-install-recommends \ build-essential gcc g++ curl git wget ca-certificates \ make automake autoconf libtool pkg-config \ - python3 python3-pip gzip xz-utils makeself && \ + python3 python3-pip gzip xz-utils makeself > /dev/null 2>&1 && \ rm -rf /var/lib/apt/lists/* # ------------------------- @@ -54,14 +54,14 @@ RUN apt-get -o Acquire::http::No-Cache=true update && \ # ------------------------- RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \ dpkg -i cuda-keyring_1.1-1_all.deb && \ - apt-get update + apt-get update > /dev/null 2>&1 # ------------------------- # 3. Install NCCL (pinned version) # ------------------------- RUN apt-get install -y --no-install-recommends \ libnccl2=${NCCL_PACKAGE_VERSION} \ - libnccl-dev=${NCCL_PACKAGE_VERSION} && \ + libnccl-dev=${NCCL_PACKAGE_VERSION} > /dev/null 2>&1 && \ ldconfig && \ rm -rf /var/lib/apt/lists/* @@ -69,7 +69,7 @@ RUN apt-get install -y --no-install-recommends \ # 4. Build OpenMPI from source # ------------------------- RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz && \ - tar zxvf openmpi-${MPI_VERSION}.tar.gz && \ + tar zxvf openmpi-${MPI_VERSION}.tar.gz > /dev/null 2>&1 && \ cd openmpi-${MPI_VERSION} && \ ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda > /dev/null 2>&1 && \ make -j$(nproc) > /dev/null 2>&1 && make install && \ From ea941cea480a942dab4ac0e7c3db4eb24bdb41d2 Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 12:11:36 +0000 Subject: [PATCH 23/26] update dockerfile --- docker/Dockerfile.cuda12.x.ubuntu20.04 | 2 +- docker/Dockerfile.cuda13.x.ubuntu22.04 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.cuda12.x.ubuntu20.04 b/docker/Dockerfile.cuda12.x.ubuntu20.04 index 14279189..540eff0d 100644 --- a/docker/Dockerfile.cuda12.x.ubuntu20.04 +++ b/docker/Dockerfile.cuda12.x.ubuntu20.04 @@ -71,7 +71,7 @@ RUN apt-mark unhold libnccl2 libnccl-dev || true && \ # ------------------------- # 4. Build OpenMPI from source # ------------------------- -RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz && \ +RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz > /dev/null 2>&1 && \ tar zxvf openmpi-${MPI_VERSION}.tar.gz > /dev/null 2>&1 && \ cd openmpi-${MPI_VERSION} && \ ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda > /dev/null 2>&1 && \ diff --git a/docker/Dockerfile.cuda13.x.ubuntu22.04 b/docker/Dockerfile.cuda13.x.ubuntu22.04 index b716db87..02890bb9 100644 --- a/docker/Dockerfile.cuda13.x.ubuntu22.04 +++ b/docker/Dockerfile.cuda13.x.ubuntu22.04 @@ -68,7 +68,7 @@ RUN apt-get install -y --no-install-recommends \ # ------------------------- # 4. Build OpenMPI from source # ------------------------- -RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz && \ +RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz > /dev/null 2>&1 && \ tar zxvf openmpi-${MPI_VERSION}.tar.gz > /dev/null 2>&1 && \ cd openmpi-${MPI_VERSION} && \ ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda > /dev/null 2>&1 && \ From 9b37f54680f0777c7037aa7a0bd82c2e80dce9d6 Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 15:11:53 +0000 Subject: [PATCH 24/26] update dockerfile --- docker/Dockerfile.cuda13.x.ubuntu22.04 | 43 +++++++++++++------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/docker/Dockerfile.cuda13.x.ubuntu22.04 b/docker/Dockerfile.cuda13.x.ubuntu22.04 index 02890bb9..ae389ea0 100644 --- a/docker/Dockerfile.cuda13.x.ubuntu22.04 +++ b/docker/Dockerfile.cuda13.x.ubuntu22.04 @@ -42,28 +42,28 @@ WORKDIR /workspace # ------------------------- # 1. Base build dependencies # ------------------------- -RUN apt-get -o Acquire::http::No-Cache=true update && \ +RUN { apt-get -o Acquire::http::No-Cache=true update > build.log 2>&1 && \ apt-get install -y --no-install-recommends \ build-essential gcc g++ curl git wget ca-certificates \ make automake autoconf libtool pkg-config \ - python3 python3-pip gzip xz-utils makeself > /dev/null 2>&1 && \ - rm -rf /var/lib/apt/lists/* + python3 python3-pip gzip xz-utils makeself >> build.log 2>&1 && \ + rm -rf /var/lib/apt/lists/* && rm -f build.log; } || (cat build.log && false) # ------------------------- # 2. Install CUDA keyring and restore NVIDIA repository # ------------------------- -RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \ - dpkg -i cuda-keyring_1.1-1_all.deb && \ - apt-get update > /dev/null 2>&1 +RUN { wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb > build.log 2>&1 && \ + dpkg -i cuda-keyring_1.1-1_all.deb >> build.log 2>&1 && \ + apt-get update >> build.log 2>&1 && rm -f build.log; } || (cat build.log && false) # ------------------------- # 3. Install NCCL (pinned version) # ------------------------- -RUN apt-get install -y --no-install-recommends \ +RUN { apt-get install -y --no-install-recommends \ libnccl2=${NCCL_PACKAGE_VERSION} \ - libnccl-dev=${NCCL_PACKAGE_VERSION} > /dev/null 2>&1 && \ - ldconfig && \ - rm -rf /var/lib/apt/lists/* + libnccl-dev=${NCCL_PACKAGE_VERSION} > build.log 2>&1 && \ + ldconfig >> build.log 2>&1 && \ + rm -rf /var/lib/apt/lists/* && rm -f build.log; } || (cat build.log && false) # ------------------------- # 4. Build OpenMPI from source @@ -72,15 +72,15 @@ RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-$ tar zxvf openmpi-${MPI_VERSION}.tar.gz > /dev/null 2>&1 && \ cd openmpi-${MPI_VERSION} && \ ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda > /dev/null 2>&1 && \ - make -j$(nproc) > /dev/null 2>&1 && make install && \ + make -j$(nproc) > /dev/null 2>&1 && make install > /dev/null 2>&1 && \ rm -rf /workspace/openmpi-${MPI_VERSION} /workspace/openmpi-${MPI_VERSION}.tar.gz # ------------------------- # 5. Build nccl-tests # ------------------------- -RUN git clone https://github.com/scitix/nccl-tests.git -b sync/upstream-20251216 && \ +RUN git clone --depth 1 --single-branch -b sync/upstream-20251216 https://github.com/scitix/nccl-tests.git > /dev/null 2>&1 && \ cd nccl-tests && \ - make MPI=1 MPI_HOME=/usr/local/sihpc && \ + { make MPI=1 MPI_HOME=/usr/local/sihpc > build.log 2>&1 && rm -f build.log || (cat build.log && false); } && \ mkdir -p /usr/local/sihpc/libexec/nccl-tests && \ cp -rf build/*_perf /usr/local/sihpc/libexec/nccl-tests/ && \ mkdir -p /usr/local/sihpc/bin && \ @@ -94,10 +94,11 @@ RUN git clone https://github.com/scitix/nccl-tests.git -b sync/upstream-20251216 # ------------------------- # 6. Collect runtime libraries (strict selection) # ------------------------- -RUN set -eux && \ - mkdir -p /usr/local/sihpc/lib && \ - cp /usr/local/cuda/lib64/libcudart* /usr/local/sihpc/lib/ && \ - cp /usr/lib/x86_64-linux-gnu/libnccl.so* /usr/local/sihpc/lib/ +RUN { set -e && \ + mkdir -p /usr/local/sihpc/lib > build.log 2>&1 && \ + cp /usr/local/cuda/lib64/libcudart* /usr/local/sihpc/lib/ >> build.log 2>&1 && \ + cp /usr/lib/x86_64-linux-gnu/libnccl.so* /usr/local/sihpc/lib/ >> build.log 2>&1 && \ + rm -f build.log; } || (cat build.log && false) # cp /lib/x86_64-linux-gnu/libltdl.so.7.3.1 /usr/local/sihpc/lib/ && \ # cp /usr/lib/x86_64-linux-gnu/libhwloc.so* /usr/local/sihpc/lib/ && \ # cp /usr/lib/x86_64-linux-gnu/libevent_core* /usr/local/sihpc/lib/ && \ @@ -130,7 +131,7 @@ ENV NCCL_PACKAGE_VERSION=${NCCL_PACKAGE_VERSION} \ WORKDIR / RUN SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ PACKAGE_FILENAME="sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run" && \ - makeself --gzip /usr/local/sihpc \ - "${PACKAGE_FILENAME}" \ - "SiHPC MPI + NCCL + NCCL-tests Portable Installer" \ - ./bin/install_sihpc > /dev/null 2>&1 \ No newline at end of file + { makeself --gzip /usr/local/sihpc \ + "${PACKAGE_FILENAME}" \ + "SiHPC MPI + NCCL + NCCL-tests Portable Installer" \ + ./bin/install_sihpc > build.log 2>&1 && rm -f build.log; } || (cat build.log && false) \ No newline at end of file From db45ae9c3ddd08f28bd89f177c522007b172a840 Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 15:40:24 +0000 Subject: [PATCH 25/26] update dockerfile --- .github/workflows/pre-check.yml | 2 +- docker/Dockerfile.cuda12.x.ubuntu20.04 | 57 ++++++++++++++++---------- docker/Dockerfile.cuda13.x.ubuntu22.04 | 18 +++++++- 3 files changed, 53 insertions(+), 24 deletions(-) diff --git a/.github/workflows/pre-check.yml b/.github/workflows/pre-check.yml index f8f75d00..0b8d4140 100644 --- a/.github/workflows/pre-check.yml +++ b/.github/workflows/pre-check.yml @@ -21,7 +21,7 @@ jobs: docker buildx build \ -f docker/Dockerfile.cuda13.x.ubuntu22.04 \ --platform linux/amd64 \ - --target build \ + --target package \ --output type=local,dest=dist \ . - name: List artifacts diff --git a/docker/Dockerfile.cuda12.x.ubuntu20.04 b/docker/Dockerfile.cuda12.x.ubuntu20.04 index 540eff0d..b358aa5e 100644 --- a/docker/Dockerfile.cuda12.x.ubuntu20.04 +++ b/docker/Dockerfile.cuda12.x.ubuntu20.04 @@ -43,30 +43,30 @@ WORKDIR /workspace # 1. Base build dependencies # ------------------------- RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list || true && \ - apt-get -o Acquire::http::No-Cache=true update && \ + { apt-get -o Acquire::http::No-Cache=true update > build.log 2>&1 && \ apt-get install -y --no-install-recommends \ build-essential gcc g++ curl git wget ca-certificates \ make automake autoconf libtool pkg-config \ - python3 python3-pip gzip xz-utils makeself > /dev/null 2>&1 && \ - rm -rf /var/lib/apt/lists/* + python3 python3-pip gzip xz-utils >> build.log 2>&1 && \ + rm -rf /var/lib/apt/lists/* && rm -f build.log; } || (cat build.log && false) # ------------------------- # 2. Install CUDA keyring and restore NVIDIA repository # ------------------------- -RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb && \ - dpkg -i cuda-keyring_1.1-1_all.deb && \ - apt-get update > /dev/null 2>&1 +RUN { wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb > build.log 2>&1 && \ + dpkg -i cuda-keyring_1.1-1_all.deb >> build.log 2>&1 && \ + apt-get update >> build.log 2>&1 && rm -f build.log; } || (cat build.log && false) # ------------------------- # 3. Install NCCL (pinned version) # ------------------------- RUN apt-mark unhold libnccl2 libnccl-dev || true && \ - apt-get install -y --no-install-recommends \ + { apt-get install -y --no-install-recommends \ libnccl2=${NCCL_PACKAGE_VERSION} \ - libnccl-dev=${NCCL_PACKAGE_VERSION} > /dev/null 2>&1 && \ - apt-mark hold libnccl2 libnccl-dev && \ - ldconfig && \ - rm -rf /var/lib/apt/lists/* + libnccl-dev=${NCCL_PACKAGE_VERSION} > build.log 2>&1 && \ + apt-mark hold libnccl2 libnccl-dev >> build.log 2>&1 && \ + ldconfig >> build.log 2>&1 && \ + rm -rf /var/lib/apt/lists/* && rm -f build.log; } || (cat build.log && false) # ------------------------- # 4. Build OpenMPI from source @@ -75,15 +75,15 @@ RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-$ tar zxvf openmpi-${MPI_VERSION}.tar.gz > /dev/null 2>&1 && \ cd openmpi-${MPI_VERSION} && \ ./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda > /dev/null 2>&1 && \ - make -j$(nproc) > /dev/null 2>&1 && make install && \ + make -j$(nproc) > /dev/null 2>&1 && make install > /dev/null 2>&1 && \ rm -rf /workspace/openmpi-${MPI_VERSION} /workspace/openmpi-${MPI_VERSION}.tar.gz # ------------------------- # 5. Build nccl-tests # ------------------------- -RUN git clone https://github.com/scitix/nccl-tests.git -b sicl && \ +RUN { git clone --depth 1 --single-branch -b sicl https://github.com/scitix/nccl-tests.git > build.log 2>&1 && \ cd nccl-tests && \ - make MPI=1 MPI_HOME=/usr/local/sihpc && \ + make MPI=1 MPI_HOME=/usr/local/sihpc > build.log 2>&1 && rm -f build.log || (cat build.log && false); } && \ mkdir -p /usr/local/sihpc/libexec/nccl-tests && \ cp -rf build/*_perf /usr/local/sihpc/libexec/nccl-tests/ && \ mkdir -p /usr/local/sihpc/bin && \ @@ -97,10 +97,11 @@ RUN git clone https://github.com/scitix/nccl-tests.git -b sicl && \ # ------------------------- # 6. Collect runtime libraries (strict selection) # ------------------------- -RUN set -eux && \ - mkdir -p /usr/local/sihpc/lib && \ - cp /usr/local/cuda/lib64/libcudart* /usr/local/sihpc/lib/ && \ - cp /usr/lib/x86_64-linux-gnu/libnccl.so* /usr/local/sihpc/lib/ +RUN { set -e && \ + mkdir -p /usr/local/sihpc/lib > build.log 2>&1 && \ + cp /usr/local/cuda/lib64/libcudart* /usr/local/sihpc/lib/ >> build.log 2>&1 && \ + cp /usr/lib/x86_64-linux-gnu/libnccl.so* /usr/local/sihpc/lib/ >> build.log 2>&1 && \ + rm -f build.log; } || (cat build.log && false) # cp /lib/x86_64-linux-gnu/libltdl.so.7.3.1 /usr/local/sihpc/lib/ && \ # cp /usr/lib/x86_64-linux-gnu/libhwloc.so* /usr/local/sihpc/lib/ && \ # cp /usr/lib/x86_64-linux-gnu/libevent_core* /usr/local/sihpc/lib/ && \ @@ -125,15 +126,29 @@ RUN cd /usr/local/sihpc/lib && \ # ln -sf libltdl.so.7.3.1 libltdl.so.7 && \ # ln -sf libltdl.so.7 libltdl.so +########################### +# Package Stage +########################### +FROM ubuntu:20.04 AS package + +# Re-declare args for this stage (values are inherited) +ARG UBUNTU_VERSION +ARG NCCL_PACKAGE_VERSION +ARG MPI_VERSION +ARG BUILD_DATE + # Expose versions/date as environment variables for runtime shell expansion ENV NCCL_PACKAGE_VERSION=${NCCL_PACKAGE_VERSION} \ MPI_VERSION=${MPI_VERSION} \ BUILD_DATE=${BUILD_DATE} +COPY --from=build /usr/local/sihpc /usr/local/sihpc + WORKDIR / -RUN SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ +RUN apt-get update && apt-get install -y --no-install-recommends makeself && \ + SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ PACKAGE_FILENAME="sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run" && \ - makeself --gzip /usr/local/sihpc \ + { makeself --gzip /usr/local/sihpc \ "${PACKAGE_FILENAME}" \ "SiHPC MPI + NCCL + NCCL-tests Portable Installer" \ - ./bin/install_sihpc > /dev/null 2>&1 \ No newline at end of file + ./bin/install_sihpc > build.log 2>&1 && rm -f build.log; } || (cat build.log && false) \ No newline at end of file diff --git a/docker/Dockerfile.cuda13.x.ubuntu22.04 b/docker/Dockerfile.cuda13.x.ubuntu22.04 index ae389ea0..82cb17d5 100644 --- a/docker/Dockerfile.cuda13.x.ubuntu22.04 +++ b/docker/Dockerfile.cuda13.x.ubuntu22.04 @@ -46,7 +46,7 @@ RUN { apt-get -o Acquire::http::No-Cache=true update > build.log 2>&1 && \ apt-get install -y --no-install-recommends \ build-essential gcc g++ curl git wget ca-certificates \ make automake autoconf libtool pkg-config \ - python3 python3-pip gzip xz-utils makeself >> build.log 2>&1 && \ + python3 python3-pip gzip xz-utils >> build.log 2>&1 && \ rm -rf /var/lib/apt/lists/* && rm -f build.log; } || (cat build.log && false) # ------------------------- @@ -123,13 +123,27 @@ RUN cd /usr/local/sihpc/lib && \ # ln -sf libltdl.so.7.3.1 libltdl.so.7 && \ # ln -sf libltdl.so.7 libltdl.so +########################### +# Package Stage +########################### +FROM ubuntu:20.04 AS package + +# Re-declare args for this stage (values are inherited) +ARG UBUNTU_VERSION +ARG NCCL_PACKAGE_VERSION +ARG MPI_VERSION +ARG BUILD_DATE + # Expose versions/date as environment variables for runtime shell expansion ENV NCCL_PACKAGE_VERSION=${NCCL_PACKAGE_VERSION} \ MPI_VERSION=${MPI_VERSION} \ BUILD_DATE=${BUILD_DATE} +COPY --from=build /usr/local/sihpc /usr/local/sihpc + WORKDIR / -RUN SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ +RUN apt-get update && apt-get install -y --no-install-recommends makeself && \ + SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \ PACKAGE_FILENAME="sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run" && \ { makeself --gzip /usr/local/sihpc \ "${PACKAGE_FILENAME}" \ From 587d57c5ae4e82f1f0b11abb0d16b97733f1690f Mon Sep 17 00:00:00 2001 From: xlliu-scitix Date: Sun, 21 Dec 2025 16:05:17 +0000 Subject: [PATCH 26/26] update release.yml --- .github/workflows/release.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e62bca7b..dd3e20a9 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -18,10 +18,10 @@ jobs: matrix: include: - name: cuda13-ubuntu22.04 - dockerfile: docker/Dockerfile.cuda13.ubuntu22.04 + dockerfile: docker/Dockerfile.cuda13.x.ubuntu22.04 # - name: cuda12-ubuntu20.04 - # dockerfile: docker/Dockerfile.cuda12.ubuntu20.04 + # dockerfile: docker/Dockerfile.cuda12.x.ubuntu20.04 steps: - name: Checkout source