Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
fd83f7c
tweak build and test scripts for SiCL.
zqan9 Nov 14, 2024
395e345
add ucommd get_bw for checking the perf results (busbw).
zqan9 Nov 14, 2024
10a48eb
fix: get NODE_NAME first during ucommd initialization.
zqan9 Dec 4, 2024
38bf3cd
update copyright notice.
zqan9 Dec 4, 2024
1fb35de
enlarge limit of opened file descriptors to run AllToAll test.
zqan9 Feb 5, 2025
8ab1ad2
Merge remote-tracking branch 'upstream/master' into sync/upstream-202…
Dec 16, 2025
351b3c1
add scripts
Dec 19, 2025
a922a05
update scripts
Dec 19, 2025
4aebbcb
add Dockerfile
xlliu-scitix Dec 21, 2025
7dce185
update Dockerfile
xlliu-scitix Dec 21, 2025
a7444d2
chore(scripts): add install scripts
xlliu-scitix Dec 21, 2025
cc230e6
add github workflow
xlliu-scitix Dec 21, 2025
5eec99d
add dockerfile and ci workflow (#1)
xlliu-scitix Dec 21, 2025
42c1025
add free disk space in release.yml ci
xlliu-scitix Dec 21, 2025
e9e9a27
Merge branch 'sicl' into sync/upstream-20251216
xlliu-scitix Dec 21, 2025
b5c25a8
delete duplicated pre-check.yml
xlliu-scitix Dec 21, 2025
5fe638c
update Dockerfile
xlliu-scitix Dec 21, 2025
c9ace03
update pre-check.yml
xlliu-scitix Dec 21, 2025
e115bd8
fix bugs
xlliu-scitix Dec 21, 2025
d5d08b9
update Dockerfile
xlliu-scitix Dec 21, 2025
eb04f21
update dockerfile
xlliu-scitix Dec 21, 2025
2225d3f
update dockefile
xlliu-scitix Dec 21, 2025
a63433c
update dockerfile
xlliu-scitix Dec 21, 2025
0b7aeaf
update dockerfile
xlliu-scitix Dec 21, 2025
ea941ce
update dockerfile
xlliu-scitix Dec 21, 2025
9b37f54
update dockerfile
xlliu-scitix Dec 21, 2025
db45ae9
update dockerfile
xlliu-scitix Dec 21, 2025
587d57c
update release.yml
xlliu-scitix Dec 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .github/workflows/pre-check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
on:
pull_request:
workflow_dispatch:

jobs:
build-only:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- uses: docker/setup-buildx-action@v3
- name: Free disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache
sudo docker system prune -af || true
df -h
- name: Build run package
run: |
docker buildx build \
-f docker/Dockerfile.cuda13.x.ubuntu22.04 \
--platform linux/amd64 \
--target package \
--output type=local,dest=dist \
.
- name: List artifacts
run: |
ls -lh dist
61 changes: 61 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
name: release

on:
push:
tags:
- "v*"

permissions:
contents: write

jobs:
build-and-release:
name: build-run (${{ matrix.name }})
runs-on: ubuntu-22.04

strategy:
fail-fast: false
matrix:
include:
- name: cuda13-ubuntu22.04
dockerfile: docker/Dockerfile.cuda13.x.ubuntu22.04

# - name: cuda12-ubuntu20.04
# dockerfile: docker/Dockerfile.cuda12.x.ubuntu20.04

steps:
- name: Checkout source
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Free disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache
sudo docker system prune -af || true
df -h

- name: Build run package
run: |
BUILD_DATE=$(date +%Y%m%d)

docker buildx build \
--platform linux/amd64 \
--progress=plain \
-f ${{ matrix.dockerfile }} \
--build-arg BUILD_DATE=${BUILD_DATE} \
--output type=local,dest=dist/${{ matrix.name }} \
.

echo "Produced files:"
ls -lh dist/${{ matrix.name }}

- name: Upload run to GitHub Release
uses: softprops/action-gh-release@v2
with:
files: |
dist/${{ matrix.name }}/*.run
154 changes: 154 additions & 0 deletions docker/Dockerfile.cuda12.x.ubuntu20.04
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
###########################
# Build-time configuration
###########################

# Base OS and CUDA versions
ARG UBUNTU_VERSION=20.04
ARG CUDA_VERSION=12.8.1
ARG CUDART_VERSION=12.8.90
ARG CUDART_MAJOR_VERSION=12

# NCCL versions
ARG NCCL_PACKAGE_VERSION=2.27.7-1+cuda12.4
ARG NCCL_SO_VERSION=2.27.7

# OpenMPI versions
# - MPI_VERSION: full OpenMPI version
# - MPI_SERIES: major.minor series used in download URL
ARG MPI_VERSION=4.1.8
ARG MPI_SERIES=4.1

# Build date (override at build time)
ARG BUILD_DATE=20251221

###########################
# Build Stage
###########################
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS build

# Re-declare build args for this stage (values are inherited)
ARG UBUNTU_VERSION
ARG CUDA_VERSION
ARG CUDART_VERSION
ARG NCCL_PACKAGE_VERSION
ARG NCCL_SO_VERSION
ARG MPI_VERSION
ARG MPI_SERIES
ARG BUILD_DATE

ENV DEBIAN_FRONTEND=noninteractive
WORKDIR /workspace

# -------------------------
# 1. Base build dependencies
# -------------------------
RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list || true && \
{ apt-get -o Acquire::http::No-Cache=true update > build.log 2>&1 && \
apt-get install -y --no-install-recommends \
build-essential gcc g++ curl git wget ca-certificates \
make automake autoconf libtool pkg-config \
python3 python3-pip gzip xz-utils >> build.log 2>&1 && \
rm -rf /var/lib/apt/lists/* && rm -f build.log; } || (cat build.log && false)

# -------------------------
# 2. Install CUDA keyring and restore NVIDIA repository
# -------------------------
RUN { wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb > build.log 2>&1 && \
dpkg -i cuda-keyring_1.1-1_all.deb >> build.log 2>&1 && \
apt-get update >> build.log 2>&1 && rm -f build.log; } || (cat build.log && false)

# -------------------------
# 3. Install NCCL (pinned version)
# -------------------------
RUN apt-mark unhold libnccl2 libnccl-dev || true && \
{ apt-get install -y --no-install-recommends \
libnccl2=${NCCL_PACKAGE_VERSION} \
libnccl-dev=${NCCL_PACKAGE_VERSION} > build.log 2>&1 && \
apt-mark hold libnccl2 libnccl-dev >> build.log 2>&1 && \
ldconfig >> build.log 2>&1 && \
rm -rf /var/lib/apt/lists/* && rm -f build.log; } || (cat build.log && false)

# -------------------------
# 4. Build OpenMPI from source
# -------------------------
RUN wget https://download.open-mpi.org/release/open-mpi/v${MPI_SERIES}/openmpi-${MPI_VERSION}.tar.gz > /dev/null 2>&1 && \
tar zxvf openmpi-${MPI_VERSION}.tar.gz > /dev/null 2>&1 && \
cd openmpi-${MPI_VERSION} && \
./configure --prefix=/usr/local/sihpc --with-cuda=/usr/local/cuda > /dev/null 2>&1 && \
make -j$(nproc) > /dev/null 2>&1 && make install > /dev/null 2>&1 && \
rm -rf /workspace/openmpi-${MPI_VERSION} /workspace/openmpi-${MPI_VERSION}.tar.gz

# -------------------------
# 5. Build nccl-tests
# -------------------------
RUN { git clone --depth 1 --single-branch -b sicl https://github.com/scitix/nccl-tests.git > build.log 2>&1 && \
cd nccl-tests && \
make MPI=1 MPI_HOME=/usr/local/sihpc > build.log 2>&1 && rm -f build.log || (cat build.log && false); } && \
mkdir -p /usr/local/sihpc/libexec/nccl-tests && \
cp -rf build/*_perf /usr/local/sihpc/libexec/nccl-tests/ && \
mkdir -p /usr/local/sihpc/bin && \
cp scripts/nccl_perf /usr/local/sihpc/bin/nccl_perf && \
cp scripts/nccl_test /usr/local/sihpc/libexec/nccl-tests/nccl_test && \
cp scripts/env.sh /usr/local/sihpc/env.sh && \
cp scripts/install_sihpc /usr/local/sihpc/bin/install_sihpc && \
cp scripts/uninstall_sihpc /usr/local/sihpc/bin/uninstall_sihpc && \
rm -rf /workspace/nccl-tests

# -------------------------
# 6. Collect runtime libraries (strict selection)
# -------------------------
RUN { set -e && \
mkdir -p /usr/local/sihpc/lib > build.log 2>&1 && \
cp /usr/local/cuda/lib64/libcudart* /usr/local/sihpc/lib/ >> build.log 2>&1 && \
cp /usr/lib/x86_64-linux-gnu/libnccl.so* /usr/local/sihpc/lib/ >> build.log 2>&1 && \
rm -f build.log; } || (cat build.log && false)
# cp /lib/x86_64-linux-gnu/libltdl.so.7.3.1 /usr/local/sihpc/lib/ && \
# cp /usr/lib/x86_64-linux-gnu/libhwloc.so* /usr/local/sihpc/lib/ && \
# cp /usr/lib/x86_64-linux-gnu/libevent_core* /usr/local/sihpc/lib/ && \
# cp /usr/lib/x86_64-linux-gnu/libevent_pthreads* /usr/local/sihpc/lib/

# -------------------------
# 7. Fix library symlinks
# -------------------------
RUN cd /usr/local/sihpc/lib && \
rm -f libcudart.so libcudart.so.12 && \
ln -sf libnccl.so.2.27.7 libnccl.so.2 && \
ln -sf libnccl.so.2 libnccl.so && \
ln -sf libcudart.so.12.8.90 libcudart.so.12 && \
ln -sf libcudart.so.12 libcudart.so
# rm -f libevent_core-2.1.so.7 && \
# ln -sf libhwloc.so.15.1.0 libhwloc.so.15 && \
# ln -sf libhwloc.so.15.1.0 libhwloc.so && \
# ln -sf libevent_core-2.1.so.7.0.0 libevent_core-2.1.so.7 && \
# ln -sf libevent_core-2.1.so.7 libevent_core-2.1.so && \
# ln -sf libevent_pthreads-2.1.so.7.0.0 libevent_pthreads-2.1.so.7 && \
# ln -sf libevent_pthreads-2.1.so.7 libevent_pthreads-2.1.so && \
# ln -sf libltdl.so.7.3.1 libltdl.so.7 && \
# ln -sf libltdl.so.7 libltdl.so

###########################
# Package Stage
###########################
FROM ubuntu:20.04 AS package

# Re-declare args for this stage (values are inherited)
ARG UBUNTU_VERSION
ARG NCCL_PACKAGE_VERSION
ARG MPI_VERSION
ARG BUILD_DATE

# Expose versions/date as environment variables for runtime shell expansion
ENV NCCL_PACKAGE_VERSION=${NCCL_PACKAGE_VERSION} \
MPI_VERSION=${MPI_VERSION} \
BUILD_DATE=${BUILD_DATE}

COPY --from=build /usr/local/sihpc /usr/local/sihpc

WORKDIR /
RUN apt-get update && apt-get install -y --no-install-recommends makeself && \
SAFE_NCCL_PKG=$(printf '%s\n' "${NCCL_PACKAGE_VERSION}" | tr '+' '-') && \
PACKAGE_FILENAME="sicl-nccl${SAFE_NCCL_PKG}-ompi${MPI_VERSION}-ubuntu${UBUNTU_VERSION}-${BUILD_DATE}.run" && \
{ makeself --gzip /usr/local/sihpc \
"${PACKAGE_FILENAME}" \
"SiHPC MPI + NCCL + NCCL-tests Portable Installer" \
./bin/install_sihpc > build.log 2>&1 && rm -f build.log; } || (cat build.log && false)
Loading