diff --git a/.zuul.yaml b/.zuul.yaml index 637c38d0f..a09f85a95 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -51,7 +51,7 @@ - job: name: system-test-fedora-rawhide description: Run Toolbx's system tests in Fedora Rawhide - timeout: 7800 + timeout: 10800 nodeset: nodes: - name: fedora-rawhide @@ -62,7 +62,7 @@ - job: name: system-test-fedora-40 description: Run Toolbx's system tests in Fedora 40 - timeout: 7200 + timeout: 9000 nodeset: nodes: - name: fedora-40 @@ -73,7 +73,7 @@ - job: name: system-test-fedora-39 description: Run Toolbx's system tests in Fedora 39 - timeout: 7200 + timeout: 9000 nodeset: nodes: - name: fedora-39 diff --git a/src/go-build-wrapper b/src/go-build-wrapper index 13ddc127c..a5a1a6a50 100755 --- a/src/go-build-wrapper +++ b/src/go-build-wrapper @@ -70,11 +70,53 @@ fi dynamic_linker="/run/host$dynamic_linker_canonical_dirname/$dynamic_linker_basename" +# Note for distributors: +# +# The '-z now' flag, which is the opposite of '-z lazy', is unsupported as an +# external linker flag [1], because of how the NVIDIA Container Toolkit stack +# uses dlopen(3) to load libcuda.so.1 and libnvidia-ml.so.1 at runtime [2,3]. +# +# The NVIDIA Container Toolkit stack doesn't use dlsym(3) to obtain the address +# of a symbol at runtime before using it. It links against undefined symbols +# at build-time available through a CUDA API definition embedded directly in +# the CGO code or a copy of nvml.h. It relies upon lazily deferring function +# call resolution to the point when dlopen(3) is able to load the shared +# libraries at runtime, instead of doing it when toolbox(1) is started. +# +# This is unlike how Toolbx itself uses dlopen(3) and dlsym(3) to load +# libsubid.so at runtime. +# +# Compare the output of: +# $ nm /path/to/toolbox | grep ' subid_init' +# +# ... with those from: +# $ nm /path/to/toolbox | grep ' nvmlGpuInstanceGetComputeInstanceProfileInfoV' +# U nvmlGpuInstanceGetComputeInstanceProfileInfoV +# $ nm /path/to/toolbox | grep ' nvmlDeviceGetAccountingPids' +# U nvmlDeviceGetAccountingPids +# +# Using '-z now' as an external linker flag forces the dynamic linker to +# resolve all symbols when toolbox(1) is started, and leads to: +# $ toolbox +# toolbox: symbol lookup error: toolbox: undefined symbol: +# nvmlGpuInstanceGetComputeInstanceProfileInfoV +# +# [1] NVIDIA Container Toolkit commit 1407ace94ab7c150 +# https://github.com/NVIDIA/nvidia-container-toolkit/commit/1407ace94ab7c150 +# https://github.com/NVIDIA/go-nvml/issues/18 +# https://github.com/NVIDIA/nvidia-container-toolkit/issues/49 +# +# [2] https://github.com/NVIDIA/nvidia-container-toolkit/tree/main/internal/cuda +# +# [3] https://github.com/NVIDIA/go-nvml/blob/main/README.md +# https://github.com/NVIDIA/go-nvml/tree/main/pkg/dl +# https://github.com/NVIDIA/go-nvml/tree/main/pkg/nvml + # shellcheck disable=SC2086 go build \ $tags \ -trimpath \ - -ldflags "-extldflags '-Wl,-dynamic-linker,$dynamic_linker -Wl,-rpath,/run/host$libc_dir_canonical_dirname' -linkmode external -X github.com/containers/toolbox/pkg/version.currentVersion=$4" \ + -ldflags "-extldflags '-Wl,-dynamic-linker,$dynamic_linker -Wl,-rpath,/run/host$libc_dir_canonical_dirname -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files' -linkmode external -X github.com/containers/toolbox/pkg/version.currentVersion=$4" \ -o "$2/$3" exit "$?"