From 83f28c52e47c2d44acbc5c8454ea23ba70ecdb6f Mon Sep 17 00:00:00 2001 From: Debarshi Ray Date: Wed, 25 Sep 2024 18:25:56 +0200 Subject: [PATCH 1/2] build: Notify distributors that the '-z now' linker flag is unsupported The '-z now' flag, which is the opposite of '-z lazy', is unsupported as an external linker flag [1], because of how the NVIDIA Container Toolkit stack uses dlopen(3) to load libcuda.so.1 and libnvidia-ml.so.1 at runtime [2,3]. The NVIDIA Container Toolkit stack doesn't use dlsym(3) to obtain the address of a symbol at runtime before using it. It links against undefined symbols at build-time available through a CUDA API definition embedded directly in the CGO code or a copy of nvml.h. It relies upon lazily deferring function call resolution to the point when dlopen(3) is able to load the shared libraries at runtime, instead of doing it when toolbox(1) is started. This is unlike how Toolbx itself uses dlopen(3) and dlsym(3) to load libsubid.so at runtime. Compare the output of: $ nm /path/to/toolbox | grep ' subid_init' ... with those from: $ nm /path/to/toolbox | grep ' nvmlGpuInstanceGetComputeInstanceProfileInfoV' U nvmlGpuInstanceGetComputeInstanceProfileInfoV $ nm /path/to/toolbox | grep ' nvmlDeviceGetAccountingPids' U nvmlDeviceGetAccountingPids Using '-z now' as an external linker flag forces the dynamic linker to resolve all symbols when toolbox(1) is started, and leads to: $ toolbox toolbox: symbol lookup error: toolbox: undefined symbol: nvmlGpuInstanceGetComputeInstanceProfileInfoV With the recent expansion of the test suite, it's necessary to increase the timeout for the Fedora nodes to prevent the CI from timing out. Fallout from 6e848b250b4cde98fb9a40b17421f1f54eacd8f3 [1] NVIDIA Container Toolkit commit 1407ace94ab7c150 https://github.com/NVIDIA/nvidia-container-toolkit/commit/1407ace94ab7c150 https://github.com/NVIDIA/go-nvml/issues/18 https://github.com/NVIDIA/nvidia-container-toolkit/issues/49 [2] https://github.com/NVIDIA/nvidia-container-toolkit/tree/main/internal/cuda [3] https://github.com/NVIDIA/go-nvml/blob/main/README.md https://github.com/NVIDIA/go-nvml/tree/main/pkg/dl https://github.com/NVIDIA/go-nvml/tree/main/pkg/nvml https://github.com/containers/toolbox/pull/1548 --- .zuul.yaml | 6 +++--- src/go-build-wrapper | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index 637c38d0f..a09f85a95 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -51,7 +51,7 @@ - job: name: system-test-fedora-rawhide description: Run Toolbx's system tests in Fedora Rawhide - timeout: 7800 + timeout: 10800 nodeset: nodes: - name: fedora-rawhide @@ -62,7 +62,7 @@ - job: name: system-test-fedora-40 description: Run Toolbx's system tests in Fedora 40 - timeout: 7200 + timeout: 9000 nodeset: nodes: - name: fedora-40 @@ -73,7 +73,7 @@ - job: name: system-test-fedora-39 description: Run Toolbx's system tests in Fedora 39 - timeout: 7200 + timeout: 9000 nodeset: nodes: - name: fedora-39 diff --git a/src/go-build-wrapper b/src/go-build-wrapper index 13ddc127c..7968e3c9b 100755 --- a/src/go-build-wrapper +++ b/src/go-build-wrapper @@ -70,6 +70,48 @@ fi dynamic_linker="/run/host$dynamic_linker_canonical_dirname/$dynamic_linker_basename" +# Note for distributors: +# +# The '-z now' flag, which is the opposite of '-z lazy', is unsupported as an +# external linker flag [1], because of how the NVIDIA Container Toolkit stack +# uses dlopen(3) to load libcuda.so.1 and libnvidia-ml.so.1 at runtime [2,3]. +# +# The NVIDIA Container Toolkit stack doesn't use dlsym(3) to obtain the address +# of a symbol at runtime before using it. It links against undefined symbols +# at build-time available through a CUDA API definition embedded directly in +# the CGO code or a copy of nvml.h. It relies upon lazily deferring function +# call resolution to the point when dlopen(3) is able to load the shared +# libraries at runtime, instead of doing it when toolbox(1) is started. +# +# This is unlike how Toolbx itself uses dlopen(3) and dlsym(3) to load +# libsubid.so at runtime. +# +# Compare the output of: +# $ nm /path/to/toolbox | grep ' subid_init' +# +# ... with those from: +# $ nm /path/to/toolbox | grep ' nvmlGpuInstanceGetComputeInstanceProfileInfoV' +# U nvmlGpuInstanceGetComputeInstanceProfileInfoV +# $ nm /path/to/toolbox | grep ' nvmlDeviceGetAccountingPids' +# U nvmlDeviceGetAccountingPids +# +# Using '-z now' as an external linker flag forces the dynamic linker to +# resolve all symbols when toolbox(1) is started, and leads to: +# $ toolbox +# toolbox: symbol lookup error: toolbox: undefined symbol: +# nvmlGpuInstanceGetComputeInstanceProfileInfoV +# +# [1] NVIDIA Container Toolkit commit 1407ace94ab7c150 +# https://github.com/NVIDIA/nvidia-container-toolkit/commit/1407ace94ab7c150 +# https://github.com/NVIDIA/go-nvml/issues/18 +# https://github.com/NVIDIA/nvidia-container-toolkit/issues/49 +# +# [2] https://github.com/NVIDIA/nvidia-container-toolkit/tree/main/internal/cuda +# +# [3] https://github.com/NVIDIA/go-nvml/blob/main/README.md +# https://github.com/NVIDIA/go-nvml/tree/main/pkg/dl +# https://github.com/NVIDIA/go-nvml/tree/main/pkg/nvml + # shellcheck disable=SC2086 go build \ $tags \ From 66280a617ae7eaa29113aeb8e76cbcee6ac04feb Mon Sep 17 00:00:00 2001 From: Debarshi Ray Date: Tue, 24 Sep 2024 14:21:05 +0200 Subject: [PATCH 2/2] build: Use the same linker flags as NVIDIA Container Toolkit The previous commit explains how the NVIDIA Container Toolkit is sensitive to some linker flags. Therefore, use the same linker flags that are used by NVIDIA Container Toolkit to build the nvidia-cdi-hook, nvidia-ctk, etc. binaries, because they use the same Go APIs that toolbox(1) does [1]. It's better to use the same build configuration to prevent subtle bugs from creeping in. [1] NVIDIA Container Toolkit commit 772cf77dcc2347ce https://github.com/NVIDIA/nvidia-container-toolkit/commit/772cf77dcc2347ce https://github.com/NVIDIA/nvidia-container-toolkit/pull/333 https://github.com/containers/toolbox/pull/1548 --- src/go-build-wrapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/go-build-wrapper b/src/go-build-wrapper index 7968e3c9b..a5a1a6a50 100755 --- a/src/go-build-wrapper +++ b/src/go-build-wrapper @@ -116,7 +116,7 @@ dynamic_linker="/run/host$dynamic_linker_canonical_dirname/$dynamic_linker_basen go build \ $tags \ -trimpath \ - -ldflags "-extldflags '-Wl,-dynamic-linker,$dynamic_linker -Wl,-rpath,/run/host$libc_dir_canonical_dirname' -linkmode external -X github.com/containers/toolbox/pkg/version.currentVersion=$4" \ + -ldflags "-extldflags '-Wl,-dynamic-linker,$dynamic_linker -Wl,-rpath,/run/host$libc_dir_canonical_dirname -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files' -linkmode external -X github.com/containers/toolbox/pkg/version.currentVersion=$4" \ -o "$2/$3" exit "$?"