From 8e8bf3b4750f7827eb511d2fd966f531f898f400 Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Wed, 24 Oct 2018 13:33:15 -0700 Subject: [PATCH 1/2] add resilience to nvidia driver install/config --- parts/k8s/kubernetesconfigs.sh | 14 +++++++------- parts/k8s/kubernetesinstalls.sh | 21 ++++++++++++--------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/parts/k8s/kubernetesconfigs.sh b/parts/k8s/kubernetesconfigs.sh index 4f9c7a8f1c..597924ad84 100755 --- a/parts/k8s/kubernetesconfigs.sh +++ b/parts/k8s/kubernetesconfigs.sh @@ -397,17 +397,17 @@ configAddons() { } configGPUDrivers() { - retrycmd_if_failure 10 1 60 sh $GPU_DEST/nvidia-drivers-$GPU_DV --silent --accept-license --no-drm --dkms --utility-prefix="${GPU_DEST}" --opengl-prefix="${GPU_DEST}" || exit $ERR_GPU_DRIVERS_START_FAIL + retrycmd_if_failure 30 1 120 sh $GPU_DEST/nvidia-drivers-$GPU_DV --silent --accept-license --no-drm --dkms --utility-prefix="${GPU_DEST}" --opengl-prefix="${GPU_DEST}" || exit $ERR_GPU_DRIVERS_START_FAIL echo "${GPU_DEST}/lib64" > /etc/ld.so.conf.d/nvidia.conf - ldconfig - umount -l /usr/lib/x86_64-linux-gnu - nvidia-modprobe -u -c0 - $GPU_DEST/bin/nvidia-smi - ldconfig + retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL + retrycmd_if_failure 120 5 25 umount -l /usr/lib/x86_64-linux-gnu || exit $ERR_GPU_DRIVERS_START_FAIL + retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL + retrycmd_if_failure 120 5 25 $GPU_DEST/bin/nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL + retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL } ensureGPUDrivers() { configGPUDrivers systemctlEnableAndStart nvidia-modprobe || exit $ERR_GPU_DRIVERS_START_FAIL - retrycmd_if_failure 5 10 60 systemctl restart kubelet + retrycmd_if_failure 30 1 120 systemctl restart kubelet || exit $ERR_GPU_DRIVERS_START_FAIL } diff --git a/parts/k8s/kubernetesinstalls.sh b/parts/k8s/kubernetesinstalls.sh index 2a77d9edab..043da43da1 100755 --- a/parts/k8s/kubernetesinstalls.sh +++ b/parts/k8s/kubernetesinstalls.sh @@ -31,19 +31,22 @@ function installDeps() { function installGPUDrivers() { rmmod nouveau echo blacklist nouveau >> /etc/modprobe.d/blacklist.conf - update-initramfs -u + retrycmd_if_failure_no_stats 120 5 25 update-initramfs -u || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT mkdir -p $GPU_DEST - retrycmd_if_failure_no_stats 180 1 5 curl -fsSL https://nvidia.github.io/nvidia-docker/gpgkey > /tmp/aptnvidia.gpg || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT - cat /tmp/aptnvidia.gpg | apt-key add - - retrycmd_if_failure_no_stats 180 1 5 curl -fsSL https://nvidia.github.io/nvidia-docker/ubuntu16.04/amd64/nvidia-docker.list > /tmp/nvidia-docker.list || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT - cat /tmp/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list + retrycmd_if_failure_no_stats 120 5 25 curl -fsSL https://nvidia.github.io/nvidia-docker/gpgkey > /tmp/aptnvidia.gpg || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT + wait_for_apt_locks + retrycmd_if_failure 120 5 25 apt-key add /tmp/aptnvidia.gpg || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT + wait_for_apt_locks + retrycmd_if_failure_no_stats 120 5 25 curl -fsSL https://nvidia.github.io/nvidia-docker/ubuntu16.04/amd64/nvidia-docker.list > /tmp/nvidia-docker.list || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT + wait_for_apt_locks + retrycmd_if_failure_no_stats 120 5 25 cat /tmp/nvidia-docker.list > /etc/apt/sources.list.d/nvidia-docker.list apt_get_update retrycmd_if_failure 30 5 300 apt-get install -y linux-headers-$(uname -r) gcc make dkms || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT - retrycmd_if_failure 30 5 300 apt-get -o Dpkg::Options::="--force-confold" install -y nvidia-docker2=$NVIDIA_DOCKER_VERSION+docker$DOCKER_VERSION nvidia-container-runtime=$NVIDIA_CONTAINER_RUNTIME_VERSION+docker$DOCKER_VERSION || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT - pkill -SIGHUP dockerd - retrycmd_if_failure 5 10 60 curl -fLS https://us.download.nvidia.com/tesla/$GPU_DV/NVIDIA-Linux-x86_64-$GPU_DV.run -o $GPU_DEST/nvidia-drivers-$GPU_DV || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT + retrycmd_if_failure 30 5 300 apt-get -o Dpkg::Options::="--force-confold" install -y nvidia-docker2=${NVIDIA_DOCKER_VERSION}+docker${DOCKER_VERSION} nvidia-container-runtime=${NVIDIA_CONTAINER_RUNTIME_VERSION}+docker${DOCKER_VERSION} || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT + retrycmd_if_failure 120 5 25 pkill -SIGHUP dockerd || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT + retrycmd_if_failure 30 5 60 curl -fLS https://us.download.nvidia.com/tesla/$GPU_DV/NVIDIA-Linux-x86_64-${GPU_DV}.run -o ${GPU_DEST}/nvidia-drivers-${GPU_DV} || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT mkdir -p $GPU_DEST/lib64 $GPU_DEST/overlay-workdir - mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=$GPU_DEST/lib64,workdir=$GPU_DEST/overlay-workdir none /usr/lib/x86_64-linux-gnu + retrycmd_if_failure 120 5 25 mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=${GPU_DEST}/lib64,workdir=${GPU_DEST}/overlay-workdir none /usr/lib/x86_64-linux-gnu || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT } function installContainerRuntime() { From 59da8875785f1f97ffc0ccea0ca47a55e9cb728a Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Wed, 24 Oct 2018 15:23:05 -0700 Subject: [PATCH 2/2] =?UTF-8?q?don=E2=80=99t=20fail=20if=20no=20pre-existi?= =?UTF-8?q?ng=20mount?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- parts/k8s/kubernetesconfigs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parts/k8s/kubernetesconfigs.sh b/parts/k8s/kubernetesconfigs.sh index 597924ad84..157002ba2a 100755 --- a/parts/k8s/kubernetesconfigs.sh +++ b/parts/k8s/kubernetesconfigs.sh @@ -400,7 +400,7 @@ configGPUDrivers() { retrycmd_if_failure 30 1 120 sh $GPU_DEST/nvidia-drivers-$GPU_DV --silent --accept-license --no-drm --dkms --utility-prefix="${GPU_DEST}" --opengl-prefix="${GPU_DEST}" || exit $ERR_GPU_DRIVERS_START_FAIL echo "${GPU_DEST}/lib64" > /etc/ld.so.conf.d/nvidia.conf retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL - retrycmd_if_failure 120 5 25 umount -l /usr/lib/x86_64-linux-gnu || exit $ERR_GPU_DRIVERS_START_FAIL + umount -l /usr/lib/x86_64-linux-gnu retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL retrycmd_if_failure 120 5 25 $GPU_DEST/bin/nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL