From 674e69292a725629a2aab026f948c6a72f818736 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Mon, 22 May 2023 12:49:09 -0400 Subject: [PATCH 1/4] overlay: disable iscsi.service by default `iscsi.service` has `Before=remote-fs-pre.target` *and* `After=network-online.target`. This forces `remote-fs-pre.target` to block on `network-online.target` and hence in OCP, on `ovs-configuration.service` (which has `Before=network-online.target`). So this transitively makes `systemd-user-sessions.service` block on `network-online.target`. This was an issue in Fedora as well and was discussed in a devel thread[[1]]. `iscsi.service` was subsequently reworked[[2]][[3]] so that it was only activated if iSCSI was actually used by the system. On RHEL 8, `iscsi.service` and co. were directly enabled by RPM scriptlets rather than using presets. In RHCOS, we explicitly make presets canonical[[4]] so we shipped with `iscsi.service` disabled by default. On RHEL 9, the units were fixed to use presets[[5]]. This is why we started seeing this issue after moving to RHEL 9. So all we need in theory is to have the Fedora patch backported to RHEL 9. However, since we don't really need the functionality from `iscsi.service` by default in RHCOS, we can fast-track its (re-)disablement and not wait for the `iscsi-starter.service` workaround. Note that `iscsi.service` is only used to bring up iSCSI sessions marked for autostart in `/var/lib/iscsi/nodes` and is separate from `iscsid.service`, which is what actually manages the iSCSI connections. In OpenShift, we rely on the latter only (e.g. configured iSCSI PVCs are done by the kubelet directly calling out to `iscsiadm`). It's also separate from iSCSI devices that use host bus adapters, which are transparent to RHCOS/OCP. Fixes: https://issues.redhat.com/browse/OCPBUGS-11124 [1]: https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/thread/HACVEJ3FMOIM2TOENOVH5CPOUNR7NCMS [2]: https://src.fedoraproject.org/rpms/iscsi-initiator-utils/c/1e689cd0c6667eca838c85975a1b7a070209e5ad [3]: https://src.fedoraproject.org/rpms/fedora-release/pull-request/246 [4]: https://github.com/coreos/fedora-coreos-config/blob/1553518214088a89d6a2360a6fcdddbd3915628a/manifests/ignition-and-ostree.yaml#L35-L44 [5]: https://bugzilla.redhat.com/show_bug.cgi?id=1930458 (cherry picked from commit b5c5a05853258678dca2b00e7a81ccdccf6700e3) --- .../system-preset/36-iscsid-disabled.preset | 6 ++++ tests/kola/systemd/network-online/config.bu | 32 +++++++++++++++++++ .../systemd/network-online/data/commonlib.sh | 1 + tests/kola/systemd/network-online/test.sh | 31 ++++++++++++++++++ 4 files changed, 70 insertions(+) create mode 100644 tests/kola/systemd/network-online/config.bu create mode 120000 tests/kola/systemd/network-online/data/commonlib.sh create mode 100755 tests/kola/systemd/network-online/test.sh diff --git a/overlay.d/07el9/usr/lib/systemd/system-preset/36-iscsid-disabled.preset b/overlay.d/07el9/usr/lib/systemd/system-preset/36-iscsid-disabled.preset index ccdaa8db3..b419980e7 100644 --- a/overlay.d/07el9/usr/lib/systemd/system-preset/36-iscsid-disabled.preset +++ b/overlay.d/07el9/usr/lib/systemd/system-preset/36-iscsid-disabled.preset @@ -1 +1,7 @@ disable iscsid.socket + +# force disable until the following patches are in c9s/el9: +# https://src.fedoraproject.org/rpms/iscsi-initiator-utils/c/1e689cd0c6667eca838c85975a1b7a070209e5ad +# https://src.fedoraproject.org/rpms/fedora-release/pull-request/246 +# then it'll be off by default and we won't need this +disable iscsi.service diff --git a/tests/kola/systemd/network-online/config.bu b/tests/kola/systemd/network-online/config.bu new file mode 100644 index 000000000..3d0b87696 --- /dev/null +++ b/tests/kola/systemd/network-online/config.bu @@ -0,0 +1,32 @@ +variant: fcos +version: 1.4.0 +systemd: + units: + - name: ovs-configuration.service + enabled: true + contents: | + [Unit] + Requires=openvswitch.service + After=NetworkManager-wait-online.service openvswitch.service + Before=network-online.target kubelet.service + + [Service] + Type=oneshot + RemainAfterExit=yes + ExecStart=sleep infinity + + [Install] + WantedBy=network-online.target + - name: kubelet.service + enabled: true + contents: | + [Unit] + Wants=network-online.target + After=network-online.target + + [Service] + Type=simple + ExecStart=sleep infinity + + [Install] + WantedBy=multi-user.target diff --git a/tests/kola/systemd/network-online/data/commonlib.sh b/tests/kola/systemd/network-online/data/commonlib.sh new file mode 120000 index 000000000..cdecd90f5 --- /dev/null +++ b/tests/kola/systemd/network-online/data/commonlib.sh @@ -0,0 +1 @@ +../../../../../fedora-coreos-config/tests/kola/data/commonlib.sh \ No newline at end of file diff --git a/tests/kola/systemd/network-online/test.sh b/tests/kola/systemd/network-online/test.sh new file mode 100755 index 000000000..176506f89 --- /dev/null +++ b/tests/kola/systemd/network-online/test.sh @@ -0,0 +1,31 @@ +#!/bin/bash +## kola: +## description: Verify that network-online.target doesn't block login +## tags: platform-independent +## # this really shouldn't take long; if it does, it's that we're hitting the +## # very issue we're testing for +## timeoutMin: 3 + +# https://github.com/openshift/os/pull/1279 +# https://issues.redhat.com/browse/OCPBUGS-11124 + +set -euo pipefail + +. $KOLA_EXT_DATA/commonlib.sh + +# The fact that we're here means that logins must be working since kola was able +# to SSH to start us. But let's do some sanity-checks to verify that the test +# was valid. + +# verify that ovs-configuration is still activating +if [[ $(systemctl show ovs-configuration.service -p ActiveState) != "ActiveState=activating" ]]; then + systemctl status ovs-configuration.service + fatal "ovs-configuration.service isn't activating" +fi + +if [[ $(systemctl show network-online.target -p ActiveState) != "ActiveState=inactive" ]]; then + systemctl status network-online.target + fatal "network-online.target isn't inactive" +fi + +echo "ok network-online.target does not block login" From 9fdbd5ee995f7f51572d578c06473d4e9aae2014 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Fri, 26 May 2023 09:51:56 -0400 Subject: [PATCH 2/4] overlay: stop disabling `iscsid.socket` The iSCSI daemon is now socket-activated so that it's only running when needed rather than always enabled. We're breaking that by disabling `iscsid.socket`. This effectively reverts 929ac48 ("c9s: Disable iscsid.socket"). It's not certain why this was done, but it was likely to workaround failing tests. These tests should be fixed now[[1]] so we should be able to stop doing this. [1]: https://github.com/coreos/coreos-assembler/pull/3275 (cherry picked from commit 8c9427029cd6da75756a797f04d85c51caebd4ec) --- .../usr/lib/systemd/system-preset/36-iscsid-disabled.preset | 2 -- 1 file changed, 2 deletions(-) diff --git a/overlay.d/07el9/usr/lib/systemd/system-preset/36-iscsid-disabled.preset b/overlay.d/07el9/usr/lib/systemd/system-preset/36-iscsid-disabled.preset index b419980e7..6d7f03d28 100644 --- a/overlay.d/07el9/usr/lib/systemd/system-preset/36-iscsid-disabled.preset +++ b/overlay.d/07el9/usr/lib/systemd/system-preset/36-iscsid-disabled.preset @@ -1,5 +1,3 @@ -disable iscsid.socket - # force disable until the following patches are in c9s/el9: # https://src.fedoraproject.org/rpms/iscsi-initiator-utils/c/1e689cd0c6667eca838c85975a1b7a070209e5ad # https://src.fedoraproject.org/rpms/fedora-release/pull-request/246 From e4553e9cf474f21cce1cffb32725d673be34214f Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Mon, 29 May 2023 09:19:08 -0400 Subject: [PATCH 3/4] tests/kola: delete ext.config.systemd.network-online test This was moved to FCOS[[1]]. [1]: https://github.com/coreos/fedora-coreos-config/pull/2437 (cherry picked from commit 096f0aed32d3a7e43daeee6d176b93521da9ee2f) --- tests/kola/systemd/network-online/config.bu | 32 ------------------- .../systemd/network-online/data/commonlib.sh | 1 - tests/kola/systemd/network-online/test.sh | 31 ------------------ 3 files changed, 64 deletions(-) delete mode 100644 tests/kola/systemd/network-online/config.bu delete mode 120000 tests/kola/systemd/network-online/data/commonlib.sh delete mode 100755 tests/kola/systemd/network-online/test.sh diff --git a/tests/kola/systemd/network-online/config.bu b/tests/kola/systemd/network-online/config.bu deleted file mode 100644 index 3d0b87696..000000000 --- a/tests/kola/systemd/network-online/config.bu +++ /dev/null @@ -1,32 +0,0 @@ -variant: fcos -version: 1.4.0 -systemd: - units: - - name: ovs-configuration.service - enabled: true - contents: | - [Unit] - Requires=openvswitch.service - After=NetworkManager-wait-online.service openvswitch.service - Before=network-online.target kubelet.service - - [Service] - Type=oneshot - RemainAfterExit=yes - ExecStart=sleep infinity - - [Install] - WantedBy=network-online.target - - name: kubelet.service - enabled: true - contents: | - [Unit] - Wants=network-online.target - After=network-online.target - - [Service] - Type=simple - ExecStart=sleep infinity - - [Install] - WantedBy=multi-user.target diff --git a/tests/kola/systemd/network-online/data/commonlib.sh b/tests/kola/systemd/network-online/data/commonlib.sh deleted file mode 120000 index cdecd90f5..000000000 --- a/tests/kola/systemd/network-online/data/commonlib.sh +++ /dev/null @@ -1 +0,0 @@ -../../../../../fedora-coreos-config/tests/kola/data/commonlib.sh \ No newline at end of file diff --git a/tests/kola/systemd/network-online/test.sh b/tests/kola/systemd/network-online/test.sh deleted file mode 100755 index 176506f89..000000000 --- a/tests/kola/systemd/network-online/test.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -## kola: -## description: Verify that network-online.target doesn't block login -## tags: platform-independent -## # this really shouldn't take long; if it does, it's that we're hitting the -## # very issue we're testing for -## timeoutMin: 3 - -# https://github.com/openshift/os/pull/1279 -# https://issues.redhat.com/browse/OCPBUGS-11124 - -set -euo pipefail - -. $KOLA_EXT_DATA/commonlib.sh - -# The fact that we're here means that logins must be working since kola was able -# to SSH to start us. But let's do some sanity-checks to verify that the test -# was valid. - -# verify that ovs-configuration is still activating -if [[ $(systemctl show ovs-configuration.service -p ActiveState) != "ActiveState=activating" ]]; then - systemctl status ovs-configuration.service - fatal "ovs-configuration.service isn't activating" -fi - -if [[ $(systemctl show network-online.target -p ActiveState) != "ActiveState=inactive" ]]; then - systemctl status network-online.target - fatal "network-online.target isn't inactive" -fi - -echo "ok network-online.target does not block login" From 5bb4a77462d480516795d302113fa2fea64c024d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Ravier?= Date: Thu, 8 Jun 2023 14:16:37 +0200 Subject: [PATCH 4/4] Bump fedora-coreos-config submodule See: https://github.com/coreos/fedora-coreos-config/pull/2448 ``` $ git -C fedora-coreos-config shortlog --no-merges \ 9fae403ff93a090ef4f7436eda8a0d5387b9c862..ed7f4f22c6db1fbcabad678267a747cc70cbc53a Dusty Mabe (1): tests/kola: wait longer in commonlib.sh is_service_active Jonathan Lebon (1): tests/kola: upstream network-online login test from RHCOS ``` --- fedora-coreos-config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedora-coreos-config b/fedora-coreos-config index 9fae403ff..ed7f4f22c 160000 --- a/fedora-coreos-config +++ b/fedora-coreos-config @@ -1 +1 @@ -Subproject commit 9fae403ff93a090ef4f7436eda8a0d5387b9c862 +Subproject commit ed7f4f22c6db1fbcabad678267a747cc70cbc53a