From 2a2b7ee41502e182d57cc1389746edd774c031a0 Mon Sep 17 00:00:00 2001 From: Chris Patterson Date: Mon, 14 Mar 2022 08:15:26 -0400 Subject: [PATCH] sources/azure: remove bind/unbind logic for hot attached nic Wait up to 10 seconds for link to come up before continuing. This typically takes just a few seconds once the NIC is hotplugged. If it takes longer than 10 seconds for whatever reason, dhclient should eventually succeed on its next attempt after the link does come online. Signed-off-by: Chris Patterson --- cloudinit/sources/DataSourceAzure.py | 76 ++++++--------------------- tests/unittests/sources/test_azure.py | 52 ++---------------- 2 files changed, 20 insertions(+), 108 deletions(-) diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index 85f0211bfb6..80b9226328d 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -925,70 +925,24 @@ def _wait_for_nic_detach(self, nl_sock): raise @azure_ds_telemetry_reporter - def wait_for_link_up(self, ifname): - """In cases where the link state is still showing down after a nic is - hot-attached, we can attempt to bring it up by forcing the hv_netvsc - drivers to query the link state by unbinding and then binding the - device. This function attempts infinitely until the link is up, - because we cannot proceed further until we have a stable link.""" - - if self.distro.networking.try_set_link_up(ifname): - report_diagnostic_event( - "The link %s is already up." % ifname, logger_func=LOG.info - ) - return - - LOG.debug("Attempting to bring %s up", ifname) - - attempts = 0 - LOG.info("Unbinding and binding the interface %s", ifname) - while True: - device_id = net.read_sys_net(ifname, "device/device_id") - if device_id is False or not isinstance(device_id, str): - raise RuntimeError("Unable to read device ID: %s" % device_id) - devicename = device_id.strip("{}") - util.write_file( - "/sys/bus/vmbus/drivers/hv_netvsc/unbind", devicename - ) - util.write_file( - "/sys/bus/vmbus/drivers/hv_netvsc/bind", devicename - ) - - attempts = attempts + 1 + def wait_for_link_up( + self, ifname: str, retries: int = 100, retry_sleep: float = 0.1 + ): + for i in range(retries): if self.distro.networking.try_set_link_up(ifname): - msg = "The link %s is up after %s attempts" % ( - ifname, - attempts, + report_diagnostic_event( + "The link %s is up." % ifname, logger_func=LOG.info ) - report_diagnostic_event(msg, logger_func=LOG.info) - return - - if attempts % 10 == 0: - msg = "Link is not up after %d attempts to rebind" % attempts - report_diagnostic_event(msg, logger_func=LOG.info) - LOG.info(msg) - - # It could take some time after rebind for the interface to be up. - # So poll for the status for some time before attempting to rebind - # again. - sleep_duration = 0.5 - max_status_polls = 20 - LOG.debug( - "Polling %d seconds for primary NIC link up after rebind.", - sleep_duration * max_status_polls, - ) + break - for i in range(0, max_status_polls): - if self.distro.networking.is_up(ifname): - msg = ( - "After %d attempts to rebind, link is up after " - "polling the link status %d times" % (attempts, i) - ) - report_diagnostic_event(msg, logger_func=LOG.info) - LOG.debug(msg) - return - else: - sleep(sleep_duration) + if (i + 1) < retries: + sleep(retry_sleep) + else: + report_diagnostic_event( + "The link %s is not up after %f seconds, continuing anyways." + % (ifname, retries * retry_sleep), + logger_func=LOG.info, + ) @azure_ds_telemetry_reporter def _create_report_ready_marker(self): diff --git a/tests/unittests/sources/test_azure.py b/tests/unittests/sources/test_azure.py index 62e5134b709..ac87f5bea33 100644 --- a/tests/unittests/sources/test_azure.py +++ b/tests/unittests/sources/test_azure.py @@ -3126,12 +3126,10 @@ def test_wait_for_link_up_returns_if_already_up(self, m_is_link_up): dsa.wait_for_link_up("eth0") self.assertEqual(1, m_is_link_up.call_count) - @mock.patch(MOCKPATH + "net.is_up", autospec=True) - @mock.patch(MOCKPATH + "util.write_file") - @mock.patch("cloudinit.net.read_sys_net", return_value="device-id") @mock.patch("cloudinit.distros.networking.LinuxNetworking.try_set_link_up") + @mock.patch(MOCKPATH + "sleep") def test_wait_for_link_up_checks_link_after_sleep( - self, m_try_set_link_up, m_read_sys_net, m_writefile, m_is_up + self, m_sleep, m_try_set_link_up ): """Waiting for link to be up should return immediately if the link is already up.""" @@ -3141,50 +3139,10 @@ def test_wait_for_link_up_checks_link_after_sleep( dsa = dsaz.DataSourceAzure({}, distro=distro, paths=self.paths) m_try_set_link_up.return_value = False - callcount = 0 - - def is_up_mock(key): - nonlocal callcount - if callcount == 0: - callcount += 1 - return False - return True - - m_is_up.side_effect = is_up_mock - - with mock.patch("cloudinit.sources.DataSourceAzure.sleep"): - dsa.wait_for_link_up("eth0") - self.assertEqual(2, m_try_set_link_up.call_count) - self.assertEqual(2, m_is_up.call_count) - - @mock.patch(MOCKPATH + "util.write_file") - @mock.patch("cloudinit.net.read_sys_net", return_value="device-id") - @mock.patch("cloudinit.distros.networking.LinuxNetworking.try_set_link_up") - def test_wait_for_link_up_writes_to_device_file( - self, m_is_link_up, m_read_sys_net, m_writefile - ): - """Waiting for link to be up should return immediately if the link is - already up.""" - - distro_cls = distros.fetch("ubuntu") - distro = distro_cls("ubuntu", {}, self.paths) - dsa = dsaz.DataSourceAzure({}, distro=distro, paths=self.paths) - - callcount = 0 - - def linkup(key): - nonlocal callcount - if callcount == 0: - callcount += 1 - return False - return True - - m_is_link_up.side_effect = linkup - dsa.wait_for_link_up("eth0") - self.assertEqual(2, m_is_link_up.call_count) - self.assertEqual(1, m_read_sys_net.call_count) - self.assertEqual(2, m_writefile.call_count) + + self.assertEqual(100, m_try_set_link_up.call_count) + self.assertEqual(99 * [mock.call(0.1)], m_sleep.mock_calls) @mock.patch( "cloudinit.sources.helpers.netlink.create_bound_netlink_socket"