From b1cc7d95aa4e35207ac42a5dcc39faece78a7994 Mon Sep 17 00:00:00 2001 From: Aswin Rajamannar Date: Fri, 20 Aug 2021 12:19:58 -0700 Subject: [PATCH 1/3] Modify sleep logic in primary nic detection based on observation --- cloudinit/sources/DataSourceAzure.py | 28 +++++++++---------- tests/unittests/test_datasource/test_azure.py | 20 +++++++++---- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index ba23139b7e8..bbd40617b05 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -912,26 +912,26 @@ def wait_for_link_up(self, ifname): report_diagnostic_event(msg, logger_func=LOG.info) return - sleep_duration = 1 - msg = ("Link is not up after %d attempts with %d seconds sleep " - "between attempts." % (attempts, sleep_duration)) + msg = ("Link is not up after %d attempts to rebind" % attempts) if attempts % 10 == 0: report_diagnostic_event(msg, logger_func=LOG.info) else: LOG.info(msg) - sleep(sleep_duration) - - # Since we just did a unbind and bind, check again after sleep - # but before doing unbind and bind again to avoid races where the - # link might take a slight delay after bind to be up. - if self.distro.networking.is_up(ifname): - msg = ("Link is up after checking after sleeping for %d secs" - " after %d attempts" % - (sleep_duration, attempts)) - report_diagnostic_event(msg, logger_func=LOG.info) - return + # It could take some time after rebind for the interface to be up. + # So poll for the status for some time before attempting to rebind + # again. + sleep_duration = 0.5 + max_status_polls = 20 + for i in range(0, max_status_polls): + if self.distro.networking.is_up(ifname): + msg = ("After %d attempts to rebind, link is up after " + "polling the link status %d times" % (attempts, i)) + report_diagnostic_event(msg, logger_func=LOG.info) + return + else: + sleep(sleep_duration) @azure_ds_telemetry_reporter def _create_report_ready_marker(self): diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py index 03609c3de19..851cf82ef8d 100644 --- a/tests/unittests/test_datasource/test_azure.py +++ b/tests/unittests/test_datasource/test_azure.py @@ -2912,19 +2912,29 @@ def test_wait_for_link_up_returns_if_already_up( @mock.patch('cloudinit.net.read_sys_net') @mock.patch('cloudinit.distros.networking.LinuxNetworking.try_set_link_up') def test_wait_for_link_up_checks_link_after_sleep( - self, m_is_link_up, m_read_sys_net, m_writefile, m_is_up): + self, m_try_set_link_up, m_read_sys_net, m_writefile, m_is_up): """Waiting for link to be up should return immediately if the link is already up.""" distro_cls = distros.fetch('ubuntu') distro = distro_cls('ubuntu', {}, self.paths) dsa = dsaz.DataSourceAzure({}, distro=distro, paths=self.paths) - m_is_link_up.return_value = False - m_is_up.return_value = True + m_try_set_link_up.return_value = False + + callcount = 0 + + def is_up_mock(key): + nonlocal callcount + if callcount == 0: + callcount += 1 + return False + return True + + m_is_up.side_effect = is_up_mock dsa.wait_for_link_up("eth0") - self.assertEqual(2, m_is_link_up.call_count) - self.assertEqual(1, m_is_up.call_count) + self.assertEqual(2, m_try_set_link_up.call_count) + self.assertEqual(2, m_is_up.call_count) @mock.patch(MOCKPATH + 'util.write_file') @mock.patch('cloudinit.net.read_sys_net') From bfe55aa3c935e9df6290a004168216a859645e59 Mon Sep 17 00:00:00 2001 From: aswinrajamannar <39812128+aswinrajamannar@users.noreply.github.com> Date: Fri, 20 Aug 2021 13:41:46 -0700 Subject: [PATCH 2/3] Commit PR comment suggestion Co-authored-by: Chad Smith --- cloudinit/sources/DataSourceAzure.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index bbd40617b05..921a1e09e8e 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -924,6 +924,7 @@ def wait_for_link_up(self, ifname): # again. sleep_duration = 0.5 max_status_polls = 20 + LOG.debug("Polling %d seconds for primary NIC link up after rebind.", sleep_duration * max_status_polls) for i in range(0, max_status_polls): if self.distro.networking.is_up(ifname): msg = ("After %d attempts to rebind, link is up after " From a0074da932f420fc68f2c28e9928449dabf8dab5 Mon Sep 17 00:00:00 2001 From: Aswin Rajamannar Date: Fri, 20 Aug 2021 14:22:01 -0700 Subject: [PATCH 3/3] Address logging comments --- cloudinit/sources/DataSourceAzure.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index 921a1e09e8e..fddfe363abd 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -892,12 +892,12 @@ def wait_for_link_up(self, ifname): logger_func=LOG.info) return - LOG.info("Attempting to bring %s up", ifname) + LOG.debug("Attempting to bring %s up", ifname) attempts = 0 + LOG.info("Unbinding and binding the interface %s", ifname) while True: - LOG.info("Unbinding and binding the interface %s", ifname) devicename = net.read_sys_net(ifname, 'device/device_id').strip('{}') util.write_file('/sys/bus/vmbus/drivers/hv_netvsc/unbind', @@ -912,11 +912,9 @@ def wait_for_link_up(self, ifname): report_diagnostic_event(msg, logger_func=LOG.info) return - msg = ("Link is not up after %d attempts to rebind" % attempts) - if attempts % 10 == 0: + msg = ("Link is not up after %d attempts to rebind" % attempts) report_diagnostic_event(msg, logger_func=LOG.info) - else: LOG.info(msg) # It could take some time after rebind for the interface to be up. @@ -924,12 +922,15 @@ def wait_for_link_up(self, ifname): # again. sleep_duration = 0.5 max_status_polls = 20 - LOG.debug("Polling %d seconds for primary NIC link up after rebind.", sleep_duration * max_status_polls) + LOG.debug("Polling %d seconds for primary NIC link up after " + "rebind.", sleep_duration * max_status_polls) + for i in range(0, max_status_polls): if self.distro.networking.is_up(ifname): msg = ("After %d attempts to rebind, link is up after " "polling the link status %d times" % (attempts, i)) report_diagnostic_event(msg, logger_func=LOG.info) + LOG.debug(msg) return else: sleep(sleep_duration)