Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 20 additions & 18 deletions cloudinit/sources/DataSourceAzure.py
Original file line number Diff line number Diff line change
Expand Up @@ -892,12 +892,12 @@ def wait_for_link_up(self, ifname):
logger_func=LOG.info)
return

LOG.info("Attempting to bring %s up", ifname)
LOG.debug("Attempting to bring %s up", ifname)

attempts = 0
LOG.info("Unbinding and binding the interface %s", ifname)
while True:

LOG.info("Unbinding and binding the interface %s", ifname)
devicename = net.read_sys_net(ifname,
'device/device_id').strip('{}')
util.write_file('/sys/bus/vmbus/drivers/hv_netvsc/unbind',
Expand All @@ -912,26 +912,28 @@ def wait_for_link_up(self, ifname):
report_diagnostic_event(msg, logger_func=LOG.info)
return

sleep_duration = 1
msg = ("Link is not up after %d attempts with %d seconds sleep "
"between attempts." % (attempts, sleep_duration))

if attempts % 10 == 0:
msg = ("Link is not up after %d attempts to rebind" % attempts)
report_diagnostic_event(msg, logger_func=LOG.info)
else:
LOG.info(msg)

sleep(sleep_duration)

# Since we just did a unbind and bind, check again after sleep
# but before doing unbind and bind again to avoid races where the
# link might take a slight delay after bind to be up.
if self.distro.networking.is_up(ifname):
msg = ("Link is up after checking after sleeping for %d secs"
" after %d attempts" %
(sleep_duration, attempts))
report_diagnostic_event(msg, logger_func=LOG.info)
return
# It could take some time after rebind for the interface to be up.
# So poll for the status for some time before attempting to rebind
# again.
sleep_duration = 0.5
max_status_polls = 20
LOG.debug("Polling %d seconds for primary NIC link up after "
"rebind.", sleep_duration * max_status_polls)

for i in range(0, max_status_polls):
Comment thread
aswinrajamannar marked this conversation as resolved.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @aswinrajamannar for the quick fix, given that this polling on networking.is_up seems to be generally helpful, do you think it would make sense to either define a local function to wrap self.distro-networking.is_up and the polling logic into a simple function that we can also call above before the while True loop on the initial self.distro.networking.try_set_link_up check?

Copy link
Copy Markdown
Collaborator

@blackboxsw blackboxsw Aug 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But, maybe this retry logic doesn't really payoff on the first call to self.distro.networking.try_set_link_up in wait_for_link_up and only adds 10 seconds of delay to the whole process because we expect wait_for_link_up to succeed immediately. If first linkup attempt fails, then we do a deeper unbind/bind loop with retries

Copy link
Copy Markdown
Contributor Author

@aswinrajamannar aswinrajamannar Aug 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The polling is not needed before the loop. The polling is necessary here mainly because we wrote to hv_netvsc/unbind and hv_netvsc/bind, which is forcing the driver to refresh the state of the link. After doing the unbind & bind, it seems to take a few seconds for the link to be actually up that's why we wait for is_up before calling unbind & bind again.

In most of the cases, the link would be up after just one call to try_set_link_up so we don't need to waste time doing unbind and bind. That is why it is added before the while loop. If that first check doesn't bring the link up, simply waiting longer without doing unbind and bind is not going to help and it would be an unnecessary delay. Hope this clarifies a little bit..? Or let me know if I misunderstood your comment.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 this general polling looks like it could be reasonable in for more general use-cases down in https://github.com/canonical/cloud-init/blob/main/cloudinit/distros/networking.py#L226 if we were to add an optional "retries" parameter or something that if set, we'd retry the check on networking.is_up for however many retries are provided. I don't think this needs to be fixed in this branch though, but it might be something we request on a followup PR.

if self.distro.networking.is_up(ifname):
msg = ("After %d attempts to rebind, link is up after "
"polling the link status %d times" % (attempts, i))
report_diagnostic_event(msg, logger_func=LOG.info)
LOG.debug(msg)
return
else:
sleep(sleep_duration)

@azure_ds_telemetry_reporter
def _create_report_ready_marker(self):
Expand Down
20 changes: 15 additions & 5 deletions tests/unittests/test_datasource/test_azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -2912,19 +2912,29 @@ def test_wait_for_link_up_returns_if_already_up(
@mock.patch('cloudinit.net.read_sys_net')
@mock.patch('cloudinit.distros.networking.LinuxNetworking.try_set_link_up')
def test_wait_for_link_up_checks_link_after_sleep(
self, m_is_link_up, m_read_sys_net, m_writefile, m_is_up):
self, m_try_set_link_up, m_read_sys_net, m_writefile, m_is_up):
"""Waiting for link to be up should return immediately if the link is
already up."""

distro_cls = distros.fetch('ubuntu')
distro = distro_cls('ubuntu', {}, self.paths)
dsa = dsaz.DataSourceAzure({}, distro=distro, paths=self.paths)
m_is_link_up.return_value = False
m_is_up.return_value = True
m_try_set_link_up.return_value = False

callcount = 0

def is_up_mock(key):
nonlocal callcount
if callcount == 0:
callcount += 1
return False
return True

m_is_up.side_effect = is_up_mock

dsa.wait_for_link_up("eth0")
self.assertEqual(2, m_is_link_up.call_count)
self.assertEqual(1, m_is_up.call_count)
self.assertEqual(2, m_try_set_link_up.call_count)
self.assertEqual(2, m_is_up.call_count)

@mock.patch(MOCKPATH + 'util.write_file')
@mock.patch('cloudinit.net.read_sys_net')
Expand Down