Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 15 additions & 61 deletions cloudinit/sources/DataSourceAzure.py
Original file line number Diff line number Diff line change
Expand Up @@ -925,70 +925,24 @@ def _wait_for_nic_detach(self, nl_sock):
raise

@azure_ds_telemetry_reporter
def wait_for_link_up(self, ifname):
"""In cases where the link state is still showing down after a nic is
hot-attached, we can attempt to bring it up by forcing the hv_netvsc
drivers to query the link state by unbinding and then binding the
device. This function attempts infinitely until the link is up,
because we cannot proceed further until we have a stable link."""

if self.distro.networking.try_set_link_up(ifname):
report_diagnostic_event(
"The link %s is already up." % ifname, logger_func=LOG.info
)
return

LOG.debug("Attempting to bring %s up", ifname)

attempts = 0
LOG.info("Unbinding and binding the interface %s", ifname)
while True:
device_id = net.read_sys_net(ifname, "device/device_id")
if device_id is False or not isinstance(device_id, str):
raise RuntimeError("Unable to read device ID: %s" % device_id)
devicename = device_id.strip("{}")
util.write_file(
"/sys/bus/vmbus/drivers/hv_netvsc/unbind", devicename
)
util.write_file(
"/sys/bus/vmbus/drivers/hv_netvsc/bind", devicename
)

attempts = attempts + 1
def wait_for_link_up(
self, ifname: str, retries: int = 100, retry_sleep: float = 0.1
):
for i in range(retries):
if self.distro.networking.try_set_link_up(ifname):
msg = "The link %s is up after %s attempts" % (
ifname,
attempts,
report_diagnostic_event(
"The link %s is up." % ifname, logger_func=LOG.info
)
report_diagnostic_event(msg, logger_func=LOG.info)
return

if attempts % 10 == 0:
msg = "Link is not up after %d attempts to rebind" % attempts
report_diagnostic_event(msg, logger_func=LOG.info)
LOG.info(msg)

# It could take some time after rebind for the interface to be up.
# So poll for the status for some time before attempting to rebind
# again.
sleep_duration = 0.5
max_status_polls = 20
LOG.debug(
"Polling %d seconds for primary NIC link up after rebind.",
sleep_duration * max_status_polls,
)
break

for i in range(0, max_status_polls):
if self.distro.networking.is_up(ifname):
msg = (
"After %d attempts to rebind, link is up after "
"polling the link status %d times" % (attempts, i)
)
report_diagnostic_event(msg, logger_func=LOG.info)
LOG.debug(msg)
return
else:
sleep(sleep_duration)
if (i + 1) < retries:
sleep(retry_sleep)
else:
report_diagnostic_event(
"The link %s is not up after %f seconds, continuing anyways."
% (ifname, retries * retry_sleep),
logger_func=LOG.info,
)

@azure_ds_telemetry_reporter
def _create_report_ready_marker(self):
Expand Down
52 changes: 5 additions & 47 deletions tests/unittests/sources/test_azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -3126,12 +3126,10 @@ def test_wait_for_link_up_returns_if_already_up(self, m_is_link_up):
dsa.wait_for_link_up("eth0")
self.assertEqual(1, m_is_link_up.call_count)

@mock.patch(MOCKPATH + "net.is_up", autospec=True)
@mock.patch(MOCKPATH + "util.write_file")
@mock.patch("cloudinit.net.read_sys_net", return_value="device-id")
@mock.patch("cloudinit.distros.networking.LinuxNetworking.try_set_link_up")
@mock.patch(MOCKPATH + "sleep")
def test_wait_for_link_up_checks_link_after_sleep(
self, m_try_set_link_up, m_read_sys_net, m_writefile, m_is_up
self, m_sleep, m_try_set_link_up
):
"""Waiting for link to be up should return immediately if the link is
already up."""
Expand All @@ -3141,50 +3139,10 @@ def test_wait_for_link_up_checks_link_after_sleep(
dsa = dsaz.DataSourceAzure({}, distro=distro, paths=self.paths)
m_try_set_link_up.return_value = False

callcount = 0

def is_up_mock(key):
nonlocal callcount
if callcount == 0:
callcount += 1
return False
return True

m_is_up.side_effect = is_up_mock

with mock.patch("cloudinit.sources.DataSourceAzure.sleep"):
dsa.wait_for_link_up("eth0")
self.assertEqual(2, m_try_set_link_up.call_count)
self.assertEqual(2, m_is_up.call_count)

@mock.patch(MOCKPATH + "util.write_file")
@mock.patch("cloudinit.net.read_sys_net", return_value="device-id")
@mock.patch("cloudinit.distros.networking.LinuxNetworking.try_set_link_up")
def test_wait_for_link_up_writes_to_device_file(
self, m_is_link_up, m_read_sys_net, m_writefile
):
"""Waiting for link to be up should return immediately if the link is
already up."""

distro_cls = distros.fetch("ubuntu")
distro = distro_cls("ubuntu", {}, self.paths)
dsa = dsaz.DataSourceAzure({}, distro=distro, paths=self.paths)

callcount = 0

def linkup(key):
nonlocal callcount
if callcount == 0:
callcount += 1
return False
return True

m_is_link_up.side_effect = linkup

dsa.wait_for_link_up("eth0")
self.assertEqual(2, m_is_link_up.call_count)
self.assertEqual(1, m_read_sys_net.call_count)
self.assertEqual(2, m_writefile.call_count)

self.assertEqual(100, m_try_set_link_up.call_count)
self.assertEqual(99 * [mock.call(0.1)], m_sleep.mock_calls)

@mock.patch(
"cloudinit.sources.helpers.netlink.create_bound_netlink_socket"
Expand Down