Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
230 changes: 111 additions & 119 deletions cloudinit/sources/DataSourceAzure.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from cloudinit import net, sources, ssh_util, subp, util
from cloudinit.event import EventScope, EventType
from cloudinit.net import device_driver
from cloudinit.net.dhcp import NoDHCPLeaseError
from cloudinit.reporting import events
from cloudinit.sources.helpers import netlink
from cloudinit.sources.helpers.azure import (
Expand Down Expand Up @@ -890,36 +891,18 @@ def _create_report_ready_marker(self):
)

@azure_ds_telemetry_reporter
def _report_ready_if_needed(self):
"""Report ready to the platform if the marker file is not present,
and create the marker file.
def _report_ready_for_pps(self, lease: dict) -> None:
"""Report ready for PPS, creating the marker file upon completion.

:raises sources.InvalidMetaDataException: On error reporting ready.
"""
have_not_reported_ready = not os.path.isfile(
REPORTED_READY_MARKER_FILE
)
report_ready_succeeded = self._report_ready(lease=lease)
if not report_ready_succeeded:
msg = "Failed reporting ready while in the preprovisioning pool."
report_diagnostic_event(msg, logger_func=LOG.error)
raise sources.InvalidMetaDataException(msg)

if have_not_reported_ready:
report_diagnostic_event(
"Reporting ready before nic detach", logger_func=LOG.info
)
try:
with EphemeralDHCPv4WithReporting(azure_ds_reporter) as lease:
self._report_ready(lease=lease)
except Exception as e:
report_diagnostic_event(
"Exception reporting ready during "
"preprovisioning before nic detach: %s" % e,
logger_func=LOG.error,
)
raise
self._create_report_ready_marker()
else:
report_diagnostic_event(
"Already reported ready before nic detach."
" The marker file already exists: %s"
% REPORTED_READY_MARKER_FILE,
logger_func=LOG.error,
)
self._create_report_ready_marker()

@azure_ds_telemetry_reporter
def _check_if_nic_is_primary(self, ifname):
Expand Down Expand Up @@ -1113,7 +1096,20 @@ def _wait_for_all_nics_ready(self):
# Report ready if the marker file is not already present.
# The nic of the preprovisioned vm gets hot-detached as soon as
# we report ready. So no need to save the dhcp context.
self._report_ready_if_needed()
if not os.path.isfile(REPORTED_READY_MARKER_FILE):
try:
with EphemeralDHCPv4WithReporting(
azure_ds_reporter
) as lease:
self._report_ready_for_pps(lease)
except NoDHCPLeaseError as error:
report_diagnostic_event(
"DHCP failed while in provisioning pool",
logger_func=LOG.warning,
)
raise sources.InvalidMetaDataException(
"Failed to report ready while in provisioning pool."
) from error

has_nic_been_detached = bool(
os.path.isfile(REPROVISION_NIC_DETACHED_MARKER_FILE)
Expand Down Expand Up @@ -1159,8 +1155,7 @@ def _poll_imds(self):
self.imds_logging_threshold = 1
self.imds_poll_counter = 1
dhcp_attempts = 0
vnet_switched = False
return_val = None
reprovision_data = None

def exc_cb(msg, exception):
if isinstance(exception, UrlError):
Expand Down Expand Up @@ -1201,110 +1196,107 @@ def exc_cb(msg, exception):
# When the interface is hot-attached, we would have already
# done dhcp and set the dhcp context. In that case, skip
# the attempt to do dhcp.
is_ephemeral_ctx_present = self._ephemeral_dhcp_ctx is not None
msg = (
"Unexpected error. Dhcp context is not expected to be already "
"set when we need to wait for vnet switch"
)
if is_ephemeral_ctx_present and report_ready:
if self._ephemeral_dhcp_ctx is not None and report_ready:
report_diagnostic_event(msg, logger_func=LOG.error)
raise RuntimeError(msg)

while True:
if report_ready:
try:
# Since is_ephemeral_ctx_present is set only once, this ensures
# that with regular reprovisioning, dhcp is always done every
# time the loop runs.
if not is_ephemeral_ctx_present:
# Save our EphemeralDHCPv4 context to avoid repeated dhcp
# later when we report ready
self._ephemeral_dhcp_ctx = EphemeralDHCPv4WithReporting(
azure_ds_reporter
)
lease = self._ephemeral_dhcp_ctx.obtain_lease()

if vnet_switched:
dhcp_attempts += 1
if report_ready:
self._ephemeral_dhcp_ctx = EphemeralDHCPv4WithReporting(
azure_ds_reporter
)
lease = self._ephemeral_dhcp_ctx.obtain_lease()
nl_sock = netlink.create_bound_netlink_socket()
self._report_ready_for_pps(lease)

# Networking must remain up for netlink to detect
# media disconnect/connect.
LOG.debug(
"Wait for vnetswitch to happen on %s",
lease["interface"],
)
with events.ReportEventStack(
name="wait-for-media-disconnect-connect",
description="wait for vnet switch",
parent=azure_ds_reporter,
):
try:
nl_sock = netlink.create_bound_netlink_socket()
except netlink.NetlinkCreateSocketError as e:
report_diagnostic_event(
"Failed to create bound netlink socket: %s" % e,
logger_func=LOG.warning,
netlink.wait_for_media_disconnect_connect(
nl_sock, lease["interface"]
)
self._ephemeral_dhcp_ctx.clean_network()
break

report_ready_succeeded = self._report_ready(lease=lease)
if not report_ready_succeeded:
msg = (
"Failed reporting ready while in "
"the preprovisioning pool."
except AssertionError as e:
report_diagnostic_event(
"Error while waiting for vnet switch: %s" % e,
logger_func=LOG.error,
)
report_diagnostic_event(msg, logger_func=LOG.error)
self._ephemeral_dhcp_ctx.clean_network()
raise sources.InvalidMetaDataException(msg)

self._create_report_ready_marker()
report_ready = False

LOG.debug("Wait for vnetswitch to happen")
with events.ReportEventStack(
name="wait-for-media-disconnect-connect",
description="wait for vnet switch",
parent=azure_ds_reporter,
):
try:
netlink.wait_for_media_disconnect_connect(
nl_sock, lease["interface"]
)
except AssertionError as e:
report_diagnostic_event(
"Error while waiting for vnet switch: %s" % e,
logger_func=LOG.error,
)
break

vnet_switched = True
self._ephemeral_dhcp_ctx.clean_network()
else:
with events.ReportEventStack(
name="get-reprovision-data-from-imds",
description="get reprovision data from imds",
parent=azure_ds_reporter,
):
return_val = readurl(
url,
timeout=IMDS_TIMEOUT_IN_SECONDS,
headers=headers,
exception_cb=exc_cb,
infinite=True,
log_req_resp=False,
).contents
break
except UrlError:
# Teardown our EphemeralDHCPv4 context on failure as we retry
self._ephemeral_dhcp_ctx.clean_network()

# Also reset this flag which determines if we should do dhcp
# during retries.
is_ephemeral_ctx_present = False
except netlink.NetlinkCreateSocketError as e:
report_diagnostic_event(
"Failed to create bound netlink socket: %s" % e,
logger_func=LOG.warning,
)
raise sources.InvalidMetaDataException(
"Failed to report ready while in provisioning pool."
) from e
except NoDHCPLeaseError as e:
report_diagnostic_event(
"DHCP failed while in provisioning pool",
logger_func=LOG.warning,
)
raise sources.InvalidMetaDataException(
"Failed to report ready while in provisioning pool."
) from e
finally:
if nl_sock:
nl_sock.close()

if vnet_switched:
report_diagnostic_event(
"attempted dhcp %d times after reuse" % dhcp_attempts,
logger_func=LOG.debug,
)
report_diagnostic_event(
"polled imds %d times after reuse" % self.imds_poll_counter,
logger_func=LOG.debug,
)
# Teardown old network configuration.
self._ephemeral_dhcp_ctx.clean_network()
self._ephemeral_dhcp_ctx = None

while not reprovision_data:
if self._ephemeral_dhcp_ctx is None:
self._ephemeral_dhcp_ctx = EphemeralDHCPv4WithReporting(
azure_ds_reporter
)
dhcp_attempts += 1
try:
self._ephemeral_dhcp_ctx.obtain_lease()
except NoDHCPLeaseError:
continue

with events.ReportEventStack(
name="get-reprovision-data-from-imds",
description="get reprovision data from imds",
parent=azure_ds_reporter,
):
try:
reprovision_data = readurl(
url,
timeout=IMDS_TIMEOUT_IN_SECONDS,
headers=headers,
exception_cb=exc_cb,
infinite=True,
log_req_resp=False,
).contents
except UrlError:
self._ephemeral_dhcp_ctx.clean_network()
self._ephemeral_dhcp_ctx = None
continue

report_diagnostic_event(
"attempted dhcp %d times after reuse" % dhcp_attempts,
logger_func=LOG.debug,
)
report_diagnostic_event(
"polled imds %d times after reuse" % self.imds_poll_counter,
logger_func=LOG.debug,
)

return return_val
return reprovision_data

@azure_ds_telemetry_reporter
def _report_failure(self, description: Optional[str] = None) -> bool:
Expand Down
5 changes: 1 addition & 4 deletions tests/unittests/sources/test_azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -3213,10 +3213,8 @@ def setUp(self):
dsaz.BUILTIN_DS_CONFIG["data_dir"] = self.waagent_d

@mock.patch("time.sleep", mock.MagicMock())
@mock.patch(MOCKPATH + "EphemeralDHCPv4WithReporting", autospec=True)
def test_poll_imds_re_dhcp_on_timeout(
self,
m_dhcpv4,
m_report_ready,
m_request,
m_media_switch,
Expand All @@ -3236,7 +3234,6 @@ def test_poll_imds_re_dhcp_on_timeout(
m_media_switch.return_value = None
dhcp_ctx = mock.MagicMock(lease=lease)
dhcp_ctx.obtain_lease.return_value = lease
m_dhcpv4.return_value = dhcp_ctx

self.tries = 0

Expand All @@ -3260,7 +3257,7 @@ def fake_timeout_once(**kwargs):
dsa._poll_imds()
self.assertEqual(m_report_ready.call_count, 1)
m_report_ready.assert_called_with(lease=lease)
self.assertEqual(3, m_dhcpv4.call_count, "Expected 3 DHCP calls")
self.assertEqual(3, m_dhcp.call_count, "Expected 3 DHCP calls")
self.assertEqual(4, self.tries, "Expected 4 total reads from IMDS")

@mock.patch("os.path.isfile")
Expand Down