From 706dcd055f71631f1a3b7889e4c69480a21d0b7f Mon Sep 17 00:00:00 2001 From: Chris Patterson Date: Thu, 21 Apr 2022 09:02:51 -0400 Subject: [PATCH] sources/azure: remove reprovisioning marker If we haven't reported ready for source PPS then we can treat the recovery boot like any other. The metadata on the OVF and IMDS will indicate the PPS type correctly as the state hasn't changed. If we have reported ready for source PPS, we continue to fall into _poll_imds() by way of setting pps_type to UNKNOWN if the REPORTED_READY_MARKER is present and will not attempt to report ready again. This fixes a potential issue when recovering on Savable PPS. If a recovery boot occurs after the recovery marker is created, and without reporting ready, the subsequent boot will assume pps type UNKNOWN and attempt to report ready in _poll_imds() using the Running PPS netlink operations. Add unit test coverage for complete recovery scenario. Signed-off-by: Chris Patterson --- cloudinit/sources/DataSourceAzure.py | 94 ++++++++++----------------- tests/unittests/sources/test_azure.py | 86 ++++++++++++++++++++---- 2 files changed, 109 insertions(+), 71 deletions(-) diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index fd846dd6d1c..d1bec85c84d 100755 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -60,7 +60,6 @@ DEFAULT_FS = "ext4" # DMI chassis-asset-tag is set static for all azure instances AZURE_CHASSIS_ASSET_TAG = "7783-7084-3265-9085-8269-3286-77" -REPROVISION_MARKER_FILE = "/var/lib/cloud/data/poll_imds" REPORTED_READY_MARKER_FILE = "/var/lib/cloud/data/reported_ready" AGENT_SEED_DIR = "/var/lib/waagent" DEFAULT_PROVISIONING_ISO_DEV = "/dev/sr0" @@ -487,50 +486,42 @@ def crawl_metadata(self): cfg = {} files = {} - if os.path.isfile(REPROVISION_MARKER_FILE): - metadata_source = "IMDS" - report_diagnostic_event( - "Reprovision marker file already present " - "before crawling Azure metadata: %s" % REPROVISION_MARKER_FILE, - logger_func=LOG.debug, - ) - else: - for src in list_possible_azure_ds(self.seed_dir, ddir): - try: - if src.startswith("/dev/"): - if util.is_FreeBSD(): - md, userdata_raw, cfg, files = util.mount_cb( - src, load_azure_ds_dir, mtype="udf" - ) - else: - md, userdata_raw, cfg, files = util.mount_cb( - src, load_azure_ds_dir - ) - # save the device for ejection later - self._iso_dev = src + for src in list_possible_azure_ds(self.seed_dir, ddir): + try: + if src.startswith("/dev/"): + if util.is_FreeBSD(): + md, userdata_raw, cfg, files = util.mount_cb( + src, load_azure_ds_dir, mtype="udf" + ) else: - md, userdata_raw, cfg, files = load_azure_ds_dir(src) - ovf_is_accessible = True - metadata_source = src - break - except NonAzureDataSource: - report_diagnostic_event( - "Did not find Azure data source in %s" % src, - logger_func=LOG.debug, - ) - continue - except util.MountFailedError: - report_diagnostic_event( - "%s was not mountable" % src, logger_func=LOG.debug - ) - md = {"local-hostname": ""} - cfg = {"system_info": {"default_user": {"name": ""}}} - metadata_source = "IMDS" - continue - except BrokenAzureDataSource as exc: - msg = "BrokenAzureDataSource: %s" % exc - report_diagnostic_event(msg, logger_func=LOG.error) - raise sources.InvalidMetaDataException(msg) + md, userdata_raw, cfg, files = util.mount_cb( + src, load_azure_ds_dir + ) + # save the device for ejection later + self._iso_dev = src + else: + md, userdata_raw, cfg, files = load_azure_ds_dir(src) + ovf_is_accessible = True + metadata_source = src + break + except NonAzureDataSource: + report_diagnostic_event( + "Did not find Azure data source in %s" % src, + logger_func=LOG.debug, + ) + continue + except util.MountFailedError: + report_diagnostic_event( + "%s was not mountable" % src, logger_func=LOG.debug + ) + md = {"local-hostname": ""} + cfg = {"system_info": {"default_user": {"name": ""}}} + metadata_source = "IMDS" + continue + except BrokenAzureDataSource as exc: + msg = "BrokenAzureDataSource: %s" % exc + report_diagnostic_event(msg, logger_func=LOG.error) + raise sources.InvalidMetaDataException(msg) report_diagnostic_event( "Found provisioning metadata in %s" % metadata_source, @@ -567,8 +558,6 @@ def crawl_metadata(self): report_diagnostic_event(msg, logger_func=LOG.error) raise sources.InvalidMetaDataException(msg) - self._write_reprovision_marker() - if pps_type == PPSType.SAVABLE: self._wait_for_all_nics_ready() @@ -1394,7 +1383,7 @@ def _ppstype_from_imds(self, imds_md: dict) -> Optional[str]: def _determine_pps_type(self, ovf_cfg: dict, imds_md: dict) -> PPSType: """Determine PPS type using OVF, IMDS data, and reprovision marker.""" - if os.path.isfile(REPROVISION_MARKER_FILE): + if os.path.isfile(REPORTED_READY_MARKER_FILE): pps_type = PPSType.UNKNOWN elif ( ovf_cfg.get("PreprovisionedVMType", None) == PPSType.SAVABLE.value @@ -1416,16 +1405,6 @@ def _determine_pps_type(self, ovf_cfg: dict, imds_md: dict) -> PPSType: ) return pps_type - def _write_reprovision_marker(self): - """Write reprovision marker file in case system is rebooted.""" - LOG.info( - "Creating a marker file to poll imds: %s", REPROVISION_MARKER_FILE - ) - util.write_file( - REPROVISION_MARKER_FILE, - "{pid}: {time}\n".format(pid=os.getpid(), time=time()), - ) - @azure_ds_telemetry_reporter def _reprovision(self): """Initiate the reprovisioning workflow. @@ -1463,7 +1442,6 @@ def _determine_wireserver_pubkey_info( def _cleanup_markers(self): """Cleanup any marker files.""" util.del_file(REPORTED_READY_MARKER_FILE) - util.del_file(REPROVISION_MARKER_FILE) @azure_ds_telemetry_reporter def activate(self, cfg, is_new_instance): diff --git a/tests/unittests/sources/test_azure.py b/tests/unittests/sources/test_azure.py index 005b16b675c..b7dae873f28 100644 --- a/tests/unittests/sources/test_azure.py +++ b/tests/unittests/sources/test_azure.py @@ -287,16 +287,6 @@ def patched_reported_ready_marker_path(patched_markers_dir_path): yield reported_ready_marker -@pytest.fixture -def patched_reprovision_marker_path(patched_markers_dir_path): - reprovision_marker = patched_markers_dir_path / "poll_imds" - with mock.patch( - MOCKPATH + "REPROVISION_MARKER_FILE", - str(patched_reprovision_marker_path), - ): - yield reprovision_marker - - def construct_valid_ovf_env( data=None, pubkeys=None, userdata=None, platform_settings=None ): @@ -3006,7 +2996,9 @@ def test_determine_pps_with_reprovision_marker( azure_ds._determine_pps_type(ovf_cfg, imds_md) == dsaz.PPSType.UNKNOWN ) - assert is_file.mock_calls == [mock.call(dsaz.REPROVISION_MARKER_FILE)] + assert is_file.mock_calls == [ + mock.call(dsaz.REPORTED_READY_MARKER_FILE) + ] @mock.patch("os.path.isfile", return_value=False) @@ -4104,7 +4096,6 @@ def provisioning_setup( mock_util_mount_cb, mock_wrapping_setup_ephemeral_networking, patched_reported_ready_marker_path, - patched_reprovision_marker_path, ): self.azure_ds = azure_ds self.mock_azure_get_metadata_from_fabric = ( @@ -4135,7 +4126,6 @@ def provisioning_setup( self.patched_reported_ready_marker_path = ( patched_reported_ready_marker_path ) - self.patched_reprovision_marker_path = patched_reprovision_marker_path self.imds_md = { "extended": {"compute": {"ppsType": "None"}}, @@ -4408,6 +4398,76 @@ def test_savable_pps(self): mock.call.create_bound_netlink_socket().close(), ] + @pytest.mark.parametrize("pps_type", ["Savable", "Running", "None"]) + def test_recovery_pps(self, pps_type): + self.patched_reported_ready_marker_path.write_text("") + self.imds_md["extended"]["compute"]["ppsType"] = pps_type + ovf_data = {"HostName": "myhost", "UserName": "myuser"} + + self.mock_readurl.side_effect = [ + mock.MagicMock(contents=json.dumps(self.imds_md).encode()), + mock.MagicMock( + contents=construct_valid_ovf_env(data=ovf_data).encode() + ), + mock.MagicMock(contents=json.dumps(self.imds_md).encode()), + ] + self.mock_azure_get_metadata_from_fabric.return_value = [] + + self.azure_ds._get_data() + + assert self.mock_readurl.mock_calls == [ + mock.call( + "http://169.254.169.254/metadata/instance?" + "api-version=2021-08-01&extended=true", + timeout=2, + headers={"Metadata": "true"}, + retries=10, + exception_cb=dsaz.imds_readurl_exception_callback, + infinite=False, + ), + mock.call( + "http://169.254.169.254/metadata/reprovisiondata?" + "api-version=2019-06-01", + timeout=2, + headers={"Metadata": "true"}, + exception_cb=mock.ANY, + infinite=True, + log_req_resp=False, + ), + mock.call( + "http://169.254.169.254/metadata/instance?" + "api-version=2021-08-01&extended=true", + timeout=2, + headers={"Metadata": "true"}, + retries=10, + exception_cb=dsaz.imds_readurl_exception_callback, + infinite=False, + ), + ] + + # Verify DHCP is setup once. + assert self.mock_wrapping_setup_ephemeral_networking.mock_calls == [ + mock.call(timeout_minutes=20), + ] + assert self.mock_net_dhcp_maybe_perform_dhcp_discovery.mock_calls == [ + mock.call(None, dsaz.dhcp_log_cb), + ] + + # Verify IMDS metadata. + assert self.azure_ds.metadata["imds"] == self.imds_md + + # Verify reports ready once. + assert self.mock_azure_get_metadata_from_fabric.mock_calls == [ + mock.call( + endpoint="10.11.12.13", + iso_dev="/dev/sr0", + pubkey_info=None, + ), + ] + + # Verify no netlink operations for recovering PPS. + assert self.mock_netlink.mock_calls == [] + class TestValidateIMDSMetadata: @pytest.mark.parametrize(