From 5800d1de7d17b05ea3c42c560ee68d5b7c044b69 Mon Sep 17 00:00:00 2001 From: Chris Patterson Date: Thu, 14 May 2026 12:39:53 +0000 Subject: [PATCH 1/2] fix(ds-identify): parse multi-line YAML datasource_list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multi-line YAML lists are valid YAML. The prior code assumed that multi-line datasource_list values could not be parsed, but both multi-line flow sequences and block sequences are standard YAML: # multi-line flow sequence datasource_list: [ Azure ] # block sequence datasource_list: - Azure - None The existing read_datasource_list() only handled single-line flow sequences (e.g. 'datasource_list: [Azure, None]'). When a config file contains a multi-line flow sequence, the parser saw only '[' without the closing ']', produced an empty dslist, and fell back to the full default datasource list. On Azure VMs this meant ds-identify checked every datasource unnecessarily. Add read_datasource_list_multiline() to handle both multi-line flow sequences and block sequences as a fallback when single-line parsing produces no result. Also tighten get_single_line_flow_sequence() to reject values that lack a closing ']', so incomplete flow sequences fall through to the multi-line parser correctly. An alternative approach would be to shell out to python3 for YAML parsing (e.g. python3 -c 'import yaml; ...'), since python3 is a hard dependency of cloud-init and is guaranteed to be present. This would handle every valid YAML construct correctly with zero edge cases and much less code. However, ds-identify is designed as a pure POSIX shell script for speed — it runs as a systemd generator early in boot where startup latency matters, and on systems where cloud-init should be disabled it avoids the cost of a python3 interpreter entirely. The shell-based approach preserves that property while covering the formats seen in practice. We could maybe do one and attempt to fall back to the other? When datasource_list is present in config but cannot be parsed by any method, ds-identify now emits a distinct error indicating the key was found but unparsable. I think maybe this error needs to go to console or otherwise be detected by cloud-init later and escalated as part of status, etc. --- tests/unittests/test_ds_identify.py | 360 +++++++++++++++++++++++++--- tools/ds-identify | 138 ++++++++++- 2 files changed, 468 insertions(+), 30 deletions(-) diff --git a/tests/unittests/test_ds_identify.py b/tests/unittests/test_ds_identify.py index ba5aae5ebb7..365d05a8cea 100644 --- a/tests/unittests/test_ds_identify.py +++ b/tests/unittests/test_ds_identify.py @@ -82,8 +82,8 @@ # This will cause the set+update hostname module to not operate (if true) preserve_hostname: false -# If you use datasource_list array, keep array items in a single line. -# If you use multi line array, ds-identify script won't read array items. +# datasource_list supports single-line flow sequences, multi-line flow +# sequences, and block sequences (one item per line with '- ' prefix). # Example datasource config # datasource: # Ec2: @@ -537,16 +537,9 @@ def test_wb_print_variables(self, tmp_path): pytest.param("LXD-kvm-not-MAAS-1", True, id="mass_not_detected_1"), pytest.param("LXD-kvm-not-MAAS-2", True, id="mass_not_detected_2"), pytest.param("LXD-kvm-not-MAAS-3", True, id="mass_not_detected_3"), - # Don't detect incorrect config when invalid datasource_list - # provided - # - # If unparsable list is provided we just ignore it. Some users - # might assume that since the rest of the configuration is yaml - # that multi-line yaml lists are valid (they aren't). When this - # happens, just run ds-identify and figure it out for ourselves - # which platform to run. + # Block-style YAML datasource_list is parsed correctly pytest.param( - "Azure-parse-invalid", True, id="azure_invalid_configuration" + "Azure-block-sequence", True, id="azure_block_sequence" ), # Azure datasource is detected from DMI chassis-asset-tag pytest.param( @@ -639,19 +632,11 @@ def test_wb_print_variables(self, tmp_path): pytest.param("ConfigDriveUpper", True, id="config_drive_upper"), # Config Drive seed directory. pytest.param("ConfigDrive-seed", True, id="config_drive_seed"), - # Multi-line yaml is unsupported + # Block-style multi-line YAML datasource_list is respected pytest.param( - "LXD-kvm-not-azure", + "LXD-kvm-block-sequence-azure", True, - marks=[ - pytest.mark.xfail( - reason=( - "not supported: yaml parser implemented in POSIX" - " shell" - ) - ) - ], - id="multiline_yaml", + id="multiline_yaml_block", ), # Template provisioned with user-data first boot. # @@ -944,6 +929,327 @@ def test_flow_sequence(self, tmp_path): data, str(tmp_path), RC_FOUND, dslist=[data.get("ds")] ) + @pytest.mark.parametrize( + "cfg_content,expected_dslist,rc", + [ + # === Single-line flow sequences === + pytest.param( + "datasource_list: [Azure]\n", + ["Azure"], + RC_FOUND, + id="single_line_one_entry", + ), + pytest.param( + "datasource_list: [Azure, None]\n", + ["Azure", DS_NONE], + RC_FOUND, + id="single_line_two_entries", + ), + pytest.param( + "datasource_list: [ Azure , None ]\n", + ["Azure", DS_NONE], + RC_FOUND, + id="single_line_extra_spaces", + ), + pytest.param( + "datasource_list: ['Azure', 'None']\n", + ["Azure", DS_NONE], + RC_FOUND, + id="single_line_single_quoted", + ), + pytest.param( + 'datasource_list: ["Azure"]\n', + ["Azure"], + RC_FOUND, + id="single_line_double_quoted", + ), + pytest.param( + "datasource_list: [Azure,None]\n", + ["Azure", DS_NONE], + RC_FOUND, + id="single_line_no_space_after_comma", + ), + pytest.param( + "datasource_list: [Azure ,None]\n", + ["Azure", DS_NONE], + RC_FOUND, + id="single_line_space_before_comma", + ), + pytest.param( + "datasource_list:[Azure]\n", + ["Azure"], + RC_FOUND, + id="single_line_no_space_after_colon", + ), + # === Multi-line flow sequences === + # dpkg-reconfigure cloud-init generates this format + pytest.param( + "datasource_list: [\n" + " Azure\n" + " ]\n", + ["Azure"], + RC_FOUND, + id="multiline_flow_dpkg_style", + ), + pytest.param( + "datasource_list: [\n" " Azure,\n" " None\n" "]\n", + ["Azure", DS_NONE], + RC_FOUND, + id="multiline_flow_two_entries", + ), + pytest.param( + "datasource_list: [\n" " Azure,\n" " None\n" "]\n", + ["Azure", DS_NONE], + RC_FOUND, + id="multiline_flow_with_none", + ), + pytest.param( + "datasource_list: [\n" " Azure,\n" "]\n", + ["Azure"], + RC_FOUND, + id="multiline_flow_trailing_comma", + ), + pytest.param( + "datasource_list: [\n" " Azure\n" " ]\n", + ["Azure"], + RC_FOUND, + id="multiline_flow_indented_bracket", + ), + # first entry on the same line as opening bracket + pytest.param( + "datasource_list: [Azure,\n" " None]\n", + ["Azure", DS_NONE], + RC_FOUND, + id="multiline_flow_first_on_bracket_line", + ), + pytest.param( + "datasource_list: [\n" "Azure,\n" "None\n" "]\n", + ["Azure", DS_NONE], + RC_FOUND, + id="multiline_flow_no_indent", + ), + pytest.param( + "datasource_list: [\n" + " Azure , \n" + " None \n" + " ]\n", + ["Azure", DS_NONE], + RC_FOUND, + id="multiline_flow_irregular_spacing", + ), + pytest.param( + "datasource_list: [\n" " Azure, None\n" "]\n", + ["Azure", DS_NONE], + RC_FOUND, + id="multiline_flow_multiple_on_one_line", + ), + pytest.param( + "datasource_list: [\n" " Azure, None,\n" "]\n", + ["Azure", DS_NONE], + RC_FOUND, + id="multiline_flow_multiple_on_one_line_trailing_comma", + ), + # === Block sequences === + pytest.param( + "datasource_list:\n" " - Azure\n", + ["Azure"], + RC_FOUND, + id="block_one_entry", + ), + pytest.param( + "datasource_list:\n" " - Azure\n" " - None\n", + ["Azure", DS_NONE], + RC_FOUND, + id="block_two_entries", + ), + pytest.param( + "datasource_list:\n" " - Azure\n", + ["Azure"], + RC_FOUND, + id="block_one_space_indent", + ), + pytest.param( + "datasource_list:\n" " - Azure\n", + ["Azure"], + RC_FOUND, + id="block_four_space_indent", + ), + pytest.param( + "datasource_list:\n" "\t- Azure\n", + ["Azure"], + RC_FOUND, + id="block_tab_indent", + ), + pytest.param( + "datasource_list:\n" " - Azure\n" " - None\n", + ["Azure", DS_NONE], + RC_FOUND, + id="block_extra_spaces_after_dash", + ), + pytest.param( + "datasource_list:\n" " - 'Azure'\n", + ["Azure"], + RC_FOUND, + id="block_single_quoted", + ), + pytest.param( + "datasource_list:\n" ' - "Azure"\n', + ["Azure"], + RC_FOUND, + id="block_double_quoted", + ), + pytest.param( + "datasource_list:\n" " - Azure \n", + ["Azure"], + RC_FOUND, + id="block_trailing_whitespace", + ), + # === With surrounding config === + pytest.param( + "# a comment\n" + "manage_etc_hosts: true\n" + "datasource_list: [\n" + " Azure\n" + " ]\n" + "locale: en_US.UTF-8\n", + ["Azure"], + RC_FOUND, + id="multiline_flow_with_surrounding_config", + ), + pytest.param( + "manage_etc_hosts: true\n" + "datasource_list:\n" + " - Azure\n" + "locale: en_US.UTF-8\n", + ["Azure"], + RC_FOUND, + id="block_with_surrounding_config", + ), + pytest.param( + "# datasource_list: [GCE]\n" + "datasource_list:\n" + " - Azure\n", + ["Azure"], + RC_FOUND, + id="block_with_commented_out_line_above", + ), + # === Inline comments === + pytest.param( + "datasource_list: [\n" + " Azure, # cloud provider\n" + " None\n" + "]\n", + ["Azure", DS_NONE], + RC_FOUND, + id="multiline_flow_inline_comment", + ), + pytest.param( + "datasource_list:\n" " - Azure # cloud provider\n", + ["Azure"], + RC_FOUND, + id="block_inline_comment", + ), + # === Quoted keys with multiline === + pytest.param( + '"datasource_list": [\n' " Azure\n" "]\n", + ["Azure"], + RC_FOUND, + id="multiline_flow_double_quoted_key", + ), + pytest.param( + "'datasource_list': [\n" " Azure\n" "]\n", + ["Azure"], + RC_FOUND, + id="multiline_flow_single_quoted_key", + ), + pytest.param( + '"datasource_list":\n' " - Azure\n", + ["Azure"], + RC_FOUND, + id="block_double_quoted_key", + ), + # === Quoted values in multiline flow === + pytest.param( + "datasource_list: [\n" " 'Azure'\n" "]\n", + ["Azure"], + RC_FOUND, + id="multiline_flow_single_quoted_value", + ), + pytest.param( + "datasource_list: [\n" ' "Azure"\n' "]\n", + ["Azure"], + RC_FOUND, + id="multiline_flow_double_quoted_value", + ), + # === No trailing newline at EOF === + pytest.param( + "datasource_list:\n" " - Azure", + ["Azure"], + RC_FOUND, + id="block_no_trailing_newline", + ), + pytest.param( + "datasource_list: [\n" " Azure\n" "]", + ["Azure"], + RC_FOUND, + id="multiline_flow_no_trailing_newline", + ), + # === Blank lines === + pytest.param( + "datasource_list:\n" "\n" " - Azure\n", + ["Azure"], + RC_FOUND, + id="block_blank_line_before_items", + ), + # blank line between block items is skipped; both are captured + pytest.param( + "datasource_list:\n" " - Azure\n" "\n" " - None\n", + ["Azure", DS_NONE], + RC_FOUND, + id="block_blank_line_between_items", + ), + pytest.param( + "datasource_list: [\n" "\n" " Azure\n" "]\n", + ["Azure"], + RC_FOUND, + id="multiline_flow_blank_line_inside", + ), + # === Block terminated by next key === + pytest.param( + "datasource_list:\n" + " - Azure\n" + " - None\n" + "manage_etc_hosts: true\n", + ["Azure", DS_NONE], + RC_FOUND, + id="block_terminated_by_next_key", + ), + # === Closing bracket on same line as last value === + pytest.param( + "datasource_list: [\n" " Azure]\n", + ["Azure"], + RC_FOUND, + id="multiline_flow_close_bracket_same_line", + ), + pytest.param( + "datasource_list: [\n" " Azure, None]\n", + ["Azure", DS_NONE], + RC_FOUND, + id="multiline_flow_two_entries_close_bracket_same_line", + ), + ], + ) + def test_datasource_list_formats( + self, cfg_content, expected_dslist, rc, tmp_path + ): + """Verify ds-identify parses various datasource_list formats.""" + mydata = copy.deepcopy(VALID_CFG["Azure-dmi-detection"]) + cfgpath = "etc/cloud/cloud.cfg.d/90_dpkg.cfg" + mydata["files"][cfgpath] = cfg_content + self._check_via_dict( + mydata, str(tmp_path), rc=rc, dslist=expected_dslist + ) + def test_config_drive_interacts_with_ibmcloud_config_disk(self, tmp_path): """Verify ConfigDrive interaction with IBMCloud. @@ -1643,12 +1949,12 @@ def _print_run_output(rc, out, err, cfg, files): }, }, "CloudCIX": {"ds": "CloudCIX", "files": {P_PRODUCT_NAME: "CloudCIX\n"}}, - "Azure-parse-invalid": { + "Azure-block-sequence": { "ds": "Azure", "files": { P_CHASSIS_ASSET_TAG: "7783-7084-3265-9085-8269-3286-77\n", "etc/cloud/cloud.cfg.d/91-azure_datasource.cfg": ( - "datasource_list:\n - Azure" + "datasource_list:\n - Azure\n - None" ), }, }, @@ -1873,11 +2179,11 @@ def _print_run_output(rc, out, err, cfg, files): "no_mocks": ["dscheck_LXD"], # Don't default mock dscheck_LXD "files": {"etc/cloud/cloud.cfg": dedent("datasource_list: [None]")}, }, - "LXD-kvm-not-azure": { + "LXD-kvm-block-sequence-azure": { "ds": "Azure", "files": { - "etc/cloud/cloud.cfg.d/92-broken-azure.cfg": ( - "datasource_list:\n - Azure" + "etc/cloud/cloud.cfg.d/92-azure.cfg": ( + "datasource_list:\n - Azure\n - None" ), }, # /dev/lxd/sock does exist and KVM virt-type diff --git a/tools/ds-identify b/tools/ds-identify index 82f6b036bab..2cb5d5570cb 100755 --- a/tools/ds-identify +++ b/tools/ds-identify @@ -641,9 +641,118 @@ parse_yaml_array() { _RET="${ret# }" } +read_datasource_list_multiline() { + # Parse multi-line datasource_list from config files. + # Handles both multi-line flow sequences and block sequences: + # + # Multi-line flow sequence: + # datasource_list: [ + # Azure + # ] + # + # Block sequence: + # datasource_list: + # - NoCloud + # - ConfigDrive + # + # NOTE: A more reliable approach would be to invoke python3 for YAML + # parsing with a shell fallback, since python3 is a hard dependency of + # cloud-init. That would handle all valid YAML correctly with no edge + # cases. This shell implementation covers the formats seen in practice. + # + # Sets _RET to space-separated list of values. + # Sets _RET_fname to the file where it was found. + local files="" f="" line="" val="" mode="" dslist="" found_file="" + local flow_content="" + files="${PATH_ETC_CI_CFG} ${PATH_ETC_CI_CFG_D}/*.cfg" + # shellcheck disable=2086 + { set +f; set -- $files; set -f; } + [ "$1" = "$files" ] && [ ! -f "$1" ] && return 1 + + for f in "$@"; do + [ -f "$f" ] || continue + mode="" + while IFS= read -r line || [ -n "$line" ]; do + line="${line%%#*}" + if [ "$mode" = "flow" ]; then + case "$line" in + *"]"*) + # found closing bracket + val="${line%%"]"*}" + trim "$val" + [ -n "$_RET" ] && + flow_content="${flow_content},${_RET}" + parse_yaml_array "$flow_content" + dslist="$_RET" + mode="" + ;; + *) + trim "$line" + [ -n "$_RET" ] && + flow_content="${flow_content},${_RET}" + ;; + esac + continue + elif [ "$mode" = "block" ]; then + trim "$line" + case "$_RET" in + "- "*) + val="${_RET#- }" + trim "$val" + unquote "$_RET" + dslist="${dslist} $_RET" + continue + ;; + "") continue;; + *) mode="";; + esac + fi + case "$line" in + *datasource_list*) + trim "${line%%:*}" + unquote "$_RET" + case "$_RET" in + datasource_list) + val="${line#*:}" + trim "$val" + case "$_RET" in + "["*"]"*) + # single-line flow sequence + # already handled by caller + ;; + "["*) + # multi-line flow sequence + mode="flow" + found_file="$f" + dslist="" + flow_content="${_RET#"["}" + ;; + "") + # block sequence + mode="block" + found_file="$f" + dslist="" + ;; + esac + ;; + esac + ;; + esac + done < "$f" + done + + dslist="${dslist# }" + if [ -n "$dslist" ]; then + _RET="$dslist" + _RET_fname="$found_file" + return 0 + fi + return 1 +} + read_datasource_list() { cached "$DI_DSLIST" && return - local dslist="" key="datasource_list" + local dslist="" key="datasource_list" key_found=false # if DI_DSNAME is set directly, then avoid parsing config. if [ -n "${DI_DSNAME}" ]; then dslist="${DI_DSNAME}" @@ -666,9 +775,23 @@ read_datasource_list() { parse_yaml_array "$_RET" dslist=${_RET} fi + # Track whether the key was present in config at all + if [ -z "$dslist" ] && check_config "$key"; then + key_found=true + fi + # Try multi-line formats (flow sequence or block sequence) + if [ -z "$dslist" ] && read_datasource_list_multiline; then + debug 1 "$_RET_fname set datasource_list (multiline): $_RET" + dslist=${_RET} + fi if [ -z "$dslist" ]; then + if [ "$key_found" = "true" ]; then + error "datasource_list found in config but could not be parsed," \ + "using default: $DI_DSLIST_DEFAULT" + else + warn "no datasource_list found, using default: $DI_DSLIST_DEFAULT" + fi dslist=${DI_DSLIST_DEFAULT} - warn "no datasource_list found, using default: $dslist" fi DI_DSLIST=$dslist return 0 @@ -962,6 +1085,15 @@ get_single_line_flow_sequence() { ret="$_RET" tmp="$_RET" + # reject values that don't contain both [ and ] + # (multi-line flow sequences are handled elsewhere) + case "$tmp" in + *"["*"]"*) ;; + *) + debug 1 "key $1 has no single line flow sequence" + return 1;; + esac + # remove smallest ] suffix tmp="${tmp%]}" @@ -976,7 +1108,7 @@ get_single_line_flow_sequence() { if [ "${#_RET}" -ne 0 ]; then return 0 fi - debug 1 "key $key didn't have a valid single line flow sequence" + debug 1 "key $1 didn't have a valid single line flow sequence" return 1 } From 7fd93a5bb3bf9ba22efa63e4b5a0f3eaba66ede4 Mon Sep 17 00:00:00 2001 From: Chris Patterson Date: Thu, 14 May 2026 13:40:59 +0000 Subject: [PATCH 2/2] disable some string lints for readability --- tests/unittests/test_ds_identify.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unittests/test_ds_identify.py b/tests/unittests/test_ds_identify.py index 365d05a8cea..e018cbf06d9 100644 --- a/tests/unittests/test_ds_identify.py +++ b/tests/unittests/test_ds_identify.py @@ -1,4 +1,6 @@ # This file is part of cloud-init. See LICENSE file for license information. +# ruff: noqa: ISC001 +# pylint: disable=implicit-str-concat import copy import os