From 5800d1de7d17b05ea3c42c560ee68d5b7c044b69 Mon Sep 17 00:00:00 2001
From: Chris Patterson <cpatterson@microsoft.com>
Date: Thu, 14 May 2026 12:39:53 +0000
Subject: [PATCH 1/2] fix(ds-identify): parse multi-line YAML datasource_list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Multi-line YAML lists are valid YAML. The prior code assumed that
multi-line datasource_list values could not be parsed, but both
multi-line flow sequences and block sequences are standard YAML:

  # multi-line flow sequence
  datasource_list: [
                   Azure
                   ]

  # block sequence
  datasource_list:
    - Azure
    - None

The existing read_datasource_list() only handled single-line flow
sequences (e.g. 'datasource_list: [Azure, None]'). When a config
file contains a multi-line flow sequence, the parser saw only '['
without the closing ']', produced an empty dslist, and fell back to
the full default datasource list. On Azure VMs this meant
ds-identify checked every datasource unnecessarily.

Add read_datasource_list_multiline() to handle both multi-line flow
sequences and block sequences as a fallback when single-line parsing
produces no result. Also tighten get_single_line_flow_sequence() to
reject values that lack a closing ']', so incomplete flow sequences
fall through to the multi-line parser correctly.

An alternative approach would be to shell out to python3 for YAML
parsing (e.g. python3 -c 'import yaml; ...'), since python3 is a
hard dependency of cloud-init and is guaranteed to be present.
This would handle every valid YAML construct correctly with zero
edge cases and much less code. However, ds-identify is designed as
a pure POSIX shell script for speed — it runs as a systemd generator
early in boot where startup latency matters, and on systems where
cloud-init should be disabled it avoids the cost of a python3
interpreter entirely. The shell-based approach preserves that
property while covering the formats seen in practice.  We could
maybe do one and attempt to fall back to the other?

When datasource_list is present in config but cannot be parsed by
any method, ds-identify now emits a distinct error indicating the
key was found but unparsable.  I think maybe this error needs to go
to console or otherwise be detected by cloud-init later and escalated
as part of status, etc.
---
 tests/unittests/test_ds_identify.py | 360 +++++++++++++++++++++++++---
 tools/ds-identify                   | 138 ++++++++++-
 2 files changed, 468 insertions(+), 30 deletions(-)

diff --git a/tests/unittests/test_ds_identify.py b/tests/unittests/test_ds_identify.py
index ba5aae5ebb7..365d05a8cea 100644
--- a/tests/unittests/test_ds_identify.py
+++ b/tests/unittests/test_ds_identify.py
@@ -82,8 +82,8 @@
 # This will cause the set+update hostname module to not operate (if true)
 preserve_hostname: false
 
-# If you use datasource_list array, keep array items in a single line.
-# If you use multi line array, ds-identify script won't read array items.
+# datasource_list supports single-line flow sequences, multi-line flow
+# sequences, and block sequences (one item per line with '- ' prefix).
 # Example datasource config
 # datasource:
 #    Ec2:
@@ -537,16 +537,9 @@ def test_wb_print_variables(self, tmp_path):
             pytest.param("LXD-kvm-not-MAAS-1", True, id="mass_not_detected_1"),
             pytest.param("LXD-kvm-not-MAAS-2", True, id="mass_not_detected_2"),
             pytest.param("LXD-kvm-not-MAAS-3", True, id="mass_not_detected_3"),
-            # Don't detect incorrect config when invalid datasource_list
-            # provided
-            #
-            # If unparsable list is provided we just ignore it. Some users
-            # might assume that since the rest of the configuration is yaml
-            # that multi-line yaml lists are valid (they aren't). When this
-            # happens, just run ds-identify and figure it out for ourselves
-            # which platform to run.
+            # Block-style YAML datasource_list is parsed correctly
             pytest.param(
-                "Azure-parse-invalid", True, id="azure_invalid_configuration"
+                "Azure-block-sequence", True, id="azure_block_sequence"
             ),
             # Azure datasource is detected from DMI chassis-asset-tag
             pytest.param(
@@ -639,19 +632,11 @@ def test_wb_print_variables(self, tmp_path):
             pytest.param("ConfigDriveUpper", True, id="config_drive_upper"),
             # Config Drive seed directory.
             pytest.param("ConfigDrive-seed", True, id="config_drive_seed"),
-            # Multi-line yaml is unsupported
+            # Block-style multi-line YAML datasource_list is respected
             pytest.param(
-                "LXD-kvm-not-azure",
+                "LXD-kvm-block-sequence-azure",
                 True,
-                marks=[
-                    pytest.mark.xfail(
-                        reason=(
-                            "not supported: yaml parser implemented in POSIX"
-                            " shell"
-                        )
-                    )
-                ],
-                id="multiline_yaml",
+                id="multiline_yaml_block",
             ),
             # Template provisioned with user-data first boot.
             #
@@ -944,6 +929,327 @@ def test_flow_sequence(self, tmp_path):
                 data, str(tmp_path), RC_FOUND, dslist=[data.get("ds")]
             )
 
+    @pytest.mark.parametrize(
+        "cfg_content,expected_dslist,rc",
+        [
+            # === Single-line flow sequences ===
+            pytest.param(
+                "datasource_list: [Azure]\n",
+                ["Azure"],
+                RC_FOUND,
+                id="single_line_one_entry",
+            ),
+            pytest.param(
+                "datasource_list: [Azure, None]\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="single_line_two_entries",
+            ),
+            pytest.param(
+                "datasource_list: [ Azure , None ]\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="single_line_extra_spaces",
+            ),
+            pytest.param(
+                "datasource_list: ['Azure', 'None']\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="single_line_single_quoted",
+            ),
+            pytest.param(
+                'datasource_list: ["Azure"]\n',
+                ["Azure"],
+                RC_FOUND,
+                id="single_line_double_quoted",
+            ),
+            pytest.param(
+                "datasource_list:  [Azure,None]\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="single_line_no_space_after_comma",
+            ),
+            pytest.param(
+                "datasource_list: [Azure ,None]\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="single_line_space_before_comma",
+            ),
+            pytest.param(
+                "datasource_list:[Azure]\n",
+                ["Azure"],
+                RC_FOUND,
+                id="single_line_no_space_after_colon",
+            ),
+            # === Multi-line flow sequences ===
+            # dpkg-reconfigure cloud-init generates this format
+            pytest.param(
+                "datasource_list: [\n"
+                "                 Azure\n"
+                "                 ]\n",
+                ["Azure"],
+                RC_FOUND,
+                id="multiline_flow_dpkg_style",
+            ),
+            pytest.param(
+                "datasource_list: [\n" "  Azure,\n" "  None\n" "]\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="multiline_flow_two_entries",
+            ),
+            pytest.param(
+                "datasource_list: [\n" "  Azure,\n" "  None\n" "]\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="multiline_flow_with_none",
+            ),
+            pytest.param(
+                "datasource_list: [\n" "  Azure,\n" "]\n",
+                ["Azure"],
+                RC_FOUND,
+                id="multiline_flow_trailing_comma",
+            ),
+            pytest.param(
+                "datasource_list: [\n" "  Azure\n" "  ]\n",
+                ["Azure"],
+                RC_FOUND,
+                id="multiline_flow_indented_bracket",
+            ),
+            # first entry on the same line as opening bracket
+            pytest.param(
+                "datasource_list: [Azure,\n" "  None]\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="multiline_flow_first_on_bracket_line",
+            ),
+            pytest.param(
+                "datasource_list: [\n" "Azure,\n" "None\n" "]\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="multiline_flow_no_indent",
+            ),
+            pytest.param(
+                "datasource_list: [\n"
+                "    Azure  ,  \n"
+                "    None   \n"
+                "  ]\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="multiline_flow_irregular_spacing",
+            ),
+            pytest.param(
+                "datasource_list: [\n" "  Azure, None\n" "]\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="multiline_flow_multiple_on_one_line",
+            ),
+            pytest.param(
+                "datasource_list: [\n" "  Azure, None,\n" "]\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="multiline_flow_multiple_on_one_line_trailing_comma",
+            ),
+            # === Block sequences ===
+            pytest.param(
+                "datasource_list:\n" "  - Azure\n",
+                ["Azure"],
+                RC_FOUND,
+                id="block_one_entry",
+            ),
+            pytest.param(
+                "datasource_list:\n" "  - Azure\n" "  - None\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="block_two_entries",
+            ),
+            pytest.param(
+                "datasource_list:\n" " - Azure\n",
+                ["Azure"],
+                RC_FOUND,
+                id="block_one_space_indent",
+            ),
+            pytest.param(
+                "datasource_list:\n" "    - Azure\n",
+                ["Azure"],
+                RC_FOUND,
+                id="block_four_space_indent",
+            ),
+            pytest.param(
+                "datasource_list:\n" "\t- Azure\n",
+                ["Azure"],
+                RC_FOUND,
+                id="block_tab_indent",
+            ),
+            pytest.param(
+                "datasource_list:\n" "  - Azure\n" "  -   None\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="block_extra_spaces_after_dash",
+            ),
+            pytest.param(
+                "datasource_list:\n" "  - 'Azure'\n",
+                ["Azure"],
+                RC_FOUND,
+                id="block_single_quoted",
+            ),
+            pytest.param(
+                "datasource_list:\n" '  - "Azure"\n',
+                ["Azure"],
+                RC_FOUND,
+                id="block_double_quoted",
+            ),
+            pytest.param(
+                "datasource_list:\n" "  -   Azure  \n",
+                ["Azure"],
+                RC_FOUND,
+                id="block_trailing_whitespace",
+            ),
+            # === With surrounding config ===
+            pytest.param(
+                "# a comment\n"
+                "manage_etc_hosts: true\n"
+                "datasource_list: [\n"
+                "  Azure\n"
+                "  ]\n"
+                "locale: en_US.UTF-8\n",
+                ["Azure"],
+                RC_FOUND,
+                id="multiline_flow_with_surrounding_config",
+            ),
+            pytest.param(
+                "manage_etc_hosts: true\n"
+                "datasource_list:\n"
+                "  - Azure\n"
+                "locale: en_US.UTF-8\n",
+                ["Azure"],
+                RC_FOUND,
+                id="block_with_surrounding_config",
+            ),
+            pytest.param(
+                "# datasource_list: [GCE]\n"
+                "datasource_list:\n"
+                "  - Azure\n",
+                ["Azure"],
+                RC_FOUND,
+                id="block_with_commented_out_line_above",
+            ),
+            # === Inline comments ===
+            pytest.param(
+                "datasource_list: [\n"
+                "  Azure, # cloud provider\n"
+                "  None\n"
+                "]\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="multiline_flow_inline_comment",
+            ),
+            pytest.param(
+                "datasource_list:\n" "  - Azure  # cloud provider\n",
+                ["Azure"],
+                RC_FOUND,
+                id="block_inline_comment",
+            ),
+            # === Quoted keys with multiline ===
+            pytest.param(
+                '"datasource_list": [\n' "  Azure\n" "]\n",
+                ["Azure"],
+                RC_FOUND,
+                id="multiline_flow_double_quoted_key",
+            ),
+            pytest.param(
+                "'datasource_list': [\n" "  Azure\n" "]\n",
+                ["Azure"],
+                RC_FOUND,
+                id="multiline_flow_single_quoted_key",
+            ),
+            pytest.param(
+                '"datasource_list":\n' "  - Azure\n",
+                ["Azure"],
+                RC_FOUND,
+                id="block_double_quoted_key",
+            ),
+            # === Quoted values in multiline flow ===
+            pytest.param(
+                "datasource_list: [\n" "  'Azure'\n" "]\n",
+                ["Azure"],
+                RC_FOUND,
+                id="multiline_flow_single_quoted_value",
+            ),
+            pytest.param(
+                "datasource_list: [\n" '  "Azure"\n' "]\n",
+                ["Azure"],
+                RC_FOUND,
+                id="multiline_flow_double_quoted_value",
+            ),
+            # === No trailing newline at EOF ===
+            pytest.param(
+                "datasource_list:\n" "  - Azure",
+                ["Azure"],
+                RC_FOUND,
+                id="block_no_trailing_newline",
+            ),
+            pytest.param(
+                "datasource_list: [\n" "  Azure\n" "]",
+                ["Azure"],
+                RC_FOUND,
+                id="multiline_flow_no_trailing_newline",
+            ),
+            # === Blank lines ===
+            pytest.param(
+                "datasource_list:\n" "\n" "  - Azure\n",
+                ["Azure"],
+                RC_FOUND,
+                id="block_blank_line_before_items",
+            ),
+            # blank line between block items is skipped; both are captured
+            pytest.param(
+                "datasource_list:\n" "  - Azure\n" "\n" "  - None\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="block_blank_line_between_items",
+            ),
+            pytest.param(
+                "datasource_list: [\n" "\n" "  Azure\n" "]\n",
+                ["Azure"],
+                RC_FOUND,
+                id="multiline_flow_blank_line_inside",
+            ),
+            # === Block terminated by next key ===
+            pytest.param(
+                "datasource_list:\n"
+                "  - Azure\n"
+                "  - None\n"
+                "manage_etc_hosts: true\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="block_terminated_by_next_key",
+            ),
+            # === Closing bracket on same line as last value ===
+            pytest.param(
+                "datasource_list: [\n" "  Azure]\n",
+                ["Azure"],
+                RC_FOUND,
+                id="multiline_flow_close_bracket_same_line",
+            ),
+            pytest.param(
+                "datasource_list: [\n" "  Azure, None]\n",
+                ["Azure", DS_NONE],
+                RC_FOUND,
+                id="multiline_flow_two_entries_close_bracket_same_line",
+            ),
+        ],
+    )
+    def test_datasource_list_formats(
+        self, cfg_content, expected_dslist, rc, tmp_path
+    ):
+        """Verify ds-identify parses various datasource_list formats."""
+        mydata = copy.deepcopy(VALID_CFG["Azure-dmi-detection"])
+        cfgpath = "etc/cloud/cloud.cfg.d/90_dpkg.cfg"
+        mydata["files"][cfgpath] = cfg_content
+        self._check_via_dict(
+            mydata, str(tmp_path), rc=rc, dslist=expected_dslist
+        )
+
     def test_config_drive_interacts_with_ibmcloud_config_disk(self, tmp_path):
         """Verify ConfigDrive interaction with IBMCloud.
 
@@ -1643,12 +1949,12 @@ def _print_run_output(rc, out, err, cfg, files):
         },
     },
     "CloudCIX": {"ds": "CloudCIX", "files": {P_PRODUCT_NAME: "CloudCIX\n"}},
-    "Azure-parse-invalid": {
+    "Azure-block-sequence": {
         "ds": "Azure",
         "files": {
             P_CHASSIS_ASSET_TAG: "7783-7084-3265-9085-8269-3286-77\n",
             "etc/cloud/cloud.cfg.d/91-azure_datasource.cfg": (
-                "datasource_list:\n   - Azure"
+                "datasource_list:\n   - Azure\n   - None"
             ),
         },
     },
@@ -1873,11 +2179,11 @@ def _print_run_output(rc, out, err, cfg, files):
         "no_mocks": ["dscheck_LXD"],  # Don't default mock dscheck_LXD
         "files": {"etc/cloud/cloud.cfg": dedent("datasource_list: [None]")},
     },
-    "LXD-kvm-not-azure": {
+    "LXD-kvm-block-sequence-azure": {
         "ds": "Azure",
         "files": {
-            "etc/cloud/cloud.cfg.d/92-broken-azure.cfg": (
-                "datasource_list:\n - Azure"
+            "etc/cloud/cloud.cfg.d/92-azure.cfg": (
+                "datasource_list:\n - Azure\n - None"
             ),
         },
         # /dev/lxd/sock does exist and KVM virt-type
diff --git a/tools/ds-identify b/tools/ds-identify
index 82f6b036bab..2cb5d5570cb 100755
--- a/tools/ds-identify
+++ b/tools/ds-identify
@@ -641,9 +641,118 @@ parse_yaml_array() {
     _RET="${ret# }"
 }
 
+read_datasource_list_multiline() {
+    # Parse multi-line datasource_list from config files.
+    # Handles both multi-line flow sequences and block sequences:
+    #
+    #   Multi-line flow sequence:
+    #     datasource_list: [
+    #       Azure
+    #     ]
+    #
+    #   Block sequence:
+    #     datasource_list:
+    #       - NoCloud
+    #       - ConfigDrive
+    #
+    # NOTE: A more reliable approach would be to invoke python3 for YAML
+    # parsing with a shell fallback, since python3 is a hard dependency of
+    # cloud-init. That would handle all valid YAML correctly with no edge
+    # cases.  This shell implementation covers the formats seen in practice.
+    #
+    # Sets _RET to space-separated list of values.
+    # Sets _RET_fname to the file where it was found.
+    local files="" f="" line="" val="" mode="" dslist="" found_file=""
+    local flow_content=""
+    files="${PATH_ETC_CI_CFG} ${PATH_ETC_CI_CFG_D}/*.cfg"
+    # shellcheck disable=2086
+    { set +f; set -- $files; set -f; }
+    [ "$1" = "$files" ] && [ ! -f "$1" ] && return 1
+
+    for f in "$@"; do
+        [ -f "$f" ] || continue
+        mode=""
+        while IFS= read -r line || [ -n "$line" ]; do
+            line="${line%%#*}"
+            if [ "$mode" = "flow" ]; then
+                case "$line" in
+                    *"]"*)
+                        # found closing bracket
+                        val="${line%%"]"*}"
+                        trim "$val"
+                        [ -n "$_RET" ] &&
+                            flow_content="${flow_content},${_RET}"
+                        parse_yaml_array "$flow_content"
+                        dslist="$_RET"
+                        mode=""
+                        ;;
+                    *)
+                        trim "$line"
+                        [ -n "$_RET" ] &&
+                            flow_content="${flow_content},${_RET}"
+                        ;;
+                esac
+                continue
+            elif [ "$mode" = "block" ]; then
+                trim "$line"
+                case "$_RET" in
+                    "- "*)
+                        val="${_RET#- }"
+                        trim "$val"
+                        unquote "$_RET"
+                        dslist="${dslist} $_RET"
+                        continue
+                        ;;
+                    "") continue;;
+                    *) mode="";;
+                esac
+            fi
+            case "$line" in
+                *datasource_list*)
+                    trim "${line%%:*}"
+                    unquote "$_RET"
+                    case "$_RET" in
+                        datasource_list)
+                            val="${line#*:}"
+                            trim "$val"
+                            case "$_RET" in
+                                "["*"]"*)
+                                    # single-line flow sequence
+                                    # already handled by caller
+                                    ;;
+                                "["*)
+                                    # multi-line flow sequence
+                                    mode="flow"
+                                    found_file="$f"
+                                    dslist=""
+                                    flow_content="${_RET#"["}"
+                                    ;;
+                                "")
+                                    # block sequence
+                                    mode="block"
+                                    found_file="$f"
+                                    dslist=""
+                                    ;;
+                            esac
+                            ;;
+                    esac
+                    ;;
+            esac
+        done < "$f"
+    done
+
+    dslist="${dslist# }"
+    if [ -n "$dslist" ]; then
+        _RET="$dslist"
+        _RET_fname="$found_file"
+        return 0
+    fi
+    return 1
+}
+
 read_datasource_list() {
     cached "$DI_DSLIST" && return
-    local dslist="" key="datasource_list"
+    local dslist="" key="datasource_list" key_found=false
     # if DI_DSNAME is set directly, then avoid parsing config.
     if [ -n "${DI_DSNAME}" ]; then
         dslist="${DI_DSNAME}"
@@ -666,9 +775,23 @@ read_datasource_list() {
         parse_yaml_array "$_RET"
         dslist=${_RET}
     fi
+    # Track whether the key was present in config at all
+    if [ -z "$dslist" ] && check_config "$key"; then
+        key_found=true
+    fi
+    # Try multi-line formats (flow sequence or block sequence)
+    if [ -z "$dslist" ] && read_datasource_list_multiline; then
+        debug 1 "$_RET_fname set datasource_list (multiline): $_RET"
+        dslist=${_RET}
+    fi
     if [ -z "$dslist" ]; then
+        if [ "$key_found" = "true" ]; then
+            error "datasource_list found in config but could not be parsed," \
+                "using default: $DI_DSLIST_DEFAULT"
+        else
+            warn "no datasource_list found, using default: $DI_DSLIST_DEFAULT"
+        fi
         dslist=${DI_DSLIST_DEFAULT}
-        warn "no datasource_list found, using default: $dslist"
     fi
     DI_DSLIST=$dslist
     return 0
@@ -962,6 +1085,15 @@ get_single_line_flow_sequence() {
     ret="$_RET"
     tmp="$_RET"
 
+    # reject values that don't contain both [ and ]
+    # (multi-line flow sequences are handled elsewhere)
+    case "$tmp" in
+        *"["*"]"*) ;;
+        *)
+            debug 1 "key $1 has no single line flow sequence"
+            return 1;;
+    esac
+
     # remove smallest ] suffix
     tmp="${tmp%]}"
 
@@ -976,7 +1108,7 @@ get_single_line_flow_sequence() {
     if [ "${#_RET}" -ne 0 ]; then
         return 0
     fi
-    debug 1 "key $key didn't have a valid single line flow sequence"
+    debug 1 "key $1 didn't have a valid single line flow sequence"
     return 1
 }
 

From 7fd93a5bb3bf9ba22efa63e4b5a0f3eaba66ede4 Mon Sep 17 00:00:00 2001
From: Chris Patterson <cpatterson@microsoft.com>
Date: Thu, 14 May 2026 13:40:59 +0000
Subject: [PATCH 2/2] disable some string lints for readability

---
 tests/unittests/test_ds_identify.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/unittests/test_ds_identify.py b/tests/unittests/test_ds_identify.py
index 365d05a8cea..e018cbf06d9 100644
--- a/tests/unittests/test_ds_identify.py
+++ b/tests/unittests/test_ds_identify.py
@@ -1,4 +1,6 @@
 # This file is part of cloud-init. See LICENSE file for license information.
+# ruff: noqa: ISC001
+# pylint: disable=implicit-str-concat
 
 import copy
 import os