From e50df317986ee18e04781dcf2cdaffb27f2a94a3 Mon Sep 17 00:00:00 2001 From: Enrique Vallespi Gil Date: Wed, 15 Apr 2026 11:28:07 +0200 Subject: [PATCH 1/2] [os_must_gather] Add exit rc to os-must-gather main task With this we allow to run the rescue block. Previously any timeout in the main task would return the code from the echo which was always success. Also, we're removing from the rescue block the always block. And we've create a always section for finding existing os-must-gather directories and the symlink creation. Also we've changed the dest-dir for the generic fallback command to match the same folder of the symlink. Removed oc inspect as we're not getting so many errors from oc adm must-gather so probably this wouldn't be usefull. Finally, we've parametriced SOS_EDPM as cifmw_os_must_gather_sos_edpm and we've added default value to "all" Signed-off-by: Enrique Vallespi Gil --- roles/os_must_gather/README.md | 1 + roles/os_must_gather/defaults/main.yml | 1 + roles/os_must_gather/tasks/main.yml | 78 ++++++++++---------------- 3 files changed, 32 insertions(+), 48 deletions(-) diff --git a/roles/os_must_gather/README.md b/roles/os_must_gather/README.md index 92cc182c5..0e891c1cd 100644 --- a/roles/os_must_gather/README.md +++ b/roles/os_must_gather/README.md @@ -12,6 +12,7 @@ testing the new changes. * `cifmw_os_must_gather_output_dir`: (String) Directory to store logs generated by must-gather tool * `cifmw_os_must_gather_repo_path`: (string) Path to local clone of openstack-must-gather git repo * `cifmw_os_must_gather_timeout`: (String) Timeout for must-gather command +* `cifmw_os_must_gather_sos_edpm`: (String) Indicates where to run the SOS report. Default all * `cifmw_os_must_gather_host_network`: (Bool) Flag to gather host network data * `cifmw_os_must_gather_namespaces`: (List) List of namespaces required by the gather task in case of failure * `cifmw_os_must_gather_additional_namespaces`: (String) List of comma separated additional namespaces. Defaults to `kuttl,openshift-storage,sushy-emulator` diff --git a/roles/os_must_gather/defaults/main.yml b/roles/os_must_gather/defaults/main.yml index 6bd0d8167..70211fa0e 100644 --- a/roles/os_must_gather/defaults/main.yml +++ b/roles/os_must_gather/defaults/main.yml @@ -23,6 +23,7 @@ cifmw_os_must_gather_image_registry: "quay.rdoproject.org/openstack-k8s-operator cifmw_os_must_gather_output_dir: "{{ cifmw_basedir }}" cifmw_os_must_gather_output_log_dir: "{{ cifmw_os_must_gather_output_dir }}/logs/openstack-must-gather" cifmw_os_must_gather_repo_path: "{{ ansible_user_dir }}/src/github.com/openstack-k8s-operators/openstack-must-gather" +cifmw_os_must_gather_sos_edpm: "all" cifmw_os_must_gather_timeout: "30m" cifmw_os_must_gather_volume_percentage: 80 cifmw_os_must_gather_additional_namespaces: "kuttl,openshift-storage,openshift-marketplace,openshift-operators,sushy-emulator,tobiko" diff --git a/roles/os_must_gather/tasks/main.yml b/roles/os_must_gather/tasks/main.yml index 72fce2b80..d08fc02ae 100644 --- a/roles/os_must_gather/tasks/main.yml +++ b/roles/os_must_gather/tasks/main.yml @@ -61,7 +61,7 @@ environment: KUBECONFIG: "{{ cifmw_openshift_kubeconfig | default(cifmw_os_must_gather_kubeconfig) }}" PATH: "{{ cifmw_path }}" - SOS_EDPM: "all" + SOS_EDPM: "{{ cifmw_os_must_gather_sos_edpm }}" SOS_DECOMPRESS: "0" OPENSTACK_DATABASES: "{{ cifmw_os_must_gather_dump_db }}" OMC: "{{ cifmw_os_must_gather_omc }}" @@ -86,8 +86,31 @@ echo "The must gather command did not finish on time!" echo "{{ shell_cmd_timeout }} seconds was not enough to finish the task." fi + exit $rc } + register: _must_gather_result + rescue: + - name: Log openstack-must-gather failure + ansible.builtin.debug: + msg: "OpenStack must-gather failed, running fallback generic must-gather if timeout" + + - name: Run fallback generic must-gather command without SOS report when timed out + when: + - _must_gather_result is defined + - _must_gather_result.rc == 124 + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig | default(cifmw_os_must_gather_kubeconfig) }}" + PATH: "{{ cifmw_path }}" + ansible.builtin.command: + cmd: >- + timeout {{ (cifmw_os_must_gather_timeout | community.general.to_seconds) + 120 }} + oc adm must-gather + --dest-dir {{ cifmw_os_must_gather_output_log_dir }} + --timeout {{ cifmw_os_must_gather_timeout }} + --volume-percentage={{ cifmw_os_must_gather_volume_percentage }} + + always: - name: Find existing os-must-gather directories ansible.builtin.find: paths: "{{ cifmw_os_must_gather_output_log_dir }}" @@ -95,52 +118,11 @@ depth: 1 register: _os_gather_latest_dir - - name: Create a symlink to newest os-must-gather directory - ansible.builtin.file: - src: "{{ (_os_gather_latest_dir.files | sort(attribute='mtime', reverse=True) | first).path | basename }}" - dest: "{{ cifmw_os_must_gather_output_log_dir }}/latest" - state: link - - rescue: - - name: Openstack-must-gather failure + - name: Symlink to newest log folder and run top commands + when: _os_gather_latest_dir.files | length > 0 block: - - name: Log openstack-must-gather failure - ansible.builtin.debug: - msg: "OpenStack must-gather failed, running fallback generic must-gather" - - - name: Run fallback generic must-gather command - environment: - KUBECONFIG: "{{ cifmw_openshift_kubeconfig | default(cifmw_os_must_gather_kubeconfig) }}" - PATH: "{{ cifmw_path }}" - ansible.builtin.command: - cmd: >- - timeout {{ (cifmw_os_must_gather_timeout | community.general.to_seconds) + 120 }} - oc adm must-gather - --dest-dir {{ ansible_user_dir }}/ci-framework-data/must-gather - --timeout {{ cifmw_os_must_gather_timeout }} - --volume-percentage={{ cifmw_os_must_gather_volume_percentage }} - always: - - name: Create oc_inspect log directory + - name: Create a symlink to newest os-must-gather directory ansible.builtin.file: - path: "{{ cifmw_os_must_gather_output_dir }}/logs/oc_inspect" - state: directory - mode: "0755" - - - name: Inspect the cluster after must-gather failure - ignore_errors: true # noqa: ignore-errors - environment: - KUBECONFIG: "{{ cifmw_openshift_kubeconfig | default(cifmw_os_must_gather_kubeconfig) }}" - PATH: "{{ cifmw_path }}" - cifmw.general.ci_script: - output_dir: "{{ cifmw_os_must_gather_output_dir }}/artifacts" - script: | - oc adm inspect namespace/{{ item }} --dest-dir={{ cifmw_os_must_gather_output_dir }}/logs/oc_inspect - loop: >- - {{ - ( - cifmw_os_must_gather_namespaces | default([]) + - ( - cifmw_os_must_gather_additional_namespaces | split(',') | list - ) - ) | unique - }} + src: "{{ (_os_gather_latest_dir.files | sort(attribute='mtime', reverse=True) | first).path | basename }}" + dest: "{{ cifmw_os_must_gather_output_log_dir }}/latest" + state: link From 865c66222b0e23894e4af83d6a2837f64016269b Mon Sep 17 00:00:00 2001 From: Enrique Vallespi Gil Date: Fri, 17 Apr 2026 10:37:04 +0200 Subject: [PATCH 2/2] DNM Signed-off-by: Enrique Vallespi Gil --- roles/os_must_gather/tasks/main.yml | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/roles/os_must_gather/tasks/main.yml b/roles/os_must_gather/tasks/main.yml index d08fc02ae..0da44477c 100644 --- a/roles/os_must_gather/tasks/main.yml +++ b/roles/os_must_gather/tasks/main.yml @@ -68,19 +68,7 @@ cifmw.general.ci_script: output_dir: "{{ cifmw_os_must_gather_output_dir }}/artifacts" script: >- - timeout {{ shell_cmd_timeout }} - oc adm must-gather --image {{ cifmw_os_must_gather_image }} - --timeout {{ cifmw_os_must_gather_timeout }} - --host-network={{ cifmw_os_must_gather_host_network }} - --dest-dir {{ cifmw_os_must_gather_output_log_dir }} - --volume-percentage={{ cifmw_os_must_gather_volume_percentage }} - -- ADDITIONAL_NAMESPACES={{ cifmw_os_must_gather_additional_namespaces }} - OPENSTACK_DATABASES=$OPENSTACK_DATABASES - SOS_EDPM=$SOS_EDPM - OMC=$OMC - SOS_DECOMPRESS=$SOS_DECOMPRESS - gather - 2>&1 || { + timeout 5 sleep 30 || { rc=$? if [ $rc -eq 124 ]; then echo "The must gather command did not finish on time!" @@ -90,14 +78,19 @@ } register: _must_gather_result - rescue: + always: - name: Log openstack-must-gather failure + when: + - _must_gather_result is defined + - _must_gather_result.rc is defined + - _must_gather_result.rc != 0 ansible.builtin.debug: - msg: "OpenStack must-gather failed, running fallback generic must-gather if timeout" + msg: "OpenStack must-gather failed with rc={{ _must_gather_result.rc }}, running fallback generic must-gather if timeout" - name: Run fallback generic must-gather command without SOS report when timed out when: - _must_gather_result is defined + - _must_gather_result.rc is defined - _must_gather_result.rc == 124 environment: KUBECONFIG: "{{ cifmw_openshift_kubeconfig | default(cifmw_os_must_gather_kubeconfig) }}" @@ -110,7 +103,6 @@ --timeout {{ cifmw_os_must_gather_timeout }} --volume-percentage={{ cifmw_os_must_gather_volume_percentage }} - always: - name: Find existing os-must-gather directories ansible.builtin.find: paths: "{{ cifmw_os_must_gather_output_log_dir }}"