From 2e7e87d5d260d6bc539f4e0b0b615931dd660b7a Mon Sep 17 00:00:00 2001 From: Marc Herbert Date: Wed, 9 Sep 2020 17:15:11 -0700 Subject: [PATCH 1/2] sof-kernel-log-check: add platform variable So we can start ignoring known issues on a per platform basis. Signed-off-by: Marc Herbert --- tools/sof-kernel-log-check.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/sof-kernel-log-check.sh b/tools/sof-kernel-log-check.sh index a9be70ee..282c10ea 100755 --- a/tools/sof-kernel-log-check.sh +++ b/tools/sof-kernel-log-check.sh @@ -2,6 +2,9 @@ begin_line=${1:-1} declare err_str ignore_str project_key + +platform=$(sof-dump-status.py -p) + err_str="error|failed|timed out|panic|oops" # TODO explain From 952b8cd91813a2854844d2fdede717bf8a6f3003 Mon Sep 17 00:00:00 2001 From: Marc Herbert Date: Wed, 9 Sep 2020 17:15:58 -0700 Subject: [PATCH 2/2] sof-kernel-log-check: narrower ignore_str for ICL/CML FW boot retries Fixes ac415de03cb2 ("tools: ignore a false error message with 'panic'") which was ignoring way too many errors. See previous discussion in corresponding PR and long source code comment. Generally speaking, ignoring errors ("green failures") is extremely dangerous and should be as narrow as possible. More specifically here, not even knowing which platforms experience the issue and what code they print is is really not going to help fix it. Signed-off-by: Marc Herbert --- tools/sof-kernel-log-check.sh | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/tools/sof-kernel-log-check.sh b/tools/sof-kernel-log-check.sh index 282c10ea..52d35d53 100755 --- a/tools/sof-kernel-log-check.sh +++ b/tools/sof-kernel-log-check.sh @@ -53,12 +53,29 @@ ignore_str="$ignore_str"'|thermal thermal_zone.*: failed to read out thermal zon ignore_str="$ignore_str"'|iwlwifi 0000:00:14\.3: Microcode SW error detected\. Restarting 0x0\.' ignore_str="$ignore_str"'|wlo1: authentication with f4:f5:e8:6b:45:bb timed out' -# Test cases on some platforms fail because the false error message: +# Test cases on some platforms fail because the boot retry message: +# # sof-audio-pci 0000:00:1f.3: status = 0x00000000 panic = 0x00000000 -# Note that different platform may have different PCI ID, and the panic code -# may not be 0x00000000. +# ... +# Attempting iteration 1 of Core En/ROM load... +# +# Despite the real boot failure the retry message is not at the error +# level until after the last try. However we still use kern.log for now +# and it has no log levels, so this may unfortunately hide this same +# message at the 'error' level until we switch to journalctl +# --priority. Hopefully other issues will cause the test to fail in that +# case. +# +# For now the codes seem to be 0x00000000 and affected platforms have +# PCI ID 1f.3. Before adding other values make sure you update the list +# of affected systems in bug 3395 below. +# # Buglink: https://github.com/thesofproject/sof/issues/3395 -ignore_str="$ignore_str"'|sof-audio-pci 0000:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9a-f]: status = 0x[0-9a-f]{8} panic = 0x[0-9a-f]{8}' +case "$platform" in + icl|cml) + ignore_str="$ignore_str"'|sof-audio-pci 0000:00:1f\.3: status = 0x[0]{8} panic = 0x[0]{8}' + ;; +esac [[ ! "$err_str" ]] && echo "Missing error keyword list" && exit 0 # dmesg KB size buffer size