From 02947e7d2b5776d515b3a58335f66b362752c31b Mon Sep 17 00:00:00 2001 From: David Eads Date: Wed, 24 Jan 2024 12:53:18 -0500 Subject: [PATCH] enforce termination message policy on all platform pods --- pkg/defaultmonitortests/types.go | 24 ++- .../terminationmessagepolicy/monitortest.go | 149 ++++++++++++++++++ 2 files changed, 160 insertions(+), 13 deletions(-) create mode 100644 pkg/monitortests/clusterversionoperator/terminationmessagepolicy/monitortest.go diff --git a/pkg/defaultmonitortests/types.go b/pkg/defaultmonitortests/types.go index 87b9678c161f..ecbab70b1261 100644 --- a/pkg/defaultmonitortests/types.go +++ b/pkg/defaultmonitortests/types.go @@ -3,30 +3,20 @@ package defaultmonitortests import ( "fmt" - "github.com/openshift/origin/pkg/monitortests/testframework/disruptionexternalawscloudservicemonitoring" - "github.com/openshift/origin/pkg/monitortests/testframework/disruptionexternalazurecloudservicemonitoring" - "github.com/openshift/origin/pkg/monitortests/testframework/disruptionexternalgcpcloudservicemonitoring" - "github.com/openshift/origin/pkg/monitortests/testframework/watchrequestcountscollector" - "github.com/sirupsen/logrus" - - "github.com/openshift/origin/pkg/monitortests/kubeapiserver/disruptionnewapiserver" - - "github.com/openshift/origin/pkg/monitortests/testframework/alertanalyzer" - "github.com/openshift/origin/pkg/monitortests/testframework/pathologicaleventanalyzer" - - "github.com/openshift/origin/pkg/monitortests/kubeapiserver/apiservergracefulrestart" - "github.com/openshift/origin/pkg/monitortestframework" "github.com/openshift/origin/pkg/monitortests/authentication/legacyauthenticationmonitortests" "github.com/openshift/origin/pkg/monitortests/authentication/requiredsccmonitortests" azuremetrics "github.com/openshift/origin/pkg/monitortests/cloud/azure/metrics" "github.com/openshift/origin/pkg/monitortests/clusterversionoperator/legacycvomonitortests" "github.com/openshift/origin/pkg/monitortests/clusterversionoperator/operatorstateanalyzer" + "github.com/openshift/origin/pkg/monitortests/clusterversionoperator/terminationmessagepolicy" "github.com/openshift/origin/pkg/monitortests/etcd/etcdloganalyzer" "github.com/openshift/origin/pkg/monitortests/etcd/legacyetcdmonitortests" "github.com/openshift/origin/pkg/monitortests/imageregistry/disruptionimageregistry" + "github.com/openshift/origin/pkg/monitortests/kubeapiserver/apiservergracefulrestart" "github.com/openshift/origin/pkg/monitortests/kubeapiserver/auditloganalyzer" "github.com/openshift/origin/pkg/monitortests/kubeapiserver/disruptionlegacyapiservers" + "github.com/openshift/origin/pkg/monitortests/kubeapiserver/disruptionnewapiserver" "github.com/openshift/origin/pkg/monitortests/kubeapiserver/legacykubeapiservermonitortests" "github.com/openshift/origin/pkg/monitortests/monitoring/statefulsetsrecreation" "github.com/openshift/origin/pkg/monitortests/network/disruptioningress" @@ -40,17 +30,24 @@ import ( "github.com/openshift/origin/pkg/monitortests/node/watchpods" "github.com/openshift/origin/pkg/monitortests/storage/legacystoragemonitortests" "github.com/openshift/origin/pkg/monitortests/testframework/additionaleventscollector" + "github.com/openshift/origin/pkg/monitortests/testframework/alertanalyzer" "github.com/openshift/origin/pkg/monitortests/testframework/clusterinfoserializer" + "github.com/openshift/origin/pkg/monitortests/testframework/disruptionexternalawscloudservicemonitoring" + "github.com/openshift/origin/pkg/monitortests/testframework/disruptionexternalazurecloudservicemonitoring" + "github.com/openshift/origin/pkg/monitortests/testframework/disruptionexternalgcpcloudservicemonitoring" "github.com/openshift/origin/pkg/monitortests/testframework/disruptionexternalservicemonitoring" "github.com/openshift/origin/pkg/monitortests/testframework/disruptionserializer" "github.com/openshift/origin/pkg/monitortests/testframework/e2etestanalyzer" "github.com/openshift/origin/pkg/monitortests/testframework/intervalserializer" "github.com/openshift/origin/pkg/monitortests/testframework/knownimagechecker" "github.com/openshift/origin/pkg/monitortests/testframework/legacytestframeworkmonitortests" + "github.com/openshift/origin/pkg/monitortests/testframework/pathologicaleventanalyzer" "github.com/openshift/origin/pkg/monitortests/testframework/timelineserializer" "github.com/openshift/origin/pkg/monitortests/testframework/trackedresourcesserializer" "github.com/openshift/origin/pkg/monitortests/testframework/watchclusteroperators" "github.com/openshift/origin/pkg/monitortests/testframework/watchevents" + "github.com/openshift/origin/pkg/monitortests/testframework/watchrequestcountscollector" + "github.com/sirupsen/logrus" ) // ListAllMonitorTests is a helper that returns a simple list of @@ -146,6 +143,7 @@ func newUniversalMonitorTests(info monitortestframework.MonitorTestInitializatio monitorTestRegistry.AddMonitorTestOrDie("legacy-authentication-invariants", "apiserver-auth", legacyauthenticationmonitortests.NewLegacyTests()) monitorTestRegistry.AddMonitorTestOrDie("legacy-cvo-invariants", "Cluster Version Operator", legacycvomonitortests.NewLegacyTests()) + monitorTestRegistry.AddMonitorTestOrDie("termination-message-policy", "Cluster Version Operator", terminationmessagepolicy.NewAnalyzer()) monitorTestRegistry.AddMonitorTestOrDie("operator-state-analyzer", "Cluster Version Operator", operatorstateanalyzer.NewAnalyzer()) monitorTestRegistry.AddMonitorTestOrDie("required-scc-annotation-checker", "Cluster Version Operator", requiredsccmonitortests.NewAnalyzer()) diff --git a/pkg/monitortests/clusterversionoperator/terminationmessagepolicy/monitortest.go b/pkg/monitortests/clusterversionoperator/terminationmessagepolicy/monitortest.go new file mode 100644 index 000000000000..04369897c446 --- /dev/null +++ b/pkg/monitortests/clusterversionoperator/terminationmessagepolicy/monitortest.go @@ -0,0 +1,149 @@ +package terminationmessagepolicy + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/openshift/origin/pkg/monitor/monitorapi" + "github.com/openshift/origin/pkg/monitortestframework" + "github.com/openshift/origin/pkg/test/ginkgo/junitapi" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +type terminationMessagePolicyChecker struct { + kubeClient kubernetes.Interface +} + +func NewAnalyzer() monitortestframework.MonitorTest { + return &terminationMessagePolicyChecker{} +} + +func (w *terminationMessagePolicyChecker) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error { + var err error + w.kubeClient, err = kubernetes.NewForConfig(adminRESTConfig) + if err != nil { + return err + } + return nil +} + +func (w *terminationMessagePolicyChecker) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) { + if w.kubeClient == nil { + return nil, nil, nil + } + allPods, err := w.kubeClient.CoreV1().Pods("").List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, nil, err + } + + failuresByNamespace := map[string][]string{} + for _, pod := range allPods.Items { + if !strings.HasPrefix(pod.Namespace, "openshift") { + continue + } + if _, ok := failuresByNamespace[pod.Namespace]; !ok { + failuresByNamespace[pod.Namespace] = []string{} + } + + for _, container := range pod.Spec.InitContainers { + if container.TerminationMessagePolicy != corev1.TerminationMessageFallbackToLogsOnError { + failuresByNamespace[pod.Namespace] = append(failuresByNamespace[pod.Namespace], + fmt.Sprintf("pods/%s initContainers[%v]", pod.Name, container.Name)) + } + } + for _, container := range pod.Spec.Containers { + if container.TerminationMessagePolicy != corev1.TerminationMessageFallbackToLogsOnError { + failuresByNamespace[pod.Namespace] = append(failuresByNamespace[pod.Namespace], + fmt.Sprintf("pods/%s containers[%v]", pod.Name, container.Name)) + + } + } + for _, container := range pod.Spec.EphemeralContainers { + if container.TerminationMessagePolicy != corev1.TerminationMessageFallbackToLogsOnError { + failuresByNamespace[pod.Namespace] = append(failuresByNamespace[pod.Namespace], + fmt.Sprintf("pods/%s ephemeralContainers[%v]", pod.Name, container.Name)) + + } + } + } + + // existingViolations is the list of violations already present, don't add to it once we start enforcing + existingViolations := map[string]sets.String{ + "namespace": sets.NewString("pods/ []"), + } + + junits := []*junitapi.JUnitTestCase{} + for _, namespace := range sets.StringKeySet(failuresByNamespace).List() { + testName := fmt.Sprintf("[sig-arch] all containers in ns/%v must have terminationMessagePolicy=%v", namespace, corev1.TerminationMessageFallbackToLogsOnError) + failingContainers := sets.NewString(failuresByNamespace[namespace]...) + if len(failingContainers) == 0 { + junits = append(junits, &junitapi.JUnitTestCase{ + Name: testName, + SystemOut: "", + SystemErr: "", + }) + continue + } + + if existingViolationForNamespace, ok := existingViolations[namespace]; ok { + newViolatingContainers := failingContainers.Difference(existingViolationForNamespace) + if len(newViolatingContainers) == 0 { + junits = append(junits, &junitapi.JUnitTestCase{ + Name: testName, + SystemOut: "", + SystemErr: "", + }) + continue + } + failingContainers = newViolatingContainers + } + + failureMessages := []string{} + for _, container := range failingContainers.List() { + failureMessages = append(failureMessages, + fmt.Sprintf("%v must have terminationMessagePolicy=%q", + container, corev1.TerminationMessageFallbackToLogsOnError)) + } + + junits = append(junits, + &junitapi.JUnitTestCase{ + Name: testName, + SystemOut: strings.Join(failureMessages, "\n"), + FailureOutput: &junitapi.FailureOutput{ + Output: strings.Join(failureMessages, "\n"), + }, + }, + // start as flake to build whitelist + &junitapi.JUnitTestCase{ + Name: testName, + SystemOut: "", + SystemErr: "", + }, + ) + + } + + return nil, junits, nil +} + +func (*terminationMessagePolicyChecker) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) { + return nil, nil +} + +func (*terminationMessagePolicyChecker) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) { + return nil, nil +} + +func (*terminationMessagePolicyChecker) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error { + return nil +} + +func (*terminationMessagePolicyChecker) Cleanup(ctx context.Context) error { + return nil +}