From 0a3b02b2209720d6bb2b25b07a3475e0c0289f1d Mon Sep 17 00:00:00 2001 From: Shash Reddy Date: Thu, 30 May 2019 11:40:28 -0700 Subject: [PATCH] Bubble up pod schedule errors to revision status --- .../serving/v1alpha1/revision_lifecycle.go | 6 + .../v1alpha1/revision_lifecycle_test.go | 19 +++ .../revision/reconcile_resources.go | 9 ++ pkg/reconciler/revision/table_test.go | 18 +++ pkg/reconciler/testing/functional.go | 20 +++ test/e2e/pod_schedule_error_test.go | 117 ++++++++++++++++++ 6 files changed, 189 insertions(+) create mode 100644 test/e2e/pod_schedule_error_test.go diff --git a/pkg/apis/serving/v1alpha1/revision_lifecycle.go b/pkg/apis/serving/v1alpha1/revision_lifecycle.go index ccb36d1b8c0c..36a176a824e6 100644 --- a/pkg/apis/serving/v1alpha1/revision_lifecycle.go +++ b/pkg/apis/serving/v1alpha1/revision_lifecycle.go @@ -181,6 +181,12 @@ func (rs *RevisionStatus) MarkResourcesAvailable() { revCondSet.Manage(rs).MarkTrue(RevisionConditionResourcesAvailable) } +// MarkResourcesUnavailable changes "ResourcesAvailable" condition to false to reflect that the +// resources of the given kind and name cannot be created. +func (rs *RevisionStatus) MarkResourcesUnavailable(reason, message string) { + revCondSet.Manage(rs).MarkFalse(RevisionConditionResourcesAvailable, reason, message) +} + func (rs *RevisionStatus) MarkActive() { revCondSet.Manage(rs).MarkTrue(RevisionConditionActive) } diff --git a/pkg/apis/serving/v1alpha1/revision_lifecycle_test.go b/pkg/apis/serving/v1alpha1/revision_lifecycle_test.go index 121d379f8cfa..3288ba73a252 100644 --- a/pkg/apis/serving/v1alpha1/revision_lifecycle_test.go +++ b/pkg/apis/serving/v1alpha1/revision_lifecycle_test.go @@ -374,6 +374,25 @@ func TestRevisionNotOwnedStuff(t *testing.T) { } } +func TestRevisionResourcesUnavailable(t *testing.T) { + r := &RevisionStatus{} + r.InitializeConditions() + apitest.CheckConditionOngoing(r.duck(), RevisionConditionResourcesAvailable, t) + apitest.CheckConditionOngoing(r.duck(), RevisionConditionContainerHealthy, t) + apitest.CheckConditionOngoing(r.duck(), RevisionConditionReady, t) + + const wantReason, wantMessage = "unschedulable", "insufficient energy" + r.MarkResourcesUnavailable(wantReason, wantMessage) + apitest.CheckConditionFailed(r.duck(), RevisionConditionResourcesAvailable, t) + apitest.CheckConditionFailed(r.duck(), RevisionConditionReady, t) + if got := r.GetCondition(RevisionConditionResourcesAvailable); got == nil || got.Reason != wantReason { + t.Errorf("RevisionConditionResourcesAvailable = %v, want %v", got, wantReason) + } + if got := r.GetCondition(RevisionConditionResourcesAvailable); got == nil || got.Message != wantMessage { + t.Errorf("RevisionConditionResourcesAvailable = %v, want %v", got, wantMessage) + } +} + func TestRevisionGetGroupVersionKind(t *testing.T) { r := &Revision{} want := schema.GroupVersionKind{ diff --git a/pkg/reconciler/revision/reconcile_resources.go b/pkg/reconciler/revision/reconcile_resources.go index e09129d6bcbe..1fb4a2d4ee82 100644 --- a/pkg/reconciler/revision/reconcile_resources.go +++ b/pkg/reconciler/revision/reconcile_resources.go @@ -82,6 +82,15 @@ func (c *Reconciler) reconcileDeployment(ctx context.Context, rev *v1alpha1.Revi // Arbitrarily grab the very first pod, as they all should be crashing pod := pods.Items[0] + // Update the revision status if pod cannot be scheduled(possibly resource constraints) + // If pod cannot be scheduled then we expect the container status to be empty. + for _, cond := range pod.Status.Conditions { + if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse { + rev.Status.MarkResourcesUnavailable(cond.Reason, cond.Message) + break + } + } + for _, status := range pod.Status.ContainerStatuses { if status.Name == resources.UserContainerName { if t := status.LastTerminationState.Terminated; t != nil { diff --git a/pkg/reconciler/revision/table_test.go b/pkg/reconciler/revision/table_test.go index 6b8c927ab326..8de5c6d7d4b8 100644 --- a/pkg/reconciler/revision/table_test.go +++ b/pkg/reconciler/revision/table_test.go @@ -456,6 +456,24 @@ func TestReconcile(t *testing.T) { WithLogURL, AllUnknownConditions, MarkContainerExiting(5, "I failed man!")), }}, Key: "foo/pod-error", + }, { + Name: "surface pod schedule errors", + // Test the propagation of the scheduling errors of Pod into the revision. + // This initializes the world to unschedule pod. It then verifies + // that Reconcile propagates this into the status of the Revision. + Objects: []runtime.Object{ + rev("foo", "pod-schedule-error", + withK8sServiceName("a-pod-schedule-error"), WithLogURL, AllUnknownConditions, MarkActive), + kpa("foo", "pod-schedule-error"), // PA can't be ready, since no traffic. + pod("foo", "pod-schedule-error", WithUnschedulableContainer("Insufficient energy", "Unschedulable")), + deploy("foo", "pod-schedule-error"), + image("foo", "pod-schedule-error"), + }, + WantStatusUpdates: []clientgotesting.UpdateActionImpl{{ + Object: rev("foo", "pod-schedule-error", + WithLogURL, AllUnknownConditions, MarkResourcesUnavailable("Insufficient energy", "Unschedulable")), + }}, + Key: "foo/pod-schedule-error", }, { Name: "ready steady state", // Test the transition that Reconcile makes when Endpoints become ready on the diff --git a/pkg/reconciler/testing/functional.go b/pkg/reconciler/testing/functional.go index 1e057877e3b4..58e901ca7c0f 100644 --- a/pkg/reconciler/testing/functional.go +++ b/pkg/reconciler/testing/functional.go @@ -903,6 +903,13 @@ func MarkContainerExiting(exitCode int32, message string) RevisionOption { } } +// MarkResourcesUnavailable calls .Status.MarkResourcesUnavailable on the Revision. +func MarkResourcesUnavailable(reason, message string) RevisionOption { + return func(r *v1alpha1.Revision) { + r.Status.MarkResourcesUnavailable(reason, message) + } +} + // MarkRevisionReady calls the necessary helpers to make the Revision Ready=True. func MarkRevisionReady(r *v1alpha1.Revision) { WithInitRevConditions(r) @@ -1109,6 +1116,19 @@ func WithFailingContainer(name string, exitCode int, message string) PodOption { } } +// WithUnschedulableContainer sets the .Status.Conditionss on the pod to +// include `PodScheduled` status to `False` with the given message and reason. +func WithUnschedulableContainer(reason, message string) PodOption { + return func(pod *corev1.Pod) { + pod.Status.Conditions = []corev1.PodCondition{{ + Type: corev1.PodScheduled, + Reason: reason, + Message: message, + Status: corev1.ConditionFalse, + }} + } +} + // ClusterIngressOption enables further configuration of the Cluster Ingress. type ClusterIngressOption func(*netv1alpha1.ClusterIngress) diff --git a/test/e2e/pod_schedule_error_test.go b/test/e2e/pod_schedule_error_test.go new file mode 100644 index 000000000000..193f4cef4cb0 --- /dev/null +++ b/test/e2e/pod_schedule_error_test.go @@ -0,0 +1,117 @@ +// +build e2e + +/* +Copyright 2019 The Knative Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package e2e + +import ( + "fmt" + "strings" + "testing" + + "github.com/knative/serving/pkg/apis/serving/v1alpha1" + serviceresourcenames "github.com/knative/serving/pkg/reconciler/service/resources/names" + "github.com/knative/serving/test" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestPodScheduleError(t *testing.T) { + clients := Setup(t) + const ( + errorReason = "RevisionFailed" + errorMsg = "nodes are available" + revisionReason = "Unschedulable" + ) + names := test.ResourceNames{ + Service: test.ObjectNameForTest(t), + Image: "helloworld", + } + + defer test.TearDown(clients, names) + test.CleanupOnInterrupt(func() { test.TearDown(clients, names) }) + + t.Logf("Creating a new Service %s", names.Image) + resources := corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50000"), + }, + Requests: corev1.ResourceList{ + corev1.ResourceMemory: resource.MustParse("350Mi"), + }, + } + var ( + svc *v1alpha1.Service + err error + ) + if svc, err = test.CreateLatestService(t, clients, names, &test.Options{ContainerResources: resources}); err != nil { + t.Fatalf("Failed to create Service %s: %v", names.Service, err) + } + + names.Config = serviceresourcenames.Configuration(svc) + + err = test.WaitForServiceState(clients.ServingClient, names.Service, func(r *v1alpha1.Service) (bool, error) { + cond := r.Status.GetCondition(v1alpha1.ConfigurationConditionReady) + if cond != nil && !cond.IsUnknown() { + if strings.Contains(cond.Message, errorMsg) && cond.IsFalse() { + return true, nil + } + t.Logf("Reason: %s ; Message: %s ; Status: %s", cond.Reason, cond.Message, cond.Status) + return true, fmt.Errorf("the service %s was not marked with expected error condition (Reason=\"%s\", Message=\"%s\", Status=\"%s\"), but with (Reason=\"%s\", Message=\"%s\", Status=\"%s\")", + names.Config, errorReason, errorMsg, "False", cond.Reason, cond.Message, cond.Status) + } + return false, nil + }, "ContainerUnscheduleable") + + if err != nil { + t.Fatalf("Failed to validate service state: %s", err) + } + + revisionName, err := revisionFromConfiguration(clients, names.Config) + if err != nil { + t.Fatalf("Failed to get revision from configuration %s: %v", names.Config, err) + } + + t.Log("When the containers are not scheduled, the revision should have error status.") + err = test.WaitForRevisionState(clients.ServingClient, revisionName, func(r *v1alpha1.Revision) (bool, error) { + cond := r.Status.GetCondition(v1alpha1.RevisionConditionReady) + if cond != nil { + if cond.Reason == revisionReason && strings.Contains(cond.Message, errorMsg) { + return true, nil + } + return true, fmt.Errorf("the revision %s was not marked with expected error condition (Reason=%q, Message=%q), but with (Reason=%q, Message=%q)", + revisionName, revisionReason, errorMsg, cond.Reason, cond.Message) + } + return false, nil + }, errorReason) + + if err != nil { + t.Fatalf("Failed to validate revision state: %s", err) + } +} + +// Get revision name from configuration. +func revisionFromConfiguration(clients *test.Clients, configName string) (string, error) { + config, err := clients.ServingClient.Configs.Get(configName, metav1.GetOptions{}) + if err != nil { + return "", err + } + if config.Status.LatestCreatedRevisionName != "" { + return config.Status.LatestCreatedRevisionName, nil + } + return "", fmt.Errorf("no valid revision name found in configuration %s", configName) +}