diff --git a/pkg/apis/autoscaling/v1alpha1/pa_lifecycle.go b/pkg/apis/autoscaling/v1alpha1/pa_lifecycle.go index 3911231adbab..2a22eed49cae 100644 --- a/pkg/apis/autoscaling/v1alpha1/pa_lifecycle.go +++ b/pkg/apis/autoscaling/v1alpha1/pa_lifecycle.go @@ -22,9 +22,10 @@ import ( "strconv" "time" - "github.com/knative/serving/pkg/apis/autoscaling" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime/schema" + + "github.com/knative/serving/pkg/apis/autoscaling" "knative.dev/pkg/apis" duckv1beta1 "knative.dev/pkg/apis/duck/v1beta1" ) @@ -202,6 +203,12 @@ func (pas *PodAutoscalerStatus) CanMarkInactive(idlePeriod time.Duration) bool { return pas.inStatusFor(corev1.ConditionTrue, idlePeriod) } +// CanFailActivation checks whether the pod autoscaler has been activating +// for at least the specified idle period. +func (pas *PodAutoscalerStatus) CanFailActivation(idlePeriod time.Duration) bool { + return pas.inStatusFor(corev1.ConditionUnknown, idlePeriod) +} + // inStatusFor returns true if the PodAutoscalerStatus's Active condition has stayed in // the specified status for at least the specified duration. Otherwise it returns false, // including when the status is undetermined (Active condition is not found.) diff --git a/pkg/apis/autoscaling/v1alpha1/pa_lifecycle_test.go b/pkg/apis/autoscaling/v1alpha1/pa_lifecycle_test.go index bc7cfaf8e944..e02d0426203c 100644 --- a/pkg/apis/autoscaling/v1alpha1/pa_lifecycle_test.go +++ b/pkg/apis/autoscaling/v1alpha1/pa_lifecycle_test.go @@ -19,9 +19,10 @@ import ( "testing" "time" - "github.com/knative/serving/pkg/apis/autoscaling" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/knative/serving/pkg/apis/autoscaling" "knative.dev/pkg/apis" "knative.dev/pkg/apis/duck" duckv1beta1 "knative.dev/pkg/apis/duck/v1beta1" @@ -216,6 +217,85 @@ func TestCanMarkInactive(t *testing.T) { } } +func TestCanFailActivation(t *testing.T) { + cases := []struct { + name string + status PodAutoscalerStatus + result bool + grace time.Duration + }{{ + name: "empty status", + status: PodAutoscalerStatus{}, + result: false, + grace: 10 * time.Second, + }, { + name: "active condition", + status: PodAutoscalerStatus{ + Status: duckv1beta1.Status{ + Conditions: duckv1beta1.Conditions{{ + Type: PodAutoscalerConditionActive, + Status: corev1.ConditionTrue, + }}, + }, + }, + result: false, + grace: 10 * time.Second, + }, { + name: "activating condition (no LTT)", + status: PodAutoscalerStatus{ + Status: duckv1beta1.Status{ + Conditions: duckv1beta1.Conditions{{ + Type: PodAutoscalerConditionActive, + Status: corev1.ConditionUnknown, + // No LTT = beginning of time, so for sure we can. + }}, + }, + }, + result: true, + grace: 10 * time.Second, + }, { + name: "activating condition (LTT longer than grace period ago)", + status: PodAutoscalerStatus{ + Status: duckv1beta1.Status{ + Conditions: duckv1beta1.Conditions{{ + Type: PodAutoscalerConditionActive, + Status: corev1.ConditionUnknown, + LastTransitionTime: apis.VolatileTime{ + Inner: metav1.NewTime(time.Now().Add(-30 * time.Second)), + }, + // LTT = 30 seconds ago. + }}, + }, + }, + result: true, + grace: 10 * time.Second, + }, { + name: "activating condition (LTT less than grace period ago)", + status: PodAutoscalerStatus{ + Status: duckv1beta1.Status{ + Conditions: duckv1beta1.Conditions{{ + Type: PodAutoscalerConditionActive, + Status: corev1.ConditionUnknown, + LastTransitionTime: apis.VolatileTime{ + Inner: metav1.NewTime(time.Now().Add(-10 * time.Second)), + }, + // LTT = 10 seconds ago. + }}, + }, + }, + result: false, + grace: 30 * time.Second, + }} + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if e, a := tc.result, tc.status.CanFailActivation(tc.grace); e != a { + t.Errorf("%q expected: %v got: %v", tc.name, e, a) + } + }) + } +} + func TestIsActivating(t *testing.T) { cases := []struct { name string diff --git a/pkg/reconciler/autoscaling/kpa/kpa.go b/pkg/reconciler/autoscaling/kpa/kpa.go index 5dd95b66ef26..11e447b144ad 100644 --- a/pkg/reconciler/autoscaling/kpa/kpa.go +++ b/pkg/reconciler/autoscaling/kpa/kpa.go @@ -233,7 +233,12 @@ func computeActiveCondition(pa *pav1alpha1.PodAutoscaler, want int32, got int) ( switch { case want == 0: ret = !pa.Status.IsInactive() // Any state but inactive should change SKS. - pa.Status.MarkInactive("NoTraffic", "The target is not receiving traffic.") + if pa.Status.IsActivating() { + // We only ever scale to zero while activating if we fail to activate within the progress deadline. + pa.Status.MarkInactive("TimedOut", "The target could not be activated.") + } else { + pa.Status.MarkInactive("NoTraffic", "The target is not receiving traffic.") + } case got < minReady && want > 0: ret = pa.Status.IsInactive() // If we were inactive and became activating. diff --git a/pkg/reconciler/autoscaling/kpa/kpa_test.go b/pkg/reconciler/autoscaling/kpa/kpa_test.go index 5a9e45f010db..67577738bf29 100644 --- a/pkg/reconciler/autoscaling/kpa/kpa_test.go +++ b/pkg/reconciler/autoscaling/kpa/kpa_test.go @@ -291,10 +291,10 @@ func TestReconcileAndScaleToZero(t *testing.T) { kpa(testNamespace, testRevision, WithNoTraffic("NoTraffic", "The target is not receiving traffic."), markOld, WithPAStatusService(testRevision), - withMSvcStatus("rocking-in-the-free-world")), + withMSvcStatus("my-my-hey-hey")), sks(testNamespace, testRevision, WithDeployRef(deployName), WithProxyMode, WithSKSReady), metricsSvc(testNamespace, testRevision, withSvcSelector(usualSelector), - withMSvcName("rocking-in-the-free-world")), + withMSvcName("my-my-hey-hey")), deploy(testNamespace, testRevision, func(d *appsv1.Deployment) { d.Spec.Replicas = ptr.Int32(0) }), @@ -350,10 +350,10 @@ func TestReconcileAndScaleToZero(t *testing.T) { Key: key, Objects: []runtime.Object{ kpa(testNamespace, testRevision, markActive, markOld, - WithPAStatusService(testRevision), withMSvcStatus("you-ask-for-this")), + WithPAStatusService(testRevision), withMSvcStatus("they-give-you-this")), sks(testNamespace, testRevision, WithDeployRef(deployName), WithSKSReady), metricsSvc(testNamespace, testRevision, withSvcSelector(usualSelector), - withMSvcName("you-ask-for-this")), + withMSvcName("they-give-you-this")), deploy(testNamespace, testRevision), makeSKSPrivateEndpoints(1, testNamespace, testRevision), }, @@ -368,7 +368,7 @@ func TestReconcileAndScaleToZero(t *testing.T) { WantStatusUpdates: []clientgotesting.UpdateActionImpl{{ Object: kpa(testNamespace, testRevision, WithNoTraffic("NoTraffic", "The target is not receiving traffic."), - WithPAStatusService(testRevision), withMSvcStatus("you-ask-for-this")), + WithPAStatusService(testRevision), withMSvcStatus("they-give-you-this")), }}, WantUpdates: []clientgotesting.UpdateActionImpl{{ Object: sks(testNamespace, testRevision, WithSKSReady, @@ -379,13 +379,42 @@ func TestReconcileAndScaleToZero(t *testing.T) { Key: key, Objects: []runtime.Object{ kpa(testNamespace, testRevision, markActive, - WithPAStatusService(testRevision), withMSvcStatus("but-we-give-you-that")), + WithPAStatusService(testRevision), withMSvcStatus("but-you-pay-for-that")), sks(testNamespace, testRevision, WithDeployRef(deployName), WithSKSReady), metricsSvc(testNamespace, testRevision, withSvcSelector(usualSelector), - withMSvcName("but-we-give-you-that")), + withMSvcName("but-you-pay-for-that")), deploy(testNamespace, testRevision), makeSKSPrivateEndpoints(1, testNamespace, testRevision), }, + }, { + Name: "activation failure", + Key: key, + Objects: []runtime.Object{ + kpa(testNamespace, testRevision, markActivating, markOld, + WithPAStatusService(testRevision), + withMSvcStatus("once-you're-gone")), + sks(testNamespace, testRevision, WithDeployRef(deployName), WithSKSReady), + metricsSvc(testNamespace, testRevision, withSvcSelector(usualSelector), + withMSvcName("once-you're-gone")), + deploy(testNamespace, testRevision), + makeSKSPrivateEndpoints(1, testNamespace, testRevision), + }, + WantStatusUpdates: []clientgotesting.UpdateActionImpl{{ + Object: kpa(testNamespace, testRevision, + WithNoTraffic("TimedOut", "The target could not be activated."), + WithPAStatusService(testRevision), withMSvcStatus("once-you're-gone")), + }}, + WantUpdates: []clientgotesting.UpdateActionImpl{{ + Object: sks(testNamespace, testRevision, WithSKSReady, + WithDeployRef(deployName), WithProxyMode), + }}, + WantPatches: []clientgotesting.PatchActionImpl{{ + ActionImpl: clientgotesting.ActionImpl{ + Namespace: testNamespace, + }, + Name: deployName, + Patch: []byte(`[{"op":"add","path":"/spec/replicas","value":0}]`), + }}, }} defer logtesting.ClearAll() diff --git a/pkg/reconciler/autoscaling/kpa/scaler.go b/pkg/reconciler/autoscaling/kpa/scaler.go index 9663763f71b2..63055f6b094b 100644 --- a/pkg/reconciler/autoscaling/kpa/scaler.go +++ b/pkg/reconciler/autoscaling/kpa/scaler.go @@ -36,6 +36,7 @@ import ( "github.com/knative/serving/pkg/network/prober" "github.com/knative/serving/pkg/reconciler/autoscaling/config" aresources "github.com/knative/serving/pkg/reconciler/autoscaling/resources" + rresources "github.com/knative/serving/pkg/reconciler/revision/resources" "github.com/knative/serving/pkg/resources" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -51,6 +52,16 @@ const ( // This number is small, since `handleScaleToZero` below will // re-enque for the configured grace period. reenqeuePeriod = 1 * time.Second + + // TODO(#3456): Remove this buffer once KPA does pod failure diagnostics. + // + // KPA will scale the Deployment down to zero if it fails to activate after ProgressDeadlineSeconds, + // however, after ProgressDeadlineSeconds, the Deployment itself updates its status, which causes + // the Revision to re-reconcile and diagnose pod failures. If we use the same timeout here, we will + // race the Revision reconciler and scale down the pods before it can actually surface the pod errors. + // We should instead do pod failure diagnostics here immediately before scaling down the Deployment. + activationTimeoutBuffer = 10 * time.Second + activationTimeout = time.Duration(rresources.ProgressDeadlineSeconds)*time.Second + activationTimeoutBuffer ) var probeOptions = []interface{}{ @@ -146,6 +157,12 @@ func (ks *scaler) handleScaleToZero(pa *pav1alpha1.PodAutoscaler, desiredScale i } if pa.Status.IsActivating() { // Active=Unknown + // If we are stuck activating for longer than our progress deadline, presume we cannot succeed and scale to 0. + if pa.Status.CanFailActivation(activationTimeout) { + ks.logger.Infof("%s activation has timed out after %v.", pa.Name, activationTimeout) + return 0, true + } + ks.enqueueCB(pa, activationTimeout) return scaleUnknown, false } else if pa.Status.IsReady() { // Active=True // Don't scale-to-zero if the PA is active @@ -221,7 +238,7 @@ func (ks *scaler) applyScale(ctx context.Context, pa *pav1alpha1.PodAutoscaler, func (ks *scaler) Scale(ctx context.Context, pa *pav1alpha1.PodAutoscaler, desiredScale int32) (int32, error) { logger := logging.FromContext(ctx) - if desiredScale < 0 { + if desiredScale < 0 && !pa.Status.IsActivating() { logger.Debug("Metrics are not yet being collected.") return desiredScale, nil } diff --git a/pkg/reconciler/autoscaling/kpa/scaler_test.go b/pkg/reconciler/autoscaling/kpa/scaler_test.go index 57632e203e25..8945be315cfa 100644 --- a/pkg/reconciler/autoscaling/kpa/scaler_test.go +++ b/pkg/reconciler/autoscaling/kpa/scaler_test.go @@ -185,13 +185,23 @@ func TestScaler(t *testing.T) { proberfunc: func(*pav1alpha1.PodAutoscaler, http.RoundTripper) (bool, error) { return false, nil }, wantAsyncProbeCount: 1, }, { - label: "does not scale while activating", + label: "waits to scale to zero while activating until after deadline exceeded", startReplicas: 1, scaleTo: 0, wantReplicas: -1, wantScaling: false, paMutation: func(k *pav1alpha1.PodAutoscaler) { - k.Status.MarkActivating("", "") + paMarkActivating(k, time.Now().Add(-activationTimeout/2)) + }, + wantCBCount: 1, + }, { + label: "scale to zero while activating after deadline exceeded", + startReplicas: 1, + scaleTo: 0, + wantReplicas: 0, + wantScaling: true, + paMutation: func(k *pav1alpha1.PodAutoscaler) { + paMarkActivating(k, time.Now().Add(-(activationTimeout + time.Second))) }, }, { label: "scale down to minScale before grace period", @@ -241,6 +251,9 @@ func TestScaler(t *testing.T) { scaleTo: -1, // no metrics wantReplicas: -1, wantScaling: false, + paMutation: func(k *pav1alpha1.PodAutoscaler) { + paMarkInactive(k, time.Now()) + }, }, { label: "scales up from zero to desired one", startReplicas: 0, @@ -259,6 +272,9 @@ func TestScaler(t *testing.T) { scaleTo: -1, wantReplicas: -1, wantScaling: false, + paMutation: func(k *pav1alpha1.PodAutoscaler) { + paMarkActive(k, time.Now()) + }, }} for _, test := range tests { @@ -381,6 +397,7 @@ func TestDisableScaleToZero(t *testing.T) { psInformerFactory: presources.NewPodScalableInformerFactory(ctx), } pa := newKPA(t, fakeservingclient.Get(ctx), revision) + paMarkActive(pa, time.Now()) conf := defaultConfig() conf.Autoscaler.EnableScaleToZero = false @@ -495,6 +512,13 @@ func paMarkInactive(pa *pav1alpha1.PodAutoscaler, ltt time.Time) { pa.Status.Conditions[0].LastTransitionTime = apis.VolatileTime{Inner: metav1.NewTime(ltt)} } +func paMarkActivating(pa *pav1alpha1.PodAutoscaler, ltt time.Time) { + pa.Status.MarkActivating("", "") + + // This works because the conditions are sorted alphabetically + pa.Status.Conditions[0].LastTransitionTime = apis.VolatileTime{Inner: metav1.NewTime(ltt)} +} + func checkReplicas(t *testing.T, dynamicClient *fakedynamic.FakeDynamicClient, deployment *v1.Deployment, expectedScale int32) { t.Helper() diff --git a/pkg/reconciler/revision/resources/constants.go b/pkg/reconciler/revision/resources/constants.go index d471c0b4c13e..5fdf2bf9ab4d 100644 --- a/pkg/reconciler/revision/resources/constants.go +++ b/pkg/reconciler/revision/resources/constants.go @@ -31,13 +31,13 @@ const ( // AppLabelKey is the label defining the application's name. AppLabelKey = "app" -) -var ( // ProgressDeadlineSeconds is the time in seconds we wait for the deployment to // be ready before considering it failed. ProgressDeadlineSeconds = int32(120) +) +var ( // See https://github.com/knative/serving/pull/1124#issuecomment-397120430 // for how CPU and memory values were calculated. queueContainerCPU = resource.MustParse("25m") diff --git a/pkg/reconciler/revision/resources/deploy.go b/pkg/reconciler/revision/resources/deploy.go index 586981ad3b78..b1fe15b27dfa 100644 --- a/pkg/reconciler/revision/resources/deploy.go +++ b/pkg/reconciler/revision/resources/deploy.go @@ -228,7 +228,7 @@ func MakeDeployment(rev *v1alpha1.Revision, Spec: appsv1.DeploymentSpec{ Replicas: ptr.Int32(1), Selector: makeSelector(rev), - ProgressDeadlineSeconds: &ProgressDeadlineSeconds, + ProgressDeadlineSeconds: ptr.Int32(ProgressDeadlineSeconds), Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: makeLabels(rev), diff --git a/pkg/reconciler/revision/resources/deploy_test.go b/pkg/reconciler/revision/resources/deploy_test.go index 95c2b7e870e1..be9aac8cd141 100644 --- a/pkg/reconciler/revision/resources/deploy_test.go +++ b/pkg/reconciler/revision/resources/deploy_test.go @@ -172,7 +172,7 @@ var ( serving.RevisionUID: "1234", }, }, - ProgressDeadlineSeconds: &ProgressDeadlineSeconds, + ProgressDeadlineSeconds: ptr.Int32(ProgressDeadlineSeconds), Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{