Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion pkg/apis/autoscaling/v1alpha1/pa_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ import (
"strconv"
"time"

"github.com/knative/serving/pkg/apis/autoscaling"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime/schema"

"github.com/knative/serving/pkg/apis/autoscaling"
"knative.dev/pkg/apis"
duckv1beta1 "knative.dev/pkg/apis/duck/v1beta1"
)
Expand Down Expand Up @@ -202,6 +203,12 @@ func (pas *PodAutoscalerStatus) CanMarkInactive(idlePeriod time.Duration) bool {
return pas.inStatusFor(corev1.ConditionTrue, idlePeriod)
}

// CanFailActivation checks whether the pod autoscaler has been activating
// for at least the specified idle period.
func (pas *PodAutoscalerStatus) CanFailActivation(idlePeriod time.Duration) bool {
return pas.inStatusFor(corev1.ConditionUnknown, idlePeriod)
}

// inStatusFor returns true if the PodAutoscalerStatus's Active condition has stayed in
// the specified status for at least the specified duration. Otherwise it returns false,
// including when the status is undetermined (Active condition is not found.)
Expand Down
82 changes: 81 additions & 1 deletion pkg/apis/autoscaling/v1alpha1/pa_lifecycle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ import (
"testing"
"time"

"github.com/knative/serving/pkg/apis/autoscaling"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/knative/serving/pkg/apis/autoscaling"
"knative.dev/pkg/apis"
"knative.dev/pkg/apis/duck"
duckv1beta1 "knative.dev/pkg/apis/duck/v1beta1"
Expand Down Expand Up @@ -216,6 +217,85 @@ func TestCanMarkInactive(t *testing.T) {
}
}

func TestCanFailActivation(t *testing.T) {
cases := []struct {
name string
status PodAutoscalerStatus
result bool
grace time.Duration
}{{
name: "empty status",
status: PodAutoscalerStatus{},
result: false,
grace: 10 * time.Second,
}, {
name: "active condition",
status: PodAutoscalerStatus{
Status: duckv1beta1.Status{
Conditions: duckv1beta1.Conditions{{
Type: PodAutoscalerConditionActive,
Status: corev1.ConditionTrue,
}},
},
},
result: false,
grace: 10 * time.Second,
}, {
name: "activating condition (no LTT)",
status: PodAutoscalerStatus{
Status: duckv1beta1.Status{
Conditions: duckv1beta1.Conditions{{
Type: PodAutoscalerConditionActive,
Status: corev1.ConditionUnknown,
// No LTT = beginning of time, so for sure we can.
}},
},
},
result: true,
grace: 10 * time.Second,
}, {
name: "activating condition (LTT longer than grace period ago)",
status: PodAutoscalerStatus{
Status: duckv1beta1.Status{
Conditions: duckv1beta1.Conditions{{
Type: PodAutoscalerConditionActive,
Status: corev1.ConditionUnknown,
LastTransitionTime: apis.VolatileTime{
Inner: metav1.NewTime(time.Now().Add(-30 * time.Second)),
},
// LTT = 30 seconds ago.
}},
},
},
result: true,
grace: 10 * time.Second,
}, {
name: "activating condition (LTT less than grace period ago)",
status: PodAutoscalerStatus{
Status: duckv1beta1.Status{
Conditions: duckv1beta1.Conditions{{
Type: PodAutoscalerConditionActive,
Status: corev1.ConditionUnknown,
LastTransitionTime: apis.VolatileTime{
Inner: metav1.NewTime(time.Now().Add(-10 * time.Second)),
},
// LTT = 10 seconds ago.
}},
},
},
result: false,
grace: 30 * time.Second,
}}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
if e, a := tc.result, tc.status.CanFailActivation(tc.grace); e != a {
t.Errorf("%q expected: %v got: %v", tc.name, e, a)
}
})
}
}

func TestIsActivating(t *testing.T) {
cases := []struct {
name string
Expand Down
7 changes: 6 additions & 1 deletion pkg/reconciler/autoscaling/kpa/kpa.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,12 @@ func computeActiveCondition(pa *pav1alpha1.PodAutoscaler, want int32, got int) (
switch {
case want == 0:
ret = !pa.Status.IsInactive() // Any state but inactive should change SKS.
pa.Status.MarkInactive("NoTraffic", "The target is not receiving traffic.")
if pa.Status.IsActivating() {
// We only ever scale to zero while activating if we fail to activate within the progress deadline.
pa.Status.MarkInactive("TimedOut", "The target could not be activated.")
Comment thread
jonjohnsonjr marked this conversation as resolved.
} else {
pa.Status.MarkInactive("NoTraffic", "The target is not receiving traffic.")
}

case got < minReady && want > 0:
ret = pa.Status.IsInactive() // If we were inactive and became activating.
Expand Down
43 changes: 36 additions & 7 deletions pkg/reconciler/autoscaling/kpa/kpa_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,10 +291,10 @@ func TestReconcileAndScaleToZero(t *testing.T) {
kpa(testNamespace, testRevision,
WithNoTraffic("NoTraffic", "The target is not receiving traffic."),
markOld, WithPAStatusService(testRevision),
withMSvcStatus("rocking-in-the-free-world")),
withMSvcStatus("my-my-hey-hey")),
sks(testNamespace, testRevision, WithDeployRef(deployName), WithProxyMode, WithSKSReady),
metricsSvc(testNamespace, testRevision, withSvcSelector(usualSelector),
withMSvcName("rocking-in-the-free-world")),
withMSvcName("my-my-hey-hey")),
deploy(testNamespace, testRevision, func(d *appsv1.Deployment) {
d.Spec.Replicas = ptr.Int32(0)
}),
Expand Down Expand Up @@ -350,10 +350,10 @@ func TestReconcileAndScaleToZero(t *testing.T) {
Key: key,
Objects: []runtime.Object{
kpa(testNamespace, testRevision, markActive, markOld,
WithPAStatusService(testRevision), withMSvcStatus("you-ask-for-this")),
WithPAStatusService(testRevision), withMSvcStatus("they-give-you-this")),
sks(testNamespace, testRevision, WithDeployRef(deployName), WithSKSReady),
metricsSvc(testNamespace, testRevision, withSvcSelector(usualSelector),
withMSvcName("you-ask-for-this")),
withMSvcName("they-give-you-this")),
deploy(testNamespace, testRevision),
makeSKSPrivateEndpoints(1, testNamespace, testRevision),
},
Expand All @@ -368,7 +368,7 @@ func TestReconcileAndScaleToZero(t *testing.T) {
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: kpa(testNamespace, testRevision,
WithNoTraffic("NoTraffic", "The target is not receiving traffic."),
WithPAStatusService(testRevision), withMSvcStatus("you-ask-for-this")),
WithPAStatusService(testRevision), withMSvcStatus("they-give-you-this")),
}},
WantUpdates: []clientgotesting.UpdateActionImpl{{
Object: sks(testNamespace, testRevision, WithSKSReady,
Expand All @@ -379,13 +379,42 @@ func TestReconcileAndScaleToZero(t *testing.T) {
Key: key,
Objects: []runtime.Object{
kpa(testNamespace, testRevision, markActive,
WithPAStatusService(testRevision), withMSvcStatus("but-we-give-you-that")),
WithPAStatusService(testRevision), withMSvcStatus("but-you-pay-for-that")),
sks(testNamespace, testRevision, WithDeployRef(deployName), WithSKSReady),
metricsSvc(testNamespace, testRevision, withSvcSelector(usualSelector),
withMSvcName("but-we-give-you-that")),
withMSvcName("but-you-pay-for-that")),
deploy(testNamespace, testRevision),
makeSKSPrivateEndpoints(1, testNamespace, testRevision),
},
}, {
Name: "activation failure",
Key: key,
Objects: []runtime.Object{
kpa(testNamespace, testRevision, markActivating, markOld,
WithPAStatusService(testRevision),
withMSvcStatus("once-you're-gone")),
sks(testNamespace, testRevision, WithDeployRef(deployName), WithSKSReady),
metricsSvc(testNamespace, testRevision, withSvcSelector(usualSelector),
withMSvcName("once-you're-gone")),
deploy(testNamespace, testRevision),
makeSKSPrivateEndpoints(1, testNamespace, testRevision),
},
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: kpa(testNamespace, testRevision,
WithNoTraffic("TimedOut", "The target could not be activated."),
WithPAStatusService(testRevision), withMSvcStatus("once-you're-gone")),
}},
WantUpdates: []clientgotesting.UpdateActionImpl{{
Object: sks(testNamespace, testRevision, WithSKSReady,
WithDeployRef(deployName), WithProxyMode),
}},
WantPatches: []clientgotesting.PatchActionImpl{{
ActionImpl: clientgotesting.ActionImpl{
Namespace: testNamespace,
},
Name: deployName,
Patch: []byte(`[{"op":"add","path":"/spec/replicas","value":0}]`),
}},
}}

defer logtesting.ClearAll()
Expand Down
19 changes: 18 additions & 1 deletion pkg/reconciler/autoscaling/kpa/scaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
"github.com/knative/serving/pkg/network/prober"
"github.com/knative/serving/pkg/reconciler/autoscaling/config"
aresources "github.com/knative/serving/pkg/reconciler/autoscaling/resources"
rresources "github.com/knative/serving/pkg/reconciler/revision/resources"
"github.com/knative/serving/pkg/resources"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -51,6 +52,16 @@ const (
// This number is small, since `handleScaleToZero` below will
// re-enque for the configured grace period.
reenqeuePeriod = 1 * time.Second

// TODO(#3456): Remove this buffer once KPA does pod failure diagnostics.
//
// KPA will scale the Deployment down to zero if it fails to activate after ProgressDeadlineSeconds,
// however, after ProgressDeadlineSeconds, the Deployment itself updates its status, which causes
// the Revision to re-reconcile and diagnose pod failures. If we use the same timeout here, we will
// race the Revision reconciler and scale down the pods before it can actually surface the pod errors.
// We should instead do pod failure diagnostics here immediately before scaling down the Deployment.
activationTimeoutBuffer = 10 * time.Second
activationTimeout = time.Duration(rresources.ProgressDeadlineSeconds)*time.Second + activationTimeoutBuffer
)

var probeOptions = []interface{}{
Expand Down Expand Up @@ -146,6 +157,12 @@ func (ks *scaler) handleScaleToZero(pa *pav1alpha1.PodAutoscaler, desiredScale i
}

if pa.Status.IsActivating() { // Active=Unknown
// If we are stuck activating for longer than our progress deadline, presume we cannot succeed and scale to 0.
if pa.Status.CanFailActivation(activationTimeout) {
ks.logger.Infof("%s activation has timed out after %v.", pa.Name, activationTimeout)
return 0, true
}
ks.enqueueCB(pa, activationTimeout)
return scaleUnknown, false
} else if pa.Status.IsReady() { // Active=True
// Don't scale-to-zero if the PA is active
Expand Down Expand Up @@ -221,7 +238,7 @@ func (ks *scaler) applyScale(ctx context.Context, pa *pav1alpha1.PodAutoscaler,
func (ks *scaler) Scale(ctx context.Context, pa *pav1alpha1.PodAutoscaler, desiredScale int32) (int32, error) {
logger := logging.FromContext(ctx)

if desiredScale < 0 {
if desiredScale < 0 && !pa.Status.IsActivating() {
logger.Debug("Metrics are not yet being collected.")
return desiredScale, nil
}
Expand Down
28 changes: 26 additions & 2 deletions pkg/reconciler/autoscaling/kpa/scaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,13 +185,23 @@ func TestScaler(t *testing.T) {
proberfunc: func(*pav1alpha1.PodAutoscaler, http.RoundTripper) (bool, error) { return false, nil },
wantAsyncProbeCount: 1,
}, {
label: "does not scale while activating",
label: "waits to scale to zero while activating until after deadline exceeded",
startReplicas: 1,
scaleTo: 0,
wantReplicas: -1,
wantScaling: false,
paMutation: func(k *pav1alpha1.PodAutoscaler) {
k.Status.MarkActivating("", "")
paMarkActivating(k, time.Now().Add(-activationTimeout/2))
},
wantCBCount: 1,
}, {
label: "scale to zero while activating after deadline exceeded",
startReplicas: 1,
scaleTo: 0,
wantReplicas: 0,
wantScaling: true,
paMutation: func(k *pav1alpha1.PodAutoscaler) {
paMarkActivating(k, time.Now().Add(-(activationTimeout + time.Second)))
},
}, {
label: "scale down to minScale before grace period",
Expand Down Expand Up @@ -241,6 +251,9 @@ func TestScaler(t *testing.T) {
scaleTo: -1, // no metrics
wantReplicas: -1,
wantScaling: false,
paMutation: func(k *pav1alpha1.PodAutoscaler) {
paMarkInactive(k, time.Now())
},
}, {
label: "scales up from zero to desired one",
startReplicas: 0,
Expand All @@ -259,6 +272,9 @@ func TestScaler(t *testing.T) {
scaleTo: -1,
wantReplicas: -1,
wantScaling: false,
paMutation: func(k *pav1alpha1.PodAutoscaler) {
paMarkActive(k, time.Now())
},
}}

for _, test := range tests {
Expand Down Expand Up @@ -381,6 +397,7 @@ func TestDisableScaleToZero(t *testing.T) {
psInformerFactory: presources.NewPodScalableInformerFactory(ctx),
}
pa := newKPA(t, fakeservingclient.Get(ctx), revision)
paMarkActive(pa, time.Now())

conf := defaultConfig()
conf.Autoscaler.EnableScaleToZero = false
Expand Down Expand Up @@ -495,6 +512,13 @@ func paMarkInactive(pa *pav1alpha1.PodAutoscaler, ltt time.Time) {
pa.Status.Conditions[0].LastTransitionTime = apis.VolatileTime{Inner: metav1.NewTime(ltt)}
}

func paMarkActivating(pa *pav1alpha1.PodAutoscaler, ltt time.Time) {
pa.Status.MarkActivating("", "")

// This works because the conditions are sorted alphabetically
pa.Status.Conditions[0].LastTransitionTime = apis.VolatileTime{Inner: metav1.NewTime(ltt)}
}

func checkReplicas(t *testing.T, dynamicClient *fakedynamic.FakeDynamicClient, deployment *v1.Deployment, expectedScale int32) {
t.Helper()

Expand Down
4 changes: 2 additions & 2 deletions pkg/reconciler/revision/resources/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ const (

// AppLabelKey is the label defining the application's name.
AppLabelKey = "app"
)

var (
// ProgressDeadlineSeconds is the time in seconds we wait for the deployment to
// be ready before considering it failed.
ProgressDeadlineSeconds = int32(120)
)

var (
// See https://github.com/knative/serving/pull/1124#issuecomment-397120430
// for how CPU and memory values were calculated.
queueContainerCPU = resource.MustParse("25m")
Expand Down
2 changes: 1 addition & 1 deletion pkg/reconciler/revision/resources/deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ func MakeDeployment(rev *v1alpha1.Revision,
Spec: appsv1.DeploymentSpec{
Replicas: ptr.Int32(1),
Selector: makeSelector(rev),
ProgressDeadlineSeconds: &ProgressDeadlineSeconds,
Comment thread
jonjohnsonjr marked this conversation as resolved.
ProgressDeadlineSeconds: ptr.Int32(ProgressDeadlineSeconds),
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: makeLabels(rev),
Expand Down
2 changes: 1 addition & 1 deletion pkg/reconciler/revision/resources/deploy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ var (
serving.RevisionUID: "1234",
},
},
ProgressDeadlineSeconds: &ProgressDeadlineSeconds,
ProgressDeadlineSeconds: ptr.Int32(ProgressDeadlineSeconds),
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
Expand Down