-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Initial PR for avoiding http retries by activator #1665
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,16 +17,17 @@ limitations under the License. | |
| package main | ||
|
|
||
| import ( | ||
| "bytes" | ||
| "flag" | ||
| "fmt" | ||
| "io" | ||
| "io/ioutil" | ||
| "log" | ||
| "net/http" | ||
| "time" | ||
|
|
||
| "bytes" | ||
| "io" | ||
| "io/ioutil" | ||
| "net/http/httputil" | ||
| "net/url" | ||
| "time" | ||
|
|
||
| "github.com/knative/serving/pkg/logging/logkey" | ||
|
|
||
|
|
@@ -135,14 +136,26 @@ func (a *activationHandler) handler(w http.ResponseWriter, r *http.Request) { | |
| http.Error(w, msg, int(status)) | ||
| return | ||
| } | ||
|
|
||
| var transport http.RoundTripper | ||
| if endpoint.IsVerified() { | ||
| transport = http.DefaultTransport | ||
| if r.ProtoMajor == 2 { | ||
| transport = h2cutil.NewTransport() | ||
| } | ||
| } else { | ||
| transport = retryRoundTripper{ | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The PR is meant to address not retrying HTTP POSTs it's still possible here if the endpoint isn't verified.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The PR comment refers to this fact. This issue is being resolved with a multi PR approach This initial PR avoids HTTP retries by activator when the user has defined an HTTPGet readinessProbe. The follow up PR will default to queue-proxy health check when user has not defined a readinessProbe
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To clarify my comment if the endpoint |
||
| logger: a.logger, | ||
| } | ||
| } | ||
|
|
||
| target := &url.URL{ | ||
| Scheme: "http", | ||
| Host: fmt.Sprintf("%s:%d", endpoint.FQDN, endpoint.Port), | ||
| } | ||
|
|
||
| proxy := httputil.NewSingleHostReverseProxy(target) | ||
| proxy.Transport = retryRoundTripper{ | ||
| logger: a.logger, | ||
| } | ||
| proxy.Transport = transport | ||
|
|
||
| // TODO: Clear the host to avoid 404's. | ||
| // https://github.com/knative/serving/issues/964 | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,6 +37,19 @@ type revisionID struct { | |
|
|
||
| // Endpoint is a fully-qualified domain name / port pair for an active revision. | ||
| type Endpoint struct { | ||
| FQDN string | ||
| Port int32 | ||
| FQDN string | ||
| Port int32 | ||
| Verified VerificationStatus | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair. Will change |
||
| } | ||
|
|
||
| type VerificationStatus string | ||
|
|
||
| const ( | ||
| Unknown VerificationStatus = "Unknown" | ||
| Pass VerificationStatus = "Pass" | ||
| Fail VerificationStatus = "Fail" | ||
| ) | ||
|
|
||
| func (e *Endpoint) IsVerified() bool { | ||
| return e.Verified == Pass | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,123 @@ | ||
| /* | ||
| Copyright 2018 The Knative Authors | ||
|
|
||
| Licensed under the Apache License, Version 2.0 (the "License"); | ||
| you may not use this file except in compliance with the License. | ||
| You may obtain a copy of the License at | ||
|
|
||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
|
|
||
| Unless required by applicable law or agreed to in writing, software | ||
| distributed under the License is distributed on an "AS IS" BASIS, | ||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| See the License for the specific language governing permissions and | ||
| limitations under the License. | ||
| */ | ||
| package activator | ||
|
|
||
| import ( | ||
| "errors" | ||
| "fmt" | ||
| "net/http" | ||
| "net/url" | ||
| "time" | ||
|
|
||
| "github.com/knative/serving/pkg/apis/serving/v1alpha1" | ||
| "go.uber.org/zap" | ||
| "k8s.io/api/core/v1" | ||
| "k8s.io/apimachinery/pkg/util/intstr" | ||
| ) | ||
|
|
||
| const ( | ||
| maxRetry = 60 | ||
| defaultPeriodSeconds = int32(1 * time.Second) | ||
| defaultTimeoutSeconds = int32(1 * time.Second) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These defaults exist here: https://github.com/kubernetes/kubernetes/blob/master/pkg/apis/core/v1/defaults.go#L185 |
||
| ) | ||
|
|
||
| func verifyRevisionRoutability(revision *v1alpha1.Revision, endpoint *Endpoint, logger *zap.SugaredLogger) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not clear that you're using a reference to an
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. well, could change the function name to |
||
| // Proceed only if user has a HTTPGet readinessProbe defined | ||
| if revision.Spec.Container.ReadinessProbe == nil || | ||
| revision.Spec.Container.ReadinessProbe.HTTPGet == nil { | ||
| endpoint.Verified = Unknown | ||
| return | ||
| } | ||
|
|
||
| endpoint.Verified = Fail | ||
| probe := createHttpGetProbe(revision, *endpoint) | ||
|
|
||
| // Number of seconds after the readiness probes are initiated | ||
| time.Sleep(time.Second * int32ToDuration(probe.InitialDelaySeconds)) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We essentially know the endpoint is ready, but not necessarily from all nodes. I'm thinking we can skip the initial delay and hope that it'll optimistically work.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We did have some hesitation in using it. But the thinking was that a user directive to apply an initial delay should be respected. The default initial delay is 0 seconds. As an example, assume for an app, the actual initial delay is longer than 60 seconds; then the endpoint verification would fail if the default retry interval of 1 second and max retry limit of 60 is used. |
||
|
|
||
| retryCount := 1 | ||
| retryInterval := time.Second * int32ToDuration(probe.PeriodSeconds) | ||
|
|
||
| for retryCount = 1; retryCount < maxRetry; retryCount++ { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer to go with an exponential backoff. E.g. #1814. Let's see if you can reuse @markusthoemmes' retry mechanism. Or if his mechanism can be modified for you to use it. |
||
| ready, err := checkHttpGetProbe(probe, logger) | ||
| if err != nil { | ||
| logger.Errorf("error checking probe", zap.String("error", err.Error())) | ||
| } | ||
|
|
||
| if ready { | ||
| endpoint.Verified = Pass | ||
| break | ||
| } | ||
|
|
||
| // How often (in seconds) to perform the probe | ||
| time.Sleep(retryInterval) | ||
| } | ||
|
|
||
| logger.Infof("took %d probe retries for readiness", retryCount, zap.Any("endpoint", endpoint)) | ||
| } | ||
|
|
||
| // Function creates HTTP readiness probe for revision | ||
| func createHttpGetProbe(revision *v1alpha1.Revision, endpoint Endpoint) *v1.Probe { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're transforming a (revision, endpoint) to a probe. Then in the subsequent steps you're converting the probe to an http get call/request. You can go straight to a (revision, endpoint) -> http.Request/call
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair. This approach is a remnant of the initial effort to tackle both HttpGet and TCPSocket based readiness probes. Support for TCPSocket probe is currently blocked by issue #1241. More details in this issue comment. Will update. |
||
| probe := revision.Spec.Container.ReadinessProbe.DeepCopy() | ||
| probe.HTTPGet.Scheme = "http" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're making assumptions on the scheme - just confirm it's always http You can use the defaulter function to set the default value for this and the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, will make the change. I did consider using the defaulter functions from the kubernetes library. Can't remember why I backed out of it :-/ |
||
| probe.HTTPGet.Host = endpoint.FQDN | ||
| probe.HTTPGet.Port.Type = intstr.Int | ||
| probe.HTTPGet.Port.IntVal = endpoint.Port | ||
|
|
||
| if probe.TimeoutSeconds == 0 { | ||
| probe.TimeoutSeconds = defaultTimeoutSeconds | ||
| } | ||
| if probe.PeriodSeconds == 0 { | ||
| probe.PeriodSeconds = defaultPeriodSeconds | ||
| } | ||
|
|
||
| return probe | ||
| } | ||
|
|
||
| func checkHttpGetProbe(probe *v1.Probe, logger *zap.SugaredLogger) (ready bool, err error) { | ||
| if probe == nil { | ||
| return false, errors.New("probe cannot be nil") | ||
| } | ||
|
|
||
| if probe.HTTPGet == nil { | ||
| return false, errors.New("probe HTTPGet cannot be nil") | ||
| } | ||
|
|
||
| host := fmt.Sprintf("%s:%d", probe.HTTPGet.Host, probe.HTTPGet.Port.IntVal) | ||
| if err != nil { | ||
| return false, err | ||
| } | ||
|
|
||
| probeUrl := url.URL{ | ||
| Scheme: string(probe.HTTPGet.Scheme), | ||
| Host: host, | ||
| Path: probe.HTTPGet.Path, | ||
| } | ||
|
|
||
| logger.Debug("checking probe url: %s", probeUrl.String()) | ||
|
|
||
| client := http.Client{Timeout: time.Second * int32ToDuration(probe.TimeoutSeconds)} | ||
| res, err := client.Get(probeUrl.String()) | ||
| if err != nil { | ||
| return false, err | ||
| } | ||
|
|
||
| return res.StatusCode == http.StatusOK, nil | ||
| } | ||
|
|
||
| func int32ToDuration(i int32) time.Duration { | ||
| return time.Duration(int64(i)) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's probably still worth using the retryRoundTripper for HTTP GETs
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not clear as to why the retry approach is better when the readinessProbe is defined by the user. The readinessProbe is a clear indicator of when the app is ready to receive requests.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For GETs a request could still fail for whatever reason even if readiness probe succeeds
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Once the user's application says it's ready, it will be put into service. That's true whether it's the first pod or the 100th. The only reason we're retrying at this level (in the activator) is because we know the network programming is eventually consistent, which matters more for the first pod.
Once we've verified we can reach the service with readiness probing, we should just forward the request. If it fails because of something in the user's application, we should just rely on a higher level retry (or not).