From cc4985af58c923e48c443323d5f3c36eb00f1850 Mon Sep 17 00:00:00 2001 From: Damien Grisonnet Date: Wed, 12 Jan 2022 16:27:05 +0100 Subject: [PATCH 1/4] test/extended: expose apiserver request latency Signed-off-by: Damien Grisonnet --- test/extended/apiserver/latency.go | 75 ++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 test/extended/apiserver/latency.go diff --git a/test/extended/apiserver/latency.go b/test/extended/apiserver/latency.go new file mode 100644 index 000000000000..0b56fe7436b4 --- /dev/null +++ b/test/extended/apiserver/latency.go @@ -0,0 +1,75 @@ +package apiserver + +import ( + "context" + "fmt" + "time" + + g "github.com/onsi/ginkgo" + o "github.com/onsi/gomega" + + prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" + + exutil "github.com/openshift/origin/test/extended/util" + promhelper "github.com/openshift/origin/test/extended/util/prometheus" +) + +var _ = g.Describe("[sig-api-machinery][Late] API Server latency", func() { + defer g.GinkgoRecover() + + oc := exutil.NewCLIWithoutNamespace("apiserver-latency") + + // TODO: refine latency thresholds based on actual CI run results + // + // Latency thresholds should be calculed based on the following queries: + // - histogram_quantile(0.99,sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",scope=~"resource|"}[$testDuration])) by (le,verb)) + // - histogram_quantile(0.99,sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",scope="namespace"}[$testDuration])) by (le,verb)) + // - histogram_quantile(0.99,sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",scope="cluster"}[$testDuration])) by (le,verb)) + g.It("should verify that the apiserver requests latency are within expected thresholds", func() { + promClient := oc.NewPrometheusClient(context.TODO()) + + // 99th percentile of resource-scoped requests latency + scope := "resource|" + expectRequestLatency(promClient, "GET", scope, parseDuration("100ms")) + expectRequestLatency(promClient, "POST", scope, parseDuration("100ms")) + expectRequestLatency(promClient, "PUT", scope, parseDuration("100ms")) + expectRequestLatency(promClient, "PATCH", scope, parseDuration("100ms")) + expectRequestLatency(promClient, "DELETE", scope, parseDuration("100ms")) + + // 99th percentile of namespace-scoped requests latency + scope = "namespace" + expectRequestLatency(promClient, "LIST", scope, parseDuration("500ms")) + expectRequestLatency(promClient, "POST", scope, parseDuration("500ms")) + expectRequestLatency(promClient, "DELETE", scope, parseDuration("500ms")) + + // 99th percentile of cluster-scoped requests latency + scope = "cluster" + expectRequestLatency(promClient, "LIST", scope, parseDuration("5s")) + expectRequestLatency(promClient, "POST", scope, parseDuration("5s")) + }) +}) + +func parseDuration(s string) time.Duration { + d, err := time.ParseDuration(s) + o.Expect(err).NotTo(o.HaveOccurred()) + return d +} + +func expectRequestLatency(prometheusClient prometheusv1.API, verb string, scope string, threshold time.Duration) { + resp, err := promhelper.RunQuery( + context.TODO(), + prometheusClient, + fmt.Sprintf( + `histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb="%s",scope=~"%s"}[%s])) by (le))`, + verb, scope, exutil.DurationSinceStartInSeconds().String(), + ), + ) + + o.Expect(err).NotTo(o.HaveOccurred()) + o.Expect(resp.Data.Result).NotTo(o.BeEmpty()) + + latency := time.Duration(float64(resp.Data.Result[0].Value) * float64(time.Second)) + o.Expect(latency.Seconds()).Should(o.BeNumerically("<", threshold.Seconds()), + fmt.Sprintf("expected 99th percentile of apiserver request latency with verb=%q and scope=%q to be less than %s, got %s", verb, scope, threshold, latency), + ) +} From f7dfe18f8d8a63e6f43efe629d9c0ee562b91fbf Mon Sep 17 00:00:00 2001 From: Damien Grisonnet Date: Wed, 12 Jan 2022 22:49:19 +0100 Subject: [PATCH 2/4] test/extended: regenerate Signed-off-by: Damien Grisonnet --- .../util/annotate/generated/zz_generated.annotations.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go index a05a5becb7e7..6efad9c90476 100644 --- a/test/extended/util/annotate/generated/zz_generated.annotations.go +++ b/test/extended/util/annotate/generated/zz_generated.annotations.go @@ -333,6 +333,8 @@ var annotations = map[string]string{ "[Top Level] [sig-api-machinery][Feature:ServerSideApply] Server-Side Apply should work for user.openshift.io/v1, Resource=users": "should work for user.openshift.io/v1, Resource=users [Suite:openshift/conformance/parallel]", + "[Top Level] [sig-api-machinery][Late] API Server latency should verify that the apiserver requests latency are within expected thresholds": "should verify that the apiserver requests latency are within expected thresholds [Suite:openshift/conformance/parallel]", + "[Top Level] [sig-apps] CronJob should be able to schedule after more than 100 missed schedule": "should be able to schedule after more than 100 missed schedule [Suite:openshift/conformance/parallel] [Suite:k8s]", "[Top Level] [sig-apps] CronJob should delete failed finished jobs with limit of one job": "should delete failed finished jobs with limit of one job [Suite:openshift/conformance/parallel] [Suite:k8s]", From a882ff31b9d54dd7b14c59f3affc325c44793b7e Mon Sep 17 00:00:00 2001 From: Damien Grisonnet Date: Fri, 14 Jan 2022 11:50:33 +0100 Subject: [PATCH 3/4] test/extended: refine apiserver latency thresholds Signed-off-by: Damien Grisonnet --- test/extended/apiserver/latency.go | 35 ++++++++++++++++-------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/test/extended/apiserver/latency.go b/test/extended/apiserver/latency.go index 0b56fe7436b4..4071b694526d 100644 --- a/test/extended/apiserver/latency.go +++ b/test/extended/apiserver/latency.go @@ -14,38 +14,41 @@ import ( promhelper "github.com/openshift/origin/test/extended/util/prometheus" ) +const ( + apiserverReadVerbs = "GET|LIST" + apiserverWriteVerbs = "POST|PUT|PATCH|DELETE" +) + var _ = g.Describe("[sig-api-machinery][Late] API Server latency", func() { defer g.GinkgoRecover() oc := exutil.NewCLIWithoutNamespace("apiserver-latency") - // TODO: refine latency thresholds based on actual CI run results - // + // Latency thresholds defined below are based on the following CI jobs: + // - https://prow.ci.openshift.org/view/gs/origin-ci-test/pr-logs/pull/26739/pull-ci-openshift-origin-master-e2e-gcp/1481905976666755072 + // - https://prow.ci.openshift.org/view/gs/origin-ci-test/pr-logs/pull/26739/pull-ci-openshift-origin-master-e2e-gcp/1481879797410828288 + // - https://prow.ci.openshift.org/view/gs/origin-ci-test/pr-logs/pull/26739/pull-ci-openshift-origin-master-e2e-aws-fips/1481879797377273856 + // - https://prow.ci.openshift.org/view/gs/origin-ci-test/pr-logs/pull/26739/pull-ci-openshift-origin-master-e2e-aws-fips/1481905976616423424 // Latency thresholds should be calculed based on the following queries: - // - histogram_quantile(0.99,sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",scope=~"resource|"}[$testDuration])) by (le,verb)) - // - histogram_quantile(0.99,sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",scope="namespace"}[$testDuration])) by (le,verb)) - // - histogram_quantile(0.99,sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",scope="cluster"}[$testDuration])) by (le,verb)) + // - histogram_quantile(0.99,sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"GET|LIST"}[$testDuration])) by (le,scope)) + // - histogram_quantile(0.99,sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[$testDuration])) by (le,scope)) g.It("should verify that the apiserver requests latency are within expected thresholds", func() { promClient := oc.NewPrometheusClient(context.TODO()) // 99th percentile of resource-scoped requests latency scope := "resource|" - expectRequestLatency(promClient, "GET", scope, parseDuration("100ms")) - expectRequestLatency(promClient, "POST", scope, parseDuration("100ms")) - expectRequestLatency(promClient, "PUT", scope, parseDuration("100ms")) - expectRequestLatency(promClient, "PATCH", scope, parseDuration("100ms")) - expectRequestLatency(promClient, "DELETE", scope, parseDuration("100ms")) + expectRequestLatency(promClient, apiserverReadVerbs, scope, parseDuration("200ms")) // got 130ms + expectRequestLatency(promClient, apiserverWriteVerbs, scope, parseDuration("1s250ms")) // got 850ms // 99th percentile of namespace-scoped requests latency scope = "namespace" - expectRequestLatency(promClient, "LIST", scope, parseDuration("500ms")) - expectRequestLatency(promClient, "POST", scope, parseDuration("500ms")) - expectRequestLatency(promClient, "DELETE", scope, parseDuration("500ms")) + expectRequestLatency(promClient, apiserverReadVerbs, scope, parseDuration("150ms")) // got 100ms + expectRequestLatency(promClient, apiserverWriteVerbs, scope, parseDuration("1s350ms")) // got 900ms // 99th percentile of cluster-scoped requests latency scope = "cluster" - expectRequestLatency(promClient, "LIST", scope, parseDuration("5s")) - expectRequestLatency(promClient, "POST", scope, parseDuration("5s")) + expectRequestLatency(promClient, apiserverReadVerbs, scope, parseDuration("1s350ms")) // got 900ms + expectRequestLatency(promClient, apiserverWriteVerbs, scope, parseDuration("300ms")) // got 200ms }) }) @@ -60,7 +63,7 @@ func expectRequestLatency(prometheusClient prometheusv1.API, verb string, scope context.TODO(), prometheusClient, fmt.Sprintf( - `histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb="%s",scope=~"%s"}[%s])) by (le))`, + `histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"%s",scope=~"%s"}[%s])) by (le))`, verb, scope, exutil.DurationSinceStartInSeconds().String(), ), ) From d67f738b82fac7f7a533dee7eb4351f296f46275 Mon Sep 17 00:00:00 2001 From: Damien Grisonnet Date: Thu, 27 Jan 2022 19:21:02 +0100 Subject: [PATCH 4/4] test/extended: check apiserver average latency Signed-off-by: Damien Grisonnet --- test/extended/apiserver/latency.go | 64 +++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/test/extended/apiserver/latency.go b/test/extended/apiserver/latency.go index 4071b694526d..fc511b0e8ab0 100644 --- a/test/extended/apiserver/latency.go +++ b/test/extended/apiserver/latency.go @@ -10,6 +10,7 @@ import ( prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" + testresult "github.com/openshift/origin/pkg/test/ginkgo/result" exutil "github.com/openshift/origin/test/extended/util" promhelper "github.com/openshift/origin/test/extended/util/prometheus" ) @@ -24,31 +25,37 @@ var _ = g.Describe("[sig-api-machinery][Late] API Server latency", func() { oc := exutil.NewCLIWithoutNamespace("apiserver-latency") - // Latency thresholds defined below are based on the following CI jobs: - // - https://prow.ci.openshift.org/view/gs/origin-ci-test/pr-logs/pull/26739/pull-ci-openshift-origin-master-e2e-gcp/1481905976666755072 - // - https://prow.ci.openshift.org/view/gs/origin-ci-test/pr-logs/pull/26739/pull-ci-openshift-origin-master-e2e-gcp/1481879797410828288 - // - https://prow.ci.openshift.org/view/gs/origin-ci-test/pr-logs/pull/26739/pull-ci-openshift-origin-master-e2e-aws-fips/1481879797377273856 - // - https://prow.ci.openshift.org/view/gs/origin-ci-test/pr-logs/pull/26739/pull-ci-openshift-origin-master-e2e-aws-fips/1481905976616423424 + // Latency thresholds defined below are based on the following search.ci graphs: + // - https://search.ci.openshift.org/graph/metrics?metric=cluster%3Aapi%3Aread%3Arequests%3Alatency%3Atotal%3Aavg&job=periodic-ci-openshift-release-master-ci-4.10-e2e-aws&job=periodic-ci-openshift-release-master-ci-4.10-e2e-azure&job=periodic-ci-openshift-release-master-ci-4.10-e2e-gcp&job=periodic-ci-openshift-release-master-ci-4.10-upgrade-from-stable-4.9-e2e-aws-upgrade&job=periodic-ci-openshift-release-master-ci-4.10-upgrade-from-stable-4.9-e2e-azure-upgrade&job=periodic-ci-openshift-release-master-ci-4.10-upgrade-from-stable-4.9-e2e-gcp-upgrade + // - https://search.ci.openshift.org/graph/metrics?metric=cluster%3Aapi%3Aread%3Arequests%3Alatency%3Atotal%3Aquantile&job=periodic-ci-openshift-release-master-ci-4.10-e2e-aws&job=periodic-ci-openshift-release-master-ci-4.10-e2e-azure&job=periodic-ci-openshift-release-master-ci-4.10-e2e-gcp&job=periodic-ci-openshift-release-master-ci-4.10-upgrade-from-stable-4.9-e2e-aws-upgrade&job=periodic-ci-openshift-release-master-ci-4.10-upgrade-from-stable-4.9-e2e-azure-upgrade&job=periodic-ci-openshift-release-master-ci-4.10-upgrade-from-stable-4.9-e2e-gcp-upgrade + // - https://search.ci.openshift.org/graph/metrics?metric=cluster%3Aapi%3Awrite%3Arequests%3Alatency%3Atotal%3Aavg&job=periodic-ci-openshift-release-master-ci-4.10-e2e-aws&job=periodic-ci-openshift-release-master-ci-4.10-e2e-azure&job=periodic-ci-openshift-release-master-ci-4.10-e2e-gcp&job=periodic-ci-openshift-release-master-ci-4.10-upgrade-from-stable-4.9-e2e-aws-upgrade&job=periodic-ci-openshift-release-master-ci-4.10-upgrade-from-stable-4.9-e2e-azure-upgrade&job=periodic-ci-openshift-release-master-ci-4.10-upgrade-from-stable-4.9-e2e-gcp-upgrade + // - https://search.ci.openshift.org/graph/metrics?metric=cluster%3Aapi%3Awrite%3Arequests%3Alatency%3Atotal%3Aquantile&job=periodic-ci-openshift-release-master-ci-4.10-e2e-aws&job=periodic-ci-openshift-release-master-ci-4.10-e2e-azure&job=periodic-ci-openshift-release-master-ci-4.10-e2e-gcp&job=periodic-ci-openshift-release-master-ci-4.10-upgrade-from-stable-4.9-e2e-aws-upgrade&job=periodic-ci-openshift-release-master-ci-4.10-upgrade-from-stable-4.9-e2e-azure-upgrade&job=periodic-ci-openshift-release-master-ci-4.10-upgrade-from-stable-4.9-e2e-gcp-upgrade // Latency thresholds should be calculed based on the following queries: // - histogram_quantile(0.99,sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"GET|LIST"}[$testDuration])) by (le,scope)) // - histogram_quantile(0.99,sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[$testDuration])) by (le,scope)) g.It("should verify that the apiserver requests latency are within expected thresholds", func() { promClient := oc.NewPrometheusClient(context.TODO()) - // 99th percentile of resource-scoped requests latency + // resource-scoped requests latency scope := "resource|" - expectRequestLatency(promClient, apiserverReadVerbs, scope, parseDuration("200ms")) // got 130ms - expectRequestLatency(promClient, apiserverWriteVerbs, scope, parseDuration("1s250ms")) // got 850ms + expectAverageRequestLatency(promClient, apiserverReadVerbs, scope, parseDuration("275ms")) // got 200ms in gcp + expectAverageRequestLatency(promClient, apiserverWriteVerbs, scope, parseDuration("120ms")) // got 90ms in gcp + expect99PercentileRequestLatency(promClient, apiserverReadVerbs, scope, parseDuration("1s")) // got 750ms in azure + expect99PercentileRequestLatency(promClient, apiserverWriteVerbs, scope, parseDuration("1s600ms")) // got 1s200ms in azure - // 99th percentile of namespace-scoped requests latency + // namespace-scoped requests latency scope = "namespace" - expectRequestLatency(promClient, apiserverReadVerbs, scope, parseDuration("150ms")) // got 100ms - expectRequestLatency(promClient, apiserverWriteVerbs, scope, parseDuration("1s350ms")) // got 900ms + expectAverageRequestLatency(promClient, apiserverReadVerbs, scope, parseDuration("50ms")) // got 35ms in azure + expectAverageRequestLatency(promClient, apiserverWriteVerbs, scope, parseDuration("100ms")) // got 75ms in azure + expect99PercentileRequestLatency(promClient, apiserverReadVerbs, scope, parseDuration("1s200ms")) // got 900ms in azure + expect99PercentileRequestLatency(promClient, apiserverWriteVerbs, scope, parseDuration("1s300ms")) // got 1s in azure - // 99th percentile of cluster-scoped requests latency + // cluster-scoped requests latency scope = "cluster" - expectRequestLatency(promClient, apiserverReadVerbs, scope, parseDuration("1s350ms")) // got 900ms - expectRequestLatency(promClient, apiserverWriteVerbs, scope, parseDuration("300ms")) // got 200ms + expectAverageRequestLatency(promClient, apiserverReadVerbs, scope, parseDuration("250ms")) // got 180ms in azure + expectAverageRequestLatency(promClient, apiserverWriteVerbs, scope, parseDuration("120ms")) // got 90ms in azure + expect99PercentileRequestLatency(promClient, apiserverReadVerbs, scope, parseDuration("6s")) // got 4s in gcp + expect99PercentileRequestLatency(promClient, apiserverWriteVerbs, scope, parseDuration("1s300ms")) // got 1s in gcp }) }) @@ -58,7 +65,26 @@ func parseDuration(s string) time.Duration { return d } -func expectRequestLatency(prometheusClient prometheusv1.API, verb string, scope string, threshold time.Duration) { +func expectAverageRequestLatency(prometheusClient prometheusv1.API, verb string, scope string, threshold time.Duration) { + resp, err := promhelper.RunQuery( + context.TODO(), + prometheusClient, + fmt.Sprintf( + `sum(rate(apiserver_request_duration_seconds_sum{job="apiserver",verb=~"%s",scope=~"%s"}[%s])) / sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"%s",scope=~"%s"}[%s]))`, + verb, scope, exutil.DurationSinceStartInSeconds().String(), verb, scope, exutil.DurationSinceStartInSeconds(), + ), + ) + o.Expect(err).NotTo(o.HaveOccurred()) + o.Expect(resp.Data.Result).NotTo(o.BeEmpty()) + + latency := time.Duration(float64(resp.Data.Result[0].Value) * float64(time.Second)) + if latency.Seconds() < threshold.Seconds() { + // Flake to gather more insights on the impact of the test. + testresult.Flakef("expected average apiserver request latency with verb=%q and scope=%q to be less than %s, got %s", verb, scope, threshold, latency) + } +} + +func expect99PercentileRequestLatency(prometheusClient prometheusv1.API, verb string, scope string, threshold time.Duration) { resp, err := promhelper.RunQuery( context.TODO(), prometheusClient, @@ -67,12 +93,12 @@ func expectRequestLatency(prometheusClient prometheusv1.API, verb string, scope verb, scope, exutil.DurationSinceStartInSeconds().String(), ), ) - o.Expect(err).NotTo(o.HaveOccurred()) o.Expect(resp.Data.Result).NotTo(o.BeEmpty()) latency := time.Duration(float64(resp.Data.Result[0].Value) * float64(time.Second)) - o.Expect(latency.Seconds()).Should(o.BeNumerically("<", threshold.Seconds()), - fmt.Sprintf("expected 99th percentile of apiserver request latency with verb=%q and scope=%q to be less than %s, got %s", verb, scope, threshold, latency), - ) + if latency.Seconds() > threshold.Seconds() { + // Flake to gather more insights on the impact of the test. + testresult.Flakef("expected 99th percentile of apiserver request latency with verb=%q and scope=%q to be less than %s, got %s", verb, scope, threshold, latency) + } }