Skip to content
This repository was archived by the owner on Mar 3, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 28 additions & 18 deletions cmd/manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/metrics"

"github.com/spf13/pflag"

Expand All @@ -41,6 +42,7 @@ import (
"github.com/operator-framework/catalogd/internal/version"
corecontrollers "github.com/operator-framework/catalogd/pkg/controllers/core"
"github.com/operator-framework/catalogd/pkg/features"
catalogdmetrics "github.com/operator-framework/catalogd/pkg/metrics"
"github.com/operator-framework/catalogd/pkg/profile"
"github.com/operator-framework/catalogd/pkg/storage"

Expand Down Expand Up @@ -124,24 +126,32 @@ func main() {
os.Exit(1)
}

if err := os.MkdirAll(storageDir, 0700); err != nil {
setupLog.Error(err, "unable to create storage directory for catalogs")
}
localStorage := storage.LocalDir{RootDir: storageDir}
shutdownTimeout := 30 * time.Second
catalogServer := server.Server{
Kind: "catalogs",
Server: &http.Server{
Addr: catalogServerAddr,
Handler: localStorage.StorageServerHandler(),
ReadTimeout: 5 * time.Second,
WriteTimeout: 10 * time.Second,
},
ShutdownTimeout: &shutdownTimeout,
}
if err := mgr.Add(&catalogServer); err != nil {
setupLog.Error(err, "unable to start catalog server")
os.Exit(1)
var localStorage storage.Instance
if features.CatalogdFeatureGate.Enabled(features.HTTPServer) {
metrics.Registry.MustRegister(catalogdmetrics.RequestDurationMetric)

if err := os.MkdirAll(storageDir, 0700); err != nil {
setupLog.Error(err, "unable to create storage directory for catalogs")
Comment thread
everettraven marked this conversation as resolved.
os.Exit(1)
}

localStorage = storage.LocalDir{RootDir: storageDir}
shutdownTimeout := 30 * time.Second
catalogServer := server.Server{
Kind: "catalogs",
Server: &http.Server{
Addr: catalogServerAddr,
Handler: catalogdmetrics.AddMetricsToHandler(localStorage.StorageServerHandler()),
ReadTimeout: 5 * time.Second,
WriteTimeout: 10 * time.Second,
},
ShutdownTimeout: &shutdownTimeout,
}

if err := mgr.Add(&catalogServer); err != nil {
setupLog.Error(err, "unable to start catalog server")
os.Exit(1)
}
}

if err = (&corecontrollers.CatalogReconciler{
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ require (
github.com/onsi/ginkgo/v2 v2.9.7
github.com/onsi/gomega v1.27.7
github.com/operator-framework/operator-registry v1.27.1
github.com/prometheus/client_golang v1.14.0
github.com/spf13/pflag v1.0.5
github.com/stretchr/testify v1.8.1
k8s.io/api v0.26.1
Expand Down Expand Up @@ -59,7 +60,6 @@ require (
github.com/operator-framework/api v0.17.4-0.20230223191600-0131a6301e42 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_golang v1.14.0 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
Expand Down
40 changes: 40 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package metrics

import (
"net/http"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)

const (
RequestDurationMetricName = "catalogd_http_request_duration_seconds"
)

// Sets up the necessary metrics for calculating the Apdex Score
// If using Grafana for visualization connected to a Prometheus data
// source that is scraping these metrics, you can create a panel that
// uses the following queries + expressions for calculating the Apdex Score where T = 0.5:
// Query A: sum(catalogd_http_request_duration_seconds_bucket{code!~"5..",le="0.5"})
// Query B: sum(catalogd_http_request_duration_seconds_bucket{code!~"5..",le="2"})
// Query C: sum(catalogd_http_request_duration_seconds_count)
// Expression for Apdex Score: ($A + (($B - $A) / 2)) / $C
var (
RequestDurationMetric = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: RequestDurationMetricName,
Help: "Histogram of request duration in seconds",
// create a bucket for each 100 ms up to 1s and ensure it multiplied by 4 also exists.
// Include a 10s bucket to capture very long running requests. This allows us to easily
// calculate Apdex Scores up to a T of 1 second, but using various mathmatical formulas we
// should be able to estimate Apdex Scores up to a T of 2.5. Having a larger range of buckets
// will allow us to more easily calculate health indicators other than the Apdex Score.
Buckets: []float64{0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.2, 1.6, 2, 2.4, 2.8, 3.2, 3.6, 4, 10},
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it matter that our write timeout is 10s and the max bucket duration is 10s?

Seems like we'll only ever get whatever error code maps to that timeout in the 10s bucket.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so. If anything I think it means that we now have buckets that capture all possible response times and that allows us to calculate more metrics on the fly. This is all going based on #156 (comment) . Since if no requests take more than 10s we will never have anything in the "Inf" bucket.

That being said, I could be wrong - I don't have enough experience in this area to truly know and am making an assumption with what I currently know

Copy link
Copy Markdown
Collaborator Author

@everettraven everettraven Sep 8, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like we'll only ever get whatever error code maps to that timeout in the 10s bucket.

Any response time > 4s and <= 10s will fall in that 10s bucket

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, got it. sgtm.

},
[]string{"code"},
)
)

func AddMetricsToHandler(handler http.Handler) http.Handler {
return promhttp.InstrumentHandlerDuration(RequestDurationMetric, handler)
}