diff --git a/deploy/infrastructure/dependencies/terraform-commons-dss/helm.tf b/deploy/infrastructure/dependencies/terraform-commons-dss/helm.tf index a39aeecc1..2e247483d 100644 --- a/deploy/infrastructure/dependencies/terraform-commons-dss/helm.tf +++ b/deploy/infrastructure/dependencies/terraform-commons-dss/helm.tf @@ -91,6 +91,32 @@ resource "local_file" "helm_chart_values" { global = { cloudProvider = var.kubernetes_cloud_provider_name } + + monitoring = { + enabled = false + }, + + prometheus = { + server = { + persistentVolume = { + storageClass = var.kubernetes_storage_class + } + + global = { + external_labels = { + environment = "dev" + k8s_cluster = var.cluster_name + } + } + } + } + + grafana = { + persistence = { + storageClassName = var.kubernetes_storage_class + } + } + }) : yamlencode({ cockroachdb = { enabled = false @@ -261,6 +287,31 @@ resource "local_file" "helm_chart_values" { global = { cloudProvider = var.kubernetes_cloud_provider_name } + + monitoring = { + enabled = false + }, + + prometheus = { + server = { + persistentVolume = { + storageClass = var.kubernetes_storage_class + } + + global = { + external_labels = { + environment = "dev" + k8s_cluster = var.cluster_name + } + } + } + } + + grafana = { + persistence = { + storageClassName = var.kubernetes_storage_class + } + } }) } diff --git a/deploy/infrastructure/dependencies/terraform-commons-dss/variables.gen.tf b/deploy/infrastructure/dependencies/terraform-commons-dss/variables.gen.tf index fdf117642..8ddb417b8 100644 --- a/deploy/infrastructure/dependencies/terraform-commons-dss/variables.gen.tf +++ b/deploy/infrastructure/dependencies/terraform-commons-dss/variables.gen.tf @@ -55,6 +55,15 @@ variable "node_count" { } +variable "cluster_name" { + type = string + description = <<-EOT + Name of the kubernetes cluster that will host this DSS instance (should generally describe the DSS instance being hosted) + + Example: `dss-che-1` + EOT +} + variable "image" { type = string description = <<-EOT diff --git a/deploy/infrastructure/modules/terraform-aws-dss/main.tf b/deploy/infrastructure/modules/terraform-aws-dss/main.tf index b328590b1..85581914c 100644 --- a/deploy/infrastructure/modules/terraform-aws-dss/main.tf +++ b/deploy/infrastructure/modules/terraform-aws-dss/main.tf @@ -16,6 +16,7 @@ module "terraform-aws-kubernetes" { module "terraform-commons-dss" { # See variables.tf for variables description. + cluster_name = var.cluster_name image = var.image image_pull_secret = var.image_pull_secret kubernetes_namespace = var.kubernetes_namespace diff --git a/deploy/infrastructure/modules/terraform-google-dss/main.tf b/deploy/infrastructure/modules/terraform-google-dss/main.tf index 90754b390..88969912c 100644 --- a/deploy/infrastructure/modules/terraform-google-dss/main.tf +++ b/deploy/infrastructure/modules/terraform-google-dss/main.tf @@ -16,6 +16,7 @@ module "terraform-google-kubernetes" { module "terraform-commons-dss" { # See variables.tf for variables description. + cluster_name = var.cluster_name image = var.image kubernetes_namespace = var.kubernetes_namespace kubernetes_storage_class = var.google_kubernetes_storage_class diff --git a/deploy/infrastructure/utils/variables.py b/deploy/infrastructure/utils/variables.py index d28009670..476ef6b48 100755 --- a/deploy/infrastructure/utils/variables.py +++ b/deploy/infrastructure/utils/variables.py @@ -23,7 +23,7 @@ # Variables per project # For all */terraform-* -GLOBAL_VARIABLES = ["app_hostname", "db_hostname_suffix", "datastore_type", "node_count"] +GLOBAL_VARIABLES = ["app_hostname", "db_hostname_suffix", "datastore_type", "node_count", "cluster_name"] # dependencies/terraform-commons-dss COMMONS_DSS_VARIABLES = GLOBAL_VARIABLES + [ @@ -59,7 +59,6 @@ # dependencies/terraform-*-kubernetes COMMON_KUBERNETES_VARIABLES = GLOBAL_VARIABLES + [ - "cluster_name", "kubernetes_version", ] diff --git a/deploy/services/helm-charts/dss/Chart.yaml b/deploy/services/helm-charts/dss/Chart.yaml index cea6edfe3..a0b0dd2fa 100644 --- a/deploy/services/helm-charts/dss/Chart.yaml +++ b/deploy/services/helm-charts/dss/Chart.yaml @@ -13,3 +13,11 @@ dependencies: repository: https://interuss.github.io/yugabyte-charts/ version: 2025.1.0 condition: yugabyte.enabled + - name: prometheus + repository: https://prometheus-community.github.io/helm-charts + version: 27.51.0 + condition: monitoring.enabled + - name: grafana + repository: https://grafana.github.io/helm-charts + version: 10.3.1 + condition: monitoring.enabled diff --git a/deploy/services/helm-charts/dss/grafana_dashboards b/deploy/services/helm-charts/dss/grafana_dashboards new file mode 120000 index 000000000..dbf1d20b1 --- /dev/null +++ b/deploy/services/helm-charts/dss/grafana_dashboards @@ -0,0 +1 @@ +../../tanka/grafana_dashboards \ No newline at end of file diff --git a/deploy/services/helm-charts/dss/templates/grafana-dashboards.yaml b/deploy/services/helm-charts/dss/templates/grafana-dashboards.yaml new file mode 100644 index 000000000..9739db4b6 --- /dev/null +++ b/deploy/services/helm-charts/dss/templates/grafana-dashboards.yaml @@ -0,0 +1,17 @@ +{{- if $.Values.monitoring.enabled }} + +--- + +apiVersion: v1 +kind: ConfigMap +metadata: + name: dss-grafana-dashboards-default + labels: + grafana_dashboard: "1" +data: +{{- range $path, $bytes := .Files.Glob "grafana_dashboards/*.json" }} + {{ base $path }}: |- +{{ $.Files.Get $path | indent 4 }} + {{- end }} + +{{- end }} diff --git a/deploy/services/helm-charts/dss/values.example.yaml b/deploy/services/helm-charts/dss/values.example.yaml index 3c96e137a..283c2f5f6 100644 --- a/deploy/services/helm-charts/dss/values.example.yaml +++ b/deploy/services/helm-charts/dss/values.example.yaml @@ -77,3 +77,15 @@ loadBalancers: global: cloudProvider: google + +monitoring: # Set to true to enable monitoring stack + enabled: true + +prometheus: + server: + persistentVolume: + storageClass: standard # If you need a specfic storage class, set it there + +grafana: + persistence: + storageClassName: standard # If you need a specfic storage class, set it there diff --git a/deploy/services/helm-charts/dss/values.schema.json b/deploy/services/helm-charts/dss/values.schema.json index 74a330d67..35c9f5a3d 100644 --- a/deploy/services/helm-charts/dss/values.schema.json +++ b/deploy/services/helm-charts/dss/values.schema.json @@ -342,12 +342,23 @@ } }, "required": ["cloudProvider"] + }, + "monitoring": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable monitoring stack with prometheus and grafana." + } + }, + "required": ["enabled"] } }, "required": [ "loadBalancers", "dss", - "global" + "global", + "monitoring" ], "title": "Values", "type": "object" diff --git a/deploy/services/helm-charts/dss/values.yaml b/deploy/services/helm-charts/dss/values.yaml index 8d842254e..40897d127 100644 --- a/deploy/services/helm-charts/dss/values.yaml +++ b/deploy/services/helm-charts/dss/values.yaml @@ -64,3 +64,209 @@ yugabyte: placement_cloud: "cloud-1" placement_region: "uss-1" placement_zone: "zone-1" + +monitoring: + enabled: false + +prometheus: + server: + global: + scrape_interval: 5s + scrape_timeout: 5s + evaluation_interval: 5s + + service: + annotations: + 'prometheus.io/scrape': 'true' + 'prometheus.io/port': '9090' + + prometheus-pushgateway: + enabled: false + + prometheus-node-exporter: + enabled: false + + kube-state-metrics: + enabled: false + + alertmanager: + enabled: false + + serverFiles: + prometheus.yml: + scrape_configs: + - job_name: K8s-Endpoints + tls_config: + insecure_skip_verify: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + separator: ; + regex: "true" + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + separator: ; + regex: (https?) + target_label: __scheme__ + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + separator: ; + regex: (.+) + target_label: __metrics_path__ + replacement: $1 + action: replace + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + separator: ; + regex: ([^:]+)(?::\d+)?;(\d+) + target_label: __address__ + replacement: $1:$2 + action: replace + - separator: ; + regex: __meta_kubernetes_service_label_([^_]+)(_kubernetes_io_name)?$ + replacement: $1 + action: labelmap + - source_labels: [__meta_kubernetes_namespace] + separator: ; + target_label: kubernetes_namespace + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + target_label: kubernetes_name + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_pod_label_statefulset_kubernetes_io_pod_name] + separator: ; + regex: (dss-cockroachdb-\d+) + target_label: pod_name + replacement: $1 + action: replace + kubernetes_sd_configs: + - role: endpoints + + recording_rules.yml: + "groups": + - "name": "rules/aggregation.rules" + "rules": + - "expr": "sum without(store) (capacity{app=\"cockroachdb\"})" + "record": "node:capacity" + - "expr": "sum without(instance) (node:capacity{app=\"cockroachdb\"})" + "record": "cluster:capacity" + - "expr": "sum without(store) (capacity_available{app=\"cockroachdb\"})" + "record": "node:capacity_available" + - "expr": "sum without(instance) (node:capacity_available{app=\"cockroachdb\"})" + "record": "cluster:capacity_available" + - "expr": "capacity_available{app=\"cockroachdb\"} / capacity{app=\"cockroachdb\"}" + "record": "capacity_available:ratio" + - "expr": "node:capacity_available{app=\"cockroachdb\"} / node:capacity{app=\"cockroachdb\"}" + "record": "node:capacity_available:ratio" + - "expr": "cluster:capacity_available{app=\"cockroachdb\"} / cluster:capacity{app=\"cockroachdb\"}" + "record": "cluster:capacity_available:ratio" + - "expr": "rate(txn_durations_bucket{app=\"cockroachdb\"}[1m])" + "record": "txn_durations_bucket:rate1m" + - "expr": "histogram_quantile(0.5, txn_durations_bucket:rate1m)" + "record": "txn_durations:rate1m:quantile_50" + - "expr": "histogram_quantile(0.75, txn_durations_bucket:rate1m)" + "record": "txn_durations:rate1m:quantile_75" + - "expr": "histogram_quantile(0.9, txn_durations_bucket:rate1m)" + "record": "txn_durations:rate1m:quantile_90" + - "expr": "histogram_quantile(0.95, txn_durations_bucket:rate1m)" + "record": "txn_durations:rate1m:quantile_95" + - "expr": "histogram_quantile(0.99, txn_durations_bucket:rate1m)" + "record": "txn_durations:rate1m:quantile_99" + - "expr": "rate(exec_latency_bucket{app=\"cockroachdb\"}[1m])" + "record": "exec_latency_bucket:rate1m" + - "expr": "histogram_quantile(0.5, exec_latency_bucket:rate1m)" + "record": "exec_latency:rate1m:quantile_50" + - "expr": "histogram_quantile(0.75, exec_latency_bucket:rate1m)" + "record": "exec_latency:rate1m:quantile_75" + - "expr": "histogram_quantile(0.9, exec_latency_bucket:rate1m)" + "record": "exec_latency:rate1m:quantile_90" + - "expr": "histogram_quantile(0.95, exec_latency_bucket:rate1m)" + "record": "exec_latency:rate1m:quantile_95" + - "expr": "histogram_quantile(0.99, exec_latency_bucket:rate1m)" + "record": "exec_latency:rate1m:quantile_99" + - "expr": "rate(round_trip_latency_bucket{app=\"cockroachdb\"}[1m])" + "record": "round_trip_latency_bucket:rate1m" + - "expr": "histogram_quantile(0.5, round_trip_latency_bucket:rate1m)" + "record": "round_trip_latency:rate1m:quantile_50" + - "expr": "histogram_quantile(0.75, round_trip_latency_bucket:rate1m)" + "record": "round_trip_latency:rate1m:quantile_75" + - "expr": "histogram_quantile(0.9, round_trip_latency_bucket:rate1m)" + "record": "round_trip_latency:rate1m:quantile_90" + - "expr": "histogram_quantile(0.95, round_trip_latency_bucket:rate1m)" + "record": "round_trip_latency:rate1m:quantile_95" + - "expr": "histogram_quantile(0.99, round_trip_latency_bucket:rate1m)" + "record": "round_trip_latency:rate1m:quantile_99" + - "expr": "rate(sql_exec_latency_bucket{app=\"cockroachdb\"}[1m])" + "record": "sql_exec_latency_bucket:rate1m" + - "expr": "histogram_quantile(0.5, sql_exec_latency_bucket:rate1m)" + "record": "sql_exec_latency:rate1m:quantile_50" + - "expr": "histogram_quantile(0.75, sql_exec_latency_bucket:rate1m)" + "record": "sql_exec_latency:rate1m:quantile_75" + - "expr": "histogram_quantile(0.9, sql_exec_latency_bucket:rate1m)" + "record": "sql_exec_latency:rate1m:quantile_90" + - "expr": "histogram_quantile(0.95, sql_exec_latency_bucket:rate1m)" + "record": "sql_exec_latency:rate1m:quantile_95" + - "expr": "histogram_quantile(0.99, sql_exec_latency_bucket:rate1m)" + "record": "sql_exec_latency:rate1m:quantile_99" + - "expr": "rate(raft_process_logcommit_latency_bucket{app=\"cockroachdb\"}[1m])" + "record": "raft_process_logcommit_latency_bucket:rate1m" + - "expr": "histogram_quantile(0.5, raft_process_logcommit_latency_bucket:rate1m)" + "record": "raft_process_logcommit_latency:rate1m:quantile_50" + - "expr": "histogram_quantile(0.75, raft_process_logcommit_latency_bucket:rate1m)" + "record": "raft_process_logcommit_latency:rate1m:quantile_75" + - "expr": "histogram_quantile(0.9, raft_process_logcommit_latency_bucket:rate1m)" + "record": "raft_process_logcommit_latency:rate1m:quantile_90" + - "expr": "histogram_quantile(0.95, raft_process_logcommit_latency_bucket:rate1m)" + "record": "raft_process_logcommit_latency:rate1m:quantile_95" + - "expr": "histogram_quantile(0.99, raft_process_logcommit_latency_bucket:rate1m)" + "record": "raft_process_logcommit_latency:rate1m:quantile_99" + - "expr": "rate(raft_process_commandcommit_latency_bucket{app=\"cockroachdb\"}[1m])" + "record": "raft_process_commandcommit_latency_bucket:rate1m" + - "expr": "histogram_quantile(0.5, raft_process_commandcommit_latency_bucket:rate1m)" + "record": "raft_process_commandcommit_latency:rate1m:quantile_50" + - "expr": "histogram_quantile(0.75, raft_process_commandcommit_latency_bucket:rate1m)" + "record": "raft_process_commandcommit_latency:rate1m:quantile_75" + - "expr": "histogram_quantile(0.9, raft_process_commandcommit_latency_bucket:rate1m)" + "record": "raft_process_commandcommit_latency:rate1m:quantile_90" + - "expr": "histogram_quantile(0.95, raft_process_commandcommit_latency_bucket:rate1m)" + "record": "raft_process_commandcommit_latency:rate1m:quantile_95" + - "expr": "histogram_quantile(0.99, raft_process_commandcommit_latency_bucket:rate1m)" + "record": "raft_process_commandcommit_latency:rate1m:quantile_99" + +grafana: + + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: prometheus + type: prometheus + access: proxy + orgId: 1 + url: http://dss-prometheus-server:80 + version: 1 + editable: true + + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default + + dashboardsConfigMaps: + default: "dss-grafana-dashboards-default" + + persistence: + type: pvc + enabled: true +>>>>>>> f3220540 ([helm] Add support for monitoring stack) diff --git a/docs/operations/monitoring.md b/docs/operations/monitoring.md index 04e321a9c..b0287b399 100644 --- a/docs/operations/monitoring.md +++ b/docs/operations/monitoring.md @@ -6,22 +6,28 @@ Some of the tools from [the manual deployment documentation](../build.md#prerequ ## Grafana / Prometheus -Note: this monitoring stack is only currently brought up when deploying [services](../index.md#deployment-layers) with [tanka](../services/tanka.md). - By default, an instance of Grafana and Prometheus are deployed along with the core DSS services; this combination allows you to view (Grafana) CRDB metrics (collected by Prometheus). To view Grafana, first ensure that the appropriate cluster context is selected (`kubectl config current-context`). Then, run the following command: -```shell script +```shell kubectl get pod | grep grafana | awk '{print $1}' | xargs -I {} kubectl port-forward {} 3000 ``` While that command is running, open a browser and navigate to -[http://localhost:3000](http://localhost:3000). The default username is `admin` -with a default password of `admin`. Click the magnifying glass on the left side -to select a dashboard to view. +[http://localhost:3000](http://localhost:3000). + +The default username is `admin` with a default password of `admin` if using tanka, or a random value in a kubernetes secret named `-grafana` if using helm charts. + +Example to retrieve the secret in a default 'dss' release: + +```shell +kubectl get secrets/dss-grafana -o jsonpath="{.data.admin-password}" | base64 -d +``` + +Click the magnifying glass on the left side to select a dashboard to view. ## Prometheus Federation (Multi Cluster Monitoring)