Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions deploy/infrastructure/dependencies/terraform-commons-dss/helm.tf
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,32 @@ resource "local_file" "helm_chart_values" {
global = {
cloudProvider = var.kubernetes_cloud_provider_name
}

monitoring = {
enabled = false
},

prometheus = {
server = {
persistentVolume = {
storageClass = var.kubernetes_storage_class
}

global = {
external_labels = {
environment = "dev"
k8s_cluster = var.cluster_name
}
}
}
}

grafana = {
persistence = {
storageClassName = var.kubernetes_storage_class
}
}

}) : yamlencode({
cockroachdb = {
enabled = false
Expand Down Expand Up @@ -261,6 +287,31 @@ resource "local_file" "helm_chart_values" {
global = {
cloudProvider = var.kubernetes_cloud_provider_name
}

monitoring = {
enabled = false
},

prometheus = {
server = {
persistentVolume = {
storageClass = var.kubernetes_storage_class
}

global = {
external_labels = {
environment = "dev"
k8s_cluster = var.cluster_name
}
}
}
}

grafana = {
persistence = {
storageClassName = var.kubernetes_storage_class
}
}
})

}
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,15 @@ variable "node_count" {
}


variable "cluster_name" {
type = string
description = <<-EOT
Name of the kubernetes cluster that will host this DSS instance (should generally describe the DSS instance being hosted)

Example: `dss-che-1`
EOT
}

variable "image" {
type = string
description = <<-EOT
Expand Down
1 change: 1 addition & 0 deletions deploy/infrastructure/modules/terraform-aws-dss/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ module "terraform-aws-kubernetes" {

module "terraform-commons-dss" {
# See variables.tf for variables description.
cluster_name = var.cluster_name
image = var.image
image_pull_secret = var.image_pull_secret
kubernetes_namespace = var.kubernetes_namespace
Expand Down
1 change: 1 addition & 0 deletions deploy/infrastructure/modules/terraform-google-dss/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ module "terraform-google-kubernetes" {

module "terraform-commons-dss" {
# See variables.tf for variables description.
cluster_name = var.cluster_name
image = var.image
kubernetes_namespace = var.kubernetes_namespace
kubernetes_storage_class = var.google_kubernetes_storage_class
Expand Down
3 changes: 1 addition & 2 deletions deploy/infrastructure/utils/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

# Variables per project
# For all */terraform-*
GLOBAL_VARIABLES = ["app_hostname", "db_hostname_suffix", "datastore_type", "node_count"]
GLOBAL_VARIABLES = ["app_hostname", "db_hostname_suffix", "datastore_type", "node_count", "cluster_name"]

# dependencies/terraform-commons-dss
COMMONS_DSS_VARIABLES = GLOBAL_VARIABLES + [
Expand Down Expand Up @@ -59,7 +59,6 @@

# dependencies/terraform-*-kubernetes
COMMON_KUBERNETES_VARIABLES = GLOBAL_VARIABLES + [
"cluster_name",
"kubernetes_version",
]

Expand Down
8 changes: 8 additions & 0 deletions deploy/services/helm-charts/dss/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,11 @@ dependencies:
repository: https://interuss.github.io/yugabyte-charts/
version: 2025.1.0
condition: yugabyte.enabled
- name: prometheus
repository: https://prometheus-community.github.io/helm-charts
version: 27.51.0
condition: monitoring.enabled
- name: grafana
repository: https://grafana.github.io/helm-charts
version: 10.3.1
condition: monitoring.enabled
1 change: 1 addition & 0 deletions deploy/services/helm-charts/dss/grafana_dashboards
17 changes: 17 additions & 0 deletions deploy/services/helm-charts/dss/templates/grafana-dashboards.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{{- if $.Values.monitoring.enabled }}

---

apiVersion: v1
kind: ConfigMap
metadata:
name: dss-grafana-dashboards-default
labels:
grafana_dashboard: "1"
data:
{{- range $path, $bytes := .Files.Glob "grafana_dashboards/*.json" }}
{{ base $path }}: |-
{{ $.Files.Get $path | indent 4 }}
{{- end }}

{{- end }}
12 changes: 12 additions & 0 deletions deploy/services/helm-charts/dss/values.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,15 @@ loadBalancers:

global:
cloudProvider: google

monitoring: # Set to true to enable monitoring stack
enabled: true

prometheus:
server:
persistentVolume:
storageClass: standard # If you need a specfic storage class, set it there

grafana:
persistence:
storageClassName: standard # If you need a specfic storage class, set it there
13 changes: 12 additions & 1 deletion deploy/services/helm-charts/dss/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -342,12 +342,23 @@
}
},
"required": ["cloudProvider"]
},
"monitoring": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean",
"description": "Enable monitoring stack with prometheus and grafana."
}
},
"required": ["enabled"]
}
},
"required": [
"loadBalancers",
"dss",
"global"
"global",
"monitoring"
],
"title": "Values",
"type": "object"
Expand Down
206 changes: 206 additions & 0 deletions deploy/services/helm-charts/dss/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,209 @@ yugabyte:
placement_cloud: "cloud-1"
placement_region: "uss-1"
placement_zone: "zone-1"

monitoring:
enabled: false

prometheus:
server:
global:
scrape_interval: 5s
scrape_timeout: 5s
evaluation_interval: 5s

service:
annotations:
'prometheus.io/scrape': 'true'
'prometheus.io/port': '9090'

prometheus-pushgateway:
enabled: false

prometheus-node-exporter:
enabled: false

kube-state-metrics:
enabled: false

alertmanager:
enabled: false

serverFiles:
prometheus.yml:
scrape_configs:
- job_name: K8s-Endpoints
tls_config:
insecure_skip_verify: true
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
separator: ;
regex: "true"
replacement: $1
action: keep
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
separator: ;
regex: (https?)
target_label: __scheme__
replacement: $1
action: replace
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
separator: ;
regex: (.+)
target_label: __metrics_path__
replacement: $1
action: replace
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
separator: ;
regex: ([^:]+)(?::\d+)?;(\d+)
target_label: __address__
replacement: $1:$2
action: replace
- separator: ;
regex: __meta_kubernetes_service_label_([^_]+)(_kubernetes_io_name)?$
replacement: $1
action: labelmap
- source_labels: [__meta_kubernetes_namespace]
separator: ;
target_label: kubernetes_namespace
replacement: $1
action: replace
- source_labels: [__meta_kubernetes_service_name]
separator: ;
target_label: kubernetes_name
replacement: $1
action: replace
- source_labels: [__meta_kubernetes_pod_label_statefulset_kubernetes_io_pod_name]
separator: ;
regex: (dss-cockroachdb-\d+)
target_label: pod_name
replacement: $1
action: replace
kubernetes_sd_configs:
- role: endpoints

recording_rules.yml:
"groups":
- "name": "rules/aggregation.rules"
"rules":
- "expr": "sum without(store) (capacity{app=\"cockroachdb\"})"
"record": "node:capacity"
- "expr": "sum without(instance) (node:capacity{app=\"cockroachdb\"})"
"record": "cluster:capacity"
- "expr": "sum without(store) (capacity_available{app=\"cockroachdb\"})"
"record": "node:capacity_available"
- "expr": "sum without(instance) (node:capacity_available{app=\"cockroachdb\"})"
"record": "cluster:capacity_available"
- "expr": "capacity_available{app=\"cockroachdb\"} / capacity{app=\"cockroachdb\"}"
"record": "capacity_available:ratio"
- "expr": "node:capacity_available{app=\"cockroachdb\"} / node:capacity{app=\"cockroachdb\"}"
"record": "node:capacity_available:ratio"
- "expr": "cluster:capacity_available{app=\"cockroachdb\"} / cluster:capacity{app=\"cockroachdb\"}"
"record": "cluster:capacity_available:ratio"
- "expr": "rate(txn_durations_bucket{app=\"cockroachdb\"}[1m])"
"record": "txn_durations_bucket:rate1m"
- "expr": "histogram_quantile(0.5, txn_durations_bucket:rate1m)"
"record": "txn_durations:rate1m:quantile_50"
- "expr": "histogram_quantile(0.75, txn_durations_bucket:rate1m)"
"record": "txn_durations:rate1m:quantile_75"
- "expr": "histogram_quantile(0.9, txn_durations_bucket:rate1m)"
"record": "txn_durations:rate1m:quantile_90"
- "expr": "histogram_quantile(0.95, txn_durations_bucket:rate1m)"
"record": "txn_durations:rate1m:quantile_95"
- "expr": "histogram_quantile(0.99, txn_durations_bucket:rate1m)"
"record": "txn_durations:rate1m:quantile_99"
- "expr": "rate(exec_latency_bucket{app=\"cockroachdb\"}[1m])"
"record": "exec_latency_bucket:rate1m"
- "expr": "histogram_quantile(0.5, exec_latency_bucket:rate1m)"
"record": "exec_latency:rate1m:quantile_50"
- "expr": "histogram_quantile(0.75, exec_latency_bucket:rate1m)"
"record": "exec_latency:rate1m:quantile_75"
- "expr": "histogram_quantile(0.9, exec_latency_bucket:rate1m)"
"record": "exec_latency:rate1m:quantile_90"
- "expr": "histogram_quantile(0.95, exec_latency_bucket:rate1m)"
"record": "exec_latency:rate1m:quantile_95"
- "expr": "histogram_quantile(0.99, exec_latency_bucket:rate1m)"
"record": "exec_latency:rate1m:quantile_99"
- "expr": "rate(round_trip_latency_bucket{app=\"cockroachdb\"}[1m])"
"record": "round_trip_latency_bucket:rate1m"
- "expr": "histogram_quantile(0.5, round_trip_latency_bucket:rate1m)"
"record": "round_trip_latency:rate1m:quantile_50"
- "expr": "histogram_quantile(0.75, round_trip_latency_bucket:rate1m)"
"record": "round_trip_latency:rate1m:quantile_75"
- "expr": "histogram_quantile(0.9, round_trip_latency_bucket:rate1m)"
"record": "round_trip_latency:rate1m:quantile_90"
- "expr": "histogram_quantile(0.95, round_trip_latency_bucket:rate1m)"
"record": "round_trip_latency:rate1m:quantile_95"
- "expr": "histogram_quantile(0.99, round_trip_latency_bucket:rate1m)"
"record": "round_trip_latency:rate1m:quantile_99"
- "expr": "rate(sql_exec_latency_bucket{app=\"cockroachdb\"}[1m])"
"record": "sql_exec_latency_bucket:rate1m"
- "expr": "histogram_quantile(0.5, sql_exec_latency_bucket:rate1m)"
"record": "sql_exec_latency:rate1m:quantile_50"
- "expr": "histogram_quantile(0.75, sql_exec_latency_bucket:rate1m)"
"record": "sql_exec_latency:rate1m:quantile_75"
- "expr": "histogram_quantile(0.9, sql_exec_latency_bucket:rate1m)"
"record": "sql_exec_latency:rate1m:quantile_90"
- "expr": "histogram_quantile(0.95, sql_exec_latency_bucket:rate1m)"
"record": "sql_exec_latency:rate1m:quantile_95"
- "expr": "histogram_quantile(0.99, sql_exec_latency_bucket:rate1m)"
"record": "sql_exec_latency:rate1m:quantile_99"
- "expr": "rate(raft_process_logcommit_latency_bucket{app=\"cockroachdb\"}[1m])"
"record": "raft_process_logcommit_latency_bucket:rate1m"
- "expr": "histogram_quantile(0.5, raft_process_logcommit_latency_bucket:rate1m)"
"record": "raft_process_logcommit_latency:rate1m:quantile_50"
- "expr": "histogram_quantile(0.75, raft_process_logcommit_latency_bucket:rate1m)"
"record": "raft_process_logcommit_latency:rate1m:quantile_75"
- "expr": "histogram_quantile(0.9, raft_process_logcommit_latency_bucket:rate1m)"
"record": "raft_process_logcommit_latency:rate1m:quantile_90"
- "expr": "histogram_quantile(0.95, raft_process_logcommit_latency_bucket:rate1m)"
"record": "raft_process_logcommit_latency:rate1m:quantile_95"
- "expr": "histogram_quantile(0.99, raft_process_logcommit_latency_bucket:rate1m)"
"record": "raft_process_logcommit_latency:rate1m:quantile_99"
- "expr": "rate(raft_process_commandcommit_latency_bucket{app=\"cockroachdb\"}[1m])"
"record": "raft_process_commandcommit_latency_bucket:rate1m"
- "expr": "histogram_quantile(0.5, raft_process_commandcommit_latency_bucket:rate1m)"
"record": "raft_process_commandcommit_latency:rate1m:quantile_50"
- "expr": "histogram_quantile(0.75, raft_process_commandcommit_latency_bucket:rate1m)"
"record": "raft_process_commandcommit_latency:rate1m:quantile_75"
- "expr": "histogram_quantile(0.9, raft_process_commandcommit_latency_bucket:rate1m)"
"record": "raft_process_commandcommit_latency:rate1m:quantile_90"
- "expr": "histogram_quantile(0.95, raft_process_commandcommit_latency_bucket:rate1m)"
"record": "raft_process_commandcommit_latency:rate1m:quantile_95"
- "expr": "histogram_quantile(0.99, raft_process_commandcommit_latency_bucket:rate1m)"
"record": "raft_process_commandcommit_latency:rate1m:quantile_99"

grafana:

datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: prometheus
type: prometheus
access: proxy
orgId: 1
url: http://dss-prometheus-server:80
version: 1
editable: true

dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/default

dashboardsConfigMaps:
default: "dss-grafana-dashboards-default"

persistence:
type: pvc
enabled: true
>>>>>>> f3220540 ([helm] Add support for monitoring stack)
Loading
Loading