diff --git a/services/gastown/gastown-grafana-dash-1.json b/services/gastown/gastown-grafana-dash-1.json index e538890720..ee44a3ec0b 100644 --- a/services/gastown/gastown-grafana-dash-1.json +++ b/services/gastown/gastown-grafana-dash-1.json @@ -3173,6 +3173,833 @@ ], "title": "Pending Event Queue Depth", "type": "gauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 129 + }, + "id": 300, + "panels": [], + "title": "Container Startup Latency", + "type": "row" + }, + { + "datasource": { + "type": "vertamedia-clickhouse-datasource", + "uid": "bffxugc31cnpcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 500 + }, + { + "color": "red", + "value": 2000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 130 + }, + "id": 303, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.1", + "targets": [ + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": true, + "format": "table", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.health_ping' AND blob8 = 'ok'", + "rawSql": "SELECT SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.health_ping' AND blob8 = 'ok'", + "refId": "A", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "gastown_events", + "useWindowFuncForMacros": true + } + ], + "timeFrom": "1h", + "title": "Avg Health Ping (1h)", + "type": "stat" + }, + { + "datasource": { + "type": "vertamedia-clickhouse-datasource", + "uid": "bffxugc31cnpcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 1000 + }, + { + "color": "red", + "value": 5000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 130 + }, + "id": 304, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.1", + "targets": [ + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": true, + "format": "table", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.agent_start_fetch' AND blob9 = 'true'", + "rawSql": "SELECT SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.agent_start_fetch' AND blob9 = 'true'", + "refId": "A", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "gastown_events", + "useWindowFuncForMacros": true + } + ], + "timeFrom": "1h", + "title": "Avg Agent Start Fetch (1h)", + "type": "stat" + }, + { + "datasource": { + "type": "vertamedia-clickhouse-datasource", + "uid": "bffxugc31cnpcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "ms" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "timeout_rate" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "min", + "value": 0 + }, + { + "id": "max", + "value": 1 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 134 + }, + "id": 301, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.4.1", + "targets": [ + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": true, + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT $timeSeries AS t, 'p50' AS label, quantileWeighted(0.50)(double1, _sample_interval) AS latency_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.health_ping' AND blob8 = 'ok' GROUP BY t ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, 'p50' AS label, quantileWeighted(0.50)(double1, _sample_interval) AS latency_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.health_ping' AND blob8 = 'ok' GROUP BY t ORDER BY t", + "refId": "A", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "gastown_events", + "useWindowFuncForMacros": true + }, + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": true, + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT $timeSeries AS t, 'p90' AS label, quantileWeighted(0.90)(double1, _sample_interval) AS latency_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.health_ping' AND blob8 = 'ok' GROUP BY t ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, 'p90' AS label, quantileWeighted(0.90)(double1, _sample_interval) AS latency_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.health_ping' AND blob8 = 'ok' GROUP BY t ORDER BY t", + "refId": "B", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "gastown_events", + "useWindowFuncForMacros": true + }, + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": true, + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT $timeSeries AS t, 'p99' AS label, quantileWeighted(0.99)(double1, _sample_interval) AS latency_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.health_ping' AND blob8 = 'ok' GROUP BY t ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, 'p99' AS label, quantileWeighted(0.99)(double1, _sample_interval) AS latency_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.health_ping' AND blob8 = 'ok' GROUP BY t ORDER BY t", + "refId": "C", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "gastown_events", + "useWindowFuncForMacros": true + }, + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": true, + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT $timeSeries AS t, 'timeout_rate' AS label, SUM(IF(blob8 = 'timeout', _sample_interval, 0)) / SUM(_sample_interval) AS timeout_rate FROM gastown_events WHERE $timeFilter AND blob1 = 'container.health_ping' GROUP BY t ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, 'timeout_rate' AS label, SUM(IF(blob8 = 'timeout', _sample_interval, 0)) / SUM(_sample_interval) AS timeout_rate FROM gastown_events WHERE $timeFilter AND blob1 = 'container.health_ping' GROUP BY t ORDER BY t", + "refId": "D", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "gastown_events", + "useWindowFuncForMacros": true + } + ], + "title": "Container Health Ping Latency (cold-start indicator)", + "type": "timeseries" + }, + { + "datasource": { + "type": "vertamedia-clickhouse-datasource", + "uid": "bffxugc31cnpcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "ms" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "failure_rate" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "min", + "value": 0 + }, + { + "id": "max", + "value": 1 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 134 + }, + "id": 302, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.4.1", + "targets": [ + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": true, + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT $timeSeries AS t, 'p50' AS label, quantileWeighted(0.50)(double1, _sample_interval) AS latency_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.agent_start_fetch' AND blob9 = 'true' GROUP BY t ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, 'p50' AS label, quantileWeighted(0.50)(double1, _sample_interval) AS latency_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.agent_start_fetch' AND blob9 = 'true' GROUP BY t ORDER BY t", + "refId": "A", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "gastown_events", + "useWindowFuncForMacros": true + }, + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": true, + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT $timeSeries AS t, 'p90' AS label, quantileWeighted(0.90)(double1, _sample_interval) AS latency_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.agent_start_fetch' AND blob9 = 'true' GROUP BY t ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, 'p90' AS label, quantileWeighted(0.90)(double1, _sample_interval) AS latency_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.agent_start_fetch' AND blob9 = 'true' GROUP BY t ORDER BY t", + "refId": "B", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "gastown_events", + "useWindowFuncForMacros": true + }, + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": true, + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT $timeSeries AS t, 'p99' AS label, quantileWeighted(0.99)(double1, _sample_interval) AS latency_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.agent_start_fetch' AND blob9 = 'true' GROUP BY t ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, 'p99' AS label, quantileWeighted(0.99)(double1, _sample_interval) AS latency_ms FROM gastown_events WHERE $timeFilter AND blob1 = 'container.agent_start_fetch' AND blob9 = 'true' GROUP BY t ORDER BY t", + "refId": "C", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "gastown_events", + "useWindowFuncForMacros": true + }, + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": true, + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT $timeSeries AS t, 'failure_rate' AS label, SUM(IF(blob9 = 'false', _sample_interval, 0)) / SUM(_sample_interval) AS failure_rate FROM gastown_events WHERE $timeFilter AND blob1 = 'container.agent_start_fetch' GROUP BY t ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, 'failure_rate' AS label, SUM(IF(blob9 = 'false', _sample_interval, 0)) / SUM(_sample_interval) AS failure_rate FROM gastown_events WHERE $timeFilter AND blob1 = 'container.agent_start_fetch' GROUP BY t ORDER BY t", + "refId": "D", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "gastown_events", + "useWindowFuncForMacros": true + } + ], + "title": "Container Agent Start Latency (Town DO \u2192 container.fetch('/agents/start') round-trip)", + "type": "timeseries" + }, + { + "datasource": { + "type": "vertamedia-clickhouse-datasource", + "uid": "bffxugc31cnpcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 142 + }, + "id": 305, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.4.1", + "targets": [ + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": true, + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT $timeSeries AS t, 'timeout_rate' AS label, SUM(IF(blob8 = 'timeout', _sample_interval, 0)) / SUM(_sample_interval) AS rate FROM gastown_events WHERE $timeFilter AND blob1 = 'container.health_ping' GROUP BY t ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, 'timeout_rate' AS label, SUM(IF(blob8 = 'timeout', _sample_interval, 0)) / SUM(_sample_interval) AS rate FROM gastown_events WHERE $timeFilter AND blob1 = 'container.health_ping' GROUP BY t ORDER BY t", + "refId": "A", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "gastown_events", + "useWindowFuncForMacros": true + } + ], + "title": "Health Ping Timeout Rate (container cold-start frequency)", + "type": "timeseries" + }, + { + "datasource": { + "type": "vertamedia-clickhouse-datasource", + "uid": "bffxugc31cnpcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "bars", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 0, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "failure" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 142 + }, + "id": 306, + "options": { + "legend": { + "calcs": ["sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.4.1", + "targets": [ + { + "adHocFilters": [], + "adHocValuesQuery": "", + "add_metadata": true, + "contextWindowSize": "10", + "dateTimeColDataType": "timestamp", + "dateTimeType": "DATETIME", + "editorMode": "sql", + "extrapolate": true, + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "nullifySparse": false, + "query": "SELECT $timeSeries AS t, IF(blob9 = 'true', 'success', 'failure') AS label, SUM(_sample_interval) AS count FROM gastown_events WHERE $timeFilter AND blob1 = 'container.agent_start_fetch' GROUP BY t, label ORDER BY t", + "rawSql": "SELECT $timeSeries AS t, IF(blob9 = 'true', 'success', 'failure') AS label, SUM(_sample_interval) AS count FROM gastown_events WHERE $timeFilter AND blob1 = 'container.agent_start_fetch' GROUP BY t, label ORDER BY t", + "refId": "A", + "round": "0s", + "showFormattedSQL": false, + "showHelp": false, + "skip_comments": true, + "table": "gastown_events", + "useWindowFuncForMacros": true + } + ], + "title": "Agent Start Attempts (success / failure)", + "type": "timeseries" } ], "preload": false,