diff --git a/ansible/grafana-dashboards/idr-per-server.json b/ansible/grafana-dashboards/idr-per-server.json new file mode 100644 index 00000000..71bcec43 --- /dev/null +++ b/ansible/grafana-dashboards/idr-per-server.json @@ -0,0 +1,195 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 0, + "id": 1, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": 3, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "hostname", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(1 - node_filesystem_free{fstype!~\"(nfs|nfs4|overlay|rootfs|rpc_pipefs|tmpfs)\", instance=\"$hostname\"} / node_filesystem_size{fstype!~\"(nfs|nfs4|overlay|rootfs|rpc_pipefs|tmpfs)\", instance=\"$hostname\"}) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{mountpoint}} ({{device}})", + "metric": "", + "refId": "A", + "step": 120 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$hostname disk", + "tooltip": { + "shared": true, + "sort": 1, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "Used space", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Hostname", + "multi": true, + "name": "hostname", + "options": [], + "query": "label_values(node_exporter_build_info, instance)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "IDR per server", + "version": 2 +} \ No newline at end of file diff --git a/ansible/grafana-dashboards/idr-sessions.json b/ansible/grafana-dashboards/idr-sessions.json new file mode 100644 index 00000000..c10b49ba --- /dev/null +++ b/ansible/grafana-dashboards/idr-sessions.json @@ -0,0 +1,471 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "5s", + "rows": [ + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": 2, + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": "hostname", + "seriesOverrides": [ + { + "alias": "Number of requests", + "dashes": true, + "legend": false, + "lines": false, + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket{instance=\"$hostname\"}[$quantileint])) by (le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "50", + "refId": "A", + "step": 4 + }, + { + "expr": "histogram_quantile(0.90, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket{instance=\"$hostname\"}[$quantileint])) by (le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "90", + "refId": "B", + "step": 4 + }, + { + "expr": "histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket{instance=\"$hostname\"}[$quantileint])) by (le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "95", + "refId": "C", + "step": 4 + }, + { + "expr": "histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket{instance=\"$hostname\"}[$quantileint])) by (le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "99", + "refId": "D", + "step": 4 + }, + { + "expr": "histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket{instance=\"$hostname\"}[$quantileint])) by (le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "99.9", + "refId": "E", + "step": 4 + }, + { + "expr": "rate(django_http_requests_latency_including_middlewares_seconds_count{instance=\"$hostname\"}[$quantileint])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Number of requests", + "refId": "F", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Web latency ($hostname)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "Latency", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "none", + "label": "Requests /s", + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 0, + "id": 1, + "legend": { + "alignAsTable": false, + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": 3, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "hostname", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "omero_sessions_active{instance=\"$hostname\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{username}}", + "metric": "omero_sessions_active", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "OMERO.server sessions ($hostname)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(django_http_responses_total_by_status[$quantileint])) without (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{status}}", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Django request status", + "tooltip": { + "shared": true, + "sort": 1, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Requests /s", + "logBase": 10, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Hostname", + "multi": true, + "name": "hostname", + "options": [], + "query": "label_values(node_exporter_build_info, instance)", + "refresh": 1, + "regex": "/(.*omero.*)/", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "2m", + "value": "2m" + }, + "hide": 0, + "label": "Quantile interval", + "name": "quantileint", + "options": [ + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": true, + "text": "2m", + "value": "2m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "15m", + "value": "15m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + } + ], + "query": "1m,2m,5m,15m,30m,1h", + "refresh": 2, + "type": "interval" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "IDR sessions", + "version": 8 +} \ No newline at end of file diff --git a/ansible/grafana-dashboards/idr-vertical.json b/ansible/grafana-dashboards/idr-vertical.json new file mode 100644 index 00000000..ceac8dd8 --- /dev/null +++ b/ansible/grafana-dashboards/idr-vertical.json @@ -0,0 +1,498 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": 259, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "CPU usage summed over all CPUs", + "fill": 1, + "id": 1, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": 2, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(100 - (avg by (instance) (irate(node_cpu{mode=\"idle\", instance=~\".*$servergroup.*\"}[5m])) * 100)) * on(instance) (count(node_cpu{mode=\"idle\", instance=~\".*$servergroup.*\"}) without (cpu, mode))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "metric": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$servergroup CPU", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "CPU (Sum)", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 0, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": 2, + "nullPointMode": "null", + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*[^%]$/", + "linewidth": 3 + }, + { + "alias": "/.*%$/", + "dashes": true, + "legend": false, + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Active{instance=~\".*$servergroup.*\"} / 1024 / 1024 / 2014", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 600 + }, + { + "expr": "node_memory_Active{instance=~\".*$servergroup.*\"} / node_memory_MemTotal{instance=~\".*$servergroup.*\"} * 100", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{instance}} %", + "refId": "C", + "step": 600 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$servergroup memory", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Active memory GB", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": "Active memory %", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": 2, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=~\".*$servergroup.*\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "metric": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$servergroup load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Load 1", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": "servergroup", + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 180, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Network IO (+ve=receive, -ve=transmit)", + "fill": 1, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": 2, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "servergroup", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum (irate(node_network_receive_bytes{instance=~\".*$servergroup.*\"}[1m])) by (instance) / 1024 / 1024", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{instance}} +", + "metric": "", + "refId": "A", + "step": 600 + }, + { + "expr": "- sum (irate(node_network_transmit_bytes{instance=~\".*$servergroup.*\"}[1m])) by (instance) / 1024 / 1024", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{instance}} -", + "metric": "", + "refId": "B", + "step": 600 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$servergroup Network", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "mbytes", + "label": "Network +rx -tx", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "tags": [], + "text": "omero + database + docker", + "value": [ + "omero", + "database", + "docker" + ] + }, + "hide": 0, + "includeAll": false, + "label": "Servers", + "multi": true, + "name": "servergroup", + "options": [ + { + "selected": true, + "text": "omero", + "value": "omero" + }, + { + "selected": true, + "text": "database", + "value": "database" + }, + { + "selected": true, + "text": "docker", + "value": "docker" + } + ], + "query": "omero, database, docker", + "type": "custom" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "IDR vertical", + "version": 2 +} \ No newline at end of file diff --git a/ansible/group_vars/dockermanager-hosts.yml b/ansible/group_vars/dockermanager-hosts.yml index da25c473..eae958e9 100644 --- a/ansible/group_vars/dockermanager-hosts.yml +++ b/ansible/group_vars/dockermanager-hosts.yml @@ -1,5 +1,7 @@ docker_use_ipv4_nic_mtu: True +kubernetes_kube_version: 1.7.0 + idr_jupyter_hub_image: "imagedata/jupyterhub-githubauth:0.4.0" idr_jupyter_notebook_image: "imagedata/jupyter-docker:develop" idr_jupyter_notebook_repo: https://github.com/IDR/idr-notebooks.git diff --git a/ansible/group_vars/dockerworker-hosts.yml b/ansible/group_vars/dockerworker-hosts.yml index 8c098f3d..320f0541 100644 --- a/ansible/group_vars/dockerworker-hosts.yml +++ b/ansible/group_vars/dockerworker-hosts.yml @@ -1,4 +1,6 @@ docker_use_ipv4_nic_mtu: True +kubernetes_kube_version: 1.7.0 + idr_jupyter_hub_image: "imagedata/jupyterhub-githubauth:0.4.0" idr_jupyter_notebook_image: "imagedata/jupyter-docker:develop" diff --git a/ansible/group_vars/omero-hosts.yml b/ansible/group_vars/omero-hosts.yml index 5216b501..fab56d12 100644 --- a/ansible/group_vars/omero-hosts.yml +++ b/ansible/group_vars/omero-hosts.yml @@ -12,7 +12,6 @@ idr_omero_ice_version: "3.6" idr_omero_upgrade: False # The IDR version system doesn't match the OMERO.server version omero_server_checkupgrade_comparator: '!=' -idr_omero_upgrade: False idr_omero_omego_additional_args: "--downloadurl https://downloads.openmicroscopy.org/idr" # If you want to speed up Ice installation use a precompiled IcePy: @@ -137,6 +136,12 @@ omero_web_config_set: omero.web.ui.metadata_panes: [] omero.web.wsgi_timeout: "{{ idr_omero_web_timeout | default(60) }}" +# TODO: This needs careful review of the code +# https://github.com/openmicroscopy/openmicroscopy/blob/v5.4.0-m2/components/tools/OmeroWeb/omeroweb/decorators.py#L305 + omero.web.public.cache.enabled: True + omero.web.public.cache.timeout: 60 + + ###################################################################### # Plugins and additional web configuration diff --git a/ansible/group_vars/proxy-hosts.yml b/ansible/group_vars/proxy-hosts.yml index 780546b3..159af9f1 100644 --- a/ansible/group_vars/proxy-hosts.yml +++ b/ansible/group_vars/proxy-hosts.yml @@ -27,7 +27,7 @@ nginx_proxy_websockets_enable: True nginx_proxy_upstream_servers: - name: omeroreadonly balance: ip_hash - servers: "{{ omero_omeroreadonly_hosts }}" + servers: "{{ omero_omeroreadonly_hosts | sort }}" - name: omeroreadwrite servers: "{{ omero_omeroreadwrite_hosts }}" @@ -271,7 +271,8 @@ nginx_proxy_sites: "{{ _nginx_proxy_sites + (idr_proxy_additional_sites | defaul ###################################################################### # Other -#nginx_proxy_block_locations: +nginx_proxy_block_locations: +- "^~ /django_prometheus" #- "^~ /login" #nginx_proxy_set_header_host: 'idr.openmicroscopy.org' diff --git a/ansible/idr-kubernetes.yml b/ansible/idr-kubernetes.yml index 2d96b863..3fd50b6f 100644 --- a/ansible/idr-kubernetes.yml +++ b/ansible/idr-kubernetes.yml @@ -1,3 +1,5 @@ +# Setup a kubernetes cluster +# idr-docker.yml must be run first --- - hosts: > diff --git a/ansible/idr-omero-readonly.yml b/ansible/idr-omero-readonly.yml index 6020a0b4..97c4bcbb 100644 --- a/ansible/idr-omero-readonly.yml +++ b/ansible/idr-omero-readonly.yml @@ -7,6 +7,12 @@ # OMERO read-write fileserver for OMERO read-only - hosts: "{{ idr_environment | default('idr') }}-omeroreadwrite-hosts" + pre_tasks: + - name: Create idr-metadata directory + file: + path: /data/idr-metadata + state: directory + roles: - role: openmicroscopy.nfs-share @@ -18,6 +24,9 @@ # TODO: Limit which hosts can write to this dir - host: "*" options: 'rw' + /data/idr-metadata: + - host: "*" + options: 'ro' # Include restart handlers - role: openmicroscopy.omero-common @@ -53,6 +62,9 @@ - path: /data/BioFormatsCache location: "{{ omero_fileserver_host_ansible }}:/data/BioFormatsCache" opts: rw,sync + - path: /data/idr-metadata + location: "{{ omero_fileserver_host_ansible }}:/data/idr-metadata" + opts: ro # Include restart handlers - role: openmicroscopy.omero-common diff --git a/ansible/idr-omero.yml b/ansible/idr-omero.yml index c5cf50ee..4a5fca05 100644 --- a/ansible/idr-omero.yml +++ b/ansible/idr-omero.yml @@ -58,11 +58,6 @@ - role: openmicroscopy.omero-server - # Use a single redis server for shared web sessions - # TODO: Add omeroweb hosts group - - role: openmicroscopy.redis - redis_listen: 0.0.0.0 - environment: "{{ idr_ANSIBLE_ENVIRONMENT_VARIABLES | default({}) }}" @@ -70,21 +65,35 @@ - hosts: > {{ idr_environment | default('idr') }}-omero-hosts - pre_tasks: - - name: Get redis IP - set_fact: - omero_redis_host_ansible: >- - {{ - hostvars[groups[ - idr_environment | default('idr') + '-omeroreadwrite-hosts'][0]] - ['ansible_' + (idr_net_iface | default('eth0'))]['ipv4']['address'] - }} - when: "{{ groups[idr_environment | default('idr') + '-omeroreadwrite-hosts'] is defined }}" - roles: + - role: openmicroscopy.redis - role: openmicroscopy.omero-web - role: openmicroscopy.omero-web-apps # Vars are in group_vars/omero-hosts.yml environment: "{{ idr_ANSIBLE_ENVIRONMENT_VARIABLES | default({}) }}" + + +# TODO: Replace with a template using +# https://github.com/openmicroscopy/openmicroscopy/pull/5387 +- hosts: "{{ idr_environment | default('idr') }}-omeroreadwrite-hosts" + + tasks: + - name: Set Nginx proxy timeout + become: yes + lineinfile: + insertafter: 'server\s*{' + path: /etc/nginx/conf.d/omero-web.conf + line: proxy_read_timeout {{ idr_omero_web_timeout }}; + regexp: proxy_read_timeout\s+.* + state: present + notify: + - restart nginx + + handlers: + - name: restart nginx + become: yes + service: + name: nginx + state: restarted diff --git a/ansible/management-grafana.yml b/ansible/management-grafana.yml new file mode 100644 index 00000000..bdbfd945 --- /dev/null +++ b/ansible/management-grafana.yml @@ -0,0 +1,147 @@ +# Setup grafana docker, user and dashboard +# Assumes there is already a local docker container called prometheus +# as a datasource + +- hosts: "{{ idr_environment | default('idr') }}-management-hosts" + + tasks: + + - name: Run docker grafana + become: yes + docker_container: + image: grafana/grafana + links: + - prometheus:prometheus + name: grafana + published_ports: + - "3000:3000" + state: started + volumes: + - /data/grafana:/var/lib/grafana + + - name: Wait for grafana + wait_for: + port: 3000 + + # Port 3000 may be open before Grafana is ready, so retry a few times + - name: Check user + uri: + url: "{{ grafana_host }}/api/users/lookup?loginOrEmail=idr" + user: "{{ grafana_admin_user }}" + password: "{{ grafana_admin_password }}" + force_basic_auth: yes + status_code: [200, 404] + register: grafana_get_user + check_mode: no + until: grafana_get_user | succeeded + retries: 5 + delay: 5 + + - name: Create user + uri: + url: "{{ grafana_host }}/api/admin/users" + user: "{{ grafana_admin_user }}" + password: "{{ grafana_admin_password }}" + force_basic_auth: yes + method: POST + body: + name: idr + email: idr@openmicroscopy.org + login: idr + password: idr123 + body_format: json + status_code: 200 + register: grafana_create_user + when: grafana_get_user.status == 404 + + - name: Check datasource + uri: + url: "{{ grafana_host }}/api/datasources/name/prometheus" + user: "{{ grafana_admin_user }}" + password: "{{ grafana_admin_password }}" + force_basic_auth: yes + status_code: [200, 404] + register: grafana_get_datasource + check_mode: no + + - name: Create datasource + uri: + url: "{{ grafana_host }}/api/datasources" + user: "{{ grafana_admin_user }}" + password: "{{ grafana_admin_password }}" + force_basic_auth: yes + method: POST + body: + name: prometheus + type: prometheus + url: "http://prometheus:9090" + access: proxy + isDefault: true + body_format: json + status_code: 200 + when: grafana_get_datasource.status == 404 + + - name: Create dashboards directory + become: yes + file: + path: "{{ grafana_dashboard_json_dir }}" + recurse: yes + + - name: Copy dashboards + become: yes + copy: + src: grafana-dashboards/{{ item }}.json + dest: "{{ grafana_dashboard_json_dir }}/{{ item }}.json" + with_items: + - "{{ grafana_dashboard_list }}" + + - name: Check dashboard + uri: + url: "{{ grafana_host }}/api/dashboards/db/{{ item }}" + user: "{{ grafana_admin_user }}" + password: "{{ grafana_admin_password }}" + force_basic_auth: yes + status_code: [200, 404] + with_items: + - "{{ grafana_dashboard_list }}" + register: grafana_get_dashboard + check_mode: no + + - name: Read dashboard json + slurp: + src: "{{ grafana_dashboard_json_dir }}/{{ item.item }}.json" + register: grafana_dashboard_json + when: item.status == 404 + with_items: + - "{{ grafana_get_dashboard.results }}" + + - name: Create dashboard + uri: + url: "{{ grafana_host }}/api/dashboards/db" + user: "{{ grafana_admin_user }}" + password: "{{ grafana_admin_password }}" + force_basic_auth: yes + method: POST + body: + dashboard: > + {{ item.content | + b64decode | + regex_replace('\$\{DS_PROMETHEUS\}', 'prometheus') | + from_json + }} + overwrite: True + body_format: json + status_code: 200 + when: item.content is defined + with_items: + - "{{ grafana_dashboard_json.results }}" + + vars: + grafana_dashboard_json_dir: /opt/grafana/dashboards + grafana_host: "http://localhost:3000" + grafana_admin_user: admin + grafana_admin_password: admin + grafana_dashboard_list: + - idr-per-server + - idr-sessions + - idr-vertical diff --git a/ansible/management-prometheus.yml b/ansible/management-prometheus.yml index fb246613..a3a12dc4 100644 --- a/ansible/management-prometheus.yml +++ b/ansible/management-prometheus.yml @@ -18,24 +18,144 @@ - role: openmicroscopy.omero-common tasks: - # TODO: Disabled due to https://github.com/prometheus/jmx_exporter/issues/156 + # TODO: Requires a custom build from HEAD - name: omero prometheus agent become: yes copy: content: | - # Disabled due to https://github.com/prometheus/jmx_exporter/issues/156 - #config set -- omero.jvmcfg.append.blitz "-javaagent:{{ jmx_javaagent }}=9180:/etc/prometheus/jmx-default-config.yml" - #config set -- omero.jvmcfg.append.indexer "-javaagent:{{ jmx_javaagent }}=9181:/etc/prometheus/jmx-default-config.yml" - #config set -- omero.jvmcfg.append.pixeldata "-javaagent:{{ jmx_javaagent }}=9182:/etc/prometheus/jmx-default-config.yml" + # Requires https://github.com/prometheus/jmx_exporter/pull/162 + config set -- omero.jvmcfg.append.blitz "-javaagent:{{ jmx_javaagent }}=9180:/etc/prometheus/jmx-default-config.yml" + config set -- omero.jvmcfg.append.indexer "-javaagent:{{ jmx_javaagent }}=9181:/etc/prometheus/jmx-default-config.yml" + config set -- omero.jvmcfg.append.pixeldata "-javaagent:{{ jmx_javaagent }}=9182:/etc/prometheus/jmx-default-config.yml" dest: "{{ omero_common_basedir }}/server/config/prometheus.omero" notify: - restart omero-server + - name: omero-web django prometheus install + become: yes + pip: + #name: django_prometheus + name: https://github.com/IDR/django-prometheus/archive/v1.0.10-IDR1.zip + state: present + #version: v + virtualenv: "{{ omero_common_basedir }}/web/venv" + virtualenv_site_packages: yes + notify: + - restart omero-web + + - name: omero-web django prometheus configure + become: yes + copy: + content: | + config append -- omero.web.middleware '{"index":0, "class": "django_prometheus.middleware.PrometheusBeforeMiddleware"}' + config append -- omero.web.middleware '{"index":1000, "class": "django_prometheus.middleware.PrometheusAfterMiddleware"}' + config append -- omero.web.apps '"django_prometheus"' + + config set -- omero.web.wsgi_args '--config file:/opt/omero/web/config/gunicorn-config.py' + dest: "{{ omero_common_basedir }}/web/config/django-prometheus.omero" + notify: + - restart omero-web + + - name: omero-web gunicorn prometheus configure + become: yes + copy: + content: | + from prometheus_client import multiprocess + def child_exit(server, worker): + multiprocess.mark_process_dead(worker.pid) + dest: "{{ omero_common_basedir }}/web/config/gunicorn-config.py" + notify: + - restart omero-web + + - name: omero-web service.d directory + become: yes + file: + path: /etc/systemd/system/omero-web.service.d + state: directory + + - name: omero-web service.d prometheus + become: yes + copy: + content: | + [Service] + Environment="prometheus_multiproc_dir=/opt/omero/web/OMERO.web/var/prometheus" + ExecStartPre=/bin/sh -c '/usr/bin/rm -rf "$prometheus_multiproc_dir" && \ + /usr/bin/mkdir -p "$prometheus_multiproc_dir"' + dest: /etc/systemd/system/omero-web.service.d/prometheus.conf + notify: + - restart omero-web + vars: # prometheus-jmx automatically creates this: jmx_javaagent: /opt/prometheus/jars/jmx_prometheus_javaagent.jar +# prometheus-omero-py session monitoring +# TODO: Consider moving this into a role +- hosts: "{{ idr_environment | default('idr') }}-omero-hosts" + + roles: + - role: openmicroscopy.versioncontrol-utils + + tasks: + + # TODO: If we keep this move to a separate repo + - name: prometheus-omero-py | install prometheus-omero-py + become: yes + git: + dest: /opt/prometheus-omero-py/src + force: yes + repo: https://github.com/IDR/omero-prometheus-tools.git + version: 0.0.1 + notify: + - restart prometheus-omero-py + + - name: prometheus-omero-py | setup virtualenv + become: yes + pip: + requirements: /opt/prometheus-omero-py/src/sessions/requirements.txt + state: present + virtualenv: /opt/prometheus-omero-py/venv + virtualenv_site_packages: yes + notify: + - restart prometheus-omero-py + + - name: prometheus-omero-py | systemd service + become: yes + copy: + dest: /etc/systemd/system/prometheus-omero-py.service + src: /opt/prometheus-omero-py/src/sessions/prometheus-omero-py-metrics.service + remote_src: True + notify: + - restart prometheus-omero-py + + - name: prometheus-omero-py | options + become: yes + copy: + content: > + OPTIONS="--host localhost --listen 9171 --verbose --interval 15" + dest: /etc/sysconfig/prometheus-omero-py + notify: + - restart prometheus-omero-py + + - name: prometheus-omero-py | enable service + become: yes + systemd: + daemon_reload: yes + enabled: yes + name: prometheus-omero-py + state: started + + handlers: + - name: restart prometheus-omero-py + become: yes + systemd: + daemon_reload: yes + enabled: yes + name: prometheus-omero-py + state: restarted + + - hosts: > {{ idr_environment | default('idr') }}-dockermanager-hosts {{ idr_environment | default('idr') }}-dockerworker-hosts @@ -70,23 +190,29 @@ jobname: node-exporter # TODO: Disabled due to https://github.com/prometheus/jmx_exporter/issues/156 +# Currently testing using a custom build from HEAD +# https://github.com/prometheus/jmx_exporter/pull/162 +# https://github.com/prometheus/jmx_exporter/commit/5b180affbefa445088d4608b9640ca404ce91ae2 - groupname: blitz - # groups: - # - "{{ idr_environment | default('idr') + '-omero-hosts' }}" + groups: + - "{{ idr_environment | default('idr') + '-omero-hosts' }}" port: 9180 jobname: jmx-blitz - - groupname: indexer - # groups: - # - "{{ idr_environment | default('idr') + '-omero-hosts' }}" - port: 9181 - jobname: jmx-indexer - - - groupname: pixeldata - # groups: - # - "{{ idr_environment | default('idr') + '-omero-hosts' }}" - port: 9182 - jobname: jmx-pixeldata + # TODO: indexer and pixeldata are currently broken (fail to start)on + # metadata53 due to the readonly work. These two targets should be + # reenabled when they are fixed: + # - groupname: indexer + # groups: + # - "{{ idr_environment | default('idr') + '-omero-hosts' }}" + # port: 9181 + # jobname: jmx-indexer + # + # - groupname: pixeldata + # groups: + # - "{{ idr_environment | default('idr') + '-omero-hosts' }}" + # port: 9182 + # jobname: jmx-pixeldata - groupname: cadvisor groups: @@ -95,6 +221,19 @@ port: 9280 jobname: cadvisor-docker + - groupname: omero-web + groups: + - "{{ idr_environment | default('idr') + '-omero-hosts' }}" + port: 80 + jobname: django + metrics_path: /django_prometheus/metrics + + - groupname: omero-sessions + groups: + - "{{ idr_environment | default('idr') + '-omero-hosts' }}" + port: 9171 + jobname: omero-sessions + prometheus_http_2xx_internal_targets: > {{ (groups[(idr_environment | default('idr')) + '-omero-hosts'] + diff --git a/ansible/management.yml b/ansible/management.yml index 0c145929..3158b320 100644 --- a/ansible/management.yml +++ b/ansible/management.yml @@ -12,7 +12,7 @@ - role: openmicroscopy.basedeps # munin requires EPEL - role: openmicroscopy.munin munin_slack_token: "{{ idr_secret_management_slack_token | default(None) }}" - munin_slack_channel: "#idr-notify" + munin_slack_channel: "#idr-notify-{{ idr_environment | default('idr') }}" munin_slack_username: "{{ idr_environment | default('idr') }} Munin Notification" munin_slack_emoji: ":pony:" munin_slack_url: http://{{ ansible_host }}/munin/problems.html @@ -81,7 +81,7 @@ roles: - role: openmicroscopy.omero-logmonitor omero_logmonitor_slack_token: "{{ idr_secret_omero_logmonitor_slack_token | default(None) }}" - omero_logmonitor_slack_channel: "#idr-logs" + omero_logmonitor_slack_channel: idr-logs-{{ idr_environment | default('idr') }} # Docker slack notifications @@ -91,7 +91,7 @@ roles: - role: openmicroscopy.docker-slack-notifier - docker_slack_notifier_channel: "#idr-deployment" + docker_slack_notifier_channel: "#idr-logs-{{ idr_environment | default('idr') }}" docker_slack_notifier_username: "{{ ansible_hostname }}" docker_slack_notifier_icon: ":docker:" docker_slack_notifier_token: "{{ idr_secret_management_slack_webhook | default(None) }}" diff --git a/ansible/requirements.yml b/ansible/requirements.yml index 041b87f0..13788031 100644 --- a/ansible/requirements.yml +++ b/ansible/requirements.yml @@ -37,7 +37,7 @@ version: 2.0.1 - src: openmicroscopy.jekyll-build - version: 1.1.1 + version: 1.2.0 - src: openmicroscopy.local-accounts version: 1.0.1 @@ -67,7 +67,7 @@ version: 1.0.0 - src: openmicroscopy.nginx-proxy - version: 1.3.0 + version: 1.5.1 - src: openmicroscopy.nginx-ssl-selfsigned version: 1.0.0 @@ -100,7 +100,7 @@ version: 1.1.0 - src: openmicroscopy.postgresql - version: 2.0.0 + version: 3.0.0-m1 - src: openmicroscopy.python-pydata version: 1.0.0 @@ -115,7 +115,7 @@ version: 1.0.0 - src: openmicroscopy.selinux-utils - version: 1.0.2 + version: 1.0.3 - src: openmicroscopy.storage-volume-initialise version: 1.0.0 @@ -166,13 +166,17 @@ version: 0.0.2 - name: openmicroscopy.prometheus - src: https://github.com/openmicroscopy/ansible-role-prometheus/archive/0.1.0.tar.gz - version: 0.1.1 + src: https://github.com/openmicroscopy/ansible-role-prometheus/archive/0.2.0.tar.gz + version: 0.2.0 - name: openmicroscopy.prometheus-jmx - src: https://github.com/openmicroscopy/ansible-role-prometheus-jmx/archive/0.1.0.tar.gz - version: 0.1.0 + src: https://github.com/openmicroscopy/ansible-role-prometheus-jmx/archive/0.1.1.tar.gz + version: 0.1.1 - name: openmicroscopy.prometheus-node src: https://github.com/openmicroscopy/ansible-role-prometheus-node/archive/0.1.1.tar.gz version: 0.1.1 + +- name: openmicroscopy.prometheus-postgres + src: https://github.com/openmicroscopy/ansible-role-prometheus-postgres/archive/0.1.0.tar.gz + version: 0.1.0 diff --git a/ansible/roles/kubernetes/defaults/main.yml b/ansible/roles/kubernetes/defaults/main.yml index 9afdafc0..31a9ef62 100644 --- a/ansible/roles/kubernetes/defaults/main.yml +++ b/ansible/roles/kubernetes/defaults/main.yml @@ -23,3 +23,5 @@ kubernetes_init_args: >- , '') }} kubernetes_join_args: --token {{ kubernetes_token }} {{ kubernetes_master }} + +kubernetes_kube_version: '' diff --git a/ansible/roles/kubernetes/tasks/common.yml b/ansible/roles/kubernetes/tasks/common.yml index e25065fe..44abaadb 100644 --- a/ansible/roles/kubernetes/tasks/common.yml +++ b/ansible/roles/kubernetes/tasks/common.yml @@ -46,9 +46,9 @@ become: yes yum: name: - - kubeadm - - kubectl - - kubelet + - kubeadm{{ (kubernetes_kube_version | length > 0) | ternary('-', '') + kubernetes_kube_version }} + - kubectl{{ (kubernetes_kube_version | length > 0) | ternary('-', '') + kubernetes_kube_version }} + - kubelet{{ (kubernetes_kube_version | length > 0) | ternary('-', '') + kubernetes_kube_version }} - kubernetes-cni state: present diff --git a/scripts/os-idr-delete.sh b/scripts/os-idr-delete.sh index 3eace811..2d3b9a2c 100755 --- a/scripts/os-idr-delete.sh +++ b/scripts/os-idr-delete.sh @@ -6,42 +6,41 @@ set -eu idr_environment=$1 -for server in proxy omero database management; do - echo openstack server delete ${idr_environment}-${server} -done -echo -for server in omero database dockermanager dockerworker; do - echo openstack server delete ${idr_environment}-a-${server} +for server in \ + proxy \ + omeroreadonly-1 \ + omeroreadonly-2 \ + omeroreadwrite \ + database \ + dockermanager \ + dockerworker-1 \ + dockerworker-2 \ + management \ + ; do + echo openstack server delete ${idr_environment}-${server} done echo -for volume in proxy-nginxcache omero-data database-db; do +for volume in \ + proxy-nginxcache \ + omeroreadwrite-data \ + database-db \ + dockermanager-data \ + ; do echo openstack volume delete ${idr_environment}-${volume} done echo -for volume in omero-data database-db dockermanager-data; do - echo openstack volume delete ${idr_environment}-a-${volume} -done -echo - -for router in ${idr_environment}-router ${idr_environment}-a-router; do - echo "for port in \$(openstack port list --router ${router} -f value | cut -d\ -f1); do" - echo " openstack router remove port ${router} \$port" - echo "done" - echo -done +echo "for port in \$(openstack port list --router ${idr_environment}-router -f value | cut -d\ -f1); do" +echo " openstack router remove port ${idr_environment}-router \$port" +echo "done" echo -for router in ${idr_environment}-router ${idr_environment}-a-router; do - echo openstack router delete ${router} -done +echo openstack router delete ${idr_environment}-router echo -for network in ${idr_environment} ${idr_environment}-a; do - echo openstack network delete ${network} -done +echo openstack network delete ${idr_environment} echo echo openstack server list -f yaml