From 9cb1c58c502fbc4e77de684bcc81de6ea3e74fc0 Mon Sep 17 00:00:00 2001 From: Greg Meyer Date: Fri, 7 Oct 2016 18:11:18 -0400 Subject: [PATCH] Adds Elastic Integration --- .travis.yml | 11 + circle.yml | 1 + elastic/README.md | 32 ++ elastic/check.py | 732 ++++++++++++++++++++++++++++++++++++++ elastic/ci/elastic.rake | 87 +++++ elastic/ci/elastic.yaml | 2 + elastic/conf.yaml.example | 43 +++ elastic/manifest.json | 10 + elastic/metadata.csv | 164 +++++++++ elastic/requirements.txt | 1 + elastic/test_elastic.py | 593 ++++++++++++++++++++++++++++++ 11 files changed, 1676 insertions(+) create mode 100644 elastic/README.md create mode 100644 elastic/check.py create mode 100644 elastic/ci/elastic.rake create mode 100644 elastic/ci/elastic.yaml create mode 100644 elastic/conf.yaml.example create mode 100644 elastic/manifest.json create mode 100644 elastic/metadata.csv create mode 100644 elastic/requirements.txt create mode 100644 elastic/test_elastic.py diff --git a/.travis.yml b/.travis.yml index 3c080c4047e5a..c41732363253b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -59,6 +59,17 @@ env: - TRAVIS_FLAVOR=pgbouncer FLAVOR_VERSION=latest - TRAVIS_FLAVOR=apache FLAVOR_VERSION=2.4.12 - TRAVIS_FLAVOR=couch FLAVOR_VERSION=1.6.1 + - TRAVIS_FLAVOR=elastic FLAVOR_VERSION=0.90.13 + - TRAVIS_FLAVOR=elastic FLAVOR_VERSION=1.0.3 + - TRAVIS_FLAVOR=elastic FLAVOR_VERSION=1.1.2 + - TRAVIS_FLAVOR=elastic FLAVOR_VERSION=1.2.4 + - TRAVIS_FLAVOR=elastic FLAVOR_VERSION=1.3.9 + - TRAVIS_FLAVOR=elastic FLAVOR_VERSION=1.4.5 + - TRAVIS_FLAVOR=elastic FLAVOR_VERSION=1.5.2 + - TRAVIS_FLAVOR=elastic FLAVOR_VERSION=1.6.2 + - TRAVIS_FLAVOR=elastic FLAVOR_VERSION=1.7.4 + - TRAVIS_FLAVOR=elastic FLAVOR_VERSION=2.0.2 + - TRAVIS_FLAVOR=elastic FLAVOR_VERSION=2.1.1 # END OF TRAVIS MATRIX before_install: diff --git a/circle.yml b/circle.yml index 4237ad710bdee..27a15cb204742 100644 --- a/circle.yml +++ b/circle.yml @@ -57,6 +57,7 @@ test: - rake ci:run[pgbouncer] - rake ci:run[apache] - rake ci:run[couch] + - rake ci:run[elastic] - bundle exec rake requirements post: - if [[ $(docker ps -a -q) ]]; then docker stop $(docker ps -a -q); fi diff --git a/elastic/README.md b/elastic/README.md new file mode 100644 index 0000000000000..2ef69485ec199 --- /dev/null +++ b/elastic/README.md @@ -0,0 +1,32 @@ +# Elastic Integration + +## Overview + +Get metrics from elastic service in real time to: + +* Visualize and monitor elastic states +* Be notified about elastic failovers and events. + +## Installation + +Install the `dd-check-elastic` package manually or with your favorite configuration manager + +## Configuration + +Edit the `elastic.yaml` file to point to your server and port, set the masters to monitor + +## Validation + +When you run `datadog-agent info` you should see something like the following: + + Checks + ====== + + elastic + ----------- + - instance #0 [OK] + - Collected 39 metrics, 0 events & 7 service checks + +## Compatibility + +The elastic check is compatible with all major platforms diff --git a/elastic/check.py b/elastic/check.py new file mode 100644 index 0000000000000..4d1c2a7bc5f58 --- /dev/null +++ b/elastic/check.py @@ -0,0 +1,732 @@ +# (C) Datadog, Inc. 2010-2016 +# All rights reserved +# Licensed under Simplified BSD License (see LICENSE) + +# stdlib +from collections import defaultdict, namedtuple +import time +import urlparse + +# 3p +import requests + +# project +from checks import AgentCheck +from config import _is_affirmative +from util import headers + + +class NodeNotFound(Exception): + pass + + +ESInstanceConfig = namedtuple( + 'ESInstanceConfig', [ + 'pshard_stats', + 'cluster_stats', + 'password', + 'service_check_tags', + 'tags', + 'timeout', + 'url', + 'username', + 'pending_task_stats', + 'ssl_verify', + 'ssl_cert', + 'ssl_key', + ]) + + +class ESCheck(AgentCheck): + SERVICE_CHECK_CONNECT_NAME = 'elasticsearch.can_connect' + SERVICE_CHECK_CLUSTER_STATUS = 'elasticsearch.cluster_health' + + DEFAULT_TIMEOUT = 5 + + # Clusterwise metrics, pre aggregated on ES, compatible with all ES versions + PRIMARY_SHARD_METRICS = { + "elasticsearch.primaries.docs.count": ("gauge", "_all.primaries.docs.count"), + "elasticsearch.primaries.docs.deleted": ("gauge", "_all.primaries.docs.deleted"), + "elasticsearch.primaries.store.size": ("gauge", "_all.primaries.store.size_in_bytes"), + "elasticsearch.primaries.indexing.index.total": ("gauge", "_all.primaries.indexing.index_total"), + "elasticsearch.primaries.indexing.index.time": ("gauge", "_all.primaries.indexing.index_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.indexing.index.current": ("gauge", "_all.primaries.indexing.index_current"), + "elasticsearch.primaries.indexing.delete.total": ("gauge", "_all.primaries.indexing.delete_total"), + "elasticsearch.primaries.indexing.delete.time": ("gauge", "_all.primaries.indexing.delete_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.indexing.delete.current": ("gauge", "_all.primaries.indexing.delete_current"), + "elasticsearch.primaries.get.total": ("gauge", "_all.primaries.get.total"), + "elasticsearch.primaries.get.time": ("gauge", "_all.primaries.get.time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.get.current": ("gauge", "_all.primaries.get.current"), + "elasticsearch.primaries.get.exists.total": ("gauge", "_all.primaries.get.exists_total"), + "elasticsearch.primaries.get.exists.time": ("gauge", "_all.primaries.get.exists_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.get.missing.total": ("gauge", "_all.primaries.get.missing_total"), + "elasticsearch.primaries.get.missing.time": ("gauge", "_all.primaries.get.missing_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.search.query.total": ("gauge", "_all.primaries.search.query_total"), + "elasticsearch.primaries.search.query.time": ("gauge", "_all.primaries.search.query_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.search.query.current": ("gauge", "_all.primaries.search.query_current"), + "elasticsearch.primaries.search.fetch.total": ("gauge", "_all.primaries.search.fetch_total"), + "elasticsearch.primaries.search.fetch.time": ("gauge", "_all.primaries.search.fetch_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.search.fetch.current": ("gauge", "_all.primaries.search.fetch_current") + } + + PRIMARY_SHARD_METRICS_POST_1_0 = { + "elasticsearch.primaries.merges.current": ("gauge", "_all.primaries.merges.current"), + "elasticsearch.primaries.merges.current.docs": ("gauge", "_all.primaries.merges.current_docs"), + "elasticsearch.primaries.merges.current.size": ("gauge", "_all.primaries.merges.current_size_in_bytes"), + "elasticsearch.primaries.merges.total": ("gauge", "_all.primaries.merges.total"), + "elasticsearch.primaries.merges.total.time": ("gauge", "_all.primaries.merges.total_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.merges.total.docs": ("gauge", "_all.primaries.merges.total_docs"), + "elasticsearch.primaries.merges.total.size": ("gauge", "_all.primaries.merges.total_size_in_bytes"), + "elasticsearch.primaries.refresh.total": ("gauge", "_all.primaries.refresh.total"), + "elasticsearch.primaries.refresh.total.time": ("gauge", "_all.primaries.refresh.total_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.flush.total": ("gauge", "_all.primaries.flush.total"), + "elasticsearch.primaries.flush.total.time": ("gauge", "_all.primaries.flush.total_time_in_millis", lambda v: float(v)/1000) + } + + STATS_METRICS = { # Metrics that are common to all Elasticsearch versions + "elasticsearch.docs.count": ("gauge", "indices.docs.count"), + "elasticsearch.docs.deleted": ("gauge", "indices.docs.deleted"), + "elasticsearch.store.size": ("gauge", "indices.store.size_in_bytes"), + "elasticsearch.indexing.index.total": ("gauge", "indices.indexing.index_total"), + "elasticsearch.indexing.index.time": ("gauge", "indices.indexing.index_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.indexing.index.current": ("gauge", "indices.indexing.index_current"), + "elasticsearch.indexing.delete.total": ("gauge", "indices.indexing.delete_total"), + "elasticsearch.indexing.delete.time": ("gauge", "indices.indexing.delete_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.indexing.delete.current": ("gauge", "indices.indexing.delete_current"), + "elasticsearch.get.total": ("gauge", "indices.get.total"), + "elasticsearch.get.time": ("gauge", "indices.get.time_in_millis", lambda v: float(v)/1000), + "elasticsearch.get.current": ("gauge", "indices.get.current"), + "elasticsearch.get.exists.total": ("gauge", "indices.get.exists_total"), + "elasticsearch.get.exists.time": ("gauge", "indices.get.exists_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.get.missing.total": ("gauge", "indices.get.missing_total"), + "elasticsearch.get.missing.time": ("gauge", "indices.get.missing_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.search.query.total": ("gauge", "indices.search.query_total"), + "elasticsearch.search.query.time": ("gauge", "indices.search.query_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.search.query.current": ("gauge", "indices.search.query_current"), + "elasticsearch.search.fetch.total": ("gauge", "indices.search.fetch_total"), + "elasticsearch.search.fetch.time": ("gauge", "indices.search.fetch_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.search.fetch.current": ("gauge", "indices.search.fetch_current"), + "elasticsearch.indices.segments.count": ("gauge", "indices.segments.count"), + "elasticsearch.indices.segments.memory_in_bytes": ("gauge", "indices.segments.memory_in_bytes"), + "elasticsearch.merges.current": ("gauge", "indices.merges.current"), + "elasticsearch.merges.current.docs": ("gauge", "indices.merges.current_docs"), + "elasticsearch.merges.current.size": ("gauge", "indices.merges.current_size_in_bytes"), + "elasticsearch.merges.total": ("gauge", "indices.merges.total"), + "elasticsearch.merges.total.time": ("gauge", "indices.merges.total_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.merges.total.docs": ("gauge", "indices.merges.total_docs"), + "elasticsearch.merges.total.size": ("gauge", "indices.merges.total_size_in_bytes"), + "elasticsearch.refresh.total": ("gauge", "indices.refresh.total"), + "elasticsearch.refresh.total.time": ("gauge", "indices.refresh.total_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.flush.total": ("gauge", "indices.flush.total"), + "elasticsearch.flush.total.time": ("gauge", "indices.flush.total_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.process.open_fd": ("gauge", "process.open_file_descriptors"), + "elasticsearch.transport.rx_count": ("gauge", "transport.rx_count"), + "elasticsearch.transport.tx_count": ("gauge", "transport.tx_count"), + "elasticsearch.transport.rx_size": ("gauge", "transport.rx_size_in_bytes"), + "elasticsearch.transport.tx_size": ("gauge", "transport.tx_size_in_bytes"), + "elasticsearch.transport.server_open": ("gauge", "transport.server_open"), + "elasticsearch.thread_pool.bulk.active": ("gauge", "thread_pool.bulk.active"), + "elasticsearch.thread_pool.bulk.threads": ("gauge", "thread_pool.bulk.threads"), + "elasticsearch.thread_pool.bulk.queue": ("gauge", "thread_pool.bulk.queue"), + "elasticsearch.thread_pool.bulk.rejected": ("rate", "thread_pool.bulk.rejected"), + "elasticsearch.thread_pool.flush.active": ("gauge", "thread_pool.flush.active"), + "elasticsearch.thread_pool.flush.threads": ("gauge", "thread_pool.flush.threads"), + "elasticsearch.thread_pool.flush.queue": ("gauge", "thread_pool.flush.queue"), + "elasticsearch.thread_pool.flush.rejected": ("rate", "thread_pool.flush.rejected"), + "elasticsearch.thread_pool.generic.active": ("gauge", "thread_pool.generic.active"), + "elasticsearch.thread_pool.generic.threads": ("gauge", "thread_pool.generic.threads"), + "elasticsearch.thread_pool.generic.queue": ("gauge", "thread_pool.generic.queue"), + "elasticsearch.thread_pool.generic.rejected": ("rate", "thread_pool.generic.rejected"), + "elasticsearch.thread_pool.get.active": ("gauge", "thread_pool.get.active"), + "elasticsearch.thread_pool.get.threads": ("gauge", "thread_pool.get.threads"), + "elasticsearch.thread_pool.get.queue": ("gauge", "thread_pool.get.queue"), + "elasticsearch.thread_pool.get.rejected": ("rate", "thread_pool.get.rejected"), + "elasticsearch.thread_pool.index.active": ("gauge", "thread_pool.index.active"), + "elasticsearch.thread_pool.index.threads": ("gauge", "thread_pool.index.threads"), + "elasticsearch.thread_pool.index.queue": ("gauge", "thread_pool.index.queue"), + "elasticsearch.thread_pool.index.rejected": ("rate", "thread_pool.index.rejected"), + "elasticsearch.thread_pool.management.active": ("gauge", "thread_pool.management.active"), + "elasticsearch.thread_pool.management.threads": ("gauge", "thread_pool.management.threads"), + "elasticsearch.thread_pool.management.queue": ("gauge", "thread_pool.management.queue"), + "elasticsearch.thread_pool.management.rejected": ("rate", "thread_pool.management.rejected"), + "elasticsearch.thread_pool.percolate.active": ("gauge", "thread_pool.percolate.active"), + "elasticsearch.thread_pool.percolate.threads": ("gauge", "thread_pool.percolate.threads"), + "elasticsearch.thread_pool.percolate.queue": ("gauge", "thread_pool.percolate.queue"), + "elasticsearch.thread_pool.percolate.rejected": ("rate", "thread_pool.percolate.rejected"), + "elasticsearch.thread_pool.refresh.active": ("gauge", "thread_pool.refresh.active"), + "elasticsearch.thread_pool.refresh.threads": ("gauge", "thread_pool.refresh.threads"), + "elasticsearch.thread_pool.refresh.queue": ("gauge", "thread_pool.refresh.queue"), + "elasticsearch.thread_pool.refresh.rejected": ("rate", "thread_pool.refresh.rejected"), + "elasticsearch.thread_pool.search.active": ("gauge", "thread_pool.search.active"), + "elasticsearch.thread_pool.search.threads": ("gauge", "thread_pool.search.threads"), + "elasticsearch.thread_pool.search.queue": ("gauge", "thread_pool.search.queue"), + "elasticsearch.thread_pool.search.rejected": ("rate", "thread_pool.search.rejected"), + "elasticsearch.thread_pool.snapshot.active": ("gauge", "thread_pool.snapshot.active"), + "elasticsearch.thread_pool.snapshot.threads": ("gauge", "thread_pool.snapshot.threads"), + "elasticsearch.thread_pool.snapshot.queue": ("gauge", "thread_pool.snapshot.queue"), + "elasticsearch.thread_pool.snapshot.rejected": ("rate", "thread_pool.snapshot.rejected"), + "elasticsearch.thread_pool.suggest.active": ("gauge", "thread_pool.suggest.active"), + "elasticsearch.thread_pool.suggest.threads": ("gauge", "thread_pool.suggest.threads"), + "elasticsearch.thread_pool.suggest.queue": ("gauge", "thread_pool.suggest.queue"), + "elasticsearch.thread_pool.suggest.rejected": ("rate", "thread_pool.suggest.rejected"), + "elasticsearch.thread_pool.warmer.active": ("gauge", "thread_pool.warmer.active"), + "elasticsearch.thread_pool.warmer.threads": ("gauge", "thread_pool.warmer.threads"), + "elasticsearch.thread_pool.warmer.queue": ("gauge", "thread_pool.warmer.queue"), + "elasticsearch.thread_pool.warmer.rejected": ("rate", "thread_pool.warmer.rejected"), + "elasticsearch.http.current_open": ("gauge", "http.current_open"), + "elasticsearch.http.total_opened": ("gauge", "http.total_opened"), + "jvm.mem.heap_committed": ("gauge", "jvm.mem.heap_committed_in_bytes"), + "jvm.mem.heap_used": ("gauge", "jvm.mem.heap_used_in_bytes"), + "jvm.mem.heap_in_use": ("gauge", "jvm.mem.heap_used_percent"), + "jvm.mem.heap_max": ("gauge", "jvm.mem.heap_max_in_bytes"), + "jvm.mem.non_heap_committed": ("gauge", "jvm.mem.non_heap_committed_in_bytes"), + "jvm.mem.non_heap_used": ("gauge", "jvm.mem.non_heap_used_in_bytes"), + "jvm.threads.count": ("gauge", "jvm.threads.count"), + "jvm.threads.peak_count": ("gauge", "jvm.threads.peak_count"), + "elasticsearch.fs.total.total_in_bytes": ("gauge", "fs.total.total_in_bytes"), + "elasticsearch.fs.total.free_in_bytes": ("gauge", "fs.total.free_in_bytes"), + "elasticsearch.fs.total.available_in_bytes": ("gauge", "fs.total.available_in_bytes"), + } + + JVM_METRICS_POST_0_90_10 = { + "jvm.gc.collectors.young.count": ("gauge", "jvm.gc.collectors.young.collection_count"), + "jvm.gc.collectors.young.collection_time": ("gauge", "jvm.gc.collectors.young.collection_time_in_millis", lambda v: float(v)/1000), + "jvm.gc.collectors.old.count": ("gauge", "jvm.gc.collectors.old.collection_count"), + "jvm.gc.collectors.old.collection_time": ("gauge", "jvm.gc.collectors.old.collection_time_in_millis", lambda v: float(v)/1000) + } + + JVM_METRICS_PRE_0_90_10 = { + "jvm.gc.concurrent_mark_sweep.count": ("gauge", "jvm.gc.collectors.ConcurrentMarkSweep.collection_count"), + "jvm.gc.concurrent_mark_sweep.collection_time": ("gauge", "jvm.gc.collectors.ConcurrentMarkSweep.collection_time_in_millis", lambda v: float(v)/1000), + "jvm.gc.par_new.count": ("gauge", "jvm.gc.collectors.ParNew.collection_count"), + "jvm.gc.par_new.collection_time": ("gauge", "jvm.gc.collectors.ParNew.collection_time_in_millis", lambda v: float(v)/1000), + "jvm.gc.collection_count": ("gauge", "jvm.gc.collection_count"), + "jvm.gc.collection_time": ("gauge", "jvm.gc.collection_time_in_millis", lambda v: float(v)/1000), + } + + ADDITIONAL_METRICS_POST_0_90_5 = { + "elasticsearch.search.fetch.open_contexts": ("gauge", "indices.search.open_contexts"), + "elasticsearch.fielddata.size": ("gauge", "indices.fielddata.memory_size_in_bytes"), + "elasticsearch.fielddata.evictions": ("gauge", "indices.fielddata.evictions"), + } + + ADDITIONAL_METRICS_POST_0_90_5_PRE_2_0 = { + "elasticsearch.cache.filter.evictions": ("gauge", "indices.filter_cache.evictions"), + "elasticsearch.cache.filter.size": ("gauge", "indices.filter_cache.memory_size_in_bytes"), + "elasticsearch.id_cache.size": ("gauge", "indices.id_cache.memory_size_in_bytes"), + } + + ADDITIONAL_METRICS_PRE_0_90_5 = { + "elasticsearch.cache.field.evictions": ("gauge", "indices.cache.field_evictions"), + "elasticsearch.cache.field.size": ("gauge", "indices.cache.field_size_in_bytes"), + "elasticsearch.cache.filter.count": ("gauge", "indices.cache.filter_count"), + "elasticsearch.cache.filter.evictions": ("gauge", "indices.cache.filter_evictions"), + "elasticsearch.cache.filter.size": ("gauge", "indices.cache.filter_size_in_bytes"), + } + + ADDITIONAL_METRICS_POST_1_0_0 = { + "elasticsearch.indices.translog.size_in_bytes": ("gauge", "indices.translog.size_in_bytes"), + "elasticsearch.indices.translog.operations": ("gauge", "indices.translog.operations"), + } + + ADDITIONAL_METRICS_1_x = { # Stats are only valid for v1.x + "elasticsearch.fs.total.disk_reads": ("rate", "fs.total.disk_reads"), + "elasticsearch.fs.total.disk_writes": ("rate", "fs.total.disk_writes"), + "elasticsearch.fs.total.disk_io_op": ("rate", "fs.total.disk_io_op"), + "elasticsearch.fs.total.disk_read_size_in_bytes": ("gauge", "fs.total.disk_read_size_in_bytes"), + "elasticsearch.fs.total.disk_write_size_in_bytes": ("gauge", "fs.total.disk_write_size_in_bytes"), + "elasticsearch.fs.total.disk_io_size_in_bytes": ("gauge", "fs.total.disk_io_size_in_bytes"), + } + + ADDITIONAL_METRICS_POST_1_3_0 = { + "elasticsearch.indices.segments.index_writer_memory_in_bytes": ("gauge", "indices.segments.index_writer_memory_in_bytes"), + "elasticsearch.indices.segments.version_map_memory_in_bytes": ("gauge", "indices.segments.version_map_memory_in_bytes"), + } + + ADDITIONAL_METRICS_POST_1_4_0 = { + "elasticsearch.indices.indexing.throttle_time": ("rate", "indices.indexing.throttle_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.indices.query_cache.memory_size_in_bytes": ("gauge", "indices.query_cache.memory_size_in_bytes"), + "elasticsearch.indices.query_cache.hit_count": ("rate", "indices.query_cache.hit_count"), + "elasticsearch.indices.query_cache.miss_count": ("rate", "indices.query_cache.miss_count"), + "elasticsearch.indices.query_cache.evictions": ("rate", "indices.query_cache.evictions"), + "elasticsearch.indices.segments.index_writer_max_memory_in_bytes": ("gauge", "indices.segments.index_writer_max_memory_in_bytes"), + "elasticsearch.indices.segments.fixed_bit_set_memory_in_bytes": ("gauge", "indices.segments.fixed_bit_set_memory_in_bytes"), + "elasticsearch.breakers.fielddata.estimated_size_in_bytes": ("gauge", "breakers.fielddata.estimated_size_in_bytes"), + "elasticsearch.breakers.fielddata.overhead": ("gauge", "breakers.fielddata.overhead"), + "elasticsearch.breakers.fielddata.tripped": ("rate", "breakers.fielddata.tripped"), + "elasticsearch.breakers.parent.estimated_size_in_bytes": ("gauge", "breakers.parent.estimated_size_in_bytes"), + "elasticsearch.breakers.parent.overhead": ("gauge", "breakers.parent.overhead"), + "elasticsearch.breakers.parent.tripped": ("rate", "breakers.parent.tripped"), + "elasticsearch.breakers.request.estimated_size_in_bytes": ("gauge", "breakers.request.estimated_size_in_bytes"), + "elasticsearch.breakers.request.overhead": ("gauge", "breakers.request.overhead"), + "elasticsearch.breakers.request.tripped": ("rate", "breakers.request.tripped"), + "elasticsearch.thread_pool.listener.active": ("gauge", "thread_pool.listener.active"), + "elasticsearch.thread_pool.listener.threads": ("gauge", "thread_pool.listener.threads"), + "elasticsearch.thread_pool.listener.queue": ("gauge", "thread_pool.listener.queue"), + "elasticsearch.thread_pool.listener.rejected": ("rate", "thread_pool.listener.rejected"), + } + + ADDITIONAL_METRICS_POST_1_5_0 = { + "elasticsearch.indices.recovery.current_as_source": ("gauge", "indices.recovery.current_as_source"), + "elasticsearch.indices.recovery.current_as_target": ("gauge", "indices.recovery.current_as_target"), + "elasticsearch.indices.recovery.throttle_time": ("rate", "indices.recovery.throttle_time_in_millis", lambda v: float(v)/1000), + } + + ADDITIONAL_METRICS_POST_1_6_0 = { + "elasticsearch.thread_pool.fetch_shard_started.active": ("gauge", "thread_pool.fetch_shard_started.active"), + "elasticsearch.thread_pool.fetch_shard_started.threads": ("gauge", "thread_pool.fetch_shard_started.threads"), + "elasticsearch.thread_pool.fetch_shard_started.queue": ("gauge", "thread_pool.fetch_shard_started.queue"), + "elasticsearch.thread_pool.fetch_shard_started.rejected": ("rate", "thread_pool.fetch_shard_started.rejected"), + "elasticsearch.thread_pool.fetch_shard_store.active": ("gauge", "thread_pool.fetch_shard_store.active"), + "elasticsearch.thread_pool.fetch_shard_store.threads": ("gauge", "thread_pool.fetch_shard_store.threads"), + "elasticsearch.thread_pool.fetch_shard_store.queue": ("gauge", "thread_pool.fetch_shard_store.queue"), + "elasticsearch.thread_pool.fetch_shard_store.rejected": ("rate", "thread_pool.fetch_shard_store.rejected"), + } + + ADDITIONAL_METRICS_PRE_2_0 = { + "elasticsearch.thread_pool.merge.active": ("gauge", "thread_pool.merge.active"), + "elasticsearch.thread_pool.merge.threads": ("gauge", "thread_pool.merge.threads"), + "elasticsearch.thread_pool.merge.queue": ("gauge", "thread_pool.merge.queue"), + "elasticsearch.thread_pool.merge.rejected": ("rate", "thread_pool.merge.rejected"), + } + + ADDITIONAL_METRICS_POST_2_0 = { + # Some of these may very well exist in previous ES versions, but not worth the time/effort + # to find where they were introduced + "elasticsearch.indices.query_cache.cache_size": ("gauge", "indices.query_cache.cache_size"), + "elasticsearch.indices.query_cache.cache_count": ("rate", "indices.query_cache.cache_count"), + "elasticsearch.indices.query_cache.total_count": ("rate", "indices.query_cache.total_count"), + "elasticsearch.indices.segments.doc_values_memory_in_bytes": ("gauge", "indices.segments.doc_values_memory_in_bytes"), + "elasticsearch.indices.segments.norms_memory_in_bytes": ("gauge", "indices.segments.norms_memory_in_bytes"), + "elasticsearch.indices.segments.stored_fields_memory_in_bytes": ("gauge", "indices.segments.stored_fields_memory_in_bytes"), + "elasticsearch.indices.segments.term_vectors_memory_in_bytes": ("gauge", "indices.segments.term_vectors_memory_in_bytes"), + "elasticsearch.indices.segments.terms_memory_in_bytes": ("gauge", "indices.segments.terms_memory_in_bytes"), + "elasticsearch.indices.request_cache.memory_size_in_bytes": ("gauge", "indices.request_cache.memory_size_in_bytes"), + "elasticsearch.indices.request_cache.evictions": ("rate", "indices.request_cache.evictions"), + "elasticsearch.indices.request_cache.hit_count": ("rate", "indices.request_cache.hit_count"), + "elasticsearch.indices.request_cache.miss_count": ("rate", "indices.request_cache.miss_count"), + } + + ADDITIONAL_METRICS_POST_2_1 = { + "elasticsearch.indices.indexing.index_failed": ("rate", "indices.indexing.index_failed"), + "elasticsearch.thread_pool.force_merge.active": ("gauge", "thread_pool.force_merge.active"), + "elasticsearch.thread_pool.force_merge.threads": ("gauge", "thread_pool.force_merge.threads"), + "elasticsearch.thread_pool.force_merge.queue": ("gauge", "thread_pool.force_merge.queue"), + "elasticsearch.thread_pool.force_merge.rejected": ("rate", "thread_pool.force_merge.rejected"), + } + + CLUSTER_HEALTH_METRICS = { + "elasticsearch.number_of_nodes": ("gauge", "number_of_nodes"), + "elasticsearch.number_of_data_nodes": ("gauge", "number_of_data_nodes"), + "elasticsearch.active_primary_shards": ("gauge", "active_primary_shards"), + "elasticsearch.active_shards": ("gauge", "active_shards"), + "elasticsearch.relocating_shards": ("gauge", "relocating_shards"), + "elasticsearch.initializing_shards": ("gauge", "initializing_shards"), + "elasticsearch.unassigned_shards": ("gauge", "unassigned_shards"), + "elasticsearch.cluster_status": ("gauge", "status", lambda v: {"red": 0, "yellow": 1, "green": 2}.get(v, -1)), + } + + CLUSTER_PENDING_TASKS = { + "elasticsearch.pending_tasks_total": ("gauge", "pending_task_total"), + "elasticsearch.pending_tasks_priority_high": ("gauge", "pending_tasks_priority_high"), + "elasticsearch.pending_tasks_priority_urgent": ("gauge", "pending_tasks_priority_urgent") + } + + SOURCE_TYPE_NAME = 'elasticsearch' + + def __init__(self, name, init_config, agentConfig, instances=None): + AgentCheck.__init__(self, name, init_config, agentConfig, instances) + + # Host status needs to persist across all checks + self.cluster_status = {} + + def get_instance_config(self, instance): + url = instance.get('url') + if url is None: + raise Exception("An url must be specified in the instance") + + pshard_stats = _is_affirmative(instance.get('pshard_stats', False)) + + cluster_stats = _is_affirmative(instance.get('cluster_stats', False)) + if 'is_external' in instance: + cluster_stats = _is_affirmative(instance.get('is_external', False)) + + pending_task_stats = _is_affirmative(instance.get('pending_task_stats', True)) + # Support URLs that have a path in them from the config, for + # backwards-compatibility. + parsed = urlparse.urlparse(url) + if parsed[2] != "": + url = "%s://%s" % (parsed[0], parsed[1]) + port = parsed.port + host = parsed.hostname + + custom_tags = instance.get('tags', []) + service_check_tags = [ + 'host:%s' % host, + 'port:%s' % port + ] + service_check_tags.extend(custom_tags) + + # Tag by URL so we can differentiate the metrics + # from multiple instances + tags = ['url:%s' % url] + tags.extend(custom_tags) + + timeout = instance.get('timeout') or self.DEFAULT_TIMEOUT + + config = ESInstanceConfig( + pshard_stats=pshard_stats, + cluster_stats=cluster_stats, + password=instance.get('password'), + service_check_tags=service_check_tags, + ssl_cert=instance.get('ssl_cert'), + ssl_key=instance.get('ssl_key'), + ssl_verify=instance.get('ssl_verify'), + tags=tags, + timeout=timeout, + url=url, + username=instance.get('username'), + pending_task_stats=pending_task_stats + ) + return config + + def check(self, instance): + config = self.get_instance_config(instance) + + # Check ES version for this instance and define parameters + # (URLs and metrics) accordingly + version = self._get_es_version(config) + + health_url, nodes_url, stats_url, pshard_stats_url, pending_tasks_url, stats_metrics, \ + pshard_stats_metrics = self._define_params(version, config.cluster_stats) + + # Load stats data. + # This must happen before other URL processing as the cluster name + # is retreived here, and added to the tag list. + + stats_url = urlparse.urljoin(config.url, stats_url) + stats_data = self._get_data(stats_url, config) + if stats_data['cluster_name']: + # retreive the cluster name from the data, and append it to the + # master tag list. + config.tags.append("cluster_name:{}".format(stats_data['cluster_name'])) + self._process_stats_data(nodes_url, stats_data, stats_metrics, config) + + # Load clusterwise data + if config.pshard_stats: + pshard_stats_url = urlparse.urljoin(config.url, pshard_stats_url) + pshard_stats_data = self._get_data(pshard_stats_url, config) + self._process_pshard_stats_data(pshard_stats_data, config, pshard_stats_metrics) + + + # Load the health data. + health_url = urlparse.urljoin(config.url, health_url) + health_data = self._get_data(health_url, config) + self._process_health_data(health_data, config) + + if config.pending_task_stats: + # Load the pending_tasks data. + pending_tasks_url = urlparse.urljoin(config.url, pending_tasks_url) + pending_tasks_data = self._get_data(pending_tasks_url, config) + self._process_pending_tasks_data(pending_tasks_data, config) + + # If we're here we did not have any ES conn issues + self.service_check( + self.SERVICE_CHECK_CONNECT_NAME, + AgentCheck.OK, + tags=config.service_check_tags + ) + + def _get_es_version(self, config): + """ Get the running version of elasticsearch. + """ + try: + data = self._get_data(config.url, config, send_sc=False) + version = map(int, data['version']['number'].split('.')[0:3]) + except Exception as e: + self.warning( + "Error while trying to get Elasticsearch version " + "from %s %s" + % (config.url, str(e)) + ) + version = [1, 0, 0] + + self.service_metadata('version', version) + self.log.debug("Elasticsearch version is %s" % version) + return version + + def _define_params(self, version, cluster_stats): + """ Define the set of URLs and METRICS to use depending on the + running ES version. + """ + + pshard_stats_url = "/_stats" + + if version >= [0, 90, 10]: + # ES versions 0.90.10 and above + health_url = "/_cluster/health?pretty=true" + nodes_url = "/_nodes?network=true" + pending_tasks_url = "/_cluster/pending_tasks?pretty=true" + + # For "external" clusters, we want to collect from all nodes. + if cluster_stats: + stats_url = "/_nodes/stats?all=true" + else: + stats_url = "/_nodes/_local/stats?all=true" + + additional_metrics = self.JVM_METRICS_POST_0_90_10 + else: + health_url = "/_cluster/health?pretty=true" + nodes_url = "/_cluster/nodes?network=true" + pending_tasks_url = None + if cluster_stats: + stats_url = "/_cluster/nodes/stats?all=true" + else: + stats_url = "/_cluster/nodes/_local/stats?all=true" + + additional_metrics = self.JVM_METRICS_PRE_0_90_10 + + stats_metrics = dict(self.STATS_METRICS) + stats_metrics.update(additional_metrics) + + + # Additional Stats metrics # + if version >= [0, 90, 5]: + # ES versions 0.90.5 and above + additional_metrics = self.ADDITIONAL_METRICS_POST_0_90_5 + else: + # ES version 0.90.4 and below + additional_metrics = self.ADDITIONAL_METRICS_PRE_0_90_5 + + stats_metrics.update(additional_metrics) + + if version >= [1, 0, 0]: + stats_metrics.update(self.ADDITIONAL_METRICS_POST_1_0_0) + + if version < [2, 0, 0]: + stats_metrics.update(self.ADDITIONAL_METRICS_PRE_2_0) + if version >= [0, 90, 5]: + stats_metrics.update(self.ADDITIONAL_METRICS_POST_0_90_5_PRE_2_0) + if version >= [1, 0, 0]: + stats_metrics.update(self.ADDITIONAL_METRICS_1_x) + + if version >= [1, 3, 0]: + stats_metrics.update(self.ADDITIONAL_METRICS_POST_1_3_0) + + if version >= [1, 4, 0]: + # ES versions 1.4 and above + stats_metrics.update(self.ADDITIONAL_METRICS_POST_1_4_0) + + if version >= [1, 5, 0]: + stats_metrics.update(self.ADDITIONAL_METRICS_POST_1_5_0) + + if version >= [1, 6, 0]: + stats_metrics.update(self.ADDITIONAL_METRICS_POST_1_6_0) + + if version >= [2, 0, 0]: + stats_metrics.update(self.ADDITIONAL_METRICS_POST_2_0) + + if version >= [2, 1, 0]: + stats_metrics.update(self.ADDITIONAL_METRICS_POST_2_1) + + # Version specific stats metrics about the primary shards + pshard_stats_metrics = dict(self.PRIMARY_SHARD_METRICS) + + if version >= [1, 0, 0]: + additional_metrics = self.PRIMARY_SHARD_METRICS_POST_1_0 + + pshard_stats_metrics.update(additional_metrics) + + return health_url, nodes_url, stats_url, pshard_stats_url, pending_tasks_url, \ + stats_metrics, pshard_stats_metrics + + def _get_data(self, url, config, send_sc=True): + """ Hit a given URL and return the parsed json + """ + # Load basic authentication configuration, if available. + if config.username and config.password: + auth = (config.username, config.password) + else: + auth = None + + # Load SSL configuration, if available. + # ssl_verify can be a bool or a string (http://docs.python-requests.org/en/latest/user/advanced/#ssl-cert-verification) + if isinstance(config.ssl_verify, bool) or isinstance(config.ssl_verify, str): + verify = config.ssl_verify + else: + verify = None + if config.ssl_cert and config.ssl_key: + cert = (config.ssl_cert, config.ssl_key) + elif config.ssl_cert: + cert = config.ssl_cert + else: + cert = None + + try: + resp = requests.get( + url, + timeout=config.timeout, + headers=headers(self.agentConfig), + auth=auth, + verify=verify, + cert=cert + ) + resp.raise_for_status() + except Exception as e: + if send_sc: + self.service_check( + self.SERVICE_CHECK_CONNECT_NAME, + AgentCheck.CRITICAL, + message="Error {0} when hitting {1}".format(e, url), + tags=config.service_check_tags + ) + raise + + return resp.json() + + def _process_pending_tasks_data(self, data, config): + p_tasks = defaultdict(int) + + for task in data.get('tasks', []): + p_tasks[task.get('priority')] += 1 + + node_data = { + 'pending_task_total': sum(p_tasks.values()), + 'pending_tasks_priority_high': p_tasks['high'], + 'pending_tasks_priority_urgent': p_tasks['urgent'], + } + + for metric in self.CLUSTER_PENDING_TASKS: + # metric description + desc = self.CLUSTER_PENDING_TASKS[metric] + self._process_metric(node_data, metric, *desc, tags=config.tags) + + def _process_stats_data(self, nodes_url, data, stats_metrics, config): + cluster_stats = config.cluster_stats + for node_data in data['nodes'].itervalues(): + metric_hostname = None + metrics_tags = list(config.tags) + + # Resolve the node's name + node_name = node_data.get('name') + if node_name: + metrics_tags.append( + u"node_name:{}".format(node_name) + ) + + # Resolve the node's hostname + if cluster_stats: + for k in ['hostname', 'host']: + if k in node_data: + metric_hostname = node_data[k] + break + + for metric, desc in stats_metrics.iteritems(): + self._process_metric( + node_data, metric, *desc, + tags=metrics_tags, hostname=metric_hostname + ) + + def _process_pshard_stats_data(self, data, config, pshard_stats_metrics): + for metric, desc in pshard_stats_metrics.iteritems(): + self._process_metric(data, metric, *desc, tags=config.tags) + + def _process_metric(self, data, metric, xtype, path, xform=None, + tags=None, hostname=None): + """data: dictionary containing all the stats + metric: datadog metric + path: corresponding path in data, flattened, e.g. thread_pool.bulk.queue + xfom: a lambda to apply to the numerical value + """ + value = data + + # Traverse the nested dictionaries + for key in path.split('.'): + if value is not None: + value = value.get(key, None) + else: + break + + if value is not None: + if xform: + value = xform(value) + if xtype == "gauge": + self.gauge(metric, value, tags=tags, hostname=hostname) + else: + self.rate(metric, value, tags=tags, hostname=hostname) + else: + self._metric_not_found(metric, path) + + def _process_health_data(self, data, config): + if self.cluster_status.get(config.url) is None: + self.cluster_status[config.url] = data['status'] + if data['status'] in ["yellow", "red"]: + event = self._create_event(data['status'], tags=config.tags) + self.event(event) + + if data['status'] != self.cluster_status.get(config.url): + self.cluster_status[config.url] = data['status'] + event = self._create_event(data['status'], tags=config.tags) + self.event(event) + + for metric, desc in self.CLUSTER_HEALTH_METRICS.iteritems(): + self._process_metric(data, metric, *desc, tags=config.tags) + + # Process the service check + cluster_status = data['status'] + if cluster_status == 'green': + status = AgentCheck.OK + data['tag'] = "OK" + elif cluster_status == 'yellow': + status = AgentCheck.WARNING + data['tag'] = "WARN" + else: + status = AgentCheck.CRITICAL + data['tag'] = "ALERT" + + msg = "{tag} on cluster \"{cluster_name}\" "\ + "| active_shards={active_shards} "\ + "| initializing_shards={initializing_shards} "\ + "| relocating_shards={relocating_shards} "\ + "| unassigned_shards={unassigned_shards} "\ + "| timed_out={timed_out}" \ + .format(**data) + + self.service_check( + self.SERVICE_CHECK_CLUSTER_STATUS, + status, + message=msg, + tags=config.service_check_tags + ) + + def _metric_not_found(self, metric, path): + self.log.debug("Metric not found: %s -> %s", path, metric) + + def _create_event(self, status, tags=None): + hostname = self.hostname.decode('utf-8') + if status == "red": + alert_type = "error" + msg_title = "%s is %s" % (hostname, status) + + elif status == "yellow": + alert_type = "warning" + msg_title = "%s is %s" % (hostname, status) + + else: + # then it should be green + alert_type = "success" + msg_title = "%s recovered as %s" % (hostname, status) + + msg = "ElasticSearch: %s just reported as %s" % (hostname, status) + + return { + 'timestamp': int(time.time()), + 'event_type': 'elasticsearch', + 'host': hostname, + 'msg_text': msg, + 'msg_title': msg_title, + 'alert_type': alert_type, + 'source_type_name': "elasticsearch", + 'event_object': hostname, + 'tags': tags + } diff --git a/elastic/ci/elastic.rake b/elastic/ci/elastic.rake new file mode 100644 index 0000000000000..af8f4d6fd4f1d --- /dev/null +++ b/elastic/ci/elastic.rake @@ -0,0 +1,87 @@ +require 'ci/common' + +def elastic_version + ENV['FLAVOR_VERSION'] || '1.3.9' +end + +def elastic_rootdir + "#{ENV['INTEGRATIONS_DIR']}/elastic_#{elastic_version}" +end + +container_name = 'dd-test-elastic' +container_port1 = 9200 +container_port2 = 9300 + +namespace :ci do + namespace :elastic do |flavor| + task before_install: ['ci:common:before_install'] do + sh %(docker kill #{container_name} 2>/dev/null || true) + sh %(docker rm #{container_name} 2>/dev/null || true) + end + + task install: ['ci:common:install'] do + use_venv = in_venv + install_requirements('elastic/requirements.txt', + "--cache-dir #{ENV['PIP_CACHE']}", + "#{ENV['VOLATILE_DIR']}/ci.log", use_venv) + docker_cmd = 'elasticsearch -Des.node.name="batman" ' + if ['0.90.13', '1.0.3', '1.1.2', '1.2.4'].any? { |v| v == elastic_version } + docker_image = "datadog/docker-library:elasticsearch_" + elastic_version.split('.')[0..1].join('_') + if elastic_version == '0.90.13' + docker_cmd += " -f" + end + else + docker_image = "elasticsearch:#{elastic_version}" + end + container_ports = "-p #{container_port1}:#{container_port1} -p #{container_port2}:#{container_port2}" + sh %(docker run -d #{container_ports} --name #{container_name} #{docker_image} #{docker_cmd}) + if elastic_version[0].to_i < 2 + ENV['DD_ELASTIC_LOCAL_HOSTNAME'] = `docker inspect dd-test-elastic | grep Id`[/([0-9a-f\.]{2,})/][0..11] + else + ENV['DD_ELASTIC_LOCAL_HOSTNAME'] = `docker inspect dd-test-elastic | grep IPAddress`[/([0-9\.]+)/] + end + end + + task before_script: ['ci:common:before_script'] do + Wait.for 'http://localhost:9200', 20 + # Create an index in ES + http = Net::HTTP.new('localhost', 9200) + http.send_request('PUT', '/datadog/') + end + + task script: ['ci:common:script'] do + this_provides = [ + 'elastic' + ] + Rake::Task['ci:common:run_tests'].invoke(this_provides) + end + + task before_cache: ['ci:common:before_cache'] + + task cleanup: ['ci:common:cleanup'] do + sh %(docker kill #{container_name} 2>/dev/null || true) + sh %(docker rm #{container_name} 2>/dev/null || true) + end + + task :execute do + exception = nil + begin + %w(before_install install before_script).each do |u| + Rake::Task["#{flavor.scope.path}:#{u}"].invoke + end + Rake::Task["#{flavor.scope.path}:script"].invoke + Rake::Task["#{flavor.scope.path}:before_cache"].invoke + rescue => e + exception = e + puts "Failed task: #{e.class} #{e.message}".red + end + if ENV['SKIP_CLEANUP'] + puts 'Skipping cleanup, disposable environments are great'.yellow + else + puts 'Cleaning up' + Rake::Task["#{flavor.scope.path}:cleanup"].invoke + end + raise exception if exception + end + end +end diff --git a/elastic/ci/elastic.yaml b/elastic/ci/elastic.yaml new file mode 100644 index 0000000000000..942b563e1f2e7 --- /dev/null +++ b/elastic/ci/elastic.yaml @@ -0,0 +1,2 @@ +node: + name: batman diff --git a/elastic/conf.yaml.example b/elastic/conf.yaml.example new file mode 100644 index 0000000000000..bf0437824e77d --- /dev/null +++ b/elastic/conf.yaml.example @@ -0,0 +1,43 @@ +init_config: + +instances: + # The URL where elasticsearch accepts HTTP requests. This will be used to + # fetch statistics from the nodes and information about the cluster health. + # + # If you're using basic authentication with a 3rd party library, for example + # elasticsearch-http-basic, you will need to specify a value for username + # and password for every instance that requires authentication. + # + # If your cluster is hosted externally (i.e., you're not pointing to localhost) + # you will need to set `cluster_stats` to true otherwise the check will only + # submit metrics of the local node. + # DEPRECATION: + # This parameter was also called `is_external` and you can still use it but it + # will be removed in version 6. + # + # If you enable the "pshard_stats" flag, statistics over primary shards + # will be collected by the check and sent to the backend with the + # 'elasticsearch.primary' prefix. It is particularly useful if you want to + # get certain metrics without taking replicas into account. For instance, + # 'elasticsearch.primaries.docs.count` will give you the total number of + # documents in your indexes WITHOUT counting duplicates due to the existence + # of replica shards in your ES cluster + + # `pending_task_stats` (defaults to True) specifies whether to collect data exposed + # by the `pending_tasks` cluster endpoint + # Ref: https://www.elastic.co/guide/en/elasticsearch/reference/current/cluster-pending.html + # Some managed ElasticSearch services (e.g. AWS ElasticSearch) do not expose this endpoint. + # Set `pending_task_stats` to false if you use such a service. + + - url: http://localhost:9200 + # username: username + # password: password + # cluster_stats: false + # pshard_stats: false + # pending_task_stats: true + # ssl_verify: false + # ssl_cert: /path/to/cert.pem + # ssl_key: /path/to/cert.key + # tags: + # - 'tag1:key1' + # - 'tag2:key2' diff --git a/elastic/manifest.json b/elastic/manifest.json new file mode 100644 index 0000000000000..5e3bc9e414355 --- /dev/null +++ b/elastic/manifest.json @@ -0,0 +1,10 @@ +{ + "maintainer": "help@datadoghq.com", + "manifest_version": "0.1.0", + "max_agent_version": "6.0.0", + "min_agent_version": "5.6.3", + "name": "elastic", + "short_description": "elastic description.", + "support": "contrib", + "version": "0.1.0" +} diff --git a/elastic/metadata.csv b/elastic/metadata.csv new file mode 100644 index 0000000000000..04a8203ad5a49 --- /dev/null +++ b/elastic/metadata.csv @@ -0,0 +1,164 @@ +metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name +elasticsearch.active_primary_shards,gauge,,shard,,The number of active primary shards in the cluster.,0,elasticsearch,active primary shards +elasticsearch.active_shards,gauge,,shard,,The number of active shards in the cluster.,0,elasticsearch,active shards +elasticsearch.cache.field.evictions,gauge,,eviction,,The total number of evictions from the field data cache.,0,elasticsearch,field cache evictions +elasticsearch.cache.field.size,gauge,,byte,,The size of the field cache.,0,elasticsearch,field cache size +elasticsearch.cache.filter.count,gauge,,item,,The number of items in the filter cache.,0,elasticsearch,filter cache elements +elasticsearch.cache.filter.evictions,gauge,,eviction,,The total number of evictions from the filter cache.,0,elasticsearch,filter cache evictions +elasticsearch.cache.filter.size,gauge,,byte,,The size of the filter cache.,0,elasticsearch,filter cache size +elasticsearch.cluster_status,gauge,,,,"The elasticsearch cluster health as a number: red = 0, yellow = 1, green = 2",0,elasticsearch,cluster status +elasticsearch.docs.count,gauge,,document,,The total number of documents in the cluster across all shards.,0,elasticsearch,docs count +elasticsearch.docs.deleted,gauge,,document,,The total number of documents deleted from the cluster across all shards.,0,elasticsearch,docs deleted +elasticsearch.fielddata.evictions,gauge,,eviction,,The total number of evictions from the fielddata cache.,0,elasticsearch,fielddata cache evictions +elasticsearch.fielddata.size,gauge,,byte,,The size of the fielddata cache.,0,elasticsearch,fielddata cache size +elasticsearch.flush.total,gauge,,flush,,The total number of index flushes to disk since start.,0,elasticsearch,total index flushes +elasticsearch.flush.total.time,gauge,,second,,The total time spent flushing the index to disk.,0,elasticsearch,total index flush time +elasticsearch.fs.total.available_in_bytes,gauge,,byte,,The total number of bytes available to this Java virtual machine on this file store.,0,elasticsearch,fs total available bytes +elasticsearch.fs.total.disk_io_op,gauge,,operation,,The total I/O operations on the file store.,0,elasticsearch,fs total i/o ops +elasticsearch.fs.total.disk_io_size_in_bytes,gauge,,byte,,Total bytes used for all I/O operations on the file store.,0,elasticsearch,fs total disk io bytes +elasticsearch.fs.total.disk_read_size_in_bytes,gauge,,byte,,The total bytes read from the file store.,0,elasticsearch,fs total bytes read +elasticsearch.fs.total.disk_reads,gauge,,read,,The total number of reads from the file store.,0,elasticsearch,fs total reads +elasticsearch.fs.total.disk_write_size_in_bytes,gauge,,byte,,The total bytes written to the file store.,0,elasticsearch,fs total bytes written +elasticsearch.fs.total.disk_writes,gauge,,write,,The total number of writes to the file store.,0,elasticsearch,fs total writes +elasticsearch.fs.total.free_in_bytes,gauge,,byte,,The total number of unallocated bytes in the file store.,0,elasticsearch,fs total free +elasticsearch.fs.total.total_in_bytes,gauge,,byte,,The total size in bytes of the file store.,0,elasticsearch,fs total size +elasticsearch.get.current,gauge,,request,,The number of get requests currently running.,0,elasticsearch,current gets +elasticsearch.get.exists.time,gauge,,second,,The total time spent on get requests where the document existed.,0,elasticsearch,get exists time +elasticsearch.get.exists.total,gauge,,request,,The total number of get requests where the document existed.,0,elasticsearch,get exists total +elasticsearch.get.missing.time,gauge,,second,,The total time spent on get requests where the document was missing.,0,elasticsearch,get missing time +elasticsearch.get.missing.total,gauge,,request,,The total number of get requests where the document was missing.,0,elasticsearch,get missing total +elasticsearch.get.time,gauge,,second,,The total time spent on get requests.,0,elasticsearch,total get request time +elasticsearch.get.total,gauge,,request,,The total number of get requests.,0,elasticsearch,total get requests +elasticsearch.http.current_open,gauge,,connection,,The number of current open HTTP connections.,0,elasticsearch,current open HTTP +elasticsearch.http.total_opened,gauge,,connection,,The total number of opened HTTP connections.,0,elasticsearch,total opened HTTP +elasticsearch.id_cache.size,gauge,,byte,,The size of the id cache,0,elasticsearch,id cache size +elasticsearch.indexing.delete.current,gauge,,document,,The number of documents currently being deleted from an index.,0,elasticsearch,current index deletes +elasticsearch.indexing.delete.time,gauge,,second,,The total time spent deleting documents from an index.,0,elasticsearch,total index delete time +elasticsearch.indexing.delete.total,gauge,,document,,The total number of documents deleted from an index.,0,elasticsearch,total index deletes +elasticsearch.indexing.index.current,gauge,,document,,The number of documents currently being indexed to an index.,0,elasticsearch,current indexing +elasticsearch.indexing.index.time,gauge,,second,,The total time spent indexing documents to an index.,0,elasticsearch,total indexing time +elasticsearch.indexing.index.total,gauge,,document,,The total number of documents indexed to an index.,0,elasticsearch,total indexed +elasticsearch.indices.segments.count,gauge,,segment,,The number of segments in an index shard.,0,elasticsearch,shard segments count +elasticsearch.indices.segments.fixed_bit_set_memory_in_bytes,gauge,,byte,,The size used by the in memory fixed bit set.,0,elasticsearch,fixed bit set memory +elasticsearch.indices.segments.index_writer_max_memory_in_bytes,gauge,,byte,,The maximum memory used by the index writer.,0,elasticsearch,index writer max memory +elasticsearch.indices.segments.index_writer_memory_in_bytes,gauge,,byte,,The memory used by the index writer.,0,elasticsearch,index writer memory +elasticsearch.indices.segments.memory_in_bytes,gauge,,byte,,The memory used by index segments.,0,elasticsearch,index segments memory +elasticsearch.indices.segments.version_map_memory_in_bytes,gauge,,byte,,The memory used by the segment version map.,0,elasticsearch,index segments version map memory +elasticsearch.indices.translog.operations,gauge,,operation,,The number of operations in the transaction log.,0,elasticsearch,translog operations +elasticsearch.indices.translog.size_in_bytes,gauge,,byte,,The size of the transaction log.,0,elasticsearch,translog size +elasticsearch.initializing_shards,gauge,,shard,,The number of shards that are currently initializing.,0,elasticsearch,initializing shards +elasticsearch.merges.current,gauge,,merge,,The number of currently active segment merges.,0,elasticsearch,current merges +elasticsearch.merges.current.docs,gauge,,document,,The number of documents across segments currently being merged.,0,elasticsearch,current merge docs +elasticsearch.merges.current.size,gauge,,byte,,The size of the segments currently being merged.,0,elasticsearch,current merge size +elasticsearch.merges.total,gauge,,merge,,The total number of segment merges.,0,elasticsearch,total merges +elasticsearch.merges.total.docs,gauge,,document,,The total number of documents across all merged segments.,0,elasticsearch,total merge docs +elasticsearch.merges.total.size,gauge,,byte,,The total size of all merged segments.,0,elasticsearch,total merge size +elasticsearch.merges.total.time,gauge,,second,,The total time spent on segment merging.,0,elasticsearch,total merge time +elasticsearch.number_of_data_nodes,gauge,,node,,The number of data nodes in the cluster.,0,elasticsearch,number of data nodes +elasticsearch.number_of_nodes,gauge,,node,,The total number of nodes in the cluster.,0,elasticsearch,number of nodes +elasticsearch.pending_tasks_priority_high,gauge,,task,,The number of high priority pending tasks.,0,elasticsearch,pending high priority tasks +elasticsearch.pending_tasks_priority_urgent,gauge,,task,,The number of urgent priority pending tasks.,0,elasticsearch,pending urgent tasks +elasticsearch.pending_tasks_total,gauge,,task,,The total number of pending tasks.,0,elasticsearch,pending tasks +elasticsearch.primaries.docs.count,gauge,,document,,The total number of documents in the primary shards.,0,elasticsearch,primary docs count +elasticsearch.primaries.docs.deleted,gauge,,document,,The total number of documents deleted from the primary shards.,0,elasticsearch,primary docs deleted +elasticsearch.primaries.flush.total,gauge,,flush,,The total number of index flushes to disk from the primary shards since start.,0,elasticsearch,primary flush total +elasticsearch.primaries.flush.total.time,gauge,,second,,The total time spent flushing the index to disk from the primary shards.,0,elasticsearch,primary flush total time +elasticsearch.primaries.get.current,gauge,,request,,The number of get requests currently running on the primary shards.,0,elasticsearch,primary current gets +elasticsearch.primaries.get.exists.time,gauge,,request,,The total time spent on get requests from the primary shards where the document existed.,0,elasticsearch,primary get exists time +elasticsearch.primaries.get.exists.total,gauge,,request,,The total number of get requests on primary shards where the document existed.,0,elasticsearch,primary get exists total +elasticsearch.primaries.get.missing.time,gauge,,second,,The total time spent on get requests from the primary shards where the document was missing.,0,elasticsearch,primary get missing time +elasticsearch.primaries.get.missing.total,gauge,,request,,The total number of get requests from the primary shards where the document was missing.,0,elasticsearch,primary get missing total +elasticsearch.primaries.get.time,gauge,,second,,The total time spent on get requests from the primary shards.,0,elasticsearch,primary get time +elasticsearch.primaries.get.total,gauge,,request,,The total number of get requests from the primary shards.,0,elasticsearch,primary get total +elasticsearch.primaries.indexing.delete.current,gauge,,document,,The number of documents currently being deleted from an index on the primary shards.,0,elasticsearch,primary delete current +elasticsearch.primaries.indexing.delete.time,gauge,,second,,The total time spent deleting documents from an index on the primary shards.,0,elasticsearch,primary delete time +elasticsearch.primaries.indexing.delete.total,gauge,,document,,The total number of documents deleted from an index on the primary shards.,0,elasticsearch,primary delete total +elasticsearch.primaries.indexing.index.current,gauge,,document,,The number of documents currently being indexed to an index on the primary shards.,0,elasticsearch,primary index current +elasticsearch.primaries.indexing.index.time,gauge,,second,,The total time spent indexing documents to an index on the primary shards.,0,elasticsearch,primary index time +elasticsearch.primaries.indexing.index.total,gauge,,document,,The total number of documents indexed to an index on the primary shards.,0,elasticsearch,primary index total +elasticsearch.primaries.merges.current,gauge,,merge,,The number of currently active segment merges on the primary shards.,0,elasticsearch,primary current merges +elasticsearch.primaries.merges.current.docs,gauge,,document,,The number of documents across segments currently being merged on the primary shards.,0,elasticsearch,primary current merge docs +elasticsearch.primaries.merges.current.size,gauge,,byte,,The size of the segments currently being merged on the primary shards.,0,elasticsearch,primary current merge size +elasticsearch.primaries.merges.total,gauge,,merge,,The total number of segment merges on the primary shards.,0,elasticsearch,primary merge total +elasticsearch.primaries.merges.total.docs,gauge,,document,,The total number of documents across all merged segments on the primary shards.,0,elasticsearch,primary merge total docs +elasticsearch.primaries.merges.total.size,gauge,,byte,,The total size of all merged segments on the primary shards.,0,elasticsearch,primary merge total size +elasticsearch.primaries.merges.total.time,gauge,,second,,The total time spent on segment merging on the primary shards.,0,elasticsearch,primary merge total time +elasticsearch.primaries.refresh.total,gauge,,refresh,,The total number of index refreshes on the primary shards.,0,elasticsearch,primary refresh total +elasticsearch.primaries.refresh.total.time,gauge,,second,,The total time spent on index refreshes on the primary shards.,0,elasticsearch,primary refresh total time +elasticsearch.primaries.search.fetch.current,gauge,,fetch,,The number of query fetches currently running on the primary shards.,0,elasticsearch,primary current fetches +elasticsearch.primaries.search.fetch.time,gauge,,second,,The total time spent on query fetches on the primary shards.,0,elasticsearch,primary fetch total time +elasticsearch.primaries.search.fetch.total,gauge,,fetch,,The total number of query fetches on the primary shards.,0,elasticsearch,primary fetch total +elasticsearch.primaries.search.query.current,gauge,,query,,The number of currently active queries on the primary shards.,0,elasticsearch,primary current queries +elasticsearch.primaries.search.query.time,gauge,,second,,The total time spent querying on the primary shards.,0,elasticsearch,primary total query time +elasticsearch.primaries.search.query.total,gauge,,query,,The total number of queries to the primary shards.,0,elasticsearch,primary total queries +elasticsearch.primaries.store.size,gauge,,byte,,The total size of all the primary shards.,0,elasticsearch,primary size +elasticsearch.process.open_fd,gauge,,file,,"The number of opened file descriptors associated with the current process, or -1 if not supported.",0,elasticsearch,open file descriptors +elasticsearch.refresh.total,gauge,,refresh,,The total number of index refreshes.,0,elasticsearch,total refreshes +elasticsearch.refresh.total.time,gauge,,second,,The total time spent on index refreshes.,0,elasticsearch,total refresh time +elasticsearch.relocating_shards,gauge,,shard,,The number of shards that are reloacting from one node to another.,0,elasticsearch,reloading shards +elasticsearch.search.fetch.current,gauge,,fetch,,The number of search fetches currently running.,0,elasticsearch,current fetches +elasticsearch.search.fetch.open_contexts,gauge,,query,,The number of active searches.,0,elasticsearch,active searches +elasticsearch.search.fetch.time,gauge,,second,,The total time spent on the search fetch.,0,elasticsearch,total fetch time +elasticsearch.search.fetch.total,gauge,,fetch,,The total number of search fetches.,0,elasticsearch,total fetches +elasticsearch.search.query.current,gauge,,query,,The number of currently active queries.,0,elasticsearch,current queries +elasticsearch.search.query.time,gauge,,second,,The total time spent on queries.,0,elasticsearch,total query time +elasticsearch.search.query.total,gauge,,query,,The total number of queries.,0,elasticsearch,total queries +elasticsearch.store.size,gauge,,byte,,The total size in bytes of the store.,0,elasticsearch,store size +elasticsearch.thread_pool.bulk.active,gauge,,thread,,The number of active threads in the bulk pool.,0,elasticsearch,active bulk threads +elasticsearch.thread_pool.bulk.queue,gauge,,thread,,The number of queued threads in the bulk pool.,0,elasticsearch,queued bulk threads +elasticsearch.thread_pool.bulk.threads,gauge,,thread,,The total number of threads in the bulk pool.,0,elasticsearch,total bulk threads +elasticsearch.thread_pool.bulk.rejected,gauge,,thread,,The number of rejected threads in the bulk pool.,0,elasticsearch,rejected bulk threads +elasticsearch.thread_pool.flush.active,gauge,,thread,,The number of active threads in the flush queue.,0,elasticsearch,active flush threads +elasticsearch.thread_pool.flush.queue,gauge,,thread,,The number of queued threads in the flush pool.,0,elasticsearch,queued flush threads +elasticsearch.thread_pool.flush.threads,gauge,,thread,,The total number of threads in the flush pool.,0,elasticsearch,total flush threads +elasticsearch.thread_pool.generic.active,gauge,,thread,,The number of active threads in the generic pool.,0,elasticsearch,active generic threads +elasticsearch.thread_pool.generic.queue,gauge,,thread,,The number of queued threads in the generic pool.,0,elasticsearch,queued generic threads +elasticsearch.thread_pool.generic.threads,gauge,,thread,,The total number of threads in the generic pool.,0,elasticsearch,total generic threads +elasticsearch.thread_pool.get.active,gauge,,thread,,The number of active threads in the get pool.,0,elasticsearch,active get threads +elasticsearch.thread_pool.get.queue,gauge,,thread,,The number of queued threads in the get pool.,0,elasticsearch,queued get threads +elasticsearch.thread_pool.get.threads,gauge,,thread,,The total number of threads in the get pool.,0,elasticsearch,total get threads +elasticsearch.thread_pool.index.active,gauge,,thread,,The number of active threads in the index pool.,0,elasticsearch,active index threads +elasticsearch.thread_pool.index.queue,gauge,,thread,,The number of queued threads in the index pool.,0,elasticsearch,queued index threads +elasticsearch.thread_pool.index.threads,gauge,,thread,,The total number of threads in the index pool.,0,elasticsearch,total index threads +elasticsearch.thread_pool.management.active,gauge,,thread,,The number of active threads in the management pool.,0,elasticsearch,active management threads +elasticsearch.thread_pool.management.queue,gauge,,thread,,The number of queued threads in the management pool.,0,elasticsearch,queued management threads +elasticsearch.thread_pool.management.threads,gauge,,thread,,The total number of threads in the management pool.,0,elasticsearch,total management threads +elasticsearch.thread_pool.merge.active,gauge,,thread,,The number of active threads in the merge pool.,0,elasticsearch,active merge threads +elasticsearch.thread_pool.merge.queue,gauge,,thread,,The number of queued threads in the merge pool.,0,elasticsearch,queued merge threads +elasticsearch.thread_pool.merge.threads,gauge,,thread,,The total number of threads in the merge pool.,0,elasticsearch,total merge threads +elasticsearch.thread_pool.percolate.active,gauge,,thread,,The number of active threads in the percolate pool.,0,elasticsearch,active percolate threads +elasticsearch.thread_pool.percolate.queue,gauge,,thread,,The number of queued threads in the percolate pool.,0,elasticsearch,queued percolate threads +elasticsearch.thread_pool.percolate.threads,gauge,,thread,,The total number of threads in the percolate pool.,0,elasticsearch,total percolate threads +elasticsearch.thread_pool.refresh.active,gauge,,thread,,The number of active threads in the refresh pool.,0,elasticsearch,active refresh threads +elasticsearch.thread_pool.refresh.queue,gauge,,thread,,The number of queued threads in the refresh pool.,0,elasticsearch,queued refresh threads +elasticsearch.thread_pool.refresh.threads,gauge,,thread,,The total number of threads in the refresh pool.,0,elasticsearch,total refresh threads +elasticsearch.thread_pool.search.active,gauge,,thread,,The number of active threads in the search pool.,0,elasticsearch,active search threads +elasticsearch.thread_pool.search.queue,gauge,,thread,,The number of queued threads in the search pool.,0,elasticsearch,queued search threads +elasticsearch.thread_pool.search.threads,gauge,,thread,,The total number of threads in the search pool.,0,elasticsearch,total search threads +elasticsearch.thread_pool.snapshot.active,gauge,,thread,,The number of active threads in the snapshot pool.,0,elasticsearch,active snapshot threads +elasticsearch.thread_pool.snapshot.queue,gauge,,thread,,The number of queued threads in the snapshot pool.,0,elasticsearch,queued snapshot threads +elasticsearch.thread_pool.snapshot.threads,gauge,,thread,,The total number of threads in the snapshot pool.,0,elasticsearch,total snapshot threads +elasticsearch.transport.rx_count,gauge,,packet,,The total number of packets received in cluster communication.,0,elasticsearch,transport packets received +elasticsearch.transport.rx_size,gauge,,byte,,The total size of data received in cluster communication.,0,elasticsearch,transport bytes received +elasticsearch.transport.server_open,gauge,,connection,,The number of connections opened for cluster communication.,0,elasticsearch,transport open connections +elasticsearch.transport.tx_count,gauge,,packet,,The total number of packets sent in cluster communication.,0,elasticsearch,transport packets sent +elasticsearch.transport.tx_size,gauge,,byte,,The total size of data sent in cluster communication.,0,elasticsearch,transport bytes sent +elasticsearch.unassigned_shards,gauge,,shard,,The number of shards that are unassigned to a node.,0,elasticsearch,unassigned shards +jvm.gc.collection_count,gauge,,garbage collection,,The total number of garbage collections run by the JVM.,0,elasticsearch,jvm gc count +jvm.gc.collection_time,gauge,,second,,The total time spent on garbage collection in the JVM.,0,elasticsearch,jvm gc time +jvm.gc.collectors.old.collection_time,gauge,,second,,The total time spent in major GCs in the JVM that collect old generation objects.,0,elasticsearch,jvm old gc time +jvm.gc.collectors.old.count,gauge,,garbage collection,,The total count of major GCs in the JVM that collect old generation objects.,0,elasticsearch,jvm old gc count +jvm.gc.collectors.young.collection_time,gauge,,second,,The total time spent in minor GCs in the JVM that collects young generation objects.,0,elasticsearch,jvm young gc time +jvm.gc.collectors.young.count,gauge,,garbage collection,,The total count of minor GCs in the JVM that collects young generation objects.,0,elasticsearch,jvm young gc count +jvm.gc.concurrent_mark_sweep.collection_time,gauge,,second,,"The total time spent on ""concurrent mark & sweep"" GCs in the JVM.",0,elasticsearch,jvm concurrent mark and sweep gc time +jvm.gc.concurrent_mark_sweep.count,gauge,,garbage collection,,"The total count of ""concurrent mark & sweep"" GCs in the JVM.",0,elasticsearch,jvm concurrent mark and sweep gc count +jvm.gc.par_new.collection_time,gauge,,second,,"The total time spent on ""parallel new"" GCs in the JVM.",0,elasticsearch,jvm parnew gc time +jvm.gc.par_new.count,gauge,,garbage collection,,"The total count of ""parallel new"" GCs in the JVM.",0,elasticsearch,jvm parnew gc count +jvm.mem.heap_committed,gauge,,byte,,The amount of memory guaranteed to be available to the JVM heap.,0,elasticsearch,jvm heap committed +jvm.mem.heap_in_use,gauge,,,,The amount of memory currently used by the JVM heap as a value between 0 and 1.,0,elasticsearch,jvm heap in use +jvm.mem.heap_max,gauge,,byte,,The maximum amount of memory that can be used by the JVM heap.,0,elasticsearch,jvm heap max +jvm.mem.heap_used,gauge,,byte,,The amount of memory in bytes currently used by the JVM heap.,0,elasticsearch,jvm heap used +jvm.mem.non_heap_committed,gauge,,byte,,The amount of memory guaranteed to be available to JVM non-heap.,0,elasticsearch,jvm non-heap committed +jvm.mem.non_heap_used,gauge,,byte,,The amount of memory in bytes currently used by the JVM non-heap.,0,elasticsearch,jvm non-heap used +jvm.threads.count,gauge,,thread,,The number of active threads in the JVM.,0,elasticsearch,jvm threads +jvm.threads.peak_count,gauge,,thread,,The peak number of threads used by the JVM.,0,elasticsearch,jvm peak threads diff --git a/elastic/requirements.txt b/elastic/requirements.txt new file mode 100644 index 0000000000000..f89ecf55da555 --- /dev/null +++ b/elastic/requirements.txt @@ -0,0 +1 @@ +# integration pip requirements diff --git a/elastic/test_elastic.py b/elastic/test_elastic.py new file mode 100644 index 0000000000000..a7fb4966b21f4 --- /dev/null +++ b/elastic/test_elastic.py @@ -0,0 +1,593 @@ +# (C) Datadog, Inc. 2010-2016 +# All rights reserved +# Licensed under Simplified BSD License (see LICENSE) + +# stdlib +import os +import time +import socket + +# 3p +from nose.plugins.attrib import attr +import requests + +# project +from config import get_version +from shared.test.common import AgentCheckTest, load_check + +# Clusterwise metrics, pre aggregated on ES, compatible with all ES versions +PRIMARY_SHARD_METRICS = { + "elasticsearch.primaries.docs.count": ("gauge", "_all.primaries.docs.count"), + "elasticsearch.primaries.docs.deleted": ("gauge", "_all.primaries.docs.deleted"), + "elasticsearch.primaries.store.size": ("gauge", "_all.primaries.store.size_in_bytes"), + "elasticsearch.primaries.indexing.index.total": ("gauge", "_all.primaries.indexing.index_total"), + "elasticsearch.primaries.indexing.index.time": ("gauge", "_all.primaries.indexing.index_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.indexing.index.current": ("gauge", "_all.primaries.indexing.index_current"), + "elasticsearch.primaries.indexing.delete.total": ("gauge", "_all.primaries.indexing.delete_total"), + "elasticsearch.primaries.indexing.delete.time": ("gauge", "_all.primaries.indexing.delete_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.indexing.delete.current": ("gauge", "_all.primaries.indexing.delete_current"), + "elasticsearch.primaries.get.total": ("gauge", "_all.primaries.get.total"), + "elasticsearch.primaries.get.time": ("gauge", "_all.primaries.get.time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.get.current": ("gauge", "_all.primaries.get.current"), + "elasticsearch.primaries.get.exists.total": ("gauge", "_all.primaries.get.exists_total"), + "elasticsearch.primaries.get.exists.time": ("gauge", "_all.primaries.get.exists_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.get.missing.total": ("gauge", "_all.primaries.get.missing_total"), + "elasticsearch.primaries.get.missing.time": ("gauge", "_all.primaries.get.missing_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.search.query.total": ("gauge", "_all.primaries.search.query_total"), + "elasticsearch.primaries.search.query.time": ("gauge", "_all.primaries.search.query_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.search.query.current": ("gauge", "_all.primaries.search.query_current"), + "elasticsearch.primaries.search.fetch.total": ("gauge", "_all.primaries.search.fetch_total"), + "elasticsearch.primaries.search.fetch.time": ("gauge", "_all.primaries.search.fetch_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.search.fetch.current": ("gauge", "_all.primaries.search.fetch_current") +} + +PRIMARY_SHARD_METRICS_POST_1_0 = { + "elasticsearch.primaries.merges.current": ("gauge", "_all.primaries.merges.current"), + "elasticsearch.primaries.merges.current.docs": ("gauge", "_all.primaries.merges.current_docs"), + "elasticsearch.primaries.merges.current.size": ("gauge", "_all.primaries.merges.current_size_in_bytes"), + "elasticsearch.primaries.merges.total": ("gauge", "_all.primaries.merges.total"), + "elasticsearch.primaries.merges.total.time": ("gauge", "_all.primaries.merges.total_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.merges.total.docs": ("gauge", "_all.primaries.merges.total_docs"), + "elasticsearch.primaries.merges.total.size": ("gauge", "_all.primaries.merges.total_size_in_bytes"), + "elasticsearch.primaries.refresh.total": ("gauge", "_all.primaries.refresh.total"), + "elasticsearch.primaries.refresh.total.time": ("gauge", "_all.primaries.refresh.total_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.primaries.flush.total": ("gauge", "_all.primaries.flush.total"), + "elasticsearch.primaries.flush.total.time": ("gauge", "_all.primaries.flush.total_time_in_millis", lambda v: float(v)/1000) +} + +STATS_METRICS = { # Metrics that are common to all Elasticsearch versions + "elasticsearch.docs.count": ("gauge", "indices.docs.count"), + "elasticsearch.docs.deleted": ("gauge", "indices.docs.deleted"), + "elasticsearch.store.size": ("gauge", "indices.store.size_in_bytes"), + "elasticsearch.indexing.index.total": ("gauge", "indices.indexing.index_total"), + "elasticsearch.indexing.index.time": ("gauge", "indices.indexing.index_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.indexing.index.current": ("gauge", "indices.indexing.index_current"), + "elasticsearch.indexing.delete.total": ("gauge", "indices.indexing.delete_total"), + "elasticsearch.indexing.delete.time": ("gauge", "indices.indexing.delete_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.indexing.delete.current": ("gauge", "indices.indexing.delete_current"), + "elasticsearch.get.total": ("gauge", "indices.get.total"), + "elasticsearch.get.time": ("gauge", "indices.get.time_in_millis", lambda v: float(v)/1000), + "elasticsearch.get.current": ("gauge", "indices.get.current"), + "elasticsearch.get.exists.total": ("gauge", "indices.get.exists_total"), + "elasticsearch.get.exists.time": ("gauge", "indices.get.exists_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.get.missing.total": ("gauge", "indices.get.missing_total"), + "elasticsearch.get.missing.time": ("gauge", "indices.get.missing_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.search.query.total": ("gauge", "indices.search.query_total"), + "elasticsearch.search.query.time": ("gauge", "indices.search.query_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.search.query.current": ("gauge", "indices.search.query_current"), + "elasticsearch.search.fetch.total": ("gauge", "indices.search.fetch_total"), + "elasticsearch.search.fetch.time": ("gauge", "indices.search.fetch_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.search.fetch.current": ("gauge", "indices.search.fetch_current"), + "elasticsearch.indices.segments.count": ("gauge", "indices.segments.count"), + "elasticsearch.indices.segments.memory_in_bytes": ("gauge", "indices.segments.memory_in_bytes"), + "elasticsearch.merges.current": ("gauge", "indices.merges.current"), + "elasticsearch.merges.current.docs": ("gauge", "indices.merges.current_docs"), + "elasticsearch.merges.current.size": ("gauge", "indices.merges.current_size_in_bytes"), + "elasticsearch.merges.total": ("gauge", "indices.merges.total"), + "elasticsearch.merges.total.time": ("gauge", "indices.merges.total_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.merges.total.docs": ("gauge", "indices.merges.total_docs"), + "elasticsearch.merges.total.size": ("gauge", "indices.merges.total_size_in_bytes"), + "elasticsearch.refresh.total": ("gauge", "indices.refresh.total"), + "elasticsearch.refresh.total.time": ("gauge", "indices.refresh.total_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.flush.total": ("gauge", "indices.flush.total"), + "elasticsearch.flush.total.time": ("gauge", "indices.flush.total_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.process.open_fd": ("gauge", "process.open_file_descriptors"), + "elasticsearch.transport.rx_count": ("gauge", "transport.rx_count"), + "elasticsearch.transport.tx_count": ("gauge", "transport.tx_count"), + "elasticsearch.transport.rx_size": ("gauge", "transport.rx_size_in_bytes"), + "elasticsearch.transport.tx_size": ("gauge", "transport.tx_size_in_bytes"), + "elasticsearch.transport.server_open": ("gauge", "transport.server_open"), + "elasticsearch.thread_pool.bulk.active": ("gauge", "thread_pool.bulk.active"), + "elasticsearch.thread_pool.bulk.threads": ("gauge", "thread_pool.bulk.threads"), + "elasticsearch.thread_pool.bulk.queue": ("gauge", "thread_pool.bulk.queue"), + "elasticsearch.thread_pool.bulk.rejected": ("rate", "thread_pool.bulk.rejected"), + "elasticsearch.thread_pool.flush.active": ("gauge", "thread_pool.flush.active"), + "elasticsearch.thread_pool.flush.threads": ("gauge", "thread_pool.flush.threads"), + "elasticsearch.thread_pool.flush.queue": ("gauge", "thread_pool.flush.queue"), + "elasticsearch.thread_pool.flush.rejected": ("rate", "thread_pool.flush.rejected"), + "elasticsearch.thread_pool.generic.active": ("gauge", "thread_pool.generic.active"), + "elasticsearch.thread_pool.generic.threads": ("gauge", "thread_pool.generic.threads"), + "elasticsearch.thread_pool.generic.queue": ("gauge", "thread_pool.generic.queue"), + "elasticsearch.thread_pool.generic.rejected": ("rate", "thread_pool.generic.rejected"), + "elasticsearch.thread_pool.get.active": ("gauge", "thread_pool.get.active"), + "elasticsearch.thread_pool.get.threads": ("gauge", "thread_pool.get.threads"), + "elasticsearch.thread_pool.get.queue": ("gauge", "thread_pool.get.queue"), + "elasticsearch.thread_pool.get.rejected": ("rate", "thread_pool.get.rejected"), + "elasticsearch.thread_pool.index.active": ("gauge", "thread_pool.index.active"), + "elasticsearch.thread_pool.index.threads": ("gauge", "thread_pool.index.threads"), + "elasticsearch.thread_pool.index.queue": ("gauge", "thread_pool.index.queue"), + "elasticsearch.thread_pool.index.rejected": ("rate", "thread_pool.index.rejected"), + "elasticsearch.thread_pool.management.active": ("gauge", "thread_pool.management.active"), + "elasticsearch.thread_pool.management.threads": ("gauge", "thread_pool.management.threads"), + "elasticsearch.thread_pool.management.queue": ("gauge", "thread_pool.management.queue"), + "elasticsearch.thread_pool.management.rejected": ("rate", "thread_pool.management.rejected"), + "elasticsearch.thread_pool.percolate.active": ("gauge", "thread_pool.percolate.active"), + "elasticsearch.thread_pool.percolate.threads": ("gauge", "thread_pool.percolate.threads"), + "elasticsearch.thread_pool.percolate.queue": ("gauge", "thread_pool.percolate.queue"), + "elasticsearch.thread_pool.percolate.rejected": ("rate", "thread_pool.percolate.rejected"), + "elasticsearch.thread_pool.refresh.active": ("gauge", "thread_pool.refresh.active"), + "elasticsearch.thread_pool.refresh.threads": ("gauge", "thread_pool.refresh.threads"), + "elasticsearch.thread_pool.refresh.queue": ("gauge", "thread_pool.refresh.queue"), + "elasticsearch.thread_pool.refresh.rejected": ("rate", "thread_pool.refresh.rejected"), + "elasticsearch.thread_pool.search.active": ("gauge", "thread_pool.search.active"), + "elasticsearch.thread_pool.search.threads": ("gauge", "thread_pool.search.threads"), + "elasticsearch.thread_pool.search.queue": ("gauge", "thread_pool.search.queue"), + "elasticsearch.thread_pool.search.rejected": ("rate", "thread_pool.search.rejected"), + "elasticsearch.thread_pool.snapshot.active": ("gauge", "thread_pool.snapshot.active"), + "elasticsearch.thread_pool.snapshot.threads": ("gauge", "thread_pool.snapshot.threads"), + "elasticsearch.thread_pool.snapshot.queue": ("gauge", "thread_pool.snapshot.queue"), + "elasticsearch.thread_pool.snapshot.rejected": ("rate", "thread_pool.snapshot.rejected"), + "elasticsearch.thread_pool.suggest.active": ("gauge", "thread_pool.suggest.active"), + "elasticsearch.thread_pool.suggest.threads": ("gauge", "thread_pool.suggest.threads"), + "elasticsearch.thread_pool.suggest.queue": ("gauge", "thread_pool.suggest.queue"), + "elasticsearch.thread_pool.suggest.rejected": ("rate", "thread_pool.suggest.rejected"), + "elasticsearch.thread_pool.warmer.active": ("gauge", "thread_pool.warmer.active"), + "elasticsearch.thread_pool.warmer.threads": ("gauge", "thread_pool.warmer.threads"), + "elasticsearch.thread_pool.warmer.queue": ("gauge", "thread_pool.warmer.queue"), + "elasticsearch.thread_pool.warmer.rejected": ("rate", "thread_pool.warmer.rejected"), + "elasticsearch.http.current_open": ("gauge", "http.current_open"), + "elasticsearch.http.total_opened": ("gauge", "http.total_opened"), + "jvm.mem.heap_committed": ("gauge", "jvm.mem.heap_committed_in_bytes"), + "jvm.mem.heap_used": ("gauge", "jvm.mem.heap_used_in_bytes"), + "jvm.mem.heap_in_use": ("gauge", "jvm.mem.heap_used_percent"), + "jvm.mem.heap_max": ("gauge", "jvm.mem.heap_max_in_bytes"), + "jvm.mem.non_heap_committed": ("gauge", "jvm.mem.non_heap_committed_in_bytes"), + "jvm.mem.non_heap_used": ("gauge", "jvm.mem.non_heap_used_in_bytes"), + "jvm.threads.count": ("gauge", "jvm.threads.count"), + "jvm.threads.peak_count": ("gauge", "jvm.threads.peak_count"), + "elasticsearch.fs.total.total_in_bytes": ("gauge", "fs.total.total_in_bytes"), + "elasticsearch.fs.total.free_in_bytes": ("gauge", "fs.total.free_in_bytes"), + "elasticsearch.fs.total.available_in_bytes": ("gauge", "fs.total.available_in_bytes"), +} + +JVM_METRICS_POST_0_90_10 = { + "jvm.gc.collectors.young.count": ("gauge", "jvm.gc.collectors.young.collection_count"), + "jvm.gc.collectors.young.collection_time": ("gauge", "jvm.gc.collectors.young.collection_time_in_millis", lambda v: float(v)/1000), + "jvm.gc.collectors.old.count": ("gauge", "jvm.gc.collectors.old.collection_count"), + "jvm.gc.collectors.old.collection_time": ("gauge", "jvm.gc.collectors.old.collection_time_in_millis", lambda v: float(v)/1000) +} + +JVM_METRICS_PRE_0_90_10 = { + "jvm.gc.concurrent_mark_sweep.count": ("gauge", "jvm.gc.collectors.ConcurrentMarkSweep.collection_count"), + "jvm.gc.concurrent_mark_sweep.collection_time": ("gauge", "jvm.gc.collectors.ConcurrentMarkSweep.collection_time_in_millis", lambda v: float(v)/1000), + "jvm.gc.par_new.count": ("gauge", "jvm.gc.collectors.ParNew.collection_count"), + "jvm.gc.par_new.collection_time": ("gauge", "jvm.gc.collectors.ParNew.collection_time_in_millis", lambda v: float(v)/1000), + "jvm.gc.collection_count": ("gauge", "jvm.gc.collection_count"), + "jvm.gc.collection_time": ("gauge", "jvm.gc.collection_time_in_millis", lambda v: float(v)/1000), +} + +ADDITIONAL_METRICS_POST_0_90_5 = { + "elasticsearch.search.fetch.open_contexts": ("gauge", "indices.search.open_contexts"), + "elasticsearch.fielddata.size": ("gauge", "indices.fielddata.memory_size_in_bytes"), + "elasticsearch.fielddata.evictions": ("gauge", "indices.fielddata.evictions"), +} + +ADDITIONAL_METRICS_POST_0_90_5_PRE_2_0 = { + "elasticsearch.cache.filter.evictions": ("gauge", "indices.filter_cache.evictions"), + "elasticsearch.cache.filter.size": ("gauge", "indices.filter_cache.memory_size_in_bytes"), + "elasticsearch.id_cache.size": ("gauge", "indices.id_cache.memory_size_in_bytes"), +} + +ADDITIONAL_METRICS_PRE_0_90_5 = { + "elasticsearch.cache.field.evictions": ("gauge", "indices.cache.field_evictions"), + "elasticsearch.cache.field.size": ("gauge", "indices.cache.field_size_in_bytes"), + "elasticsearch.cache.filter.count": ("gauge", "indices.cache.filter_count"), + "elasticsearch.cache.filter.evictions": ("gauge", "indices.cache.filter_evictions"), + "elasticsearch.cache.filter.size": ("gauge", "indices.cache.filter_size_in_bytes"), +} + +ADDITIONAL_METRICS_POST_1_0_0 = { + "elasticsearch.indices.translog.size_in_bytes": ("gauge", "indices.translog.size_in_bytes"), + "elasticsearch.indices.translog.operations": ("gauge", "indices.translog.operations"), +} + +ADDITIONAL_METRICS_1_x = { + # Currently has issues in test framework: + # "elasticsearch.fs.total.disk_reads": ("rate", "fs.total.disk_reads"), + # "elasticsearch.fs.total.disk_writes": ("rate", "fs.total.disk_writes"), + # "elasticsearch.fs.total.disk_io_op": ("rate", "fs.total.disk_io_op"), + # "elasticsearch.fs.total.disk_read_size_in_bytes": ("gauge", "fs.total.disk_read_size_in_bytes"), + # "elasticsearch.fs.total.disk_write_size_in_bytes": ("gauge", "fs.total.disk_write_size_in_bytes"), + # "elasticsearch.fs.total.disk_io_size_in_bytes": ("gauge", "fs.total.disk_io_size_in_bytes"), +} + +ADDITIONAL_METRICS_POST_1_3_0 = { + "elasticsearch.indices.segments.index_writer_memory_in_bytes": ("gauge", "indices.segments.index_writer_memory_in_bytes"), + "elasticsearch.indices.segments.version_map_memory_in_bytes": ("gauge", "indices.segments.version_map_memory_in_bytes"), +} + +ADDITIONAL_METRICS_POST_1_4_0 = { + "elasticsearch.indices.indexing.throttle_time": ("rate", "indices.indexing.throttle_time_in_millis", lambda v: float(v)/1000), + "elasticsearch.indices.query_cache.memory_size_in_bytes": ("gauge", "indices.query_cache.memory_size_in_bytes"), + "elasticsearch.indices.query_cache.hit_count": ("rate", "indices.query_cache.hit_count"), + "elasticsearch.indices.query_cache.miss_count": ("rate", "indices.query_cache.miss_count"), + "elasticsearch.indices.query_cache.evictions": ("rate", "indices.query_cache.evictions"), + "elasticsearch.indices.segments.index_writer_max_memory_in_bytes": ("gauge", "indices.segments.index_writer_max_memory_in_bytes"), + "elasticsearch.indices.segments.fixed_bit_set_memory_in_bytes": ("gauge", "indices.segments.fixed_bit_set_memory_in_bytes"), + "elasticsearch.breakers.fielddata.estimated_size_in_bytes": ("gauge", "breakers.fielddata.estimated_size_in_bytes"), + "elasticsearch.breakers.fielddata.overhead": ("gauge", "breakers.fielddata.overhead"), + "elasticsearch.breakers.fielddata.tripped": ("rate", "breakers.fielddata.tripped"), + "elasticsearch.breakers.parent.estimated_size_in_bytes": ("gauge", "breakers.parent.estimated_size_in_bytes"), + "elasticsearch.breakers.parent.overhead": ("gauge", "breakers.parent.overhead"), + "elasticsearch.breakers.parent.tripped": ("rate", "breakers.parent.tripped"), + "elasticsearch.breakers.request.estimated_size_in_bytes": ("gauge", "breakers.request.estimated_size_in_bytes"), + "elasticsearch.breakers.request.overhead": ("gauge", "breakers.request.overhead"), + "elasticsearch.breakers.request.tripped": ("rate", "breakers.request.tripped"), + "elasticsearch.thread_pool.listener.active": ("gauge", "thread_pool.listener.active"), + "elasticsearch.thread_pool.listener.threads": ("gauge", "thread_pool.listener.threads"), + "elasticsearch.thread_pool.listener.queue": ("gauge", "thread_pool.listener.queue"), + "elasticsearch.thread_pool.listener.rejected": ("rate", "thread_pool.listener.rejected"), +} + +ADDITIONAL_METRICS_POST_1_5_0 = { + "elasticsearch.indices.recovery.current_as_source": ("gauge", "indices.recovery.current_as_source"), + "elasticsearch.indices.recovery.current_as_target": ("gauge", "indices.recovery.current_as_target"), + "elasticsearch.indices.recovery.throttle_time": ("rate", "indices.recovery.throttle_time_in_millis", lambda v: float(v)/1000), +} + +ADDITIONAL_METRICS_POST_1_6_0 = { + "elasticsearch.thread_pool.fetch_shard_started.active": ("gauge", "thread_pool.fetch_shard_started.active"), + "elasticsearch.thread_pool.fetch_shard_started.threads": ("gauge", "thread_pool.fetch_shard_started.threads"), + "elasticsearch.thread_pool.fetch_shard_started.queue": ("gauge", "thread_pool.fetch_shard_started.queue"), + "elasticsearch.thread_pool.fetch_shard_started.rejected": ("rate", "thread_pool.fetch_shard_started.rejected"), + "elasticsearch.thread_pool.fetch_shard_store.active": ("gauge", "thread_pool.fetch_shard_store.active"), + "elasticsearch.thread_pool.fetch_shard_store.threads": ("gauge", "thread_pool.fetch_shard_store.threads"), + "elasticsearch.thread_pool.fetch_shard_store.queue": ("gauge", "thread_pool.fetch_shard_store.queue"), + "elasticsearch.thread_pool.fetch_shard_store.rejected": ("rate", "thread_pool.fetch_shard_store.rejected"), +} + +ADDITIONAL_METRICS_PRE_2_0 = { + "elasticsearch.thread_pool.merge.active": ("gauge", "thread_pool.merge.active"), + "elasticsearch.thread_pool.merge.threads": ("gauge", "thread_pool.merge.threads"), + "elasticsearch.thread_pool.merge.queue": ("gauge", "thread_pool.merge.queue"), + "elasticsearch.thread_pool.merge.rejected": ("rate", "thread_pool.merge.rejected"), +} + +ADDITIONAL_METRICS_POST_2_0 = { + "elasticsearch.indices.query_cache.cache_size": ("gauge", "indices.query_cache.cache_size"), + "elasticsearch.indices.query_cache.cache_count": ("rate", "indices.query_cache.cache_count"), + "elasticsearch.indices.query_cache.total_count": ("rate", "indices.query_cache.total_count"), + "elasticsearch.indices.segments.doc_values_memory_in_bytes": ("gauge", "indices.segments.doc_values_memory_in_bytes"), + "elasticsearch.indices.segments.norms_memory_in_bytes": ("gauge", "indices.segments.norms_memory_in_bytes"), + "elasticsearch.indices.segments.stored_fields_memory_in_bytes": ("gauge", "indices.segments.stored_fields_memory_in_bytes"), + "elasticsearch.indices.segments.term_vectors_memory_in_bytes": ("gauge", "indices.segments.term_vectors_memory_in_bytes"), + "elasticsearch.indices.segments.terms_memory_in_bytes": ("gauge", "indices.segments.terms_memory_in_bytes"), + "elasticsearch.indices.request_cache.memory_size_in_bytes": ("gauge", "indices.request_cache.memory_size_in_bytes"), + "elasticsearch.indices.request_cache.evictions": ("rate", "indices.request_cache.evictions"), + "elasticsearch.indices.request_cache.hit_count": ("rate", "indices.request_cache.hit_count"), + "elasticsearch.indices.request_cache.miss_count": ("rate", "indices.request_cache.miss_count"), +} + +ADDITIONAL_METRICS_POST_2_1 = { + "elasticsearch.indices.indexing.index_failed": ("rate", "indices.indexing.index_failed"), + "elasticsearch.thread_pool.force_merge.active": ("gauge", "thread_pool.force_merge.active"), + "elasticsearch.thread_pool.force_merge.threads": ("gauge", "thread_pool.force_merge.threads"), + "elasticsearch.thread_pool.force_merge.queue": ("gauge", "thread_pool.force_merge.queue"), + "elasticsearch.thread_pool.force_merge.rejected": ("rate", "thread_pool.force_merge.rejected"), +} + +CLUSTER_HEALTH_METRICS = { + "elasticsearch.number_of_nodes": ("gauge", "number_of_nodes"), + "elasticsearch.number_of_data_nodes": ("gauge", "number_of_data_nodes"), + "elasticsearch.active_primary_shards": ("gauge", "active_primary_shards"), + "elasticsearch.active_shards": ("gauge", "active_shards"), + "elasticsearch.relocating_shards": ("gauge", "relocating_shards"), + "elasticsearch.initializing_shards": ("gauge", "initializing_shards"), + "elasticsearch.unassigned_shards": ("gauge", "unassigned_shards"), + "elasticsearch.cluster_status": ("gauge", "status", lambda v: {"red": 0, "yellow": 1, "green": 2}.get(v, -1)), +} + +CLUSTER_PENDING_TASKS = { + "elasticsearch.pending_tasks_total": ("gauge", "pending_task_total"), + "elasticsearch.pending_tasks_priority_high": ("gauge", "pending_tasks_priority_high"), + "elasticsearch.pending_tasks_priority_urgent": ("gauge", "pending_tasks_priority_urgent") +} + + +def get_es_version(): + version = os.environ.get("FLAVOR_VERSION") + if version is None: + return [1, 6, 0] + return [int(k) for k in version.split(".")] + + +@attr(requires='elastic') +class TestElastic(AgentCheckTest): + CHECK_NAME = "elastic" + + def test_check(self): + conf_hostname = "foo" + port = 9200 + bad_port = 9405 + agent_config = { + "hostname": conf_hostname, "version": get_version(), + "api_key": "bar" + } + + tags = [u"foo:bar", u"baz"] + cluster_tag = [u"cluster_name:elasticsearch"] + url = 'http://localhost:{0}'.format(port) + bad_url = 'http://localhost:{0}'.format(bad_port) + + config = { + 'instances': [ + {'url': url, 'tags': tags}, # One with tags not external + {'url': url, 'cluster_stats': True}, # One without tags, external + {'url': bad_url}, # One bad url + ] + } + + self.assertRaises( + requests.exceptions.ConnectionError, + self.run_check, config=config, agent_config=agent_config) + + default_tags = ["url:http://localhost:{0}".format(port)] + + expected_metrics = dict(STATS_METRICS) + CLUSTER_HEALTH_METRICS.update(CLUSTER_PENDING_TASKS) + expected_metrics.update(CLUSTER_HEALTH_METRICS) + + instance_config = self.check.get_instance_config(config['instances'][0]) + es_version = self.check._get_es_version(instance_config) + + self.assertEquals(es_version, get_es_version()) + + if es_version >= [0, 90, 5]: + expected_metrics.update(ADDITIONAL_METRICS_POST_0_90_5) + if es_version >= [0, 90, 10]: + expected_metrics.update(JVM_METRICS_POST_0_90_10) + else: + expected_metrics.update(JVM_METRICS_PRE_0_90_10) + else: + expected_metrics.update(ADDITIONAL_METRICS_PRE_0_90_5) + expected_metrics.update(JVM_METRICS_PRE_0_90_10) + + if es_version >= [1, 0, 0]: + expected_metrics.update(ADDITIONAL_METRICS_POST_1_0_0) + + if es_version < [2, 0, 0]: + expected_metrics.update(ADDITIONAL_METRICS_PRE_2_0) + if es_version >= [0, 90, 5]: + expected_metrics.update(ADDITIONAL_METRICS_POST_0_90_5_PRE_2_0) + if es_version >= [1, 0, 0]: + expected_metrics.update(ADDITIONAL_METRICS_1_x) + + if es_version >= [1, 3, 0]: + expected_metrics.update(ADDITIONAL_METRICS_POST_1_3_0) + + if es_version >= [1, 4, 0]: + expected_metrics.update(ADDITIONAL_METRICS_POST_1_4_0) + + if es_version >= [1, 5, 0]: + expected_metrics.update(ADDITIONAL_METRICS_POST_1_5_0) + + if es_version >= [1, 6, 0]: + expected_metrics.update(ADDITIONAL_METRICS_POST_1_6_0) + + if es_version >= [2, 0, 0]: + expected_metrics.update(ADDITIONAL_METRICS_POST_2_0) + + if es_version >= [2, 1, 0]: + expected_metrics.update(ADDITIONAL_METRICS_POST_2_1) + + if os.environ.get("DD_ELASTIC_LOCAL_HOSTNAME"): + local_hostname = os.environ.get("DD_ELASTIC_LOCAL_HOSTNAME") + elif es_version < [2, 0, 0]: + local_hostname = socket.gethostname() + else: + local_hostname = '127.0.0.1' + + contexts = [ + (conf_hostname, default_tags + tags), + (local_hostname, default_tags) + ] + + stats_keys = ( + set(expected_metrics.keys()) - set(CLUSTER_HEALTH_METRICS.keys()) - + set(CLUSTER_PENDING_TASKS.keys()) + ) + + for m_name, desc in expected_metrics.iteritems(): + for hostname, m_tags in contexts: + m_tags = m_tags + cluster_tag + if (m_name in CLUSTER_HEALTH_METRICS and + hostname == local_hostname): + hostname = conf_hostname + + if m_name in stats_keys: + m_tags = m_tags + [u"node_name:batman"] + + if desc[0] == "gauge": + self.assertMetric( + m_name, tags=m_tags, count=1, hostname=hostname) + + good_sc_tags = ['host:localhost', 'port:{0}'.format(port)] + bad_sc_tags = ['host:localhost', 'port:{0}'.format(bad_port)] + + self.assertServiceCheckOK('elasticsearch.can_connect', + tags=good_sc_tags + tags, + count=1) + self.assertServiceCheckOK('elasticsearch.can_connect', + tags=good_sc_tags, + count=1) + self.assertServiceCheckCritical('elasticsearch.can_connect', + tags=bad_sc_tags, + count=1) + + # Assert service metadata + self.assertServiceMetadata(['version'], count=3) + + # FIXME: 0.90.13 returns randomly a red status instead of yellow, + # so we don't do a coverage test for it + # Remove me when we stop supporting 0.90.x (not supported anymore by ES) + if get_es_version() != [0, 90, 13]: + # Warning because elasticsearch status should be yellow, according to + # http://chrissimpson.co.uk/elasticsearch-yellow-cluster-status-explained.html + self.assertServiceCheckWarning('elasticsearch.cluster_health', + tags=good_sc_tags + tags, + count=1) + self.assertServiceCheckWarning('elasticsearch.cluster_health', + tags=good_sc_tags, + count=1) + # Assert event + self.assertEvent('ElasticSearch: foo just reported as yellow', count=1, + tags=default_tags+tags+cluster_tag, + msg_title='foo is yellow', + event_type='elasticsearch', alert_type='warning', + source_type_name='elasticsearch') + + self.coverage_report() + + def test_config_parser(self): + check = load_check(self.CHECK_NAME, {}, {}) + instance = { + "username": "user", + "password": "pass", + "is_external": "yes", + "url": "http://foo.bar", + "tags": ["a", "b:c"], + } + + c = check.get_instance_config(instance) + self.assertEquals(c.username, "user") + self.assertEquals(c.password, "pass") + self.assertEquals(c.cluster_stats, True) + self.assertEquals(c.url, "http://foo.bar") + self.assertEquals(c.tags, ["url:http://foo.bar", "a", "b:c"]) + self.assertEquals(c.timeout, check.DEFAULT_TIMEOUT) + self.assertEquals(c.service_check_tags, ["host:foo.bar", "port:None", "a", "b:c"]) + + instance = { + "url": "http://192.168.42.42:12999", + "timeout": 15 + } + + c = check.get_instance_config(instance) + self.assertEquals(c.username, None) + self.assertEquals(c.password, None) + self.assertEquals(c.cluster_stats, False) + self.assertEquals(c.url, "http://192.168.42.42:12999") + self.assertEquals(c.tags, ["url:http://192.168.42.42:12999"]) + self.assertEquals(c.timeout, 15) + self.assertEquals(c.service_check_tags, + ["host:192.168.42.42", "port:12999"]) + + instance = { + "username": "user", + "password": "pass", + "url": "https://foo.bar:9200", + "ssl_verify": "true", + "ssl_cert": "/path/to/cert.pem", + "ssl_key": "/path/to/cert.key", + } + + c = check.get_instance_config(instance) + self.assertEquals(c.username, "user") + self.assertEquals(c.password, "pass") + self.assertEquals(c.cluster_stats, False) + self.assertEquals(c.url, "https://foo.bar:9200") + self.assertEquals(c.tags, ["url:https://foo.bar:9200"]) + self.assertEquals(c.timeout, check.DEFAULT_TIMEOUT) + self.assertEquals(c.service_check_tags, ["host:foo.bar", "port:9200"]) + self.assertEquals(c.ssl_verify, "true") + self.assertEquals(c.ssl_cert, "/path/to/cert.pem") + self.assertEquals(c.ssl_key, "/path/to/cert.key") + + def test_health_event(self): + dummy_tags = ['foo:bar', 'elastique:recherche'] + server_tags = ['cluster_name:elasticsearch'] + + config = {'instances': [ + {'url': 'http://localhost:9200', 'tags': dummy_tags} + ]} + + # Should be yellow at first + requests.put('http://localhost:9200/_settings', data='{"index": {"number_of_replicas": 1}}') + self.run_check(config) + + self.assertEquals(len(self.events), 1) + self.assertIn('yellow', self.events[0]['msg_title']) + self.assertEquals( + ['url:http://localhost:9200'] + dummy_tags + server_tags, + self.events[0]['tags'] + ) + self.assertServiceCheckWarning( + 'elasticsearch.cluster_health', + tags=['host:localhost', 'port:9200'] + dummy_tags, + count=1 + ) + + # Set number of replicas to 0 for all indices + requests.put('http://localhost:9200/_settings', data='{"index": {"number_of_replicas": 0}}') + time.sleep(5) + # Now shards should be green + self.run_check(config) + + self.assertEquals(len(self.events), 1) + self.assertIn('green', self.events[0]['msg_title']) + self.assertEquals( + ['url:http://localhost:9200'] + dummy_tags + server_tags, + self.events[0]['tags'] + ) + self.assertServiceCheckOK( + 'elasticsearch.cluster_health', + tags=['host:localhost', 'port:9200'] + dummy_tags, + count=1 + ) + + def test_pshard_metrics(self): + """ Tests that the pshard related metrics are forwarded and that the + document count for primary indexes is twice smaller as the global + document count when "number_of_replicas" is set to 1 """ + elastic_latency = 10 + + config = {'instances': [ + {'url': 'http://localhost:9200', 'pshard_stats': True} + ]} + # Cleaning up everything won't hurt. + req = requests.get('http://localhost:9200/_cat/indices?v') + indices_info = req.text.split('\n')[1::-1] + for index_info in indices_info: + index_name = index_info.split()[1] + requests.delete('http://localhost:9200/' + index_name) + + requests.put('http://localhost:9200/_settings', data='{"index": {"number_of_replicas": 1}}') + requests.put('http://localhost:9200/testindex/testtype/2', data='{"name": "Jane Doe", "age": 27}') + requests.put('http://localhost:9200/testindex/testtype/1', data='{"name": "John Doe", "age": 42}') + + time.sleep(elastic_latency) + + self.run_check(config) + + pshard_stats_metrics = dict(PRIMARY_SHARD_METRICS) + if get_es_version() >= [1, 0, 0]: + pshard_stats_metrics.update(PRIMARY_SHARD_METRICS_POST_1_0) + + for m_name, desc in pshard_stats_metrics.iteritems(): + if desc[0] == "gauge": + self.assertMetric(m_name, count=1) + + # Our pshard metrics are getting sent, let's check that they're accurate + # Note: please make sure you don't install Maven on the CI for future + # elastic search CI integrations. It would make the line below fail :/ + self.assertMetric('elasticsearch.primaries.docs.count', value=2)