From 04a7c909e43df5c4ab6d247ba18572ebd4dbcde5 Mon Sep 17 00:00:00 2001 From: YongGang Date: Fri, 16 Jun 2023 12:48:51 -0700 Subject: [PATCH 01/12] Add OverlordStatusMonitor and CoordinatorStatusMonitor to monitor service leader status --- docs/configuration/index.md | 40 +++++++------ docs/operations/metrics.md | 12 ++++ .../common/stats/OverlordStatusMonitor.java | 49 +++++++++++++++ .../stats/OverlordStatusMonitorTest.java | 41 +++++++++++++ .../metrics/CoordinatorStatusMonitor.java | 49 +++++++++++++++ .../metrics/CoordinatorStatusMonitorTest.java | 60 +++++++++++++++++++ 6 files changed, 232 insertions(+), 19 deletions(-) create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitor.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitorTest.java create mode 100644 server/src/main/java/org/apache/druid/server/metrics/CoordinatorStatusMonitor.java create mode 100644 server/src/test/java/org/apache/druid/server/metrics/CoordinatorStatusMonitorTest.java diff --git a/docs/configuration/index.md b/docs/configuration/index.md index 37f237d75682..86f6b3f763a5 100644 --- a/docs/configuration/index.md +++ b/docs/configuration/index.md @@ -380,25 +380,27 @@ You can configure Druid processes to emit [metrics](../operations/metrics.md) re Metric monitoring is an essential part of Druid operations. The following monitors are available: -|Name|Description| -|----|-----------| -|`org.apache.druid.client.cache.CacheMonitor`|Emits metrics (to logs) about the segment results cache for Historical and Broker processes. Reports typical cache statistics include hits, misses, rates, and size (bytes and number of entries), as well as timeouts and and errors.| -|`org.apache.druid.java.util.metrics.SysMonitor`|Reports on various system activities and statuses using the [SIGAR library](https://github.com/hyperic/sigar). Requires execute privileges on files in `java.io.tmpdir`. Do not set `java.io.tmpdir` to `noexec` when using `SysMonitor`.| -|`org.apache.druid.java.util.metrics.JvmMonitor`|Reports various JVM-related statistics.| -|`org.apache.druid.java.util.metrics.JvmCpuMonitor`|Reports statistics of CPU consumption by the JVM.| -|`org.apache.druid.java.util.metrics.CpuAcctDeltaMonitor`|Reports consumed CPU as per the cpuacct cgroup.| -|`org.apache.druid.java.util.metrics.JvmThreadsMonitor`|Reports Thread statistics in the JVM, like numbers of total, daemon, started, died threads.| -|`org.apache.druid.java.util.metrics.CgroupCpuMonitor`|Reports CPU shares and quotas as per the `cpu` cgroup.| -|`org.apache.druid.java.util.metrics.CgroupCpuSetMonitor`|Reports CPU core/HT and memory node allocations as per the `cpuset` cgroup.| -|`org.apache.druid.java.util.metrics.CgroupMemoryMonitor`|Reports memory statistic as per the memory cgroup.| -|`org.apache.druid.server.metrics.EventReceiverFirehoseMonitor`|Reports how many events have been queued in the EventReceiverFirehose.| -|`org.apache.druid.server.metrics.HistoricalMetricsMonitor`|Reports statistics on Historical processes. Available only on Historical processes.| -|`org.apache.druid.server.metrics.SegmentStatsMonitor` | **EXPERIMENTAL** Reports statistics about segments on Historical processes. Available only on Historical processes. Not to be used when lazy loading is configured. | -|`org.apache.druid.server.metrics.QueryCountStatsMonitor`|Reports how many queries have been successful/failed/interrupted.| -|`org.apache.druid.server.emitter.HttpEmittingMonitor`|Reports internal metrics of `http` or `parametrized` emitter (see below). Must not be used with another emitter type. See the description of the metrics here: https://github.com/apache/druid/pull/4973.| -|`org.apache.druid.server.metrics.TaskCountStatsMonitor`|Reports how many ingestion tasks are currently running/pending/waiting and also the number of successful/failed tasks per emission period.| -|`org.apache.druid.server.metrics.TaskSlotCountStatsMonitor`|Reports metrics about task slot usage per emission period.| -|`org.apache.druid.server.metrics.WorkerTaskCountStatsMonitor`|Reports how many ingestion tasks are currently running/pending/waiting, the number of successful/failed tasks, and metrics about task slot usage for the reporting worker, per emission period. Only supported by middleManager node types.| +| Name | Description | +|----------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `org.apache.druid.client.cache.CacheMonitor` | Emits metrics (to logs) about the segment results cache for Historical and Broker processes. Reports typical cache statistics include hits, misses, rates, and size (bytes and number of entries), as well as timeouts and and errors. | +| `org.apache.druid.java.util.metrics.SysMonitor` | Reports on various system activities and statuses using the [SIGAR library](https://github.com/hyperic/sigar). Requires execute privileges on files in `java.io.tmpdir`. Do not set `java.io.tmpdir` to `noexec` when using `SysMonitor`. | +| `org.apache.druid.java.util.metrics.JvmMonitor` | Reports various JVM-related statistics. | +| `org.apache.druid.java.util.metrics.JvmCpuMonitor` | Reports statistics of CPU consumption by the JVM. | +| `org.apache.druid.java.util.metrics.CpuAcctDeltaMonitor` | Reports consumed CPU as per the cpuacct cgroup. | +| `org.apache.druid.java.util.metrics.JvmThreadsMonitor` | Reports Thread statistics in the JVM, like numbers of total, daemon, started, died threads. | +| `org.apache.druid.java.util.metrics.CgroupCpuMonitor` | Reports CPU shares and quotas as per the `cpu` cgroup. | +| `org.apache.druid.java.util.metrics.CgroupCpuSetMonitor` | Reports CPU core/HT and memory node allocations as per the `cpuset` cgroup. | +| `org.apache.druid.java.util.metrics.CgroupMemoryMonitor` | Reports memory statistic as per the memory cgroup. | +| `org.apache.druid.server.metrics.EventReceiverFirehoseMonitor` | Reports how many events have been queued in the EventReceiverFirehose. | +| `org.apache.druid.server.metrics.HistoricalMetricsMonitor` | Reports statistics on Historical processes. Available only on Historical processes. | +| `org.apache.druid.server.metrics.SegmentStatsMonitor` | **EXPERIMENTAL** Reports statistics about segments on Historical processes. Available only on Historical processes. Not to be used when lazy loading is configured. | +| `org.apache.druid.server.metrics.QueryCountStatsMonitor` | Reports how many queries have been successful/failed/interrupted. | +| `org.apache.druid.server.emitter.HttpEmittingMonitor` | Reports internal metrics of `http` or `parametrized` emitter (see below). Must not be used with another emitter type. See the description of the metrics here: https://github.com/apache/druid/pull/4973. | +| `org.apache.druid.server.metrics.TaskCountStatsMonitor` | Reports how many ingestion tasks are currently running/pending/waiting and also the number of successful/failed tasks per emission period. | +| `org.apache.druid.server.metrics.TaskSlotCountStatsMonitor` | Reports metrics about task slot usage per emission period. | +| `org.apache.druid.server.metrics.WorkerTaskCountStatsMonitor` | Reports how many ingestion tasks are currently running/pending/waiting, the number of successful/failed tasks, and metrics about task slot usage for the reporting worker, per emission period. Only supported by middleManager node types. | +| `org.apache.druid.indexing.common.stats.OverlordStatusMonitor` | Reports overlord leader count. Only supported by overlord node types. | +| `org.apache.druid.server.metrics.CoordinatorStatusMonitor` | Reports coordinator leader count. Only supported by coordinator node types. | For example, you might configure monitors on all processes for system and JVM information within `common.runtime.properties` as follows: diff --git a/docs/operations/metrics.md b/docs/operations/metrics.md index f85cbec77dfa..32cf64519022 100644 --- a/docs/operations/metrics.md +++ b/docs/operations/metrics.md @@ -329,6 +329,18 @@ If `emitBalancingStats` is set to `true` in the Coordinator [dynamic configurati ## General Health +### Overlord + +| Metric | Description | Dimensions | Normal Value | +|----------------|-------------------------|---------------|--------------| +| `leader/count` | Overlord leader count. `OverlordStatusMonitor` must be enabled. | `serviceType` | 1 | + +### Coordinator + +| Metric | Description | Dimensions | Normal Value | +|----------------|----------------------------|---------------|--------------| +| `leader/count` | Coordinator leader count. `CoordinatorStatusMonitor` must be enabled. | `serviceType` | 1 | + ### Historical |Metric|Description|Dimensions|Normal Value| diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitor.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitor.java new file mode 100644 index 000000000000..96ca0a8aa948 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitor.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.stats; + +import javax.inject.Inject; +import org.apache.druid.indexing.overlord.TaskMaster; +import org.apache.druid.java.util.emitter.service.ServiceEmitter; +import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; +import org.apache.druid.java.util.metrics.AbstractMonitor; + +/** + * Monitor Overlord running status. + */ +public class OverlordStatusMonitor extends AbstractMonitor { + + private final TaskMaster taskMaster; + + @Inject + public OverlordStatusMonitor(TaskMaster taskMaster) { + this.taskMaster = taskMaster; + } + + @Override + public boolean doMonitor(ServiceEmitter emitter) { + final ServiceMetricEvent.Builder builder = new ServiceMetricEvent.Builder(); + + builder.setDimension("serviceType", "overlord"); + emitter.emit(builder.build("leader/count", taskMaster.isLeader() ? 1 : 0)); + + return true; + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitorTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitorTest.java new file mode 100644 index 000000000000..8d1bd41947c4 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitorTest.java @@ -0,0 +1,41 @@ +package org.apache.druid.indexing.common.stats; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import org.apache.druid.indexing.overlord.TaskMaster; +import org.apache.druid.java.util.metrics.StubServiceEmitter; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class OverlordStatusMonitorTest { + + private TaskMaster taskMaster; + private OverlordStatusMonitor monitor; + + @Before + public void setUp() throws Exception { + taskMaster = mock(TaskMaster.class); + + monitor = new OverlordStatusMonitor(taskMaster); + } + + @Test + public void testLeaderCount() { + when(taskMaster.isLeader()).thenReturn(false); + final StubServiceEmitter emitter = new StubServiceEmitter("service", "host"); + monitor.doMonitor(emitter); + + Assert.assertEquals(1, emitter.getEvents().size()); + Assert.assertEquals("leader/count", emitter.getEvents().get(0).toMap().get("metric")); + Assert.assertEquals(0, emitter.getEvents().get(0).toMap().get("value")); + + when(taskMaster.isLeader()).thenReturn(true); + emitter.flush(); + monitor.doMonitor(emitter); + Assert.assertEquals(1, emitter.getEvents().size()); + Assert.assertEquals("leader/count", emitter.getEvents().get(0).toMap().get("metric")); + Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value")); + } +} \ No newline at end of file diff --git a/server/src/main/java/org/apache/druid/server/metrics/CoordinatorStatusMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/CoordinatorStatusMonitor.java new file mode 100644 index 000000000000..c0f53141fdbe --- /dev/null +++ b/server/src/main/java/org/apache/druid/server/metrics/CoordinatorStatusMonitor.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.server.metrics; + +import javax.inject.Inject; +import org.apache.druid.java.util.emitter.service.ServiceEmitter; +import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; +import org.apache.druid.java.util.metrics.AbstractMonitor; +import org.apache.druid.server.coordinator.DruidCoordinator; + +/** + * Monitor Coordinator running status. + */ +public class CoordinatorStatusMonitor extends AbstractMonitor { + + private final DruidCoordinator coordinator; + + @Inject + public CoordinatorStatusMonitor(DruidCoordinator coordinator) { + this.coordinator = coordinator; + } + + @Override + public boolean doMonitor(ServiceEmitter emitter) { + final ServiceMetricEvent.Builder builder = new ServiceMetricEvent.Builder(); + + builder.setDimension("serviceType", "coordinator"); + emitter.emit(builder.build("leader/count", coordinator.isLeader() ? 1 : 0)); + + return true; + } +} diff --git a/server/src/test/java/org/apache/druid/server/metrics/CoordinatorStatusMonitorTest.java b/server/src/test/java/org/apache/druid/server/metrics/CoordinatorStatusMonitorTest.java new file mode 100644 index 000000000000..96731047fcbe --- /dev/null +++ b/server/src/test/java/org/apache/druid/server/metrics/CoordinatorStatusMonitorTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.server.metrics; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import org.apache.druid.java.util.metrics.StubServiceEmitter; +import org.apache.druid.server.coordinator.DruidCoordinator; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class CoordinatorStatusMonitorTest { + + private DruidCoordinator coordinator; + private CoordinatorStatusMonitor monitor; + + @Before + public void setUp() throws Exception { + coordinator = mock(DruidCoordinator.class); + + monitor = new CoordinatorStatusMonitor(coordinator); + } + + @Test + public void testLeaderCount() { + when(coordinator.isLeader()).thenReturn(false); + final StubServiceEmitter emitter = new StubServiceEmitter("service", "host"); + monitor.doMonitor(emitter); + + Assert.assertEquals(1, emitter.getEvents().size()); + Assert.assertEquals("leader/count", emitter.getEvents().get(0).toMap().get("metric")); + Assert.assertEquals(0, emitter.getEvents().get(0).toMap().get("value")); + + when(coordinator.isLeader()).thenReturn(true); + emitter.flush(); + monitor.doMonitor(emitter); + Assert.assertEquals(1, emitter.getEvents().size()); + Assert.assertEquals("leader/count", emitter.getEvents().get(0).toMap().get("metric")); + Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value")); + } +} \ No newline at end of file From cf43ef92cada8ea3e79bb95e975ddd644686e4a5 Mon Sep 17 00:00:00 2001 From: YongGang Date: Mon, 19 Jun 2023 22:11:22 -0700 Subject: [PATCH 02/12] make the monitor more general --- docs/configuration/index.md | 3 +- docs/operations/metrics.md | 14 ++---- .../common/stats/OverlordStatusMonitor.java | 49 ------------------- .../druid/indexing/overlord/TaskMaster.java | 14 +++++- .../stats/OverlordStatusMonitorTest.java | 41 ---------------- ...Monitor.java => ServiceStatusMonitor.java} | 17 +++---- .../server/metrics/ServiceStatusProvider.java | 31 ++++++++++++ ...est.java => ServiceStatusMonitorTest.java} | 19 ++++--- .../org/apache/druid/cli/CliCoordinator.java | 2 + .../org/apache/druid/cli/CliOverlord.java | 2 + 10 files changed, 70 insertions(+), 122 deletions(-) delete mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitor.java delete mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitorTest.java rename server/src/main/java/org/apache/druid/server/metrics/{CoordinatorStatusMonitor.java => ServiceStatusMonitor.java} (73%) create mode 100644 server/src/main/java/org/apache/druid/server/metrics/ServiceStatusProvider.java rename server/src/test/java/org/apache/druid/server/metrics/{CoordinatorStatusMonitorTest.java => ServiceStatusMonitorTest.java} (73%) diff --git a/docs/configuration/index.md b/docs/configuration/index.md index 86f6b3f763a5..bc9d7d0aa0e7 100644 --- a/docs/configuration/index.md +++ b/docs/configuration/index.md @@ -399,8 +399,7 @@ Metric monitoring is an essential part of Druid operations. The following monit | `org.apache.druid.server.metrics.TaskCountStatsMonitor` | Reports how many ingestion tasks are currently running/pending/waiting and also the number of successful/failed tasks per emission period. | | `org.apache.druid.server.metrics.TaskSlotCountStatsMonitor` | Reports metrics about task slot usage per emission period. | | `org.apache.druid.server.metrics.WorkerTaskCountStatsMonitor` | Reports how many ingestion tasks are currently running/pending/waiting, the number of successful/failed tasks, and metrics about task slot usage for the reporting worker, per emission period. Only supported by middleManager node types. | -| `org.apache.druid.indexing.common.stats.OverlordStatusMonitor` | Reports overlord leader count. Only supported by overlord node types. | -| `org.apache.druid.server.metrics.CoordinatorStatusMonitor` | Reports coordinator leader count. Only supported by coordinator node types. | +| `org.apache.druid.server.metrics.ServiceStatusMonitor` | Reports service heartbeat. For overlord/coordinator, the number is leader count. Only supported by overlord/coordinator node types. | For example, you might configure monitors on all processes for system and JVM information within `common.runtime.properties` as follows: diff --git a/docs/operations/metrics.md b/docs/operations/metrics.md index 32cf64519022..9c075ae06203 100644 --- a/docs/operations/metrics.md +++ b/docs/operations/metrics.md @@ -329,17 +329,11 @@ If `emitBalancingStats` is set to `true` in the Coordinator [dynamic configurati ## General Health -### Overlord +### Overlord/Coordinator -| Metric | Description | Dimensions | Normal Value | -|----------------|-------------------------|---------------|--------------| -| `leader/count` | Overlord leader count. `OverlordStatusMonitor` must be enabled. | `serviceType` | 1 | - -### Coordinator - -| Metric | Description | Dimensions | Normal Value | -|----------------|----------------------------|---------------|--------------| -| `leader/count` | Coordinator leader count. `CoordinatorStatusMonitor` must be enabled. | `serviceType` | 1 | +| Metric | Description | Dimensions | Normal Value | +|----------------|---------------------------------------------------------------------------------------------------------------|-----------------|--------------| +| `druid/heartbeat` | Report Overlord/Coordinator service health by looking at leader count. `ServiceStatusMonitor` must be enabled. | `heartbeatType` | 1 | ### Historical diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitor.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitor.java deleted file mode 100644 index 96ca0a8aa948..000000000000 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitor.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.indexing.common.stats; - -import javax.inject.Inject; -import org.apache.druid.indexing.overlord.TaskMaster; -import org.apache.druid.java.util.emitter.service.ServiceEmitter; -import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; -import org.apache.druid.java.util.metrics.AbstractMonitor; - -/** - * Monitor Overlord running status. - */ -public class OverlordStatusMonitor extends AbstractMonitor { - - private final TaskMaster taskMaster; - - @Inject - public OverlordStatusMonitor(TaskMaster taskMaster) { - this.taskMaster = taskMaster; - } - - @Override - public boolean doMonitor(ServiceEmitter emitter) { - final ServiceMetricEvent.Builder builder = new ServiceMetricEvent.Builder(); - - builder.setDimension("serviceType", "overlord"); - emitter.emit(builder.build("leader/count", taskMaster.isLeader() ? 1 : 0)); - - return true; - } -} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/TaskMaster.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/TaskMaster.java index 1eab403585fd..2eaa667c6a10 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/TaskMaster.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/TaskMaster.java @@ -43,6 +43,7 @@ import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.server.DruidNode; import org.apache.druid.server.coordinator.CoordinatorOverlordServiceConfig; +import org.apache.druid.server.metrics.ServiceStatusProvider; import org.apache.druid.server.metrics.TaskCountStatsProvider; import org.apache.druid.server.metrics.TaskSlotCountStatsProvider; @@ -54,7 +55,8 @@ /** * Encapsulates the indexer leadership lifecycle. */ -public class TaskMaster implements TaskCountStatsProvider, TaskSlotCountStatsProvider +public class TaskMaster implements TaskCountStatsProvider, TaskSlotCountStatsProvider, + ServiceStatusProvider { private static final EmittingLogger log = new EmittingLogger(TaskMaster.class); @@ -433,4 +435,14 @@ public Map getBlacklistedTaskSlotCount() return null; } } + + @Override + public String heartbeatType() { + return "leader"; + } + + @Override + public int heartbeat() { + return isLeader() ? 1 : 0; + } } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitorTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitorTest.java deleted file mode 100644 index 8d1bd41947c4..000000000000 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/stats/OverlordStatusMonitorTest.java +++ /dev/null @@ -1,41 +0,0 @@ -package org.apache.druid.indexing.common.stats; - -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -import org.apache.druid.indexing.overlord.TaskMaster; -import org.apache.druid.java.util.metrics.StubServiceEmitter; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -public class OverlordStatusMonitorTest { - - private TaskMaster taskMaster; - private OverlordStatusMonitor monitor; - - @Before - public void setUp() throws Exception { - taskMaster = mock(TaskMaster.class); - - monitor = new OverlordStatusMonitor(taskMaster); - } - - @Test - public void testLeaderCount() { - when(taskMaster.isLeader()).thenReturn(false); - final StubServiceEmitter emitter = new StubServiceEmitter("service", "host"); - monitor.doMonitor(emitter); - - Assert.assertEquals(1, emitter.getEvents().size()); - Assert.assertEquals("leader/count", emitter.getEvents().get(0).toMap().get("metric")); - Assert.assertEquals(0, emitter.getEvents().get(0).toMap().get("value")); - - when(taskMaster.isLeader()).thenReturn(true); - emitter.flush(); - monitor.doMonitor(emitter); - Assert.assertEquals(1, emitter.getEvents().size()); - Assert.assertEquals("leader/count", emitter.getEvents().get(0).toMap().get("metric")); - Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value")); - } -} \ No newline at end of file diff --git a/server/src/main/java/org/apache/druid/server/metrics/CoordinatorStatusMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java similarity index 73% rename from server/src/main/java/org/apache/druid/server/metrics/CoordinatorStatusMonitor.java rename to server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java index c0f53141fdbe..fdae1ccb8856 100644 --- a/server/src/main/java/org/apache/druid/server/metrics/CoordinatorStatusMonitor.java +++ b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java @@ -23,26 +23,25 @@ import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; import org.apache.druid.java.util.metrics.AbstractMonitor; -import org.apache.druid.server.coordinator.DruidCoordinator; /** - * Monitor Coordinator running status. + * Monitor service running status. + * For Overlord/Coordinator, the metric reported is service leader count. */ -public class CoordinatorStatusMonitor extends AbstractMonitor { +public class ServiceStatusMonitor extends AbstractMonitor { - private final DruidCoordinator coordinator; + private final ServiceStatusProvider provider; @Inject - public CoordinatorStatusMonitor(DruidCoordinator coordinator) { - this.coordinator = coordinator; + public ServiceStatusMonitor(ServiceStatusProvider provider) { + this.provider = provider; } @Override public boolean doMonitor(ServiceEmitter emitter) { final ServiceMetricEvent.Builder builder = new ServiceMetricEvent.Builder(); - - builder.setDimension("serviceType", "coordinator"); - emitter.emit(builder.build("leader/count", coordinator.isLeader() ? 1 : 0)); + builder.setDimension("heartbeatType", provider.heartbeatType()); + emitter.emit(builder.build("druid/heartbeat", provider.heartbeat())); return true; } diff --git a/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusProvider.java b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusProvider.java new file mode 100644 index 000000000000..55b225f2be28 --- /dev/null +++ b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusProvider.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.server.metrics; + +/** + * Monitor service running status. + */ +public interface ServiceStatusProvider { + + String heartbeatType(); + + int heartbeat(); + +} diff --git a/server/src/test/java/org/apache/druid/server/metrics/CoordinatorStatusMonitorTest.java b/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java similarity index 73% rename from server/src/test/java/org/apache/druid/server/metrics/CoordinatorStatusMonitorTest.java rename to server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java index 96731047fcbe..e6253aaa2db9 100644 --- a/server/src/test/java/org/apache/druid/server/metrics/CoordinatorStatusMonitorTest.java +++ b/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java @@ -23,38 +23,37 @@ import static org.mockito.Mockito.when; import org.apache.druid.java.util.metrics.StubServiceEmitter; -import org.apache.druid.server.coordinator.DruidCoordinator; import org.junit.Assert; import org.junit.Before; import org.junit.Test; -public class CoordinatorStatusMonitorTest { +public class ServiceStatusMonitorTest { - private DruidCoordinator coordinator; - private CoordinatorStatusMonitor monitor; + private ServiceStatusProvider provider; + private ServiceStatusMonitor monitor; @Before public void setUp() throws Exception { - coordinator = mock(DruidCoordinator.class); + provider = mock(ServiceStatusProvider.class); - monitor = new CoordinatorStatusMonitor(coordinator); + monitor = new ServiceStatusMonitor(provider); } @Test public void testLeaderCount() { - when(coordinator.isLeader()).thenReturn(false); + when(provider.heartbeat()).thenReturn(0); final StubServiceEmitter emitter = new StubServiceEmitter("service", "host"); monitor.doMonitor(emitter); Assert.assertEquals(1, emitter.getEvents().size()); - Assert.assertEquals("leader/count", emitter.getEvents().get(0).toMap().get("metric")); + Assert.assertEquals("druid/heartbeat", emitter.getEvents().get(0).toMap().get("metric")); Assert.assertEquals(0, emitter.getEvents().get(0).toMap().get("value")); - when(coordinator.isLeader()).thenReturn(true); + when(provider.heartbeat()).thenReturn(1); emitter.flush(); monitor.doMonitor(emitter); Assert.assertEquals(1, emitter.getEvents().size()); - Assert.assertEquals("leader/count", emitter.getEvents().get(0).toMap().get("metric")); + Assert.assertEquals("druid/heartbeat", emitter.getEvents().get(0).toMap().get("metric")); Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value")); } } \ No newline at end of file diff --git a/services/src/main/java/org/apache/druid/cli/CliCoordinator.java b/services/src/main/java/org/apache/druid/cli/CliCoordinator.java index 2810359f4909..ee9c4d25fc4e 100644 --- a/services/src/main/java/org/apache/druid/cli/CliCoordinator.java +++ b/services/src/main/java/org/apache/druid/cli/CliCoordinator.java @@ -114,6 +114,7 @@ import org.apache.druid.server.initialization.jetty.JettyServerInitializer; import org.apache.druid.server.lookup.cache.LookupCoordinatorManager; import org.apache.druid.server.lookup.cache.LookupCoordinatorManagerConfig; +import org.apache.druid.server.metrics.ServiceStatusProvider; import org.apache.druid.server.router.TieredBrokerConfig; import org.eclipse.jetty.server.Server; import org.joda.time.Duration; @@ -222,6 +223,7 @@ public void configure(Binder binder) binder.bind(LookupCoordinatorManager.class).in(LazySingleton.class); binder.bind(CoordinatorServerView.class); binder.bind(DruidCoordinator.class); + binder.bind(ServiceStatusProvider.class).to(DruidCoordinator.class); LifecycleModule.register(binder, CoordinatorServerView.class); LifecycleModule.register(binder, MetadataStorage.class); diff --git a/services/src/main/java/org/apache/druid/cli/CliOverlord.java b/services/src/main/java/org/apache/druid/cli/CliOverlord.java index b67d6592e2e2..eb303d9a00bf 100644 --- a/services/src/main/java/org/apache/druid/cli/CliOverlord.java +++ b/services/src/main/java/org/apache/druid/cli/CliOverlord.java @@ -117,6 +117,7 @@ import org.apache.druid.server.initialization.ServerConfig; import org.apache.druid.server.initialization.jetty.JettyServerInitUtils; import org.apache.druid.server.initialization.jetty.JettyServerInitializer; +import org.apache.druid.server.metrics.ServiceStatusProvider; import org.apache.druid.server.metrics.TaskCountStatsProvider; import org.apache.druid.server.metrics.TaskSlotCountStatsProvider; import org.apache.druid.server.security.AuthConfig; @@ -197,6 +198,7 @@ public void configure(Binder binder) binder.bind(TaskMaster.class).in(ManageLifecycle.class); binder.bind(TaskCountStatsProvider.class).to(TaskMaster.class); binder.bind(TaskSlotCountStatsProvider.class).to(TaskMaster.class); + binder.bind(ServiceStatusProvider.class).to(TaskMaster.class); binder.bind(TaskLogStreamer.class) .to(SwitchingTaskLogStreamer.class) From 45a6b8d5141d6a743a9002001dd56272fe6b38e3 Mon Sep 17 00:00:00 2001 From: YongGang Date: Mon, 19 Jun 2023 22:25:12 -0700 Subject: [PATCH 03/12] resolve conflict --- .../druid/server/coordinator/DruidCoordinator.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/apache/druid/server/coordinator/DruidCoordinator.java b/server/src/main/java/org/apache/druid/server/coordinator/DruidCoordinator.java index 0c5e79bdcbe2..682ec835f6b0 100644 --- a/server/src/main/java/org/apache/druid/server/coordinator/DruidCoordinator.java +++ b/server/src/main/java/org/apache/druid/server/coordinator/DruidCoordinator.java @@ -82,6 +82,7 @@ import org.apache.druid.server.coordinator.stats.RowKey; import org.apache.druid.server.coordinator.stats.Stats; import org.apache.druid.server.lookup.cache.LookupCoordinatorManager; +import org.apache.druid.server.metrics.ServiceStatusProvider; import org.apache.druid.timeline.DataSegment; import org.apache.druid.timeline.SegmentId; import org.joda.time.DateTime; @@ -105,7 +106,7 @@ * */ @ManageLifecycle -public class DruidCoordinator +public class DruidCoordinator implements ServiceStatusProvider { /** * Orders newest segments (i.e. segments with most recent intervals) first. @@ -646,6 +647,16 @@ private List makeCompactSegmentsDuty() return ImmutableList.of(compactSegments); } + @Override + public String heartbeatType() { + return "leader"; + } + + @Override + public int heartbeat() { + return isLeader() ? 1 : 0; + } + private class DutiesRunnable implements Runnable { private final DateTime coordinatorStartTime = DateTimes.nowUtc(); From 4312ce8f27223c2e758a8473cde21f1a68cb5c75 Mon Sep 17 00:00:00 2001 From: YongGang Date: Tue, 20 Jun 2023 22:21:43 -0700 Subject: [PATCH 04/12] use Supplier pattern to provide metrics --- .../druid/indexing/overlord/TaskMaster.java | 14 +-------- .../server/coordinator/DruidCoordinator.java | 13 +------- .../server/metrics/ServiceStatusMonitor.java | 24 +++++++++----- .../server/metrics/ServiceStatusProvider.java | 31 ------------------- .../metrics/ServiceStatusMonitorTest.java | 19 ++++++------ .../org/apache/druid/cli/CliCoordinator.java | 18 +++++++++-- .../org/apache/druid/cli/CliOverlord.java | 18 +++++++++-- 7 files changed, 61 insertions(+), 76 deletions(-) delete mode 100644 server/src/main/java/org/apache/druid/server/metrics/ServiceStatusProvider.java diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/TaskMaster.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/TaskMaster.java index 2eaa667c6a10..1eab403585fd 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/TaskMaster.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/TaskMaster.java @@ -43,7 +43,6 @@ import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.server.DruidNode; import org.apache.druid.server.coordinator.CoordinatorOverlordServiceConfig; -import org.apache.druid.server.metrics.ServiceStatusProvider; import org.apache.druid.server.metrics.TaskCountStatsProvider; import org.apache.druid.server.metrics.TaskSlotCountStatsProvider; @@ -55,8 +54,7 @@ /** * Encapsulates the indexer leadership lifecycle. */ -public class TaskMaster implements TaskCountStatsProvider, TaskSlotCountStatsProvider, - ServiceStatusProvider +public class TaskMaster implements TaskCountStatsProvider, TaskSlotCountStatsProvider { private static final EmittingLogger log = new EmittingLogger(TaskMaster.class); @@ -435,14 +433,4 @@ public Map getBlacklistedTaskSlotCount() return null; } } - - @Override - public String heartbeatType() { - return "leader"; - } - - @Override - public int heartbeat() { - return isLeader() ? 1 : 0; - } } diff --git a/server/src/main/java/org/apache/druid/server/coordinator/DruidCoordinator.java b/server/src/main/java/org/apache/druid/server/coordinator/DruidCoordinator.java index 682ec835f6b0..0c5e79bdcbe2 100644 --- a/server/src/main/java/org/apache/druid/server/coordinator/DruidCoordinator.java +++ b/server/src/main/java/org/apache/druid/server/coordinator/DruidCoordinator.java @@ -82,7 +82,6 @@ import org.apache.druid.server.coordinator.stats.RowKey; import org.apache.druid.server.coordinator.stats.Stats; import org.apache.druid.server.lookup.cache.LookupCoordinatorManager; -import org.apache.druid.server.metrics.ServiceStatusProvider; import org.apache.druid.timeline.DataSegment; import org.apache.druid.timeline.SegmentId; import org.joda.time.DateTime; @@ -106,7 +105,7 @@ * */ @ManageLifecycle -public class DruidCoordinator implements ServiceStatusProvider +public class DruidCoordinator { /** * Orders newest segments (i.e. segments with most recent intervals) first. @@ -647,16 +646,6 @@ private List makeCompactSegmentsDuty() return ImmutableList.of(compactSegments); } - @Override - public String heartbeatType() { - return "leader"; - } - - @Override - public int heartbeat() { - return isLeader() ? 1 : 0; - } - private class DutiesRunnable implements Runnable { private final DateTime coordinatorStartTime = DateTimes.nowUtc(); diff --git a/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java index fdae1ccb8856..7630834447ab 100644 --- a/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java +++ b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java @@ -19,30 +19,40 @@ package org.apache.druid.server.metrics; -import javax.inject.Inject; +import com.google.common.base.Supplier; +import com.google.inject.Inject; +import com.google.inject.name.Named; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; import org.apache.druid.java.util.metrics.AbstractMonitor; +import java.util.Map; + /** * Monitor service running status. * For Overlord/Coordinator, the metric reported is service leader count. */ public class ServiceStatusMonitor extends AbstractMonitor { - private final ServiceStatusProvider provider; + @Named("heartbeat") + @Inject(optional = true) + Supplier> heartbeatTagsSupplier = null; @Inject - public ServiceStatusMonitor(ServiceStatusProvider provider) { - this.provider = provider; + public ServiceStatusMonitor() { } @Override public boolean doMonitor(ServiceEmitter emitter) { - final ServiceMetricEvent.Builder builder = new ServiceMetricEvent.Builder(); - builder.setDimension("heartbeatType", provider.heartbeatType()); - emitter.emit(builder.build("druid/heartbeat", provider.heartbeat())); + if (heartbeatTagsSupplier == null) { + return true; + } + heartbeatTagsSupplier.get().forEach((k, v) -> { + final ServiceMetricEvent.Builder builder = new ServiceMetricEvent.Builder(); + builder.setDimension("heartbeatType", k); + emitter.emit(builder.build("druid/heartbeat", v)); + }); return true; } } diff --git a/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusProvider.java b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusProvider.java deleted file mode 100644 index 55b225f2be28..000000000000 --- a/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusProvider.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.server.metrics; - -/** - * Monitor service running status. - */ -public interface ServiceStatusProvider { - - String heartbeatType(); - - int heartbeat(); - -} diff --git a/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java b/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java index e6253aaa2db9..756164a9df9c 100644 --- a/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java +++ b/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java @@ -19,29 +19,30 @@ package org.apache.druid.server.metrics; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - +import com.google.common.base.Supplier; import org.apache.druid.java.util.metrics.StubServiceEmitter; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import java.util.HashMap; +import java.util.Map; + public class ServiceStatusMonitorTest { - private ServiceStatusProvider provider; private ServiceStatusMonitor monitor; + private Map heartbeatTags = new HashMap<>(); + private Supplier> heartbeatTagsSupplier = () -> heartbeatTags; @Before public void setUp() throws Exception { - provider = mock(ServiceStatusProvider.class); - - monitor = new ServiceStatusMonitor(provider); + monitor = new ServiceStatusMonitor(); + monitor.heartbeatTagsSupplier = heartbeatTagsSupplier; } @Test public void testLeaderCount() { - when(provider.heartbeat()).thenReturn(0); + heartbeatTags.put("leader", 0); final StubServiceEmitter emitter = new StubServiceEmitter("service", "host"); monitor.doMonitor(emitter); @@ -49,7 +50,7 @@ public void testLeaderCount() { Assert.assertEquals("druid/heartbeat", emitter.getEvents().get(0).toMap().get("metric")); Assert.assertEquals(0, emitter.getEvents().get(0).toMap().get("value")); - when(provider.heartbeat()).thenReturn(1); + heartbeatTags.put("leader", 1); emitter.flush(); monitor.doMonitor(emitter); Assert.assertEquals(1, emitter.getEvents().size()); diff --git a/services/src/main/java/org/apache/druid/cli/CliCoordinator.java b/services/src/main/java/org/apache/druid/cli/CliCoordinator.java index ee9c4d25fc4e..1651eaac3ad3 100644 --- a/services/src/main/java/org/apache/druid/cli/CliCoordinator.java +++ b/services/src/main/java/org/apache/druid/cli/CliCoordinator.java @@ -24,6 +24,7 @@ import com.github.rvesse.airline.annotations.Command; import com.google.common.base.Predicates; import com.google.common.base.Strings; +import com.google.common.base.Supplier; import com.google.common.collect.ImmutableSet; import com.google.inject.Binder; import com.google.inject.Inject; @@ -31,6 +32,7 @@ import com.google.inject.Module; import com.google.inject.Provider; import com.google.inject.Provides; +import com.google.inject.name.Named; import com.google.inject.name.Names; import com.google.inject.util.Providers; import org.apache.curator.framework.CuratorFramework; @@ -114,14 +116,15 @@ import org.apache.druid.server.initialization.jetty.JettyServerInitializer; import org.apache.druid.server.lookup.cache.LookupCoordinatorManager; import org.apache.druid.server.lookup.cache.LookupCoordinatorManagerConfig; -import org.apache.druid.server.metrics.ServiceStatusProvider; import org.apache.druid.server.router.TieredBrokerConfig; import org.eclipse.jetty.server.Server; import org.joda.time.Duration; import java.util.ArrayList; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.concurrent.ExecutorService; @@ -223,7 +226,6 @@ public void configure(Binder binder) binder.bind(LookupCoordinatorManager.class).in(LazySingleton.class); binder.bind(CoordinatorServerView.class); binder.bind(DruidCoordinator.class); - binder.bind(ServiceStatusProvider.class).to(DruidCoordinator.class); LifecycleModule.register(binder, CoordinatorServerView.class); LifecycleModule.register(binder, MetadataStorage.class); @@ -372,6 +374,18 @@ public LoadQueueTaskMaster getLoadQueueTaskMaster( zkPaths ); } + + @Provides + @LazySingleton + @Named("heartbeat") + public Supplier> getHeartbeatSupplier(DruidCoordinator coordinator) { + return () -> { + Map heartbeatTags = new HashMap<>(); + heartbeatTags.put("leader", coordinator.isLeader() ? 1 : 0); + + return heartbeatTags; + }; + } } ); diff --git a/services/src/main/java/org/apache/druid/cli/CliOverlord.java b/services/src/main/java/org/apache/druid/cli/CliOverlord.java index eb303d9a00bf..94ff6819a7b4 100644 --- a/services/src/main/java/org/apache/druid/cli/CliOverlord.java +++ b/services/src/main/java/org/apache/druid/cli/CliOverlord.java @@ -21,6 +21,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.github.rvesse.airline.annotations.Command; +import com.google.common.base.Supplier; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.inject.Binder; @@ -32,6 +33,7 @@ import com.google.inject.TypeLiteral; import com.google.inject.multibindings.MapBinder; import com.google.inject.multibindings.Multibinder; +import com.google.inject.name.Named; import com.google.inject.name.Names; import com.google.inject.servlet.GuiceFilter; import com.google.inject.util.Providers; @@ -117,7 +119,6 @@ import org.apache.druid.server.initialization.ServerConfig; import org.apache.druid.server.initialization.jetty.JettyServerInitUtils; import org.apache.druid.server.initialization.jetty.JettyServerInitializer; -import org.apache.druid.server.metrics.ServiceStatusProvider; import org.apache.druid.server.metrics.TaskCountStatsProvider; import org.apache.druid.server.metrics.TaskSlotCountStatsProvider; import org.apache.druid.server.security.AuthConfig; @@ -135,7 +136,9 @@ import org.eclipse.jetty.servlet.ServletContextHandler; import org.eclipse.jetty.servlet.ServletHolder; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Properties; import java.util.Set; @@ -198,7 +201,6 @@ public void configure(Binder binder) binder.bind(TaskMaster.class).in(ManageLifecycle.class); binder.bind(TaskCountStatsProvider.class).to(TaskMaster.class); binder.bind(TaskSlotCountStatsProvider.class).to(TaskMaster.class); - binder.bind(ServiceStatusProvider.class).to(TaskMaster.class); binder.bind(TaskLogStreamer.class) .to(SwitchingTaskLogStreamer.class) @@ -356,6 +358,18 @@ public TaskStorageDirTracker getTaskStorageDirTracker(WorkerConfig workerConfig, { return TaskStorageDirTracker.fromConfigs(workerConfig, taskConfig); } + + @Provides + @LazySingleton + @Named("heartbeat") + public Supplier> getHeartbeatSupplier(TaskMaster taskMaster) { + return () -> { + Map heartbeatTags = new HashMap<>(); + heartbeatTags.put("leader", taskMaster.isLeader() ? 1 : 0); + + return heartbeatTags; + }; + } }; } From 840181ceb484b83b6f1f63025140eec7ccb8e257 Mon Sep 17 00:00:00 2001 From: YongGang Date: Wed, 21 Jun 2023 10:47:29 -0700 Subject: [PATCH 05/12] reformat code and doc --- docs/configuration/index.md | 40 +++++++++---------- docs/operations/metrics.md | 8 ++-- .../server/metrics/ServiceStatusMonitor.java | 13 +++--- .../metrics/ServiceStatusMonitorTest.java | 2 +- 4 files changed, 31 insertions(+), 32 deletions(-) diff --git a/docs/configuration/index.md b/docs/configuration/index.md index bc9d7d0aa0e7..f92c329ae7d3 100644 --- a/docs/configuration/index.md +++ b/docs/configuration/index.md @@ -380,26 +380,26 @@ You can configure Druid processes to emit [metrics](../operations/metrics.md) re Metric monitoring is an essential part of Druid operations. The following monitors are available: -| Name | Description | -|----------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `org.apache.druid.client.cache.CacheMonitor` | Emits metrics (to logs) about the segment results cache for Historical and Broker processes. Reports typical cache statistics include hits, misses, rates, and size (bytes and number of entries), as well as timeouts and and errors. | -| `org.apache.druid.java.util.metrics.SysMonitor` | Reports on various system activities and statuses using the [SIGAR library](https://github.com/hyperic/sigar). Requires execute privileges on files in `java.io.tmpdir`. Do not set `java.io.tmpdir` to `noexec` when using `SysMonitor`. | -| `org.apache.druid.java.util.metrics.JvmMonitor` | Reports various JVM-related statistics. | -| `org.apache.druid.java.util.metrics.JvmCpuMonitor` | Reports statistics of CPU consumption by the JVM. | -| `org.apache.druid.java.util.metrics.CpuAcctDeltaMonitor` | Reports consumed CPU as per the cpuacct cgroup. | -| `org.apache.druid.java.util.metrics.JvmThreadsMonitor` | Reports Thread statistics in the JVM, like numbers of total, daemon, started, died threads. | -| `org.apache.druid.java.util.metrics.CgroupCpuMonitor` | Reports CPU shares and quotas as per the `cpu` cgroup. | -| `org.apache.druid.java.util.metrics.CgroupCpuSetMonitor` | Reports CPU core/HT and memory node allocations as per the `cpuset` cgroup. | -| `org.apache.druid.java.util.metrics.CgroupMemoryMonitor` | Reports memory statistic as per the memory cgroup. | -| `org.apache.druid.server.metrics.EventReceiverFirehoseMonitor` | Reports how many events have been queued in the EventReceiverFirehose. | -| `org.apache.druid.server.metrics.HistoricalMetricsMonitor` | Reports statistics on Historical processes. Available only on Historical processes. | -| `org.apache.druid.server.metrics.SegmentStatsMonitor` | **EXPERIMENTAL** Reports statistics about segments on Historical processes. Available only on Historical processes. Not to be used when lazy loading is configured. | -| `org.apache.druid.server.metrics.QueryCountStatsMonitor` | Reports how many queries have been successful/failed/interrupted. | -| `org.apache.druid.server.emitter.HttpEmittingMonitor` | Reports internal metrics of `http` or `parametrized` emitter (see below). Must not be used with another emitter type. See the description of the metrics here: https://github.com/apache/druid/pull/4973. | -| `org.apache.druid.server.metrics.TaskCountStatsMonitor` | Reports how many ingestion tasks are currently running/pending/waiting and also the number of successful/failed tasks per emission period. | -| `org.apache.druid.server.metrics.TaskSlotCountStatsMonitor` | Reports metrics about task slot usage per emission period. | -| `org.apache.druid.server.metrics.WorkerTaskCountStatsMonitor` | Reports how many ingestion tasks are currently running/pending/waiting, the number of successful/failed tasks, and metrics about task slot usage for the reporting worker, per emission period. Only supported by middleManager node types. | -| `org.apache.druid.server.metrics.ServiceStatusMonitor` | Reports service heartbeat. For overlord/coordinator, the number is leader count. Only supported by overlord/coordinator node types. | +|Name|Description| +|----|-----------| +|`org.apache.druid.client.cache.CacheMonitor`|Emits metrics (to logs) about the segment results cache for Historical and Broker processes. Reports typical cache statistics include hits, misses, rates, and size (bytes and number of entries), as well as timeouts and and errors.| +|`org.apache.druid.java.util.metrics.SysMonitor`|Reports on various system activities and statuses using the [SIGAR library](https://github.com/hyperic/sigar). Requires execute privileges on files in `java.io.tmpdir`. Do not set `java.io.tmpdir` to `noexec` when using `SysMonitor`.| +|`org.apache.druid.java.util.metrics.JvmMonitor`|Reports various JVM-related statistics.| +|`org.apache.druid.java.util.metrics.JvmCpuMonitor`|Reports statistics of CPU consumption by the JVM.| +|`org.apache.druid.java.util.metrics.CpuAcctDeltaMonitor`|Reports consumed CPU as per the cpuacct cgroup.| +|`org.apache.druid.java.util.metrics.JvmThreadsMonitor`|Reports Thread statistics in the JVM, like numbers of total, daemon, started, died threads.| +|`org.apache.druid.java.util.metrics.CgroupCpuMonitor`|Reports CPU shares and quotas as per the `cpu` cgroup.| +|`org.apache.druid.java.util.metrics.CgroupCpuSetMonitor`|Reports CPU core/HT and memory node allocations as per the `cpuset` cgroup.| +|`org.apache.druid.java.util.metrics.CgroupMemoryMonitor`|Reports memory statistic as per the memory cgroup.| +|`org.apache.druid.server.metrics.EventReceiverFirehoseMonitor`|Reports how many events have been queued in the EventReceiverFirehose.| +|`org.apache.druid.server.metrics.HistoricalMetricsMonitor`|Reports statistics on Historical processes. Available only on Historical processes.| +|`org.apache.druid.server.metrics.SegmentStatsMonitor` | **EXPERIMENTAL** Reports statistics about segments on Historical processes. Available only on Historical processes. Not to be used when lazy loading is configured. | +|`org.apache.druid.server.metrics.QueryCountStatsMonitor`|Reports how many queries have been successful/failed/interrupted.| +|`org.apache.druid.server.emitter.HttpEmittingMonitor`|Reports internal metrics of `http` or `parametrized` emitter (see below). Must not be used with another emitter type. See the description of the metrics here: https://github.com/apache/druid/pull/4973.| +|`org.apache.druid.server.metrics.TaskCountStatsMonitor`|Reports how many ingestion tasks are currently running/pending/waiting and also the number of successful/failed tasks per emission period.| +|`org.apache.druid.server.metrics.TaskSlotCountStatsMonitor`|Reports metrics about task slot usage per emission period.| +|`org.apache.druid.server.metrics.WorkerTaskCountStatsMonitor`|Reports how many ingestion tasks are currently running/pending/waiting, the number of successful/failed tasks, and metrics about task slot usage for the reporting worker, per emission period. Only supported by middleManager node types.| +| `org.apache.druid.server.metrics.ServiceStatusMonitor`|Reports service heartbeat. For overlord/coordinator, the number is leader count. Only supported by overlord/coordinator node types.| For example, you might configure monitors on all processes for system and JVM information within `common.runtime.properties` as follows: diff --git a/docs/operations/metrics.md b/docs/operations/metrics.md index 9c075ae06203..59aa1b6be263 100644 --- a/docs/operations/metrics.md +++ b/docs/operations/metrics.md @@ -329,11 +329,11 @@ If `emitBalancingStats` is set to `true` in the Coordinator [dynamic configurati ## General Health -### Overlord/Coordinator +### Service Health -| Metric | Description | Dimensions | Normal Value | -|----------------|---------------------------------------------------------------------------------------------------------------|-----------------|--------------| -| `druid/heartbeat` | Report Overlord/Coordinator service health by looking at leader count. `ServiceStatusMonitor` must be enabled. | `heartbeatType` | 1 | +|Metric|Description|Dimensions|Normal Value| +|------|-----------|----------|------------| +|`druid/heartbeat`| Report service health. For Overlord/Coordinator, the metric is leader count. `ServiceStatusMonitor` must be enabled. |`heartbeatType`|1| ### Historical diff --git a/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java index 7630834447ab..4fa013c07ed1 100644 --- a/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java +++ b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java @@ -32,19 +32,17 @@ * Monitor service running status. * For Overlord/Coordinator, the metric reported is service leader count. */ -public class ServiceStatusMonitor extends AbstractMonitor { +public class ServiceStatusMonitor extends AbstractMonitor +{ @Named("heartbeat") @Inject(optional = true) Supplier> heartbeatTagsSupplier = null; - @Inject - public ServiceStatusMonitor() { - } - @Override - public boolean doMonitor(ServiceEmitter emitter) { - if (heartbeatTagsSupplier == null) { + public boolean doMonitor(ServiceEmitter emitter) + { + if (heartbeatTagsSupplier == null || heartbeatTagsSupplier.get() == null) { return true; } @@ -56,3 +54,4 @@ public boolean doMonitor(ServiceEmitter emitter) { return true; } } + diff --git a/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java b/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java index 756164a9df9c..506aa86c4502 100644 --- a/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java +++ b/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java @@ -57,4 +57,4 @@ public void testLeaderCount() { Assert.assertEquals("druid/heartbeat", emitter.getEvents().get(0).toMap().get("metric")); Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value")); } -} \ No newline at end of file +} From 4a0c442c7faca41e91eca8b6579de2332f980864 Mon Sep 17 00:00:00 2001 From: YongGang Date: Wed, 21 Jun 2023 14:59:43 -0700 Subject: [PATCH 06/12] move service specific tag to dimension --- .../server/metrics/ServiceStatusMonitor.java | 11 ++++++----- .../metrics/ServiceStatusMonitorTest.java | 17 +++++++++++------ .../org/apache/druid/cli/CliCoordinator.java | 4 ++-- .../java/org/apache/druid/cli/CliOverlord.java | 4 ++-- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java index 4fa013c07ed1..108b2971157b 100644 --- a/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java +++ b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java @@ -30,27 +30,28 @@ /** * Monitor service running status. - * For Overlord/Coordinator, the metric reported is service leader count. + * For Overlord/Coordinator, the dimension reported is service leader count. */ public class ServiceStatusMonitor extends AbstractMonitor { @Named("heartbeat") @Inject(optional = true) - Supplier> heartbeatTagsSupplier = null; + Supplier> heartbeatTagsSupplier = null; @Override public boolean doMonitor(ServiceEmitter emitter) { + final ServiceMetricEvent.Builder builder = new ServiceMetricEvent.Builder(); if (heartbeatTagsSupplier == null || heartbeatTagsSupplier.get() == null) { + emitter.emit(builder.build("druid/heartbeat", 1)); return true; } heartbeatTagsSupplier.get().forEach((k, v) -> { - final ServiceMetricEvent.Builder builder = new ServiceMetricEvent.Builder(); - builder.setDimension("heartbeatType", k); - emitter.emit(builder.build("druid/heartbeat", v)); + builder.setDimension(k, v); }); + emitter.emit(builder.build("druid/heartbeat", 1)); return true; } } diff --git a/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java b/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java index 506aa86c4502..27193c82381c 100644 --- a/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java +++ b/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java @@ -28,32 +28,37 @@ import java.util.HashMap; import java.util.Map; -public class ServiceStatusMonitorTest { +public class ServiceStatusMonitorTest +{ private ServiceStatusMonitor monitor; - private Map heartbeatTags = new HashMap<>(); - private Supplier> heartbeatTagsSupplier = () -> heartbeatTags; + private Map heartbeatTags = new HashMap<>(); + private Supplier> heartbeatTagsSupplier = () -> heartbeatTags; @Before - public void setUp() throws Exception { + public void setUp() throws Exception + { monitor = new ServiceStatusMonitor(); monitor.heartbeatTagsSupplier = heartbeatTagsSupplier; } @Test - public void testLeaderCount() { + public void testLeaderCount() + { heartbeatTags.put("leader", 0); final StubServiceEmitter emitter = new StubServiceEmitter("service", "host"); monitor.doMonitor(emitter); Assert.assertEquals(1, emitter.getEvents().size()); + Assert.assertEquals(0, emitter.getEvents().get(0).toMap().get("leader")); Assert.assertEquals("druid/heartbeat", emitter.getEvents().get(0).toMap().get("metric")); - Assert.assertEquals(0, emitter.getEvents().get(0).toMap().get("value")); + Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value")); heartbeatTags.put("leader", 1); emitter.flush(); monitor.doMonitor(emitter); Assert.assertEquals(1, emitter.getEvents().size()); + Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("leader")); Assert.assertEquals("druid/heartbeat", emitter.getEvents().get(0).toMap().get("metric")); Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value")); } diff --git a/services/src/main/java/org/apache/druid/cli/CliCoordinator.java b/services/src/main/java/org/apache/druid/cli/CliCoordinator.java index 1651eaac3ad3..b49c150450ac 100644 --- a/services/src/main/java/org/apache/druid/cli/CliCoordinator.java +++ b/services/src/main/java/org/apache/druid/cli/CliCoordinator.java @@ -378,9 +378,9 @@ public LoadQueueTaskMaster getLoadQueueTaskMaster( @Provides @LazySingleton @Named("heartbeat") - public Supplier> getHeartbeatSupplier(DruidCoordinator coordinator) { + public Supplier> getHeartbeatSupplier(DruidCoordinator coordinator) { return () -> { - Map heartbeatTags = new HashMap<>(); + Map heartbeatTags = new HashMap<>(); heartbeatTags.put("leader", coordinator.isLeader() ? 1 : 0); return heartbeatTags; diff --git a/services/src/main/java/org/apache/druid/cli/CliOverlord.java b/services/src/main/java/org/apache/druid/cli/CliOverlord.java index 94ff6819a7b4..6f3649db82e3 100644 --- a/services/src/main/java/org/apache/druid/cli/CliOverlord.java +++ b/services/src/main/java/org/apache/druid/cli/CliOverlord.java @@ -362,9 +362,9 @@ public TaskStorageDirTracker getTaskStorageDirTracker(WorkerConfig workerConfig, @Provides @LazySingleton @Named("heartbeat") - public Supplier> getHeartbeatSupplier(TaskMaster taskMaster) { + public Supplier> getHeartbeatSupplier(TaskMaster taskMaster) { return () -> { - Map heartbeatTags = new HashMap<>(); + Map heartbeatTags = new HashMap<>(); heartbeatTags.put("leader", taskMaster.isLeader() ? 1 : 0); return heartbeatTags; From 024685a0e2c69a3f49a16a781726195ba40c3eb8 Mon Sep 17 00:00:00 2001 From: YongGang Date: Wed, 21 Jun 2023 15:05:42 -0700 Subject: [PATCH 07/12] minor refine --- .../druid/server/metrics/ServiceStatusMonitor.java | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java index 108b2971157b..1270906fa041 100644 --- a/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java +++ b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java @@ -43,14 +43,12 @@ public class ServiceStatusMonitor extends AbstractMonitor public boolean doMonitor(ServiceEmitter emitter) { final ServiceMetricEvent.Builder builder = new ServiceMetricEvent.Builder(); - if (heartbeatTagsSupplier == null || heartbeatTagsSupplier.get() == null) { - emitter.emit(builder.build("druid/heartbeat", 1)); - return true; + if (heartbeatTagsSupplier != null && heartbeatTagsSupplier.get() != null) { + heartbeatTagsSupplier.get().forEach((k, v) -> { + builder.setDimension(k, v); + }); } - heartbeatTagsSupplier.get().forEach((k, v) -> { - builder.setDimension(k, v); - }); emitter.emit(builder.build("druid/heartbeat", 1)); return true; } From 23db0aa751fe47f81eb2e9b755c2372ef54e2e6b Mon Sep 17 00:00:00 2001 From: YongGang Date: Wed, 21 Jun 2023 15:32:39 -0700 Subject: [PATCH 08/12] update doc --- docs/operations/metrics.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/operations/metrics.md b/docs/operations/metrics.md index 59aa1b6be263..f2065f3e3607 100644 --- a/docs/operations/metrics.md +++ b/docs/operations/metrics.md @@ -333,7 +333,7 @@ If `emitBalancingStats` is set to `true` in the Coordinator [dynamic configurati |Metric|Description|Dimensions|Normal Value| |------|-----------|----------|------------| -|`druid/heartbeat`| Report service health. For Overlord/Coordinator, the metric is leader count. `ServiceStatusMonitor` must be enabled. |`heartbeatType`|1| +|`druid/heartbeat`| Report service health. For Overlord/Coordinator, the dimension is leader count. `ServiceStatusMonitor` must be enabled. |`heartbeatType`|1| ### Historical From 94e315a8c22e39e977867c3d143464660acf6ced Mon Sep 17 00:00:00 2001 From: YongGang Date: Wed, 21 Jun 2023 16:14:54 -0700 Subject: [PATCH 09/12] reformat code --- .../src/main/java/org/apache/druid/cli/CliCoordinator.java | 3 ++- services/src/main/java/org/apache/druid/cli/CliOverlord.java | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/services/src/main/java/org/apache/druid/cli/CliCoordinator.java b/services/src/main/java/org/apache/druid/cli/CliCoordinator.java index b49c150450ac..0e7fc5970551 100644 --- a/services/src/main/java/org/apache/druid/cli/CliCoordinator.java +++ b/services/src/main/java/org/apache/druid/cli/CliCoordinator.java @@ -378,7 +378,8 @@ public LoadQueueTaskMaster getLoadQueueTaskMaster( @Provides @LazySingleton @Named("heartbeat") - public Supplier> getHeartbeatSupplier(DruidCoordinator coordinator) { + public Supplier> getHeartbeatSupplier(DruidCoordinator coordinator) + { return () -> { Map heartbeatTags = new HashMap<>(); heartbeatTags.put("leader", coordinator.isLeader() ? 1 : 0); diff --git a/services/src/main/java/org/apache/druid/cli/CliOverlord.java b/services/src/main/java/org/apache/druid/cli/CliOverlord.java index 6f3649db82e3..4c365c63a905 100644 --- a/services/src/main/java/org/apache/druid/cli/CliOverlord.java +++ b/services/src/main/java/org/apache/druid/cli/CliOverlord.java @@ -362,7 +362,8 @@ public TaskStorageDirTracker getTaskStorageDirTracker(WorkerConfig workerConfig, @Provides @LazySingleton @Named("heartbeat") - public Supplier> getHeartbeatSupplier(TaskMaster taskMaster) { + public Supplier> getHeartbeatSupplier(TaskMaster taskMaster) + { return () -> { Map heartbeatTags = new HashMap<>(); heartbeatTags.put("leader", taskMaster.isLeader() ? 1 : 0); From b1f03a0bcb94879af1b799487364e003dff2a789 Mon Sep 17 00:00:00 2001 From: YongGang Date: Thu, 22 Jun 2023 10:27:30 -0700 Subject: [PATCH 10/12] address comments --- docs/configuration/index.md | 2 +- docs/operations/metrics.md | 2 +- .../server/metrics/ServiceStatusMonitor.java | 5 ++- .../metrics/ServiceStatusMonitorTest.java | 31 ++++++++++++++----- 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/docs/configuration/index.md b/docs/configuration/index.md index f92c329ae7d3..0544c7cf984d 100644 --- a/docs/configuration/index.md +++ b/docs/configuration/index.md @@ -399,7 +399,7 @@ Metric monitoring is an essential part of Druid operations. The following monit |`org.apache.druid.server.metrics.TaskCountStatsMonitor`|Reports how many ingestion tasks are currently running/pending/waiting and also the number of successful/failed tasks per emission period.| |`org.apache.druid.server.metrics.TaskSlotCountStatsMonitor`|Reports metrics about task slot usage per emission period.| |`org.apache.druid.server.metrics.WorkerTaskCountStatsMonitor`|Reports how many ingestion tasks are currently running/pending/waiting, the number of successful/failed tasks, and metrics about task slot usage for the reporting worker, per emission period. Only supported by middleManager node types.| -| `org.apache.druid.server.metrics.ServiceStatusMonitor`|Reports service heartbeat. For overlord/coordinator, the number is leader count. Only supported by overlord/coordinator node types.| +|`org.apache.druid.server.metrics.ServiceStatusMonitor`|Reports a heartbeat for the service.| For example, you might configure monitors on all processes for system and JVM information within `common.runtime.properties` as follows: diff --git a/docs/operations/metrics.md b/docs/operations/metrics.md index f2065f3e3607..5bc292dfd43c 100644 --- a/docs/operations/metrics.md +++ b/docs/operations/metrics.md @@ -333,7 +333,7 @@ If `emitBalancingStats` is set to `true` in the Coordinator [dynamic configurati |Metric|Description|Dimensions|Normal Value| |------|-----------|----------|------------| -|`druid/heartbeat`| Report service health. For Overlord/Coordinator, the dimension is leader count. `ServiceStatusMonitor` must be enabled. |`heartbeatType`|1| +| `service/heartbeat` | Metric indicating the service is up. `ServiceStatusMonitor` must be enabled. |`leader` on the Overlord and Coordinator.|1| ### Historical diff --git a/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java index 1270906fa041..d56bf76ec43c 100644 --- a/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java +++ b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java @@ -29,8 +29,7 @@ import java.util.Map; /** - * Monitor service running status. - * For Overlord/Coordinator, the dimension reported is service leader count. + * Reports a heartbeat for the service. */ public class ServiceStatusMonitor extends AbstractMonitor { @@ -49,7 +48,7 @@ public boolean doMonitor(ServiceEmitter emitter) }); } - emitter.emit(builder.build("druid/heartbeat", 1)); + emitter.emit(builder.build("service/heartbeat", 1)); return true; } } diff --git a/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java b/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java index 27193c82381c..08617b129fdd 100644 --- a/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java +++ b/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java @@ -32,34 +32,51 @@ public class ServiceStatusMonitorTest { private ServiceStatusMonitor monitor; - private Map heartbeatTags = new HashMap<>(); + private Map heartbeatTags; private Supplier> heartbeatTagsSupplier = () -> heartbeatTags; + private static String HEARTBEAT_METRIC_KEY = "service/heartbeat"; @Before public void setUp() throws Exception { monitor = new ServiceStatusMonitor(); + heartbeatTags = new HashMap<>(); monitor.heartbeatTagsSupplier = heartbeatTagsSupplier; } @Test - public void testLeaderCount() + public void testDefaultHeartbeatReported() { - heartbeatTags.put("leader", 0); final StubServiceEmitter emitter = new StubServiceEmitter("service", "host"); monitor.doMonitor(emitter); + Assert.assertEquals(1, emitter.getEvents().size()); + Assert.assertEquals(HEARTBEAT_METRIC_KEY, emitter.getEvents().get(0).toMap().get("metric")); + Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value")); + } + @Test + public void testLeaderTag() + { + heartbeatTags.put("leader", 1); + final StubServiceEmitter emitter = new StubServiceEmitter("service", "host"); + monitor.doMonitor(emitter); Assert.assertEquals(1, emitter.getEvents().size()); - Assert.assertEquals(0, emitter.getEvents().get(0).toMap().get("leader")); - Assert.assertEquals("druid/heartbeat", emitter.getEvents().get(0).toMap().get("metric")); + Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("leader")); + Assert.assertEquals(HEARTBEAT_METRIC_KEY, emitter.getEvents().get(0).toMap().get("metric")); Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value")); + } + @Test + public void testMoreThanOneTag() + { heartbeatTags.put("leader", 1); - emitter.flush(); + heartbeatTags.put("taskRunner", "http"); + final StubServiceEmitter emitter = new StubServiceEmitter("service", "host"); monitor.doMonitor(emitter); Assert.assertEquals(1, emitter.getEvents().size()); Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("leader")); - Assert.assertEquals("druid/heartbeat", emitter.getEvents().get(0).toMap().get("metric")); + Assert.assertEquals("http", emitter.getEvents().get(0).toMap().get("taskRunner")); + Assert.assertEquals(HEARTBEAT_METRIC_KEY, emitter.getEvents().get(0).toMap().get("metric")); Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value")); } } From 3b939c9d6b57af2ea131e8e621f5a52bf3a984d3 Mon Sep 17 00:00:00 2001 From: YongGang Date: Thu, 22 Jun 2023 15:17:07 -0700 Subject: [PATCH 11/12] remove declared exception --- .../apache/druid/server/metrics/ServiceStatusMonitorTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java b/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java index 08617b129fdd..88acb6dca269 100644 --- a/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java +++ b/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java @@ -37,7 +37,7 @@ public class ServiceStatusMonitorTest private static String HEARTBEAT_METRIC_KEY = "service/heartbeat"; @Before - public void setUp() throws Exception + public void setUp() { monitor = new ServiceStatusMonitor(); heartbeatTags = new HashMap<>(); From 95e19e8bbefd8a32d9aff42354c6c634d92c5d2f Mon Sep 17 00:00:00 2001 From: YongGang Date: Sun, 25 Jun 2023 21:39:49 -0700 Subject: [PATCH 12/12] bind HeartbeatSupplier conditionally in Coordinator --- .../org/apache/druid/cli/CliCoordinator.java | 41 ++++++++++++------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/services/src/main/java/org/apache/druid/cli/CliCoordinator.java b/services/src/main/java/org/apache/druid/cli/CliCoordinator.java index 0e7fc5970551..73adc79a4f31 100644 --- a/services/src/main/java/org/apache/druid/cli/CliCoordinator.java +++ b/services/src/main/java/org/apache/druid/cli/CliCoordinator.java @@ -32,7 +32,7 @@ import com.google.inject.Module; import com.google.inject.Provider; import com.google.inject.Provides; -import com.google.inject.name.Named; +import com.google.inject.TypeLiteral; import com.google.inject.name.Names; import com.google.inject.util.Providers; import org.apache.curator.framework.CuratorFramework; @@ -334,6 +334,10 @@ public void configure(Binder binder) binder.bind(TaskStorage.class).toProvider(Providers.of(null)); binder.bind(TaskMaster.class).toProvider(Providers.of(null)); binder.bind(RowIngestionMetersFactory.class).toProvider(Providers.of(null)); + // Bind HeartbeatSupplier only when the service operates independently of Overlord. + binder.bind(new TypeLiteral>>() {}) + .annotatedWith(Names.named("heartbeat")) + .toProvider(HeartbeatSupplier.class); } binder.bind(CoordinatorCustomDutyGroups.class) @@ -374,19 +378,6 @@ public LoadQueueTaskMaster getLoadQueueTaskMaster( zkPaths ); } - - @Provides - @LazySingleton - @Named("heartbeat") - public Supplier> getHeartbeatSupplier(DruidCoordinator coordinator) - { - return () -> { - Map heartbeatTags = new HashMap<>(); - heartbeatTags.put("leader", coordinator.isLeader() ? 1 : 0); - - return heartbeatTags; - }; - } } ); @@ -476,4 +467,26 @@ public CoordinatorCustomDutyGroups get() } } } + + private static class HeartbeatSupplier implements Provider>> + { + private final DruidCoordinator coordinator; + + @Inject + public HeartbeatSupplier(DruidCoordinator coordinator) + { + this.coordinator = coordinator; + } + + @Override + public Supplier> get() + { + return () -> { + Map heartbeatTags = new HashMap<>(); + heartbeatTags.put("leader", coordinator.isLeader() ? 1 : 0); + + return heartbeatTags; + }; + } + } }