diff --git a/docs/configuration/index.md b/docs/configuration/index.md index 0976190cf99a..054aea13f100 100644 --- a/docs/configuration/index.md +++ b/docs/configuration/index.md @@ -1924,16 +1924,11 @@ You can configure Druid services to emit [metrics](../operations/metrics.md) reg ### Metrics monitors for each service -Metric monitoring is an essential part of Druid operations. Druid includes +Metric monitoring is an essential part of Druid operations. +Monitors can be enabled by configuring the property `druid.monitoring.monitors` in the common configuration file, `common.runtime.properties`. +If a monitor is not supported on a certain service, it will simply be ignored while starting up that service. -:::caution - -The `runtime.properties` file for each service overrides the common configuration file (`common.runtime.properties`). They are not additive. This means that if you add any monitors to a specific service, that service only has the monitors specified in its `runtime.properties` file even if there are additional ones listed in the common file. - -::: - - -The following table lists the monitors that are available and the services you could configure the monitor for: +The following table lists available monitors and the respective services where they are supported: |Name|Description|Service| |----|-----------|-------| diff --git a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/EmbeddedKafkaClusterMetricsTest.java b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/EmbeddedKafkaClusterMetricsTest.java index 8d10bae6e4c1..320c14a56281 100644 --- a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/EmbeddedKafkaClusterMetricsTest.java +++ b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/EmbeddedKafkaClusterMetricsTest.java @@ -119,7 +119,11 @@ public void stop() .addCommonProperty("druid.emitter", "composing") .addCommonProperty("druid.emitter.composing.emitters", "[\"latching\",\"kafka\"]") .addCommonProperty("druid.monitoring.emissionPeriod", "PT0.1s") - .addCommonProperty("druid.monitoring.monitors", "[\"org.apache.druid.java.util.metrics.JvmMonitor\"]") + .addCommonProperty( + "druid.monitoring.monitors", + "[\"org.apache.druid.java.util.metrics.JvmMonitor\"," + + "\"org.apache.druid.server.metrics.TaskCountStatsMonitor\"]" + ) .addResource(kafkaServer) .addServer(coordinator) .addServer(overlord) @@ -208,6 +212,12 @@ public void test_ingestClusterMetrics_withConcurrentCompactionSupervisor_andSkip o -> o.postSupervisor(kafkaSupervisorSpec) ); + // Wait for a task to succeed + overlord.latchableEmitter().waitForEventAggregate( + event -> event.hasMetricName("task/success/count") + .hasDimension(DruidMetrics.DATASOURCE, dataSource), + agg -> agg.hasSumAtLeast(1) + ); // Wait for some segments to be published overlord.latchableEmitter().waitForEvent( event -> event.hasMetricName("segment/txn/success") diff --git a/server/src/main/java/org/apache/druid/client/cache/CacheMonitor.java b/server/src/main/java/org/apache/druid/client/cache/CacheMonitor.java index 2c43d1ea7686..2f9ab2cbf3d5 100644 --- a/server/src/main/java/org/apache/druid/client/cache/CacheMonitor.java +++ b/server/src/main/java/org/apache/druid/client/cache/CacheMonitor.java @@ -20,11 +20,19 @@ package org.apache.druid.client.cache; import com.google.inject.Inject; +import org.apache.druid.discovery.NodeRole; +import org.apache.druid.guice.annotations.LoadScope; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; import org.apache.druid.java.util.metrics.AbstractMonitor; +@LoadScope(roles = { + NodeRole.BROKER_JSON_NAME, + NodeRole.HISTORICAL_JSON_NAME, + NodeRole.INDEXER_JSON_NAME, + NodeRole.PEON_JSON_NAME +}) public class CacheMonitor extends AbstractMonitor { // package private for tests diff --git a/server/src/main/java/org/apache/druid/server/metrics/GroupByStatsMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/GroupByStatsMonitor.java index d49c19c6f882..10985b4b4d3a 100644 --- a/server/src/main/java/org/apache/druid/server/metrics/GroupByStatsMonitor.java +++ b/server/src/main/java/org/apache/druid/server/metrics/GroupByStatsMonitor.java @@ -21,6 +21,8 @@ import com.google.inject.Inject; import org.apache.druid.collections.BlockingPool; +import org.apache.druid.discovery.NodeRole; +import org.apache.druid.guice.annotations.LoadScope; import org.apache.druid.guice.annotations.Merging; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; @@ -29,6 +31,12 @@ import java.nio.ByteBuffer; +@LoadScope(roles = { + NodeRole.BROKER_JSON_NAME, + NodeRole.HISTORICAL_JSON_NAME, + NodeRole.INDEXER_JSON_NAME, + NodeRole.PEON_JSON_NAME +}) public class GroupByStatsMonitor extends AbstractMonitor { private final GroupByStatsProvider groupByStatsProvider; diff --git a/server/src/main/java/org/apache/druid/server/metrics/HistoricalMetricsMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/HistoricalMetricsMonitor.java index e7d6a74275b1..b26a55ad5a39 100644 --- a/server/src/main/java/org/apache/druid/server/metrics/HistoricalMetricsMonitor.java +++ b/server/src/main/java/org/apache/druid/server/metrics/HistoricalMetricsMonitor.java @@ -23,6 +23,8 @@ import it.unimi.dsi.fastutil.objects.Object2LongMap; import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; import org.apache.druid.client.DruidServerConfig; +import org.apache.druid.discovery.NodeRole; +import org.apache.druid.guice.annotations.LoadScope; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; import org.apache.druid.java.util.metrics.AbstractMonitor; @@ -33,6 +35,7 @@ import java.util.Map; +@LoadScope(roles = NodeRole.HISTORICAL_JSON_NAME) public class HistoricalMetricsMonitor extends AbstractMonitor { private final DruidServerConfig serverConfig; diff --git a/server/src/main/java/org/apache/druid/server/metrics/MetricsModule.java b/server/src/main/java/org/apache/druid/server/metrics/MetricsModule.java index 39fc2d622b4e..d6a25bea4ddf 100644 --- a/server/src/main/java/org/apache/druid/server/metrics/MetricsModule.java +++ b/server/src/main/java/org/apache/druid/server/metrics/MetricsModule.java @@ -22,7 +22,6 @@ import com.google.common.base.Supplier; import com.google.common.collect.Iterables; import com.google.inject.Binder; -import com.google.inject.Inject; import com.google.inject.Injector; import com.google.inject.Key; import com.google.inject.Module; @@ -34,6 +33,7 @@ import org.apache.druid.guice.JsonConfigProvider; import org.apache.druid.guice.LazySingleton; import org.apache.druid.guice.ManageLifecycle; +import org.apache.druid.guice.annotations.LoadScope; import org.apache.druid.guice.annotations.Self; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.concurrent.Execs; @@ -64,18 +64,14 @@ /** * Sets up the {@link MonitorScheduler} to monitor things on a regular schedule. {@link Monitor}s must be explicitly * bound in order to be loaded. + *

+ * For any service, a monitor is loaded only if the {@link NodeRole} of the service + * is included in the {@link LoadScope} of the monitor. */ public class MetricsModule implements Module { public static final String MONITORING_PROPERTY_PREFIX = "druid.monitoring"; private static final Logger log = new Logger(MetricsModule.class); - private Set nodeRoles; - - @Inject - public void setNodeRoles(@Self Set nodeRoles) - { - this.nodeRoles = nodeRoles; - } public static void register(Binder binder, Class monitorClazz) { @@ -107,6 +103,7 @@ public MonitorScheduler getMonitorScheduler( Supplier config, MonitorsConfig monitorsConfig, Set> monitorSet, + @Self Set nodeRoles, ServiceEmitter emitter, Injector injector ) @@ -117,12 +114,14 @@ public MonitorScheduler getMonitorScheduler( // but by injecting DataSourceTaskIdHolder early this cycle is avoided. injector.getInstance(DataSourceTaskIdHolder.class); for (Class monitorClass : Iterables.concat(monitorsConfig.getMonitors(), monitorSet)) { - monitors.add(injector.getInstance(monitorClass)); + if (shouldLoadMonitor(monitorClass, nodeRoles)) { + monitors.add(injector.getInstance(monitorClass)); + } } if (!monitors.isEmpty()) { log.info( - "Loaded %d monitors: %s", + "Loaded [%d] monitors: %s", monitors.size(), monitors.stream().map(monitor -> monitor.getClass().getName()).collect(Collectors.joining(", ")) ); @@ -218,4 +217,23 @@ public OshiSysMonitor getOshiSysMonitor( return new OshiSysMonitor(dimensions, oshiSysConfig); } } + + /** + * Checks if a monitor needs to be loaded on this service based on its node role. + */ + private boolean shouldLoadMonitor(Class monitorClass, Set nodeRoles) + { + final LoadScope loadScope = monitorClass.getAnnotation(LoadScope.class); + if (loadScope == null) { + // If annotation is not specified, check superclass (if one exists), otherwise load the monitor + Class superClass = monitorClass.getSuperclass(); + return superClass == null || shouldLoadMonitor(superClass, nodeRoles); + } + for (String role : loadScope.roles()) { + if (nodeRoles.contains(NodeRole.fromJsonName(role))) { + return true; + } + } + return false; + } } diff --git a/server/src/main/java/org/apache/druid/server/metrics/QueryCountStatsMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/QueryCountStatsMonitor.java index a9c43d77ee8b..70b68c15ca7d 100644 --- a/server/src/main/java/org/apache/druid/server/metrics/QueryCountStatsMonitor.java +++ b/server/src/main/java/org/apache/druid/server/metrics/QueryCountStatsMonitor.java @@ -22,6 +22,8 @@ import com.google.common.collect.ImmutableMap; import com.google.inject.Inject; import org.apache.druid.collections.BlockingPool; +import org.apache.druid.discovery.NodeRole; +import org.apache.druid.guice.annotations.LoadScope; import org.apache.druid.guice.annotations.Merging; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; @@ -31,6 +33,13 @@ import java.nio.ByteBuffer; import java.util.Map; +@LoadScope(roles = { + NodeRole.BROKER_JSON_NAME, + NodeRole.HISTORICAL_JSON_NAME, + NodeRole.ROUTER_JSON_NAME, + NodeRole.INDEXER_JSON_NAME, + NodeRole.PEON_JSON_NAME +}) public class QueryCountStatsMonitor extends AbstractMonitor { private final KeyedDiff keyedDiff = new KeyedDiff(); diff --git a/server/src/main/java/org/apache/druid/server/metrics/SegmentStatsMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/SegmentStatsMonitor.java index 68b703f8951f..4afc745ae4fe 100644 --- a/server/src/main/java/org/apache/druid/server/metrics/SegmentStatsMonitor.java +++ b/server/src/main/java/org/apache/druid/server/metrics/SegmentStatsMonitor.java @@ -23,6 +23,8 @@ import com.google.inject.Inject; import org.apache.druid.client.DruidServerConfig; +import org.apache.druid.discovery.NodeRole; +import org.apache.druid.guice.annotations.LoadScope; import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.java.util.emitter.service.ServiceEventBuilder; @@ -39,6 +41,7 @@ * * It keeps track of the average number of rows in a segment and the distribution of segments according to rowCount. */ +@LoadScope(roles = NodeRole.HISTORICAL_JSON_NAME) public class SegmentStatsMonitor extends AbstractMonitor { private final DruidServerConfig serverConfig; diff --git a/server/src/main/java/org/apache/druid/server/metrics/SubqueryCountStatsMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/SubqueryCountStatsMonitor.java index 57da36ac1546..561f53a6ee95 100644 --- a/server/src/main/java/org/apache/druid/server/metrics/SubqueryCountStatsMonitor.java +++ b/server/src/main/java/org/apache/druid/server/metrics/SubqueryCountStatsMonitor.java @@ -21,6 +21,8 @@ import com.google.common.collect.ImmutableMap; import com.google.inject.Inject; +import org.apache.druid.discovery.NodeRole; +import org.apache.druid.guice.annotations.LoadScope; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; import org.apache.druid.java.util.metrics.AbstractMonitor; @@ -31,6 +33,7 @@ /** * Monitors and emits the metrics corresponding to the subqueries and their materialization. */ +@LoadScope(roles = NodeRole.BROKER_JSON_NAME) public class SubqueryCountStatsMonitor extends AbstractMonitor { diff --git a/server/src/main/java/org/apache/druid/server/metrics/TaskCountStatsMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/TaskCountStatsMonitor.java index aca9fec3e4e3..f6598c0e1159 100644 --- a/server/src/main/java/org/apache/druid/server/metrics/TaskCountStatsMonitor.java +++ b/server/src/main/java/org/apache/druid/server/metrics/TaskCountStatsMonitor.java @@ -20,6 +20,8 @@ package org.apache.druid.server.metrics; import com.google.inject.Inject; +import org.apache.druid.discovery.NodeRole; +import org.apache.druid.guice.annotations.LoadScope; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; import org.apache.druid.java.util.metrics.AbstractMonitor; @@ -30,6 +32,7 @@ import java.util.Map; +@LoadScope(roles = NodeRole.OVERLORD_JSON_NAME) public class TaskCountStatsMonitor extends AbstractMonitor { private final TaskCountStatsProvider statsProvider; diff --git a/server/src/main/java/org/apache/druid/server/metrics/TaskSlotCountStatsMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/TaskSlotCountStatsMonitor.java index 46227a3ca452..dda5b0d1579d 100644 --- a/server/src/main/java/org/apache/druid/server/metrics/TaskSlotCountStatsMonitor.java +++ b/server/src/main/java/org/apache/druid/server/metrics/TaskSlotCountStatsMonitor.java @@ -20,12 +20,15 @@ package org.apache.druid.server.metrics; import com.google.inject.Inject; +import org.apache.druid.discovery.NodeRole; +import org.apache.druid.guice.annotations.LoadScope; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; import org.apache.druid.java.util.metrics.AbstractMonitor; import java.util.Map; +@LoadScope(roles = NodeRole.OVERLORD_JSON_NAME) public class TaskSlotCountStatsMonitor extends AbstractMonitor { private final TaskSlotCountStatsProvider statsProvider; diff --git a/server/src/main/java/org/apache/druid/server/metrics/WorkerTaskCountStatsMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/WorkerTaskCountStatsMonitor.java index a568ad85d423..6dfc44e4bd99 100644 --- a/server/src/main/java/org/apache/druid/server/metrics/WorkerTaskCountStatsMonitor.java +++ b/server/src/main/java/org/apache/druid/server/metrics/WorkerTaskCountStatsMonitor.java @@ -22,6 +22,7 @@ import com.google.inject.Inject; import com.google.inject.Injector; import org.apache.druid.discovery.NodeRole; +import org.apache.druid.guice.annotations.LoadScope; import org.apache.druid.guice.annotations.Self; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; @@ -31,6 +32,7 @@ import java.util.Map; import java.util.Set; +@LoadScope(roles = {NodeRole.INDEXER_JSON_NAME, NodeRole.MIDDLE_MANAGER_JSON_NAME}) public class WorkerTaskCountStatsMonitor extends AbstractMonitor { private final WorkerTaskCountStatsProvider statsProvider; diff --git a/server/src/test/java/org/apache/druid/server/metrics/MetricsModuleTest.java b/server/src/test/java/org/apache/druid/server/metrics/MetricsModuleTest.java index f4563eb8e5bb..9337f04c11e6 100644 --- a/server/src/test/java/org/apache/druid/server/metrics/MetricsModuleTest.java +++ b/server/src/test/java/org/apache/druid/server/metrics/MetricsModuleTest.java @@ -34,6 +34,7 @@ import org.apache.druid.guice.JsonConfigProvider; import org.apache.druid.guice.LazySingleton; import org.apache.druid.guice.LifecycleModule; +import org.apache.druid.guice.annotations.LoadScope; import org.apache.druid.guice.annotations.Self; import org.apache.druid.initialization.Initialization; import org.apache.druid.initialization.ServerInjectorBuilder; @@ -41,8 +42,10 @@ import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.java.util.emitter.service.ServiceEventBuilder; +import org.apache.druid.java.util.metrics.AbstractMonitor; import org.apache.druid.java.util.metrics.BasicMonitorScheduler; import org.apache.druid.java.util.metrics.ClockDriftSafeMonitorScheduler; +import org.apache.druid.java.util.metrics.Monitor; import org.apache.druid.java.util.metrics.MonitorScheduler; import org.apache.druid.java.util.metrics.NoopOshiSysMonitor; import org.apache.druid.java.util.metrics.NoopSysMonitor; @@ -62,6 +65,7 @@ import javax.validation.Validation; import javax.validation.Validator; import java.util.Properties; +import java.util.Set; public class MetricsModuleTest { @@ -158,6 +162,55 @@ public void testGetBasicMonitorSchedulerViaConfig() Assert.assertSame(BasicMonitorScheduler.class, monitorScheduler.getClass()); } + @Test + public void test_monitorScheduler_addsMonitor_ifNodeRoleIsInLoadScope() + { + final Properties properties = new Properties(); + properties.setProperty( + "druid.monitoring.monitors", + StringUtils.format("[\"%s\"]", OverlordOnlyMonitor.class.getName()) + ); + + verifyThatMonitorIsLoadedOnlyOn( + OverlordOnlyMonitor.class, + properties, + NodeRole.OVERLORD + ); + } + + @Test + public void test_monitorScheduler_addsMonitor_ifNodeRoleIsInLoadScopeOfSuperClass() + { + final Properties properties = new Properties(); + properties.setProperty( + "druid.monitoring.monitors", + StringUtils.format("[\"%s\"]", OverlordAndCoordinatorMonitor2.class.getName()) + ); + + verifyThatMonitorIsLoadedOnlyOn( + OverlordAndCoordinatorMonitor2.class, + properties, + NodeRole.COORDINATOR, + NodeRole.OVERLORD + ); + } + + @Test + public void test_monitorScheduler_addsMonitor_ifNoLoadScopeIsDefined() + { + final Properties properties = new Properties(); + properties.setProperty( + "druid.monitoring.monitors", + StringUtils.format("[\"%s\"]", AllNodeMonitor.class.getName()) + ); + + verifyThatMonitorIsLoadedOnlyOn( + AllNodeMonitor.class, + properties, + NodeRole.values() + ); + } + @Test public void testGetMonitorSchedulerUnknownSchedulerException() { @@ -259,4 +312,47 @@ private static Injector createInjector(Properties properties, ImmutableSet void verifyThatMonitorIsLoadedOnlyOn( + Class monitorClass, + Properties properties, + NodeRole... supportedRoles + ) + { + final Set supportedRoleSet = Set.of(supportedRoles); + for (NodeRole role : NodeRole.values()) { + final MonitorScheduler monitorScheduler = createInjector(properties, ImmutableSet.of(role)) + .getInstance(MonitorScheduler.class); + Assert.assertEquals( + supportedRoleSet.contains(role), + monitorScheduler.findMonitor(monitorClass).isPresent() + ); + } + } + + public static class AllNodeMonitor extends AbstractMonitor + { + @Override + public boolean doMonitor(ServiceEmitter emitter) + { + return false; + } + } + + @LoadScope(roles = {NodeRole.COORDINATOR_JSON_NAME, NodeRole.OVERLORD_JSON_NAME}) + public static class OverlordAndCoordinatorMonitor extends AllNodeMonitor + { + } + + @LoadScope(roles = NodeRole.OVERLORD_JSON_NAME) + public static class OverlordOnlyMonitor extends OverlordAndCoordinatorMonitor + { + } + + /** + * Uses load scope of super class. + */ + public static class OverlordAndCoordinatorMonitor2 extends OverlordAndCoordinatorMonitor + { + } }