-
Notifications
You must be signed in to change notification settings - Fork 15.1k
KAFKA-8934: Introduce instance-level metrics for streams applications #7416
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,6 +37,7 @@ | |
| import org.apache.kafka.streams.errors.ProcessorStateException; | ||
| import org.apache.kafka.streams.errors.StreamsException; | ||
| import org.apache.kafka.streams.internals.ApiUtils; | ||
| import org.apache.kafka.streams.internals.metrics.ClientMetrics; | ||
| import org.apache.kafka.streams.kstream.KStream; | ||
| import org.apache.kafka.streams.kstream.KTable; | ||
| import org.apache.kafka.streams.kstream.Produced; | ||
|
|
@@ -53,6 +54,7 @@ | |
| import org.apache.kafka.streams.processor.internals.StreamThread; | ||
| import org.apache.kafka.streams.processor.internals.StreamsMetadataState; | ||
| import org.apache.kafka.streams.processor.internals.ThreadStateTransitionValidator; | ||
| import org.apache.kafka.streams.processor.internals.metrics.StreamsMetricsImpl; | ||
| import org.apache.kafka.streams.state.HostInfo; | ||
| import org.apache.kafka.streams.state.QueryableStoreType; | ||
| import org.apache.kafka.streams.state.StreamsMetadata; | ||
|
|
@@ -144,6 +146,7 @@ public class KafkaStreams implements AutoCloseable { | |
| private final ScheduledExecutorService rocksDBMetricsRecordingService; | ||
| private final QueryableStoreProvider queryableStoreProvider; | ||
| private final Admin adminClient; | ||
| private final StreamsMetricsImpl streamsMetrics; | ||
|
|
||
| GlobalStreamThread globalStreamThread; | ||
| private KafkaStreams.StateListener stateListener; | ||
|
|
@@ -672,6 +675,16 @@ private KafkaStreams(final InternalTopologyBuilder internalTopologyBuilder, | |
| Collections.singletonMap(StreamsConfig.CLIENT_ID_CONFIG, clientId)); | ||
| reporters.add(new JmxReporter(JMX_PREFIX)); | ||
| metrics = new Metrics(metricConfig, reporters, time); | ||
| streamsMetrics = | ||
| new StreamsMetricsImpl(metrics, clientId, config.getString(StreamsConfig.BUILT_IN_METRICS_VERSION_CONFIG)); | ||
| streamsMetrics.setRocksDBMetricsRecordingTrigger(rocksDBMetricsRecordingTrigger); | ||
| ClientMetrics.addVersionMetric(streamsMetrics); | ||
| ClientMetrics.addCommitIdMetric(streamsMetrics); | ||
| ClientMetrics.addApplicationIdMetric(streamsMetrics, config.getString(StreamsConfig.APPLICATION_ID_CONFIG)); | ||
| ClientMetrics.addTopologyDescriptionMetric(streamsMetrics, internalTopologyBuilder.describe().toString()); | ||
| ClientMetrics.addStateMetric(streamsMetrics, (metricsConfig, now) -> state); | ||
| log.info("Kafka Streams version: {}", ClientMetrics.version()); | ||
| log.info("Kafka Streams commit ID: {}", ClientMetrics.commitId()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just going out on a limb here... should we also log the topology description here?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought, we already do, but I couldn't find any output in a log file that I have locally. I would be in favour of logging the topology because it helps us during on-call. On the other hand, it may pollute the logs. Anyways, could we postpone this discussion to after the release? |
||
|
|
||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here I add the instance-level metrics. |
||
| // re-write the physical topology according to the config | ||
| internalTopologyBuilder.rewriteTopology(config); | ||
|
|
@@ -712,11 +725,10 @@ private KafkaStreams(final InternalTopologyBuilder internalTopologyBuilder, | |
| clientSupplier.getGlobalConsumer(config.getGlobalConsumerConfigs(clientId)), | ||
| stateDirectory, | ||
| cacheSizePerThread, | ||
| metrics, | ||
| streamsMetrics, | ||
| time, | ||
| globalThreadId, | ||
| delegatingStateRestoreListener, | ||
| rocksDBMetricsRecordingTrigger | ||
| delegatingStateRestoreListener | ||
| ); | ||
| globalThreadState = globalStreamThread.state(); | ||
| } | ||
|
|
@@ -734,14 +746,13 @@ private KafkaStreams(final InternalTopologyBuilder internalTopologyBuilder, | |
| adminClient, | ||
| processId, | ||
| clientId, | ||
| metrics, | ||
| streamsMetrics, | ||
| time, | ||
| streamsMetadataState, | ||
| cacheSizePerThread, | ||
| stateDirectory, | ||
| delegatingStateRestoreListener, | ||
| i + 1); | ||
| threads[i].setRocksDBMetricsRecordingTrigger(rocksDBMetricsRecordingTrigger); | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not needed anymore since the |
||
| threadState.put(threads[i].getId(), threads[i].state()); | ||
| storeProviders.add(new StreamThreadStateStoreProvider(threads[i])); | ||
| } | ||
|
|
@@ -928,6 +939,7 @@ private boolean close(final long timeoutMs) { | |
|
|
||
| adminClient.close(); | ||
|
|
||
| streamsMetrics.removeAllClientLevelMetrics(); | ||
| metrics.close(); | ||
| setState(State.NOT_RUNNING); | ||
| }, "kafka-streams-close-thread"); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,116 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
| package org.apache.kafka.streams.internals.metrics; | ||
|
|
||
| import org.apache.kafka.common.metrics.Gauge; | ||
| import org.apache.kafka.common.metrics.Sensor.RecordingLevel; | ||
| import org.apache.kafka.streams.KafkaStreams.State; | ||
| import org.apache.kafka.streams.processor.internals.metrics.StreamsMetricsImpl; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| import java.io.InputStream; | ||
| import java.util.Properties; | ||
|
|
||
| public class ClientMetrics { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems to be more like "InstanceMetrics" than "client" metrics, where "client" usually means a Producer or Consumer. In Streams, we sometimes conflate "client" with "thread" because each StreamThread has exactly one Consumer.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In KIP-444, "client" is used instead of "instance". Also the tag for instance-level is "client-id" whereas for thread it will be "thread-id" with PR #7429. I agree that clients is somehow overloaded, but since Streams is also a client (that uses other clients) it seems consistent to me. |
||
| private ClientMetrics() {} | ||
|
|
||
| private static final Logger log = LoggerFactory.getLogger(ClientMetrics.class); | ||
| private static final String VERSION = "version"; | ||
| private static final String COMMIT_ID = "commit-id"; | ||
| private static final String APPLICATION_ID = "application-id"; | ||
| private static final String TOPOLOGY_DESCRIPTION = "topology-description"; | ||
| private static final String STATE = "state"; | ||
| private static final String VERSION_FROM_FILE; | ||
| private static final String COMMIT_ID_FROM_FILE; | ||
| private static final String DEFAULT_VALUE = "unknown"; | ||
|
|
||
| static { | ||
| final Properties props = new Properties(); | ||
| try (InputStream resourceStream = ClientMetrics.class.getResourceAsStream( | ||
| "/kafka/kafka-streams-version.properties")) { | ||
|
|
||
| props.load(resourceStream); | ||
| } catch (final Exception exception) { | ||
| log.warn("Error while loading kafka-streams-version.properties", exception); | ||
| } | ||
| VERSION_FROM_FILE = props.getProperty("version", DEFAULT_VALUE).trim(); | ||
| COMMIT_ID_FROM_FILE = props.getProperty("commitId", DEFAULT_VALUE).trim(); | ||
| } | ||
|
|
||
| private static final String VERSION_DESCRIPTION = "The version of the Kafka Streams client"; | ||
| private static final String COMMIT_ID_DESCRIPTION = "The version control commit ID of the Kafka Streams client"; | ||
| private static final String APPLICATION_ID_DESCRIPTION = "The application ID of the Kafka Streams client"; | ||
| private static final String TOPOLOGY_DESCRIPTION_DESCRIPTION = | ||
| "The description of the topology executed in the Kafka Streams client"; | ||
| private static final String STATE_DESCRIPTION = "The state of the Kafka Streams client"; | ||
|
|
||
| public static String version() { | ||
| return VERSION_FROM_FILE; | ||
| } | ||
|
|
||
| public static String commitId() { | ||
| return COMMIT_ID_FROM_FILE; | ||
| } | ||
|
|
||
| public static void addVersionMetric(final StreamsMetricsImpl streamsMetrics) { | ||
| streamsMetrics.addClientLevelImmutableMetric( | ||
| VERSION, | ||
| VERSION_DESCRIPTION, | ||
| RecordingLevel.INFO, | ||
| VERSION_FROM_FILE | ||
| ); | ||
| } | ||
|
|
||
| public static void addCommitIdMetric(final StreamsMetricsImpl streamsMetrics) { | ||
| streamsMetrics.addClientLevelImmutableMetric( | ||
| COMMIT_ID, | ||
| COMMIT_ID_DESCRIPTION, | ||
| RecordingLevel.INFO, | ||
| COMMIT_ID_FROM_FILE | ||
| ); | ||
| } | ||
|
|
||
| public static void addApplicationIdMetric(final StreamsMetricsImpl streamsMetrics, final String applicationId) { | ||
| streamsMetrics.addClientLevelImmutableMetric( | ||
| APPLICATION_ID, | ||
| APPLICATION_ID_DESCRIPTION, | ||
| RecordingLevel.INFO, | ||
| applicationId | ||
| ); | ||
| } | ||
|
|
||
| public static void addTopologyDescriptionMetric(final StreamsMetricsImpl streamsMetrics, | ||
| final String topologyDescription) { | ||
| streamsMetrics.addClientLevelImmutableMetric( | ||
| TOPOLOGY_DESCRIPTION, | ||
| TOPOLOGY_DESCRIPTION_DESCRIPTION, | ||
| RecordingLevel.INFO, | ||
| topologyDescription | ||
| ); | ||
| } | ||
|
|
||
| public static void addStateMetric(final StreamsMetricsImpl streamsMetrics, | ||
| final Gauge<State> stateProvider) { | ||
| streamsMetrics.addClientLevelMutableMetric( | ||
| STATE, | ||
| STATE_DESCRIPTION, | ||
| RecordingLevel.INFO, | ||
| stateProvider | ||
| ); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,7 +38,10 @@ private Sensors() {} | |
|
|
||
| public static Sensor lateRecordDropSensor(final InternalProcessorContext context) { | ||
| final StreamsMetricsImpl metrics = context.metrics(); | ||
| final String threadId = Thread.currentThread().getName(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this always right? It would set the threadId as the name of the thread that constructed the sensor, but is this always the StreamThread? Even if it is today, it wouldn't be outrageous to think in the future that another thread might build a task and then pass it to the thread to execute. Such a refactoring would break this logic, but it would be very subtle. In contrast, the old code here explicitly passed the thread name via the context, so it would never be set incorrectly just by changing the way that the code is invoked. Perhaps we could preserve this property, for example by passing the thread name as a method argument.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just checked the code path, passing the thread name via
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @vvcephei, you are right. This is not general enough. On thread-level -- for instance -- sensors are created by the main thread and then used in the stream thread. Here, instead of passing it through
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, sounds fine to me, as long as we're aware of the risk. |
||
|
|
||
| final Sensor sensor = metrics.nodeLevelSensor( | ||
| threadId, | ||
| context.taskId().toString(), | ||
| context.currentNode().name(), | ||
| LATE_RECORD_DROP, | ||
|
|
@@ -47,23 +50,33 @@ public static Sensor lateRecordDropSensor(final InternalProcessorContext context | |
| StreamsMetricsImpl.addInvocationRateAndCountToSensor( | ||
| sensor, | ||
| PROCESSOR_NODE_METRICS_GROUP, | ||
| metrics.tagMap("task-id", context.taskId().toString(), PROCESSOR_NODE_ID_TAG, context.currentNode().name()), | ||
| metrics.tagMap( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After the other PR is merged, this part can be leveraged / rebased, just a reminder :) |
||
| threadId, | ||
| "task-id", | ||
| context.taskId().toString(), | ||
| PROCESSOR_NODE_ID_TAG, | ||
| context.currentNode().name() | ||
| ), | ||
| LATE_RECORD_DROP | ||
| ); | ||
| return sensor; | ||
| } | ||
|
|
||
| public static Sensor recordLatenessSensor(final InternalProcessorContext context) { | ||
| final StreamsMetricsImpl metrics = context.metrics(); | ||
| final String threadId = Thread.currentThread().getName(); | ||
|
|
||
| final Sensor sensor = metrics.taskLevelSensor( | ||
| threadId, | ||
| context.taskId().toString(), | ||
| "record-lateness", | ||
| Sensor.RecordingLevel.DEBUG | ||
| ); | ||
|
|
||
| final Map<String, String> tags = metrics.tagMap( | ||
| "task-id", context.taskId().toString() | ||
| threadId, | ||
| "task-id", | ||
| context.taskId().toString() | ||
| ); | ||
| sensor.add( | ||
| new MetricName( | ||
|
|
@@ -86,17 +99,22 @@ public static Sensor recordLatenessSensor(final InternalProcessorContext context | |
|
|
||
| public static Sensor suppressionEmitSensor(final InternalProcessorContext context) { | ||
| final StreamsMetricsImpl metrics = context.metrics(); | ||
| final String threadId = Thread.currentThread().getName(); | ||
|
|
||
| final Sensor sensor = metrics.nodeLevelSensor( | ||
| threadId, | ||
| context.taskId().toString(), | ||
| context.currentNode().name(), | ||
| "suppression-emit", | ||
| Sensor.RecordingLevel.DEBUG | ||
| ); | ||
|
|
||
| final Map<String, String> tags = metrics.tagMap( | ||
| "task-id", context.taskId().toString(), | ||
| PROCESSOR_NODE_ID_TAG, context.currentNode().name() | ||
| threadId, | ||
| "task-id", | ||
| context.taskId().toString(), | ||
| PROCESSOR_NODE_ID_TAG, | ||
| context.currentNode().name() | ||
| ); | ||
|
|
||
| sensor.add( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
StreamsMetricsImplis now created at client level and not on thread level anymore.