Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/configuration/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,9 @@ Metric monitoring is an essential part of Druid operations. The following monito
|`org.apache.druid.java.util.metrics.CgroupCpuSetMonitor`|Reports CPU core/HT and memory node allocations as per the `cpuset` cgroup.|
|`org.apache.druid.java.util.metrics.CgroupDiskMonitor`|Reports disk statistic as per the blkio cgroup.|
|`org.apache.druid.java.util.metrics.CgroupMemoryMonitor`|Reports memory statistic as per the memory cgroup.|
|`org.apache.druid.java.util.metrics.CgroupV2CpuMonitor`| **EXPERIMENTAL** Reports CPU usage from `cpu.stat` file. Only applicable to `cgroupv2`.|
|`org.apache.druid.java.util.metrics.CgroupV2DiskMonitor`| **EXPERIMENTAL** Reports disk usage from `io.stat` file. Only applicable to `cgroupv2`.|
|`org.apache.druid.java.util.metrics.CgroupV2MemoryMonitor`| **EXPERIMENTAL** Reports memory usage from `memory.current` and `memory.max` files. Only applicable to `cgroupv2`.|
|`org.apache.druid.server.metrics.HistoricalMetricsMonitor`|Reports statistics on Historical services. Available only on Historical services.|
|`org.apache.druid.server.metrics.SegmentStatsMonitor` | **EXPERIMENTAL** Reports statistics about segments on Historical services. Available only on Historical services. Not to be used when lazy loading is configured.|
|`org.apache.druid.server.metrics.QueryCountStatsMonitor`|Reports how many queries have been successful/failed/interrupted.|
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,6 @@ public class CgroupCpuMonitor extends FeedDefiningMonitor
{
private static final Logger LOG = new Logger(CgroupCpuMonitor.class);
private static final Long DEFAULT_USER_HZ = 100L;
public static final String TOTAL_USAGE_METRIC = "cgroup/cpu/usage/total/percentage";
public static final String USER_USAGE_METRIC = "cgroup/cpu/usage/user/percentage";
public static final String SYS_USAGE_METRIC = "cgroup/cpu/usage/sys/percentage";
private static final String TOTAL = "total";
private static final String USER = "user";
private static final String SYSTEM = "system";
final CgroupDiscoverer cgroupDiscoverer;
final Map<String, String[]> dimensions;
private Long userHz;
Expand Down Expand Up @@ -111,18 +105,18 @@ public boolean doMonitor(ServiceEmitter emitter)
final Map<String, Long> elapsedJiffies = jiffies.to(
"usage",
ImmutableMap.<String, Long>builder()
.put(USER, cpuSnapshot.getUserJiffies())
.put(SYSTEM, cpuSnapshot.getSystemJiffies())
.put(TOTAL, cpuSnapshot.getTotalJiffies())
.put(CgroupUtil.USER, cpuSnapshot.getUserJiffies())
.put(CgroupUtil.SYSTEM, cpuSnapshot.getSystemJiffies())
.put(CgroupUtil.TOTAL, cpuSnapshot.getTotalJiffies())
.build()
);
if (elapsedJiffies != null) {
double totalUsagePct = 100.0 * elapsedJiffies.get(TOTAL) / userHz / elapsedJiffiesSnapshotSecs;
double sysUsagePct = 100.0 * elapsedJiffies.get(SYSTEM) / userHz / elapsedJiffiesSnapshotSecs;
double userUsagePct = 100.0 * elapsedJiffies.get(USER) / userHz / elapsedJiffiesSnapshotSecs;
emitter.emit(builder.setMetric(TOTAL_USAGE_METRIC, totalUsagePct));
emitter.emit(builder.setMetric(SYS_USAGE_METRIC, sysUsagePct));
emitter.emit(builder.setMetric(USER_USAGE_METRIC, userUsagePct));
double totalUsagePct = 100.0 * elapsedJiffies.get(CgroupUtil.TOTAL) / userHz / elapsedJiffiesSnapshotSecs;
double sysUsagePct = 100.0 * elapsedJiffies.get(CgroupUtil.SYSTEM) / userHz / elapsedJiffiesSnapshotSecs;
double userUsagePct = 100.0 * elapsedJiffies.get(CgroupUtil.USER) / userHz / elapsedJiffiesSnapshotSecs;
emitter.emit(builder.setMetric(CgroupUtil.CPU_TOTAL_USAGE_METRIC, totalUsagePct));
emitter.emit(builder.setMetric(CgroupUtil.CPU_SYS_USAGE_METRIC, sysUsagePct));
emitter.emit(builder.setMetric(CgroupUtil.CPU_USER_USAGE_METRIC, userUsagePct));
}
}
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ public boolean doMonitor(ServiceEmitter emitter)
final Map<String, Long> stats = diff.to(
entry.getKey(),
ImmutableMap.<String, Long>builder()
.put("cgroup/disk/read/bytes", entry.getValue().getReadBytes())
.put("cgroup/disk/read/count", entry.getValue().getReadCount())
.put("cgroup/disk/write/bytes", entry.getValue().getWriteBytes())
.put("cgroup/disk/write/count", entry.getValue().getWriteCount())
.put(CgroupUtil.DISK_READ_BYTES_METRIC, entry.getValue().getReadBytes())
.put(CgroupUtil.DISK_READ_COUNT_METRIC, entry.getValue().getReadCount())
.put(CgroupUtil.DISK_WRITE_BYTES_METRIC, entry.getValue().getWriteBytes())
.put(CgroupUtil.DISK_WRITE_COUNT_METRIC, entry.getValue().getWriteCount())
.build()
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public CgroupMemoryMonitor()
public boolean doMonitor(ServiceEmitter emitter)
{
final Memory memory = new Memory(cgroupDiscoverer);
final Memory.MemoryStat stat = memory.snapshot();
final Memory.MemoryStat stat = memory.snapshot(memoryUsageFile(), memoryLimitFile());
final ServiceMetricEvent.Builder builder = builder();
MonitorUtils.addDimensionsToBuilder(builder, dimensions);
emitter.emit(builder.setMetric("cgroup/memory/usage/bytes", stat.getUsage()));
Expand All @@ -77,4 +77,14 @@ public boolean doMonitor(ServiceEmitter emitter)
});
return true;
}

public String memoryUsageFile()
{
return "memory.usage_in_bytes";
}

public String memoryLimitFile()
{
return "memory.limit_in_bytes";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,17 @@ public class CgroupUtil
private static final Logger LOG = new Logger(CgroupUtil.class);
public static final String SPACE_MATCH = Pattern.quote(" ");
public static final String COMMA_MATCH = Pattern.quote(",");
public static final String TOTAL = "total";
public static final String USER = "user";
public static final String SYSTEM = "system";
public static final String CPU_TOTAL_USAGE_METRIC = "cgroup/cpu/usage/total/percentage";
public static final String CPU_USER_USAGE_METRIC = "cgroup/cpu/usage/user/percentage";
public static final String CPU_SYS_USAGE_METRIC = "cgroup/cpu/usage/sys/percentage";
public static final String DISK_READ_BYTES_METRIC = "cgroup/disk/read/bytes";
public static final String DISK_READ_COUNT_METRIC = "cgroup/disk/read/count";
public static final String DISK_WRITE_BYTES_METRIC = "cgroup/disk/write/bytes";
public static final String DISK_WRITE_COUNT_METRIC = "cgroup/disk/write/count";


public static long readLongValue(CgroupDiscoverer discoverer, String cgroup, String fileName, long defaultValue)
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.druid.java.util.metrics;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import com.google.common.primitives.Longs;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.java.util.emitter.service.ServiceEmitter;
import org.apache.druid.java.util.emitter.service.ServiceMetricEvent;
import org.apache.druid.java.util.metrics.cgroups.CgroupDiscoverer;
import org.apache.druid.java.util.metrics.cgroups.Cpu;
import org.apache.druid.java.util.metrics.cgroups.ProcCgroupV2Discoverer;
import org.apache.druid.java.util.metrics.cgroups.ProcSelfCgroupDiscoverer;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.time.Instant;
import java.time.temporal.ChronoUnit;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;

/**
* Monitor that reports cpu usage stats by reading `cpu.stat` reported by cgroupv2
*/
public class CgroupV2CpuMonitor extends FeedDefiningMonitor
Comment thread
ac9817 marked this conversation as resolved.
{
private static final Logger LOG = new Logger(CgroupV2CpuMonitor.class);
private static final String CPU_STAT_FILE = "cpu.stat";
private static final String SNAPSHOT = "snapshot";
final CgroupDiscoverer cgroupDiscoverer;
final Map<String, String[]> dimensions;
private final KeyedDiff diff = new KeyedDiff();

public CgroupV2CpuMonitor(CgroupDiscoverer cgroupDiscoverer, final Map<String, String[]> dimensions, String feed)
{
super(feed);
this.cgroupDiscoverer = cgroupDiscoverer;
this.dimensions = dimensions;
}

@VisibleForTesting
CgroupV2CpuMonitor(CgroupDiscoverer cgroupDiscoverer)
{
this(cgroupDiscoverer, ImmutableMap.of(), DEFAULT_METRICS_FEED);
}

CgroupV2CpuMonitor()
{
this(new ProcSelfCgroupDiscoverer(ProcCgroupV2Discoverer.class));
}

@Override
public boolean doMonitor(ServiceEmitter emitter)
{
final ServiceMetricEvent.Builder builder = builder();
MonitorUtils.addDimensionsToBuilder(builder, dimensions);
Snapshot snapshot = snapshot();
final Map<String, Long> elapsed = diff.to(
"usage",
ImmutableMap.<String, Long>builder()
.put(CgroupUtil.USER, snapshot.getUserUsec())
.put(CgroupUtil.SYSTEM, snapshot.getSystemUsec())
.put(CgroupUtil.TOTAL, snapshot.getUsageUsec())
.put(SNAPSHOT, ChronoUnit.MICROS.between(Instant.EPOCH, Instant.now()))
.build()
);

if (elapsed != null) {
long elapsedUsecs = elapsed.get(SNAPSHOT);
double totalUsagePct = 100.0 * elapsed.get(CgroupUtil.TOTAL) / elapsedUsecs;
double sysUsagePct = 100.0 * elapsed.get(CgroupUtil.SYSTEM) / elapsedUsecs;
double userUsagePct = 100.0 * elapsed.get(CgroupUtil.USER) / elapsedUsecs;
emitter.emit(builder.setMetric(CgroupUtil.CPU_TOTAL_USAGE_METRIC, totalUsagePct));
emitter.emit(builder.setMetric(CgroupUtil.CPU_SYS_USAGE_METRIC, sysUsagePct));
emitter.emit(builder.setMetric(CgroupUtil.CPU_USER_USAGE_METRIC, userUsagePct));
}
return true;
}

/*
file: cpu.stat

sample content:
usage_usec 2379951538
user_usec 1802023024
system_usec 577928513
nr_periods 1581231
nr_throttled 59
throttled_usec 3095133
*/
public Snapshot snapshot()
{
Map<String, Long> entries = new HashMap<>();
try (final BufferedReader reader = Files.newBufferedReader(
Paths.get(cgroupDiscoverer.discover(Cpu.CGROUP).toString(), CPU_STAT_FILE)
)) {
for (String line = reader.readLine(); line != null; line = reader.readLine()) {
final String[] parts = line.split(Pattern.quote(" "));
if (parts.length != 2) {
// ignore
continue;
}
entries.put(parts[0], Longs.tryParse(parts[1]));
}
}
catch (IOException | RuntimeException ex) {
LOG.error(ex, "Unable to fetch cpu snapshot");
}

return new Snapshot(entries.get("usage_usec"), entries.get("user_usec"), entries.get("system_usec"));
}


public static class Snapshot
{
private final long usageUsec;
private final long userUsec;
private final long systemUsec;

public Snapshot(long usageUsec, long userUsec, long systemUsec)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should the constructor accept @Nullable Long objects instead so that we can distinguish between parse exceptions and 0s when reading the file?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We basically skip emitting metrics on parse exceptions and would not end up creating this Snapshot object at all.

{
this.usageUsec = usageUsec;
this.userUsec = userUsec;
this.systemUsec = systemUsec;
}

public long getUsageUsec()
{
return usageUsec;
}

public long getUserUsec()
{
return userUsec;
}

public long getSystemUsec()
{
return systemUsec;
}
}
}
Loading