Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
6d9279e
Update netDeviceFilterRe to allow enP4p65s0 and enP2p33s0 (#232)
tsndqst Jul 8, 2025
f7a2da8
Add support for monitoring eMMC drives (#233)
tsndqst Jul 8, 2025
a0f10d4
add FoundationDB application type
def Aug 12, 2025
eb43dc0
Merge pull request #235 from coroot/foundationdb_app_type
def Aug 12, 2025
b349fee
add support for Oracle Cloud metadata
def Aug 13, 2025
7937399
Merge pull request #236 from coroot/oracle_cloud_support
def Aug 13, 2025
0099b5f
add support for disabling log monitoring via container ENV variable
def Aug 20, 2025
ee83bcd
Merge pull request #237 from coroot/disable_log_monitoring
def Aug 20, 2025
d29d9ed
cgroup v1: use hierarchical total_* counters for accurate RSS/memory …
def Aug 21, 2025
94fcaf7
Merge pull request #238 from coroot/cgroup_v1_mem
def Aug 21, 2025
4fe3e12
FoundationDB support
def Aug 29, 2025
73630f0
Merge pull request #239 from coroot/fdb_support
def Sep 1, 2025
3888932
ebpf traces: allow disabling traces per container with COROOT_EBPF_TR…
def Sep 1, 2025
c7d5d32
ebpf traces: add sampling support
def Sep 1, 2025
c71a416
Merge pull request #240 from coroot/ebpf_traces_improvements
def Sep 1, 2025
da8b5a4
grpc: get grpc-status from payload if possible
def Sep 11, 2025
2e9d8aa
Merge pull request #243 from coroot/grpc_status
def Sep 11, 2025
aa169a6
add `container_nodejs_event_loop_blocked_time_seconds_total`
def Sep 19, 2025
aa81b30
Merge pull request #245 from coroot/nodejs_event_loop_monitoring
def Sep 19, 2025
611166e
read python stats directly from the ebpf map
def Sep 19, 2025
d7c8349
Merge pull request #246 from coroot/python_ebpf_map
def Sep 19, 2025
3a7cbbd
add per-container Pressure Stall Information (PSI) metrics
def Sep 22, 2025
f331454
Merge pull request #247 from coroot/psi_metrics
def Sep 22, 2025
8813110
fix net interface name filtering
def Oct 3, 2025
8cf468b
Merge pull request #248 from coroot/fix_net_interface_filter
def Oct 3, 2025
ca58191
improve error logs in journald reader
Allsimon Oct 5, 2025
eab45e1
Merge pull request #250 from Allsimon/journald
def Oct 5, 2025
ef0fc3e
journald: add fallback when inotify is unavailable due to system limits
def Oct 15, 2025
c3fb0c6
Merge pull request #252 from coroot/journald_inotify_fallback
def Oct 15, 2025
41caad3
docker: use debian:bullseye instead of golang image to build with old…
def Oct 16, 2025
34373d2
Merge pull request #254 from coroot/build_on_older_debian
def Oct 16, 2025
96c530a
replace uretprobes with uprobes using return offsets to avoid segment…
def Oct 16, 2025
c0c56bc
Merge pull request #257 from coroot/fix_segfaults
def Nov 6, 2025
3e9da1f
feat: add bridge interfaces
clstb Dec 17, 2025
545ebd9
Merge pull request #261 from clstb/main
def Dec 18, 2025
76fdc04
logparser v1.2.1: cap log patterns per container
def Dec 30, 2025
bb2721d
Merge pull request #263 from coroot/cap_log_patterns
def Dec 30, 2025
61b0744
recognize `kube.slice` and `azure.slice` as systemd service
def Jan 5, 2026
5f605f5
Merge pull request #264 from coroot/issue_214
def Jan 5, 2026
71458d7
fix panic in python cmdline parsing
def Jan 5, 2026
28105e5
Merge pull request #265 from coroot/issue_260
def Jan 5, 2026
19ba6a8
Bump github.com/opencontainers/selinux from 1.11.0 to 1.13.0
dependabot[bot] Jan 19, 2026
0e9cf88
close uprobe asynchronously to avoid blocking `processEvent`
def Jan 19, 2026
2d2a0e7
Merge pull request #272 from coroot/issue_266
def Jan 19, 2026
9fa2325
Merge pull request #271 from coroot/dependabot/go_modules/github.com/…
def Jan 19, 2026
06e4499
bump github.com/containerd/containerd from 1.6.38 to 1.7.29
def Jan 19, 2026
24e41d6
Merge pull request #273 from coroot/update_containerd
def Jan 19, 2026
60205de
fetch GCP metadata from host network namespace
def Jan 23, 2026
cb1c95b
Merge pull request #274 from coroot/fix_792
def Jan 23, 2026
a98b1e1
improve systemd service detection and metadata collection
def Jan 29, 2026
5922043
add filtering for systemd system services
def Jan 29, 2026
b7f7aef
Merge pull request #276 from coroot/systemd_services
def Jan 29, 2026
1f36c57
feat: add --disable-gpu-monitoring flag (#280)
dberkerdem Feb 2, 2026
5e4bdf8
Systemd "container" improvements (#278)
apollo13 Feb 2, 2026
9be271f
ignore deleted cgroups
def Feb 5, 2026
b808030
Merge pull request #283 from coroot/ignore_deleted_cgroups
def Feb 5, 2026
253ea2e
refactor systemd D-Bus client: add caching, reconnection, and proper …
def Feb 9, 2026
e718e3c
Merge pull request #284 from coroot/dbus_refactoring
def Feb 9, 2026
5b2cb5f
cgroup: add podruntime.slice to recognized systemd slices
Alegrowin Mar 3, 2026
9e229ea
Merge pull request #287 from Alegrowin/fix/podruntime-slice-cgroup
def Mar 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
FROM golang:1.23.8-bullseye AS builder
RUN apt update && apt install -y libsystemd-dev
FROM debian:bullseye AS builder
# Using Debian instead of the official Golang image because it’s based on newer OS versions
# with newer glibc, which causes compatibility issues.

RUN apt-get update && apt-get install -y \
curl git build-essential pkg-config libsystemd-dev

ARG GO_VERSION=1.24.9
RUN curl -fsSL https://go.dev/dl/go${GO_VERSION}.linux-$(dpkg --print-architecture).tar.gz -o go.tar.gz && \
tar -C /usr/local -xzf go.tar.gz && rm go.tar.gz
ENV PATH="/usr/local/go/bin:${PATH}"

WORKDIR /tmp/src
COPY go.mod .
COPY go.sum .
Expand Down
9 changes: 6 additions & 3 deletions cgroup/cgroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ var (
crioIdRegexp = regexp.MustCompile(`crio-([a-z0-9]{64})`)
containerdIdRegexp = regexp.MustCompile(`cri-containerd[-:]([a-z0-9]{64})`)
lxcIdRegexp = regexp.MustCompile(`/lxc/([^/]+)`)
systemSliceIdRegexp = regexp.MustCompile(`(/(system|runtime|reserved)\.slice/([^/]+))`)
systemSliceIdRegexp = regexp.MustCompile(`(/(system|runtime|reserved|kube|azure|podruntime)\.slice/([^/]+))`)
talosIdRegexp = regexp.MustCompile(`/(system|podruntime)/([^/]+)`)
lxcPayloadRegexp = regexp.MustCompile(`/lxc\.payload\.([^/]+)`)
)
Expand Down Expand Up @@ -162,7 +162,7 @@ func containerByCgroup(cgroupPath string) (ContainerType, string, error) {
switch {
case cgroupPath == "/init":
return ContainerTypeTalosRuntime, "/talos/init", nil
case prefix == "user.slice" || prefix == "init.scope":
case prefix == "user.slice" || prefix == "init.scope" || prefix == "systemd":
return ContainerTypeStandaloneProcess, "", nil
case prefix == "docker" || (prefix == "system.slice" && len(parts) > 1 && strings.HasPrefix(parts[1], "docker-")):
matches := dockerIdRegexp.FindStringSubmatch(cgroupPath)
Expand Down Expand Up @@ -193,7 +193,10 @@ func containerByCgroup(cgroupPath string) (ContainerType, string, error) {
return ContainerTypeUnknown, "", fmt.Errorf("invalid talos runtime cgroup %s", cgroupPath)
}
return ContainerTypeTalosRuntime, path.Join("/talos/", matches[2]), nil
case prefix == "system.slice" || prefix == "runtime.slice" || prefix == "reserved.slice":
case prefix == "system.slice" || prefix == "runtime.slice" || prefix == "reserved.slice" || prefix == "kube.slice" || prefix == "azure.slice" || prefix == "podruntime.slice":
if strings.HasSuffix(cgroupPath, ".scope") {
return ContainerTypeStandaloneProcess, "", nil
}
matches := systemSliceIdRegexp.FindStringSubmatch(cgroupPath)
if matches == nil {
return ContainerTypeUnknown, "", fmt.Errorf("invalid systemd cgroup %s", cgroupPath)
Expand Down
40 changes: 40 additions & 0 deletions cgroup/cgroup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,26 @@ func TestContainerByCgroup(t *testing.T) {
as.Equal("/reserved.slice/kubelet.service", id)
as.Nil(err)

typ, id, err = containerByCgroup("/kube.slice/kubelet.service")
as.Equal(typ, ContainerTypeSystemdService)
as.Equal("/kube.slice/kubelet.service", id)
as.Nil(err)

typ, id, err = containerByCgroup("/azure.slice/walinuxagent.service")
as.Equal(typ, ContainerTypeSystemdService)
as.Equal("/azure.slice/walinuxagent.service", id)
as.Nil(err)

typ, id, err = containerByCgroup("/podruntime.slice/containerd.service")
as.Equal(typ, ContainerTypeSystemdService)
as.Equal("/podruntime.slice/containerd.service", id)
as.Nil(err)

typ, id, err = containerByCgroup("/podruntime.slice/kubelet.service")
as.Equal(typ, ContainerTypeSystemdService)
as.Equal("/podruntime.slice/kubelet.service", id)
as.Nil(err)

typ, id, err = containerByCgroup("/system.slice/system-postgresql.slice/postgresql@9.4-main.service")
as.Equal(typ, ContainerTypeSystemdService)
as.Equal("/system.slice/system-postgresql.slice", id)
Expand Down Expand Up @@ -195,4 +215,24 @@ func TestContainerByCgroup(t *testing.T) {
as.Equal(ContainerTypeStandaloneProcess, typ)
as.Equal("", id)
as.Nil(err)

typ, id, err = containerByCgroup("/systemd/system.slice")
as.Equal(ContainerTypeStandaloneProcess, typ)
as.Equal("", id)
as.Nil(err)

typ, id, err = containerByCgroup("/system.slice/cri-containerd-69e8ded3c33c9d5e2b93acd74787b17a8629f74d6707bc5bb9b2e095337d0263.scope")
as.Equal(ContainerTypeStandaloneProcess, typ)
as.Equal("", id)
as.Nil(err)

typ, id, err = containerByCgroup("/system.slice/run-ra2ddf9594bbf4a1986439b594f89eb0f.scope")
as.Equal(ContainerTypeStandaloneProcess, typ)
as.Equal("", id)
as.Nil(err)

typ, id, err = containerByCgroup("/system.slice/docker-ba7b10d15d16e10e3de7a2dcd408a3d971169ae303f46cfad4c5453c6326fee2.scope")
as.Equal(ContainerTypeDocker, typ)
as.Equal("ba7b10d15d16e10e3de7a2dcd408a3d971169ae303f46cfad4c5453c6326fee2", id)
as.Nil(err)
}
6 changes: 3 additions & 3 deletions cgroup/cpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ type CPUStat struct {
LimitCores float64
}

func (cg Cgroup) CpuStat() *CPUStat {
func (cg *Cgroup) CpuStat() *CPUStat {
cpu, cpuacct := cg.subsystems["cpu"], cg.subsystems["cpuacct"]
if cpu == "" || cpuacct == "" {
st, _ := cg.cpuStatV2()
Expand All @@ -26,7 +26,7 @@ func (cg Cgroup) CpuStat() *CPUStat {
return st
}

func (cg Cgroup) cpuStatV1() (*CPUStat, error) {
func (cg *Cgroup) cpuStatV1() (*CPUStat, error) {
if cg.subsystems["cpu"] == "" || cg.subsystems["cpuacct"] == "" {
return nil, nil
}
Expand Down Expand Up @@ -56,7 +56,7 @@ func (cg Cgroup) cpuStatV1() (*CPUStat, error) {
return res, nil
}

func (cg Cgroup) cpuStatV2() (*CPUStat, error) {
func (cg *Cgroup) cpuStatV2() (*CPUStat, error) {
if cg.subsystems[""] == "" {
return nil, nil
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
some avg10=0.00 avg60=0.00 avg300=0.00 total=465907442
full avg10=0.00 avg60=0.00 avg300=0.00 total=463529433
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
some avg10=0.00 avg60=0.00 avg300=0.05 total=17657662684
full avg10=0.00 avg60=0.00 avg300=0.05 total=17636951020
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
some avg10=0.00 avg60=0.00 avg300=0.00 total=6937313991
full avg10=0.00 avg60=0.00 avg300=0.00 total=6934649214
4 changes: 2 additions & 2 deletions cgroup/memory.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ func (cg *Cgroup) memoryStatV1() (*MemoryStat, error) {
// mapped_file is accounted only when the memory cgroup is owner of page
// cache.)
return &MemoryStat{
RSS: vars["rss"] + vars["mapped_file"],
Cache: vars["cache"],
RSS: vars["total_rss"] + vars["total_mapped_file"],
Cache: vars["total_cache"],
Limit: limit,
}, nil
}
Expand Down
85 changes: 85 additions & 0 deletions cgroup/psi.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package cgroup

import (
"os"
"path"
"strconv"
"strings"

"github.com/coroot/coroot-node-agent/common"
"k8s.io/klog/v2"
)

type PSIStats struct {
CPUSecondsSome float64
CPUSecondsFull float64
MemorySecondsSome float64
MemorySecondsFull float64
IOSecondsSome float64
IOSecondsFull float64
}

type PressureTotals struct {
SomeSecondsTotal float64
FullSecondsTotal float64
}

func (cg *Cgroup) PSI() *PSIStats {
if cg.subsystems[""] == "" {
return nil
}
stats := &PSIStats{}
for _, controller := range []string{"cpu", "memory", "io"} {
p, err := cg.readPressure(controller)
if err != nil {
if !common.IsNotExist(err) {
klog.Warningln(err)
}
return nil
}
switch controller {
case "cpu":
stats.CPUSecondsSome = p.SomeSecondsTotal
stats.CPUSecondsFull = p.FullSecondsTotal
case "memory":
stats.MemorySecondsSome = p.SomeSecondsTotal
stats.MemorySecondsFull = p.FullSecondsTotal
case "io":
stats.IOSecondsSome = p.SomeSecondsTotal
stats.IOSecondsFull = p.FullSecondsTotal
}
}
return stats
}

func (cg *Cgroup) readPressure(controller string) (*PressureTotals, error) {
data, err := os.ReadFile(path.Join(cg2Root, cg.subsystems[""], controller+".pressure"))
if err != nil {
return nil, err
}
pressure := &PressureTotals{}
for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") {
parts := strings.Fields(line)
if len(parts) == 0 {
continue
}
kind := parts[0]
for _, p := range parts[1:] {
if strings.HasPrefix(p, "total=") {
vStr := strings.TrimPrefix(p, "total=")
v, err := strconv.ParseUint(vStr, 10, 64)
if err != nil {
return nil, err
}
switch kind {
case "some":
pressure.SomeSecondsTotal = float64(v) / 1e6 // microseconds to seconds
case "full":
pressure.FullSecondsTotal = float64(v) / 1e6
}
break
}
}
}
return pressure, nil
}
27 changes: 27 additions & 0 deletions cgroup/psi_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package cgroup

import (
"path"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestCgroupPSI(t *testing.T) {
cgRoot = "fixtures/cgroup"
cg2Root = "fixtures/cgroup"

cg, _ := NewFromProcessCgroupFile(path.Join("fixtures/proc/400/cgroup"))
stat := cg.PSI()
require.NotNil(t, stat)
assert.Equal(t, float64(465907442)/1e6, stat.CPUSecondsSome)
assert.Equal(t, float64(463529433)/1e6, stat.CPUSecondsFull)
assert.Equal(t, float64(6937313991)/1e6, stat.MemorySecondsSome)
assert.Equal(t, float64(6934649214)/1e6, stat.MemorySecondsFull)
assert.Equal(t, float64(17657662684)/1e6, stat.IOSecondsSome)
assert.Equal(t, float64(17636951020)/1e6, stat.IOSecondsFull)

cg, _ = NewFromProcessCgroupFile(path.Join("fixtures/proc/100/cgroup"))
assert.Nil(t, cg.PSI())
}
2 changes: 2 additions & 0 deletions containers/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ func guessApplicationTypeByCmdline(cmdline []byte) string {
return "java"
case bytes.HasSuffix(cmd, []byte("ollama")):
return "ollama"
case bytes.HasSuffix(cmd, []byte("fdbserver")):
return "foundationdb"
case bytes.Contains(cmd, []byte("victoria-metrics")) ||
bytes.Contains(cmd, []byte("vmstorage")) ||
bytes.Contains(cmd, []byte("vminsert")) ||
Expand Down
Loading