diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..3ee9c8d --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,23 @@ +name: Test + +on: + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: stable + + - name: Run unit tests + run: make test + + - name: Run integration tests + run: make test-it diff --git a/app/services/metrics/metrics.go b/app/services/metrics/metrics.go index 287f2c6..1d68d7b 100644 --- a/app/services/metrics/metrics.go +++ b/app/services/metrics/metrics.go @@ -14,6 +14,7 @@ import ( "hostlink/internal/crypto" "hostlink/internal/networkmetrics" "hostlink/internal/pgmetrics" + "hostlink/internal/storagemetrics" "hostlink/internal/sysmetrics" ) @@ -26,13 +27,14 @@ type Pusher interface { } type metricspusher struct { - apiserver apiserver.MetricsOperations - agentstate agentstate.Operations - metricscollector pgmetrics.Collector - syscollector sysmetrics.Collector - netcollector networkmetrics.Collector - crypto crypto.Service - privateKeyPath string + apiserver apiserver.MetricsOperations + agentstate agentstate.Operations + metricscollector pgmetrics.Collector + syscollector sysmetrics.Collector + netcollector networkmetrics.Collector + storagecollector storagemetrics.Collector + crypto crypto.Service + privateKeyPath string } func NewWithConf() (*metricspusher, error) { @@ -51,6 +53,7 @@ func NewWithConf() (*metricspusher, error) { metricscollector: pgmetrics.New(), syscollector: sysmetrics.New(), netcollector: networkmetrics.New(), + storagecollector: storagemetrics.New(), crypto: crypto.NewService(), privateKeyPath: appconf.AgentPrivateKeyPath(), }, nil @@ -67,6 +70,7 @@ func NewWithDependencies( pgcollector pgmetrics.Collector, syscollector sysmetrics.Collector, netcollector networkmetrics.Collector, + storagecollector storagemetrics.Collector, crypto crypto.Service, privateKeyPath string, ) *metricspusher { @@ -76,6 +80,7 @@ func NewWithDependencies( metricscollector: pgcollector, syscollector: syscollector, netcollector: netcollector, + storagecollector: storagecollector, crypto: crypto, privateKeyPath: privateKeyPath, } @@ -157,6 +162,24 @@ func (mp *metricspusher) Push(cred credential.Credential) error { }) } + storageMetrics, err := mp.storagecollector.Collect(ctx) + if err != nil { + collectionErrors = append(collectionErrors, fmt.Errorf("storage metrics: %w", err)) + } else { + for _, sm := range storageMetrics { + metricSets = append(metricSets, domainmetrics.MetricSet{ + Type: domainmetrics.MetricTypeStorage, + Attributes: map[string]any{ + "mount_point": sm.Attributes.MountPoint, + "device": sm.Attributes.Device, + "filesystem_type": sm.Attributes.FilesystemType, + "is_read_only": sm.Attributes.IsReadOnly, + }, + Metrics: sm.Metrics, + }) + } + } + if len(metricSets) == 0 { return fmt.Errorf("all metrics collection failed: %v", collectionErrors) } diff --git a/app/services/metrics/metrics_test.go b/app/services/metrics/metrics_test.go index 8babdce..5f797ee 100644 --- a/app/services/metrics/metrics_test.go +++ b/app/services/metrics/metrics_test.go @@ -9,6 +9,7 @@ import ( "hostlink/domain/credential" domainmetrics "hostlink/domain/metrics" + "hostlink/internal/storagemetrics" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" @@ -90,6 +91,18 @@ func (m *MockNetCollector) Collect(ctx context.Context) (domainmetrics.NetworkMe return args.Get(0).(domainmetrics.NetworkMetrics), args.Error(1) } +type MockStorageCollector struct { + mock.Mock +} + +func (m *MockStorageCollector) Collect(ctx context.Context) ([]storagemetrics.StorageMetricSet, error) { + args := m.Called(ctx) + if args.Get(0) == nil { + return nil, args.Error(1) + } + return args.Get(0).([]storagemetrics.StorageMetricSet), args.Error(1) +} + type MockCrypto struct { mock.Mock } @@ -169,22 +182,24 @@ func (m *MockCrypto) DecryptWithPrivateKey(ciphertextBase64 string, privateKey * // Test helpers type testMocks struct { - apiserver *MockAPIServer - agentstate *MockAgentState - collector *MockCollector - syscollector *MockSysCollector - netcollector *MockNetCollector - crypto *MockCrypto + apiserver *MockAPIServer + agentstate *MockAgentState + collector *MockCollector + syscollector *MockSysCollector + netcollector *MockNetCollector + storagecollector *MockStorageCollector + crypto *MockCrypto } func setupTestMetricsPusher() (*metricspusher, *testMocks) { mocks := &testMocks{ - apiserver: new(MockAPIServer), - agentstate: new(MockAgentState), - collector: new(MockCollector), - syscollector: new(MockSysCollector), - netcollector: new(MockNetCollector), - crypto: new(MockCrypto), + apiserver: new(MockAPIServer), + agentstate: new(MockAgentState), + collector: new(MockCollector), + syscollector: new(MockSysCollector), + netcollector: new(MockNetCollector), + storagecollector: new(MockStorageCollector), + crypto: new(MockCrypto), } mp := NewWithDependencies( @@ -193,6 +208,7 @@ func setupTestMetricsPusher() (*metricspusher, *testMocks) { mocks.collector, mocks.syscollector, mocks.netcollector, + mocks.storagecollector, mocks.crypto, "/test/key/path", ) @@ -441,11 +457,13 @@ func TestPush_SystemMetricsFailure_StillPushesDbMetrics(t *testing.T) { mocks.syscollector.On("Collect", mock.Anything). Return(domainmetrics.SystemMetrics{}, errors.New("collection failed")) setupNetCollectorMocks(mocks.netcollector) + setupStorageCollectorMocks(mocks.storagecollector) mocks.collector.On("Collect", testCred). Return(domainmetrics.PostgreSQLDatabaseMetrics{ConnectionsTotal: 5}, nil) mocks.apiserver.On("PushMetrics", mock.Anything, mock.MatchedBy(func(p domainmetrics.MetricPayload) bool { hasNetwork := false hasDb := false + hasStorage := false for _, ms := range p.MetricSets { if ms.Type == domainmetrics.MetricTypeNetwork { hasNetwork = true @@ -453,8 +471,11 @@ func TestPush_SystemMetricsFailure_StillPushesDbMetrics(t *testing.T) { if ms.Type == domainmetrics.MetricTypePostgreSQLDatabase { hasDb = true } + if ms.Type == domainmetrics.MetricTypeStorage { + hasStorage = true + } } - return len(p.MetricSets) == 2 && hasNetwork && hasDb + return hasNetwork && hasDb && hasStorage })).Return(nil) err := mp.Push(testCred) @@ -473,11 +494,13 @@ func TestPush_DatabaseMetricsFailure_StillPushesSystemMetrics(t *testing.T) { mocks.agentstate.On("GetAgentID").Return("agent-123") setupSysCollectorMocks(mocks.syscollector) setupNetCollectorMocks(mocks.netcollector) + setupStorageCollectorMocks(mocks.storagecollector) mocks.collector.On("Collect", testCred). Return(domainmetrics.PostgreSQLDatabaseMetrics{}, collectErr) mocks.apiserver.On("PushMetrics", mock.Anything, mock.MatchedBy(func(p domainmetrics.MetricPayload) bool { hasSys := false hasNetwork := false + hasStorage := false for _, ms := range p.MetricSets { if ms.Type == domainmetrics.MetricTypeSystem { hasSys = true @@ -485,8 +508,11 @@ func TestPush_DatabaseMetricsFailure_StillPushesSystemMetrics(t *testing.T) { if ms.Type == domainmetrics.MetricTypeNetwork { hasNetwork = true } + if ms.Type == domainmetrics.MetricTypeStorage { + hasStorage = true + } } - return len(p.MetricSets) == 2 && hasSys && hasNetwork + return hasSys && hasNetwork && hasStorage })).Return(nil) err := mp.Push(testCred) @@ -506,6 +532,8 @@ func TestPush_AllCollectionsFail(t *testing.T) { Return(domainmetrics.SystemMetrics{}, errors.New("collection failed")) mocks.netcollector.On("Collect", mock.Anything). Return(domainmetrics.NetworkMetrics{}, errors.New("network failed")) + mocks.storagecollector.On("Collect", mock.Anything). + Return(nil, errors.New("storage failed")) mocks.collector.On("Collect", testCred). Return(domainmetrics.PostgreSQLDatabaseMetrics{}, errors.New("connection refused")) @@ -524,6 +552,7 @@ func TestPush_APIServerPushFailure(t *testing.T) { mocks.agentstate.On("GetAgentID").Return("agent-123") setupSysCollectorMocks(mocks.syscollector) setupNetCollectorMocks(mocks.netcollector) + setupStorageCollectorMocks(mocks.storagecollector) mocks.collector.On("Collect", testCred). Return(domainmetrics.PostgreSQLDatabaseMetrics{ConnectionsTotal: 5}, nil) mocks.apiserver.On("PushMetrics", mock.Anything, mock.Anything). @@ -547,6 +576,7 @@ func TestPush_Success_ValidatesPayloadSchema(t *testing.T) { mocks.agentstate.On("GetAgentID").Return("agent-123") setupSysCollectorMocks(mocks.syscollector) setupNetCollectorMocks(mocks.netcollector) + setupStorageCollectorMocks(mocks.storagecollector) mocks.collector.On("Collect", testCred). Return(domainmetrics.PostgreSQLDatabaseMetrics{ ConnectionsTotal: 10, @@ -571,7 +601,7 @@ func TestPush_Success_ValidatesPayloadSchema(t *testing.T) { if p.Resource.HostName == "" { return false } - if len(p.MetricSets) != 3 { + if len(p.MetricSets) != 4 { return false } @@ -663,6 +693,7 @@ func TestPush_ContextPropagation(t *testing.T) { mocks.agentstate.On("GetAgentID").Return("agent-123") setupSysCollectorMocks(mocks.syscollector) setupNetCollectorMocks(mocks.netcollector) + setupStorageCollectorMocks(mocks.storagecollector) mocks.collector.On("Collect", testCred). Return(domainmetrics.PostgreSQLDatabaseMetrics{}, nil) mocks.apiserver.On("PushMetrics", mock.MatchedBy(func(ctx context.Context) bool { @@ -705,6 +736,7 @@ func TestPush_CredentialPassedCorrectly(t *testing.T) { mocks.agentstate.On("GetAgentID").Return("agent-456") setupSysCollectorMocks(mocks.syscollector) setupNetCollectorMocks(mocks.netcollector) + setupStorageCollectorMocks(mocks.storagecollector) mocks.collector.On("Collect", mock.MatchedBy(func(c credential.Credential) bool { return c.Host == testCred.Host && c.Port == testCred.Port && @@ -739,3 +771,173 @@ func setupNetCollectorMocks(collector *MockNetCollector) { SentBytesPerSec: 500.0, }, nil) } + +func setupStorageCollectorMocks(collector *MockStorageCollector) { + collector.On("Collect", mock.Anything).Return([]storagemetrics.StorageMetricSet{ + { + Attributes: domainmetrics.StorageAttributes{ + MountPoint: "/", + Device: "/dev/sda1", + FilesystemType: "ext4", + IsReadOnly: false, + }, + Metrics: domainmetrics.StorageMetrics{ + DiskTotalBytes: 100000000000, + DiskUsedBytes: 60000000000, + DiskFreeBytes: 40000000000, + DiskUsedPercent: 60.0, + DiskFreePercent: 40.0, + TotalUtilizationPercent: 25.0, + }, + }, + }, nil) +} + +// Verifies storage metrics are included in the payload +func TestPush_IncludesStorageMetrics(t *testing.T) { + mp, mocks := setupTestMetricsPusher() + testCred := credential.Credential{DataDirectory: "/data"} + + mocks.agentstate.On("GetAgentID").Return("agent-123") + setupSysCollectorMocks(mocks.syscollector) + setupNetCollectorMocks(mocks.netcollector) + setupStorageCollectorMocks(mocks.storagecollector) + mocks.collector.On("Collect", testCred). + Return(domainmetrics.PostgreSQLDatabaseMetrics{}, nil) + mocks.apiserver.On("PushMetrics", mock.Anything, mock.MatchedBy(func(p domainmetrics.MetricPayload) bool { + hasStorage := false + for _, ms := range p.MetricSets { + if ms.Type == domainmetrics.MetricTypeStorage { + hasStorage = true + } + } + return hasStorage + })).Return(nil) + + err := mp.Push(testCred) + + assert.NoError(t, err) + mocks.storagecollector.AssertExpectations(t) + mocks.apiserver.AssertExpectations(t) +} + +// Verifies when storage collection fails, other metrics still pushed +func TestPush_StorageMetricsFailure_StillPushesOtherMetrics(t *testing.T) { + mp, mocks := setupTestMetricsPusher() + testCred := credential.Credential{DataDirectory: "/data"} + + mocks.agentstate.On("GetAgentID").Return("agent-123") + setupSysCollectorMocks(mocks.syscollector) + setupNetCollectorMocks(mocks.netcollector) + mocks.storagecollector.On("Collect", mock.Anything). + Return(nil, errors.New("storage collection failed")) + mocks.collector.On("Collect", testCred). + Return(domainmetrics.PostgreSQLDatabaseMetrics{}, nil) + mocks.apiserver.On("PushMetrics", mock.Anything, mock.MatchedBy(func(p domainmetrics.MetricPayload) bool { + hasSys := false + hasNet := false + hasDb := false + for _, ms := range p.MetricSets { + if ms.Type == domainmetrics.MetricTypeSystem { + hasSys = true + } + if ms.Type == domainmetrics.MetricTypeNetwork { + hasNet = true + } + if ms.Type == domainmetrics.MetricTypePostgreSQLDatabase { + hasDb = true + } + } + return hasSys && hasNet && hasDb + })).Return(nil) + + err := mp.Push(testCred) + + assert.NoError(t, err) + mocks.apiserver.AssertExpectations(t) +} + +// Verifies each mount becomes a separate MetricSet +func TestPush_StorageMetricsMultipleMounts(t *testing.T) { + mp, mocks := setupTestMetricsPusher() + testCred := credential.Credential{DataDirectory: "/data"} + + mocks.agentstate.On("GetAgentID").Return("agent-123") + setupSysCollectorMocks(mocks.syscollector) + setupNetCollectorMocks(mocks.netcollector) + mocks.storagecollector.On("Collect", mock.Anything).Return([]storagemetrics.StorageMetricSet{ + { + Attributes: domainmetrics.StorageAttributes{MountPoint: "/", Device: "/dev/sda1"}, + Metrics: domainmetrics.StorageMetrics{DiskUsedPercent: 50.0}, + }, + { + Attributes: domainmetrics.StorageAttributes{MountPoint: "/home", Device: "/dev/sda2"}, + Metrics: domainmetrics.StorageMetrics{DiskUsedPercent: 75.0}, + }, + }, nil) + mocks.collector.On("Collect", testCred). + Return(domainmetrics.PostgreSQLDatabaseMetrics{}, nil) + mocks.apiserver.On("PushMetrics", mock.Anything, mock.MatchedBy(func(p domainmetrics.MetricPayload) bool { + storageCount := 0 + for _, ms := range p.MetricSets { + if ms.Type == domainmetrics.MetricTypeStorage { + storageCount++ + } + } + return storageCount == 2 + })).Return(nil) + + err := mp.Push(testCred) + + assert.NoError(t, err) + mocks.apiserver.AssertExpectations(t) +} + +// Verifies attributes are set correctly +func TestPush_StorageMetricsWithAttributes(t *testing.T) { + mp, mocks := setupTestMetricsPusher() + testCred := credential.Credential{DataDirectory: "/data"} + + mocks.agentstate.On("GetAgentID").Return("agent-123") + setupSysCollectorMocks(mocks.syscollector) + setupNetCollectorMocks(mocks.netcollector) + mocks.storagecollector.On("Collect", mock.Anything).Return([]storagemetrics.StorageMetricSet{ + { + Attributes: domainmetrics.StorageAttributes{ + MountPoint: "/data", + Device: "/dev/nvme0n1p1", + FilesystemType: "xfs", + IsReadOnly: true, + }, + Metrics: domainmetrics.StorageMetrics{DiskUsedPercent: 80.0}, + }, + }, nil) + mocks.collector.On("Collect", testCred). + Return(domainmetrics.PostgreSQLDatabaseMetrics{}, nil) + mocks.apiserver.On("PushMetrics", mock.Anything, mock.MatchedBy(func(p domainmetrics.MetricPayload) bool { + for _, ms := range p.MetricSets { + if ms.Type == domainmetrics.MetricTypeStorage { + attrs := ms.Attributes + if attrs["mount_point"] != "/data" { + return false + } + if attrs["device"] != "/dev/nvme0n1p1" { + return false + } + if attrs["filesystem_type"] != "xfs" { + return false + } + if attrs["is_read_only"] != true { + return false + } + return true + } + } + return false + })).Return(nil) + + err := mp.Push(testCred) + + assert.NoError(t, err) + mocks.apiserver.AssertExpectations(t) +} diff --git a/docs/spec-storage-metrics.md b/docs/spec-storage-metrics.md new file mode 100644 index 0000000..8cfd4cf --- /dev/null +++ b/docs/spec-storage-metrics.md @@ -0,0 +1,1002 @@ +# Storage Metrics Collector - Technical Specification + +## Overview + +The storage metrics collector gathers disk and filesystem metrics from Linux systems and sends them to an API endpoint. Each mount point generates a separate metric set entry. + +## Data Sources + +### Primary Sources + +| Source | Purpose | +|--------|---------| +| `/proc/self/mountinfo` | Mount point enumeration (preferred) | +| `/proc/self/mounts` | Fallback mount enumeration | +| `/etc/mtab` | Secondary fallback mount enumeration | +| `/proc/diskstats` | I/O statistics (utilization, throughput, IOPS) | +| `statfs()` syscall | Space and inode metrics | +| `/proc/partitions` | Device major:minor resolution | + +### Mount Info File Formats + +**`/proc/self/mountinfo` format:** +``` +36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue +``` +- Field 5 (index 4): mount point (`/mnt2`) +- After `-` separator: filesystem type (`ext3`), device (`/dev/root`) +- Mount options in field 6 + +**`/proc/self/mounts` and `/etc/mtab` format:** +``` +/dev/sda1 / ext4 rw,relatime 0 0 +``` +- Field 1 (index 0): device +- Field 2 (index 1): mount point +- Field 3 (index 2): filesystem type +- Field 4 (index 3): mount options + +### `/proc/diskstats` Format + +``` + 8 0 sda 12345 1234 567890 12345 ... +``` + +| Field | Index | Description | Unit | +|-------|-------|-------------|------| +| 1 | 0 | Major number | - | +| 2 | 1 | Minor number | - | +| 3 | 2 | Device name | string | +| 4 | 3 | Reads completed | count | +| 5 | 4 | Reads merged | count | +| 6 | 5 | Sectors read | sectors | +| 7 | 6 | Read time | ms | +| 8 | 7 | Writes completed | count | +| 9 | 8 | Writes merged | count | +| 10 | 9 | Sectors written | sectors | +| 11 | 10 | Write time | ms | +| 12 | 11 | I/Os in progress | count | +| 13 | 12 | I/O time | ms | +| 14 | 13 | Weighted I/O time | ms | + +--- + +## Field Definitions + +### Static Labels (per mount point) + +| Field | Type | Source | Extraction Method | +|-------|------|--------|-------------------| +| `mount_point` | string | mountinfo/mounts | Field index 4 (mountinfo) or 1 (mounts) | +| `device` | string | mountinfo/mounts | After `-` separator (mountinfo) or field 0 (mounts) | +| `filesystem_type` | string | mountinfo/mounts | After device in mountinfo, or field 2 in mounts | +| `is_read_only` | bool | mount options | `true` if "ro" present in options, else `false` | + +**Supported Filesystem Types:** +Only collect metrics for: `xfs`, `btrfs`, `ext`, `ext2`, `ext3`, `ext4`, `hfs`, `vxfs`, `reiserfs` + +Skip mount points with unsupported filesystem types. + +**Skipped Filesystem Types:** +- Virtual filesystems: `proc`, `sysfs`, `devtmpfs`, `tmpfs`, `devpts`, `cgroup`, `cgroup2`, `securityfs`, `debugfs`, `tracefs`, `configfs`, `fusectl`, `mqueue`, `hugetlbfs`, `pstore`, `bpf` +- Container filesystems: `overlay`, `overlayfs`, `aufs`, `shiftfs` +- Network filesystems (if not explicitly enabled): `nfs`, `nfs4`, `cifs`, `smb`, `fuse.sshfs` + +### Space Metrics (from statfs syscall) + +| Field | Type | Formula | +|-------|------|---------| +| `disk_total_bytes` | float64 | `f_blocks * f_bsize` | +| `disk_free_bytes` | float64 | `f_bavail * f_bsize` (available to non-root) | +| `disk_used_bytes` | float64 | `(f_blocks - f_bfree) * f_bsize` | +| `disk_used_percent` | float64 | `disk_used_bytes / (disk_used_bytes + disk_free_bytes) * 100` | +| `disk_free_percent` | float64 | `100 - disk_used_percent` | + +**Note:** `disk_used_percent` uses `disk_free_bytes` (available space) not total free space. This excludes reserved blocks and accurately reflects what users can actually use. + +### Inode Metrics (from statfs syscall) + +| Field | Type | Formula | +|-------|------|---------| +| `inodes_total` | uint64 | `f_files` | +| `inodes_free` | uint64 | `f_ffree` | +| `inodes_used` | uint64 | `f_files - f_ffree` | +| `inodes_used_percent` | float64 | `inodes_used / inodes_total * 100` | + +### I/O Utilization Metrics (from /proc/diskstats) + +| Field | Type | Formula | +|-------|------|---------| +| `total_utilization_percent` | float64 | `(current_io_time - previous_io_time) / elapsed_ms * 100` (capped at 100%) | +| `read_utilization_percent` | float64 | `total_utilization_percent * (read_time_delta / (read_time_delta + write_time_delta))` | +| `write_utilization_percent` | float64 | `total_utilization_percent * (write_time_delta / (read_time_delta + write_time_delta))` | + +Where: +- `io_time`: diskstats field 13 (index 12) +- `read_time`: diskstats field 7 (index 6) +- `write_time`: diskstats field 11 (index 10) +- `elapsed_ms`: time since last collection in milliseconds + +### Throughput Metrics (from /proc/diskstats) + +| Field | Type | Formula | +|-------|------|---------| +| `read_bytes_per_second` | float64 | `(current_sectors_read - previous_sectors_read) * 512 / elapsed_seconds` | +| `write_bytes_per_second` | float64 | `(current_sectors_written - previous_sectors_written) * 512 / elapsed_seconds` | +| `read_write_bytes_per_second` | float64 | `read_bytes_per_second + write_bytes_per_second` | + +Where: +- `sectors_read`: diskstats field 6 (index 5) +- `sectors_written`: diskstats field 10 (index 9) +- Sector size: 512 bytes (Linux standard) + +### IOPS Metrics (from /proc/diskstats) + +| Field | Type | Formula | +|-------|------|---------| +| `read_io_per_second` | float64 | `(current_reads_completed - previous_reads_completed) / elapsed_seconds` | +| `write_io_per_second` | float64 | `(current_writes_completed - previous_writes_completed) / elapsed_seconds` | + +Where: +- `reads_completed`: diskstats field 4 (index 3) +- `writes_completed`: diskstats field 8 (index 7) + +--- + +## Device Mapping Rules + +### Device Resolution Strategy + +| Device Type | Mount File Pattern | Diskstats Key | Resolution Method | +|-------------|-------------------|---------------|-------------------| +| Regular | `/dev/sda1` | `sda1` | Strip `/dev/` prefix | +| LVM | `/dev/mapper/vg-lv` | `dm-X` | Resolve via major:minor from mountinfo | +| Root pseudo | `/dev/root` | varies | Resolve via `/proc/partitions` using major:minor | + +### LVM Device Resolution + +1. Parse major:minor from mountinfo (field 3, format `major:minor`) +2. Read `/proc/diskstats` to find matching `dm-X` device +3. Use the `dm-X` name for diskstats lookups + +### /dev/root Resolution + +1. Get major:minor for `/dev/root` from mountinfo +2. Read `/proc/partitions` to find device name matching major:minor +3. Use resolved device name for diskstats lookups + +--- + +## API Contract + +### Endpoint + +``` +POST /api/v1/agents/{agent_id}/metrics +``` + +### Headers + +``` +Content-Type: application/json +``` + +### Payload Structure + +```json +{ + "version": "1.0", + "timestamp_ms": 1703001234567, + "resource": { + "agent_id": "agent-123-abc" + }, + "metric_sets": [ + { + "type": "storage", + "attributes": { + "mount_point": "/", + "device": "/dev/sda1", + "filesystem_type": "ext4", + "is_read_only": false + }, + "metrics": { + "disk_used_bytes": 50000000000.0, + "disk_free_bytes": 50000000000.0, + "disk_total_bytes": 100000000000.0, + "disk_used_percent": 50.0, + "disk_free_percent": 50.0, + "total_utilization_percent": 25.5, + "read_utilization_percent": 15.2, + "write_utilization_percent": 10.3, + "read_bytes_per_second": 5242880.0, + "write_bytes_per_second": 2097152.0, + "read_write_bytes_per_second": 7340032.0, + "read_io_per_second": 150.5, + "write_io_per_second": 80.3, + "inodes_used": 500000, + "inodes_free": 9500000, + "inodes_total": 10000000, + "inodes_used_percent": 5.0 + } + }, + { + "type": "storage", + "attributes": { + "mount_point": "/data", + "device": "/dev/sdb1", + "filesystem_type": "xfs", + "is_read_only": false + }, + "metrics": { ... } + } + ] +} +``` + +### Type Constant + +Add to `domain/metrics/metrics.go`: +```go +MetricTypeStorage = "storage" +``` + +--- + +## Edge Cases and Error Handling + +### Bind Mounts and Shared Devices + +Report each mount point as a separate metric set, with shared I/O metrics across mounts of the same device. + +**Rationale:** + +| Concern | Decision | +|---------|----------| +| Capacity metrics | Unique per mount — users monitor specific paths for disk space alerts (e.g., /var/log at 90%) | +| I/O metrics | Shared across mounts — block-level operations occur at the device layer, not per path | +| Deduplication | Not performed — would lose visibility into individual mount point usage | + +**Metric Classification:** + +| Category | Metrics | Source | Unique Per Mount? | +|----------|---------|--------|-------------------| +| Identity | mount_point, device, filesystem_type, is_read_only | Mount info | Yes | +| Capacity | disk_used/free/total_bytes, disk_used/free_percent | statfs() | Yes | +| Inodes | inodes_used/free/total, inodes_used_percent | statfs() | Yes | +| Throughput | read/write/read_write_bytes_per_second | /proc/diskstats | No (shared) | +| IOPS | read/write_io_per_second | /proc/diskstats | No (shared) | +| Utilization | total/read/write_utilization_percent | /proc/diskstats | No (shared) | + +**Algorithm:** + +1. Parse mount info and group mount points by underlying device +2. For each mount point, query statfs() independently (capacity + inodes) +3. For each unique device, read /proc/diskstats and calculate deltas +4. For each mount point: attach capacity metrics (unique) + I/O metrics from parent device (shared) + +**Bind Mount Edge Cases:** + +| Case | Behavior | +|------|----------| +| Bind mount of subdirectory | Report as separate mount; capacity reflects subdirectory view | +| Read-only remount | Report with is_read_only: true; I/O metrics still shared with writable mount | +| Device unmounted mid-collection | Skip mount point; log warning | +| No previous I/O sample | Return 0.0 for I/O metrics on first collection cycle | + +### Mount Disappear/Reappear + +A mount can disappear between samples (unmounted, failed, container stopped) and later reappear. Delta calculations must handle stale previous values, counter resets after remount, and time gaps. + +**Decision:** Track per-device "last seen" timestamp. Skip delta calculation if device was absent in previous sample. + +**Algorithm:** + +1. Maintain two maps: + - `lastStats[device]` → I/O counters + - `lastSeen[device]` → sample timestamp + +2. On each sample, for each device: + - If device NOT in lastSeen OR lastSeen[device] != previousSampleTime: + - Device is new or was missing last sample + - Emit capacity metrics only (no I/O deltas) + - Else: + - Device was present in previous sample + - Calculate deltas normally with counter wrap check + - Update lastSeen[device] = currentTime + - Update lastStats[device] = currentCounters + +3. Cleanup: Remove stale entries to prevent unbounded memory growth + +**State Cleanup:** + +Devices that disappear leave orphaned entries in `lastStats` and `lastSeen` maps. Periodic cleanup removes entries not seen for N consecutive samples. + +```go +const staleThreshold = 3 // Remove after missing 3 samples + +func cleanupStaleEntries(currentDevices map[string]bool) { + for device := range lastSeen { + if !currentDevices[device] { + missedSamples[device]++ + if missedSamples[device] >= staleThreshold { + delete(lastStats, device) + delete(lastSeen, device) + delete(missedSamples, device) + } + } else { + missedSamples[device] = 0 + } + } +} +``` + +**Why not immediate removal?** +- Device might be temporarily unavailable (slow NFS mount) +- Brief unmount/remount cycles shouldn't lose state +- 3-sample threshold balances memory vs resilience + +**Output Behavior:** + +| Scenario | Capacity Metrics | I/O Metrics | +|----------|------------------|-------------| +| Normal sample | ✓ Reported | ✓ Reported | +| First sample ever | ✓ Reported | Return 0.0 | +| Device new this sample | ✓ Reported | Return 0.0 | +| Device missing last sample, back now | ✓ Reported | Return 0.0 | +| Device present consecutively | ✓ Reported | ✓ Reported | +| Counter reset detected | ✓ Reported | Return 0.0 | + +**Example Timeline:** + +``` +T0: /dev/sdb1 at /data → { capacity: ✓, io: 0.0 (first sample) } +T1: /dev/sdb1 at /data → { capacity: ✓, io: ✓ } +T2: /data unmounted → (nothing for /data) +T3: /data remounted → { capacity: ✓, io: 0.0 (gap detected) } +T4: /dev/sdb1 at /data → { capacity: ✓, io: ✓ } +``` + +**Edge Cases:** + +| Case | Behavior | +|------|----------| +| Device remounted with different name | Treated as new device | +| Same device, different mount point | Both mounts share I/O stats | +| Very long unmount (hours) | Cleanup removes stale entries; treated as new on return | +| Rapid mount/unmount flapping | Each reappearance returns 0.0 for I/O for one sample | + +### Partition vs Whole Disk Stats + +`/proc/diskstats` contains entries for both whole disks (sda) and partitions (sda1, sda2). When `/dev/sda1` is mounted, the collector must decide which stats to use. + +**Decision:** Always use partition-level stats that match the mounted device. + +**Rationale:** + +| Concern | Decision | +|---------|----------| +| Accuracy | Partition stats reflect only that partition's I/O | +| Consistency | Matches the capacity metrics which are partition-scoped | +| Whole disk mounted | Rare case; use sda stats (correct for that scenario) | +| Aggregation | Users can sum partitions if they need disk-level totals | + +**Mapping Rules:** + +| Mount Path Pattern | Diskstats Key | Example | +|--------------------|---------------|---------| +| /dev/sdXN | sdXN | /dev/sda1 → sda1 | +| /dev/sdX (whole disk) | sdX | /dev/sdb → sdb | +| /dev/nvme* | Full name | /dev/nvme0n1p1 → nvme0n1p1 | +| /dev/mapper/* | dm-N | Resolve via major:minor | +| /dev/root | Resolved name | Lookup in /proc/partitions | + +**Algorithm:** + +``` +func getDiskstatsKey(mountDevice): + if isLVM(mountDevice): + # /dev/mapper/vg-lv -> dm-X + majorMinor = lookupMajorMinor(mountDevice) + return "dm-" + minor + + if isRootDevice(mountDevice): + # /dev/root -> resolve actual device + majorMinor = lookupMajorMinor(mountDevice) + return lookupDeviceName(majorMinor) + + # /dev/sda1 -> sda1, /dev/nvme0n1p1 -> nvme0n1p1 + return strings.TrimPrefix(mountDevice, "/dev/") +``` + +**Edge Cases:** + +| Case | Behavior | +|------|----------| +| Whole disk mounted (/dev/sda) | Use sda stats (unusual but valid) | +| NVMe partition (/dev/nvme0n1p1) | Use nvme0n1p1 stats | +| Software RAID (/dev/md0) | Use md0 stats | +| Loop device (/dev/loop0) | Use loop0 stats (if supported FS) | +| No matching diskstats entry | Return 0.0 for I/O metrics, log warning | + +**Example:** + +System with `/dev/sda1` at `/` and `/dev/sda2` at `/home`: + +``` +/proc/diskstats: + sda - 1000 reads, 800 writes (aggregate) + sda1 - 600 reads, 500 writes + sda2 - 400 reads, 300 writes + +Output: + { mount_point: "/", device: "/dev/sda1", read_io: } + { mount_point: "/home", device: "/dev/sda2", read_io: } +``` + +Note: `sda` aggregate stats are never used unless `/dev/sda` itself is mounted as a filesystem (uncommon). + +### Symlink Device Paths + +Devices can be mounted using stable symlinks: +- `/dev/disk/by-uuid/` +- `/dev/disk/by-label/