Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 36 additions & 1 deletion task-runner/task_runner/system_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,58 @@
from typing import List, Literal, Optional, Tuple
from uuid import UUID

import GPUtil
import psutil

from task_runner import BaseEventLogger, events, utils


def _get_gpu_utilization_percent() -> Optional[float]:
"""Average GPU utilization across GPUs, or None if no GPU."""
try:
gpus = GPUtil.getGPUs()
except Exception: # noqa: BLE001
return None

if not gpus:
return None

# GPUtil.gpu.load is in [0,1]
avg_load = sum(gpu.load for gpu in gpus) / len(gpus)
return avg_load * 100.0


def _get_gpu_memory_utilization_percent() -> Optional[float]:
"""Average GPU memory utilization across GPUs, or None if no GPU."""
try:
gpus = GPUtil.getGPUs()
except Exception: # noqa: BLE001
return None

if not gpus:
return None

# GPUtil.gpu.memoryUtil is in [0,1]
avg_mem_util = sum(gpu.memoryUtil for gpu in gpus) / len(gpus)
return avg_mem_util * 100.0


class SystemMetrics(enum.Enum):
CPU_USAGE = "cpu-usage"
MEMORY_USAGE = "memory"
DISK_INPUT = "disk-input"
DISK_OUTPUT = "disk-output"
GPU_USAGE = "gpu-usage"
GPU_MEMORY = "gpu-memory"


SYSTEM_METRICS_TO_FUNC = {
SystemMetrics.CPU_USAGE: psutil.cpu_percent,
SystemMetrics.MEMORY_USAGE: lambda: psutil.virtual_memory().percent,
SystemMetrics.DISK_INPUT: lambda: psutil.disk_io_counters().read_bytes,
SystemMetrics.DISK_OUTPUT: lambda: psutil.disk_io_counters().write_bytes
SystemMetrics.DISK_OUTPUT: lambda: psutil.disk_io_counters().write_bytes,
SystemMetrics.GPU_USAGE: _get_gpu_utilization_percent,
SystemMetrics.GPU_MEMORY: _get_gpu_memory_utilization_percent,
}


Expand Down