Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 50 additions & 17 deletions backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,24 @@ def _relative_percent(numerator: float, denominator: float) -> Optional[float]:
return round((numerator / denominator) * 100, 1)


_SAFE_DIAGNOSTIC_ERROR_CODES = {
"cpu_load_unavailable",
"gpu_runtime_unavailable",
"memory_snapshot_unavailable",
"queue_runtime_unavailable",
}


def _sanitize_diagnostic_error(raw_error: Any, fallback: str) -> Optional[str]:
if raw_error is None:
return None
if isinstance(raw_error, str):
normalized = raw_error.strip().lower()
if normalized in _SAFE_DIAGNOSTIC_ERROR_CODES:
return normalized
return fallback


def _linux_memory_snapshot() -> Optional[Dict[str, Any]]:
meminfo_path = "/proc/meminfo"
if not os.path.exists(meminfo_path):
Expand All @@ -739,7 +757,12 @@ def _linux_memory_snapshot() -> Optional[Dict[str, Any]]:
if number.isdigit():
values[key.strip()] = int(number)
except Exception as exc:
return {"error": str(exc)}
return {
"error": _sanitize_diagnostic_error(
exc,
"memory_snapshot_unavailable",
)
}

total_kb = values.get("MemTotal", 0)
available_kb = values.get("MemAvailable", values.get("MemFree", 0))
Expand All @@ -761,7 +784,12 @@ def _windows_memory_snapshot() -> Optional[Dict[str, Any]]:
if not kernel32.GlobalMemoryStatusEx(ctypes.byref(status)):
return None
except Exception as exc:
return {"error": str(exc)}
return {
"error": _sanitize_diagnostic_error(
exc,
"memory_snapshot_unavailable",
)
}

total_bytes = int(status.ullTotalPhys)
available_bytes = int(status.ullAvailPhys)
Expand All @@ -778,12 +806,18 @@ def _memory_snapshot() -> Dict[str, Any]:
snapshot = _linux_memory_snapshot()
if snapshot is None and os.name == "nt":
snapshot = _windows_memory_snapshot()
if not snapshot:
return {
if not snapshot or snapshot.get("error"):
payload: Dict[str, Any] = {
"available": False,
"state": "warning",
"note": "메모리 사용량을 수집하지 못했습니다.",
}
if snapshot:
payload["error"] = _sanitize_diagnostic_error(
snapshot.get("error"),
"memory_snapshot_unavailable",
)
return payload

usage_percent = snapshot.get("usage_percent")
critical_percent = max(
Expand Down Expand Up @@ -868,7 +902,7 @@ def _cpu_snapshot() -> Dict[str, Any]:
usage_percent: Optional[float] = None
note = "CPU 부하가 정상 범위입니다."
state = "ok"
error_message = ""
error_code: Optional[str] = None
warning_percent = min(
SAFE_COMPUTE_USAGE_LIMIT_PERCENT,
int(os.getenv("RUNTIME_CPU_WARNING_PERCENT", str(SAFE_MEMORY_OCCUPANCY_LIMIT_PERCENT)) or SAFE_MEMORY_OCCUPANCY_LIMIT_PERCENT),
Expand All @@ -883,7 +917,7 @@ def _cpu_snapshot() -> Dict[str, Any]:
getloadavg = cast(Any, getattr(os, "getloadavg"))
load_1m = round(float(getloadavg()[0]), 2)
except Exception as exc:
error_message = str(exc)
error_code = _sanitize_diagnostic_error(exc, "cpu_load_unavailable")

if load_1m is not None and cpu_count > 0:
load_ratio_percent = _relative_percent(load_1m, cpu_count)
Expand Down Expand Up @@ -911,29 +945,28 @@ def _cpu_snapshot() -> Dict[str, Any]:
"load_ratio_percent": load_ratio_percent,
"usage_percent": usage_percent,
}
if error_message:
payload["error"] = error_message
if error_code:
payload["error"] = error_code
return payload


def _gpu_snapshot() -> Dict[str, Any]:
gpu_runtime = get_gpu_runtime_info()
gpu_runtime_data = gpu_runtime if isinstance(gpu_runtime, dict) else {}
gpu_error = _sanitize_diagnostic_error(
gpu_runtime_data.get("error"),
"gpu_runtime_unavailable",
)
devices = (
gpu_runtime.get("devices", [])
if isinstance(gpu_runtime, dict)
else []
gpu_runtime_data.get("devices", [])
)
if not gpu_runtime.get("available"):
if not gpu_runtime_data.get("available"):
return {
"available": False,
"state": "warning",
"note": "GPU 런타임이 감지되지 않았습니다. CPU fallback 또는 드라이버 상태를 확인하세요.",
"devices": [],
"error": (
gpu_runtime.get("error")
if isinstance(gpu_runtime, dict)
else None
),
"error": gpu_error or "gpu_runtime_unavailable",
}

peak_usage = 0.0
Expand Down
105 changes: 105 additions & 0 deletions tests/test_health_diagnostics_sanitization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import ast
import os
from pathlib import Path
from typing import Any, Dict, List, Optional, cast


MAIN_PATH = Path(__file__).resolve().parent.parent / "backend" / "main.py"
SAFE_DIAGNOSTIC_ERROR_CODES = {
"cpu_load_unavailable",
"gpu_runtime_unavailable",
"memory_snapshot_unavailable",
"queue_runtime_unavailable",
}


def _load_functions(*names: str, extra_globals: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
tree = ast.parse(MAIN_PATH.read_text(encoding="utf-8-sig"), filename=str(MAIN_PATH))
selected = [
node
for node in tree.body
if isinstance(node, ast.FunctionDef) and node.name in names
]
namespace: Dict[str, Any] = {
"Any": Any,
"Dict": Dict,
"List": List,
"Optional": Optional,
"Path": Path,
"cast": cast,
"os": os,
}
if extra_globals:
namespace.update(extra_globals)
exec(compile(ast.Module(body=selected, type_ignores=[]), str(MAIN_PATH), "exec"), namespace)
return namespace


def test_sanitize_diagnostic_error_redacts_exception_text():
namespace = _load_functions(
"_sanitize_diagnostic_error",
extra_globals={
"_SAFE_DIAGNOSTIC_ERROR_CODES": SAFE_DIAGNOSTIC_ERROR_CODES,
},
)

sanitize = namespace["_sanitize_diagnostic_error"]

assert sanitize(PermissionError("cannot open /proc/meminfo"), "memory_snapshot_unavailable") == "memory_snapshot_unavailable"
assert sanitize("gpu_runtime_unavailable", "memory_snapshot_unavailable") == "gpu_runtime_unavailable"
assert sanitize(" GPU_Runtime_Unavailable ", "memory_snapshot_unavailable") == "gpu_runtime_unavailable"
assert sanitize(None, "memory_snapshot_unavailable") is None


def test_memory_snapshot_error_becomes_warning_payload():
namespace = _load_functions(
"_sanitize_diagnostic_error",
"_memory_snapshot",
extra_globals={
"_SAFE_DIAGNOSTIC_ERROR_CODES": SAFE_DIAGNOSTIC_ERROR_CODES,
"_linux_memory_snapshot": lambda: {"error": "permission denied: /proc/meminfo"},
"_windows_memory_snapshot": lambda: None,
"SAFE_COMPUTE_USAGE_LIMIT_PERCENT": 90,
"SAFE_MEMORY_OCCUPANCY_LIMIT_PERCENT": 75,
},
)

payload = namespace["_memory_snapshot"]()

assert payload["available"] is False
assert payload["state"] == "warning"
assert payload["error"] == "memory_snapshot_unavailable"
assert "/proc/meminfo" not in payload["error"]


def test_cpu_and_gpu_snapshots_expose_only_safe_error_codes(monkeypatch):
namespace = _load_functions(
"_sanitize_diagnostic_error",
"_cpu_snapshot",
"_gpu_snapshot",
extra_globals={
"_SAFE_DIAGNOSTIC_ERROR_CODES": SAFE_DIAGNOSTIC_ERROR_CODES,
"SAFE_COMPUTE_USAGE_LIMIT_PERCENT": 90,
"SAFE_MEMORY_OCCUPANCY_LIMIT_PERCENT": 75,
"_relative_percent": lambda numerator, denominator: round((numerator / denominator) * 100, 1) if denominator > 0 else None,
"_linux_cpu_usage_percent": lambda: None,
"get_gpu_runtime_info": lambda: {
"available": False,
"error": "driver init failed for /dev/nvidia0",
"devices": [],
},
},
)

def _raise_loadavg_error():
raise OSError("cannot read /proc/loadavg")

monkeypatch.setattr(os, "getloadavg", _raise_loadavg_error)

cpu_payload = namespace["_cpu_snapshot"]()
gpu_payload = namespace["_gpu_snapshot"]()

assert cpu_payload["error"] == "cpu_load_unavailable"
assert "/proc/loadavg" not in cpu_payload["error"]
assert gpu_payload["error"] == "gpu_runtime_unavailable"
assert "/dev/nvidia0" not in gpu_payload["error"]
Loading