From fc467f0cd2c9d87307471f52fe73e12fb64abf82 Mon Sep 17 00:00:00 2001 From: sujay-d07 Date: Wed, 21 Jan 2026 16:30:04 +0530 Subject: [PATCH 01/12] feat(monitoring): Implement system health monitoring and alert management - Added a background daemon to monitor system metrics including CPU, memory, and disk usage, with configurable thresholds for alert generation. - Introduced new CLI commands for checking system health and managing alerts, including acknowledgment and dismissal of alerts. - Implemented SQLite-based persistence for alerts, allowing for filtering by severity and category. - Enhanced the README and documentation to reflect new features and usage instructions for system monitoring and alert management. --- README.md | 16 + cortex/cli.py | 251 ++++++ cortex/daemon_client.py | 66 ++ daemon/CMakeLists.txt | 12 +- daemon/README.md | 189 +++- daemon/config/cortexd.yaml.example | 16 +- daemon/include/cortexd/alerts/alert_manager.h | 234 +++++ daemon/include/cortexd/config.h | 9 + daemon/include/cortexd/core/daemon.h | 5 +- daemon/include/cortexd/ipc/handlers.h | 22 +- daemon/include/cortexd/ipc/protocol.h | 23 +- daemon/include/cortexd/ipc/server.h | 72 +- .../include/cortexd/monitor/system_monitor.h | 210 +++++ daemon/scripts/setup_daemon.py | 61 +- daemon/src/alerts/alert_manager.cpp | 834 ++++++++++++++++++ daemon/src/config/config.cpp | 76 ++ daemon/src/core/daemon.cpp | 72 +- daemon/src/ipc/handlers.cpp | 170 +++- daemon/src/ipc/server.cpp | 167 ++-- daemon/src/main.cpp | 69 +- daemon/src/monitor/system_monitor.cpp | 551 ++++++++++++ daemon/tests/CMakeLists.txt | 14 +- daemon/tests/integration/test_handlers.cpp | 193 +++- daemon/tests/unit/test_alert_manager.cpp | 268 ++++++ daemon/tests/unit/test_common.cpp | 12 +- daemon/tests/unit/test_protocol.cpp | 11 +- 26 files changed, 3416 insertions(+), 207 deletions(-) create mode 100644 daemon/include/cortexd/alerts/alert_manager.h create mode 100644 daemon/include/cortexd/monitor/system_monitor.h create mode 100644 daemon/src/alerts/alert_manager.cpp create mode 100644 daemon/src/monitor/system_monitor.cpp create mode 100644 daemon/tests/unit/test_alert_manager.cpp diff --git a/README.md b/README.md index 24db1c17..550158cf 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,7 @@ cortex install "tools for video compression" | **Audit Trail** | Complete history in `~/.cortex/history.db` | | **Hardware-Aware** | Detects GPU, CPU, memory for optimized packages | | **Multi-LLM Support** | Works with Claude, GPT-4, or local Ollama models | +| **System Monitoring** | Background daemon monitors CPU, memory, disk, and services with alerts | --- @@ -203,6 +204,13 @@ cortex role set | `cortex daemon version` | Show daemon version | | `cortex daemon config` | Show daemon configuration | | `cortex daemon reload-config` | Reload daemon configuration | +| `cortex daemon health` | Get system health metrics (CPU, memory, disk, services) | +| `cortex daemon alerts` | List and manage alerts | +| `cortex daemon alerts --severity ` | Filter alerts by severity (info/warning/error/critical) | +| `cortex daemon alerts --category ` | Filter alerts by category (cpu/memory/disk/apt/cve/service/system) | +| `cortex daemon alerts --acknowledge-all` | Acknowledge all active alerts | +| `cortex daemon alerts --dismiss ` | Dismiss a specific alert by UUID | +| `cortex daemon shutdown` | Request daemon shutdown | ### Configuration @@ -300,6 +308,14 @@ cortex daemon install --execute cortex daemon ping cortex daemon version +# Monitor system health +cortex daemon health + +# View and manage alerts +cortex daemon alerts +cortex daemon alerts --severity warning +cortex daemon alerts --acknowledge-all + # Run daemon tests (no installation required) cortex daemon run-tests ``` diff --git a/cortex/cli.py b/cortex/cli.py index 7a4b734c..2680fb77 100644 --- a/cortex/cli.py +++ b/cortex/cli.py @@ -2050,6 +2050,10 @@ def daemon(self, args: argparse.Namespace) -> int: return self._daemon_ping() elif action == "shutdown": return self._daemon_shutdown() + elif action == "health": + return self._daemon_health() + elif action == "alerts": + return self._daemon_alerts(args) elif action == "run-tests": return self._daemon_run_tests(args) else: @@ -2063,6 +2067,8 @@ def daemon(self, args: argparse.Namespace) -> int: cx_print(" version Show daemon version", "info") cx_print(" ping Test daemon connectivity", "info") cx_print(" shutdown Request daemon shutdown", "info") + cx_print(" health Check system health", "info") + cx_print(" alerts List and manage alerts", "info") cx_print(" run-tests Run daemon test suite", "info") return 0 @@ -2547,6 +2553,222 @@ def _daemon_shutdown(self) -> int: cx_print(f"Failed to request shutdown: {response.error}", "error") return 1 + def _daemon_health(self) -> int: + """Check system health via IPC.""" + cx_header("System Health Check") + + success, response = self._daemon_ipc_call("health", lambda c: c.health()) + if not success: + return 1 + + if not response.success: + cx_print(f"Failed to get health: {response.error}", "error") + return 1 + + result = response.result + if not result: + return 1 + + # Display health metrics in a box + from rich.panel import Panel + from rich.table import Table + + # Create health metrics table + health_table = Table(show_header=False, box=None, padding=(0, 2)) + health_table.add_column("Metric", style="bold") + health_table.add_column("Value", style="") + + # CPU + if "cpu" in result: + cpu = result["cpu"] + usage = cpu.get("usage_percent", 0) + color = "red" if usage >= 95 else "yellow" if usage >= 80 else "green" + health_table.add_row( + "CPU Usage", + f"[{color}]{usage:.1f}%[/{color}] ({cpu.get('cores', 0)} cores)" + ) + + # Memory + if "memory" in result: + mem = result["memory"] + usage = mem.get("usage_percent", 0) + color = "red" if usage >= 95 else "yellow" if usage >= 80 else "green" + mem_gb = mem.get("used_bytes", 0) / (1024**3) + mem_total_gb = mem.get("total_bytes", 0) / (1024**3) + health_table.add_row( + "Memory Usage", + f"[{color}]{usage:.1f}%[/{color}] ({mem_gb:.2f}GB / {mem_total_gb:.2f}GB)" + ) + + # Disk + if "disk" in result: + disk = result["disk"] + usage = disk.get("usage_percent", 0) + color = "red" if usage >= 95 else "yellow" if usage >= 80 else "green" + disk_gb = disk.get("used_bytes", 0) / (1024**3) + disk_total_gb = disk.get("total_bytes", 0) / (1024**3) + mount_point = disk.get("mount_point", "/") + health_table.add_row( + f"Disk Usage ({mount_point})", + f"[{color}]{usage:.1f}%[/{color}] ({disk_gb:.2f}GB / {disk_total_gb:.2f}GB)" + ) + + # System info + if "system" in result: + sys_info = result["system"] + uptime_hours = sys_info.get("uptime_seconds", 0) / 3600 + health_table.add_row("System Uptime", f"{uptime_hours:.1f} hours") + + failed = sys_info.get("failed_services_count", 0) + if failed > 0: + health_table.add_row("Failed Services", f"[red]{failed}[/red]") + else: + health_table.add_row("Failed Services", "[green]0[/green]") + + # Display health panel + console.print() + console.print(Panel(health_table, title="[bold cyan]System Health Metrics[/bold cyan]", border_style="cyan")) + + # Display thresholds in a separate panel + if "thresholds" in result: + thresholds = result["thresholds"] + threshold_table = Table(show_header=True, header_style="bold yellow", box=None, padding=(0, 2)) + threshold_table.add_column("Resource", style="bold") + threshold_table.add_column("Warning", style="yellow") + threshold_table.add_column("Critical", style="red") + + if "cpu" in thresholds: + cpu_th = thresholds["cpu"] + threshold_table.add_row( + "CPU", + f"{cpu_th.get('warning', 80)}%", + f"{cpu_th.get('critical', 95)}%" + ) + if "memory" in thresholds: + mem_th = thresholds["memory"] + threshold_table.add_row( + "Memory", + f"{mem_th.get('warning', 80)}%", + f"{mem_th.get('critical', 95)}%" + ) + if "disk" in thresholds: + disk_th = thresholds["disk"] + threshold_table.add_row( + "Disk", + f"{disk_th.get('warning', 80)}%", + f"{disk_th.get('critical', 95)}%" + ) + + console.print() + console.print(Panel(threshold_table, title="[bold yellow]Monitoring Thresholds[/bold yellow]", border_style="yellow")) + + return 0 + + def _daemon_alerts(self, args: argparse.Namespace) -> int: + """Manage alerts via IPC.""" + # Handle acknowledge-all + if getattr(args, "acknowledge_all", False): + cx_header("Acknowledging All Alerts") + success, response = self._daemon_ipc_call( + "alerts_acknowledge_all", lambda c: c.alerts_acknowledge_all() + ) + if not success: + return 1 + if response.success: + count = response.result.get("acknowledged", 0) if response.result else 0 + cx_print(f"Acknowledged {count} alert(s)", "success") + return 0 + cx_print(f"Failed: {response.error}", "error") + return 1 + + # Handle dismiss + dismiss_uuid = getattr(args, "dismiss", None) + if dismiss_uuid: + cx_header("Dismissing Alert") + success, response = self._daemon_ipc_call( + "alerts_dismiss", lambda c: c.alerts_dismiss(dismiss_uuid) + ) + if not success: + return 1 + if response.success: + cx_print(f"Alert {dismiss_uuid} dismissed", "success") + return 0 + cx_print(f"Failed: {response.error}", "error") + return 1 + + # List alerts + cx_header("Alerts") + + severity = getattr(args, "severity", None) + category = getattr(args, "category", None) + + success, response = self._daemon_ipc_call( + "alerts_get", + lambda c: c.alerts_get(severity=severity, category=category), + ) + if not success: + return 1 + + if not response.success: + cx_print(f"Failed to get alerts: {response.error}", "error") + return 1 + + result = response.result + if not result: + return 1 + + alerts = result.get("alerts", []) + counts = result.get("counts", {}) + + # Display summary + console.print(f"[bold]Total Alerts:[/bold] {result.get('count', 0)}") + if counts: + console.print( + f" Info: {counts.get('info', 0)}, " + f"Warning: {counts.get('warning', 0)}, " + f"Error: {counts.get('error', 0)}, " + f"Critical: {counts.get('critical', 0)}" + ) + + if not alerts: + console.print("[dim]No alerts found[/dim]") + return 0 + + console.print() + # Display alerts table + from rich.table import Table + + table = Table(show_header=True, header_style="bold cyan") + table.add_column("UUID", style="dim") + table.add_column("Severity", style="bold") + table.add_column("Category") + table.add_column("Source") + table.add_column("Message") + table.add_column("Status") + table.add_column("Timestamp") + + for alert in alerts: + severity_name = alert.get("severity_name", "unknown") + severity_color = { + "info": "blue", + "warning": "yellow", + "error": "red", + "critical": "bold red", + }.get(severity_name, "white") + + table.add_row( + alert.get("uuid", "")[:8] + "...", + f"[{severity_color}]{severity_name.upper()}[/{severity_color}]", + alert.get("category_name", "unknown"), + alert.get("source", "unknown"), + alert.get("message", ""), + alert.get("status_name", "unknown"), + alert.get("timestamp", ""), + ) + + console.print(table) + return 0 + def _daemon_run_tests(self, args: argparse.Namespace) -> int: """Run the daemon test suite.""" import subprocess @@ -2582,6 +2804,7 @@ def _daemon_run_tests(self, args: argparse.Namespace) -> int: "test_rate_limiter", "test_logger", "test_common", + "test_alert_manager", ] integration_tests = ["test_ipc_server", "test_handlers", "test_daemon"] all_tests = unit_tests + integration_tests @@ -4336,6 +4559,34 @@ def main(): # daemon shutdown - uses shutdown IPC handler daemon_subs.add_parser("shutdown", help="Request daemon shutdown") + # daemon health - uses health IPC handler + daemon_subs.add_parser("health", help="Check system health") + + # daemon alerts - uses alerts IPC handlers + daemon_alerts_parser = daemon_subs.add_parser( + "alerts", help="Manage alerts" + ) + daemon_alerts_parser.add_argument( + "--severity", + choices=["info", "warning", "error", "critical"], + help="Filter alerts by severity", + ) + daemon_alerts_parser.add_argument( + "--category", + choices=["cpu", "memory", "disk", "apt", "cve", "service", "system"], + help="Filter alerts by category", + ) + daemon_alerts_parser.add_argument( + "--acknowledge-all", + action="store_true", + help="Acknowledge all active alerts", + ) + daemon_alerts_parser.add_argument( + "--dismiss", + metavar="UUID", + help="Dismiss a specific alert by UUID", + ) + # daemon run-tests - run daemon test suite daemon_run_tests_parser = daemon_subs.add_parser( "run-tests", diff --git a/cortex/daemon_client.py b/cortex/daemon_client.py index f1fb3927..77a9a4ba 100644 --- a/cortex/daemon_client.py +++ b/cortex/daemon_client.py @@ -242,6 +242,72 @@ def shutdown(self) -> DaemonResponse: """ return self._send_request("shutdown") + # ========================================================================= + # PR2 IPC Methods: Monitoring and Alerts + # ========================================================================= + + def health(self) -> DaemonResponse: + """ + Get detailed system health metrics. + + Returns: + DaemonResponse with health metrics (CPU, memory, disk, etc.). + """ + return self._send_request("health") + + def alerts_get( + self, + severity: str | None = None, + category: str | None = None, + status: str | None = None, + include_dismissed: bool = False, + ) -> DaemonResponse: + """ + Get alerts matching filter criteria. + + Args: + severity: Filter by severity (info, warning, error, critical). + category: Filter by category (cpu, memory, disk, apt, cve, service, system). + status: Filter by status (active, acknowledged, dismissed). + include_dismissed: If True, include dismissed alerts. + + Returns: + DaemonResponse with alerts list and counts. + """ + params: dict[str, Any] = {} + if severity: + params["severity"] = severity + if category: + params["category"] = category + if status: + params["status"] = status + if include_dismissed: + params["include_dismissed"] = True + + # Use "alerts" as the primary method (maps to alerts.get internally) + return self._send_request("alerts", params if params else None) + + def alerts_acknowledge_all(self) -> DaemonResponse: + """ + Acknowledge all active alerts. + + Returns: + DaemonResponse with count of acknowledged alerts. + """ + return self._send_request("alerts.acknowledge", {"all": True}) + + def alerts_dismiss(self, uuid: str) -> DaemonResponse: + """ + Dismiss a specific alert by UUID. + + Args: + uuid: Alert UUID to dismiss. + + Returns: + DaemonResponse with dismissal confirmation. + """ + return self._send_request("alerts.dismiss", {"uuid": uuid}) + class DaemonNotInstalledError(Exception): """Raised when the daemon is not installed.""" diff --git a/daemon/CMakeLists.txt b/daemon/CMakeLists.txt index 9919b3ad..ed76cc54 100644 --- a/daemon/CMakeLists.txt +++ b/daemon/CMakeLists.txt @@ -62,8 +62,8 @@ endif() # Find required packages find_package(PkgConfig REQUIRED) pkg_check_modules(SYSTEMD REQUIRED libsystemd) -pkg_check_modules(OPENSSL REQUIRED openssl) pkg_check_modules(UUID REQUIRED uuid) +pkg_check_modules(SQLITE3 REQUIRED sqlite3) # Fetch nlohmann/json include(FetchContent) @@ -89,8 +89,8 @@ FetchContent_MakeAvailable(yaml-cpp) include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/include ${SYSTEMD_INCLUDE_DIRS} - ${OPENSSL_INCLUDE_DIRS} ${UUID_INCLUDE_DIRS} + ${SQLITE3_INCLUDE_DIRS} ) # Source files @@ -107,6 +107,12 @@ set(DAEMON_SOURCES src/ipc/protocol.cpp src/ipc/handlers.cpp + # Monitoring + src/monitor/system_monitor.cpp + + # Alerts + src/alerts/alert_manager.cpp + # Utils src/utils/logger.cpp ) @@ -123,8 +129,8 @@ target_compile_definitions(cortexd PRIVATE target_link_libraries(cortexd PRIVATE ${SYSTEMD_LIBRARIES} - ${OPENSSL_LIBRARIES} ${UUID_LIBRARIES} + ${SQLITE3_LIBRARIES} nlohmann_json::nlohmann_json yaml-cpp::yaml-cpp pthread diff --git a/daemon/README.md b/daemon/README.md index 350095ec..75bc0719 100644 --- a/daemon/README.md +++ b/daemon/README.md @@ -9,7 +9,9 @@ - 🔌 **Unix Socket IPC**: JSON-RPC protocol at `/run/cortex/cortex.sock` - ⚙️ **systemd Integration**: Type=notify, watchdog, journald logging - 📝 **Configuration Management**: YAML-based configuration with hot reload -- 🔧 **Basic IPC Handlers**: ping, version, config, shutdown +- 🔧 **IPC Handlers**: ping, version, config, shutdown, health, alerts +- 📊 **System Monitoring**: Continuous monitoring of CPU, memory, disk, and system services +- 🚨 **Alert Management**: SQLite-based alert persistence with severity levels and filtering ## Quick Start @@ -86,13 +88,31 @@ sudo systemctl restart cortexd && sleep 1 && journalctl -u cortexd -n 10 | grep │ │ IPC Server │ │ │ │ ─────────── │ │ │ │ JSON-RPC Protocol │ │ -│ │ Basic Handlers: ping, version, config, shutdown │ │ +│ │ Handlers: ping, version, config, shutdown, │ │ +│ │ health, alerts │ │ +│ └─────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ System Monitor │ │ +│ │ ───────────── │ │ +│ │ • CPU/Memory/Disk monitoring │ │ +│ │ • System uptime & failed services │ │ +│ │ • Threshold-based alert generation │ │ +│ └─────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ Alert Manager │ │ +│ │ ──────────── │ │ +│ │ • SQLite persistence │ │ +│ │ • Severity levels (INFO/WARNING/ERROR/CRITICAL) │ │ +│ │ • Categories (CPU/MEMORY/DISK/APT/CVE/SERVICE/SYSTEM) │ │ +│ │ • Filtering & querying │ │ │ └─────────────────────────────────────────────────────────┘ │ │ │ │ ┌─────────────────────────────────────────────────────────┐ │ │ │ Config Manager (YAML) │ Logger │ Daemon Lifecycle │ │ │ └─────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────┘ +└──────────────────────────────────────────────────────────────┘ ``` ## Directory Structure @@ -106,14 +126,20 @@ daemon/ │ ├── core/ # Daemon core │ │ ├── daemon.h │ │ └── service.h -│ └── ipc/ # IPC layer -│ ├── server.h -│ ├── protocol.h -│ └── handlers.h # Basic handlers only +│ ├── ipc/ # IPC layer +│ │ ├── server.h +│ │ ├── protocol.h +│ │ └── handlers.h +│ ├── monitor/ # System monitoring +│ │ └── system_monitor.h +│ └── alerts/ # Alert management +│ └── alert_manager.h ├── src/ # Implementation │ ├── core/ # Daemon lifecycle │ ├── config/ # Configuration management │ ├── ipc/ # IPC server and handlers +│ ├── monitor/ # System monitoring implementation +│ ├── alerts/ # Alert management implementation │ └── utils/ # Logging utilities ├── systemd/ # Service files ├── config/ # Config templates @@ -132,14 +158,22 @@ cortex daemon config # Show configuration cortex daemon reload-config # Reload configuration cortex daemon shutdown # Request daemon shutdown +# System monitoring +cortex daemon health # Get system health metrics + +# Alert management +cortex daemon alerts # List all active alerts +cortex daemon alerts --severity warning # Filter by severity +cortex daemon alerts --category cpu # Filter by category +cortex daemon alerts --acknowledge-all # Acknowledge all alerts +cortex daemon alerts --dismiss # Dismiss specific alert + # Install/uninstall daemon cortex daemon install cortex daemon install --execute cortex daemon uninstall ``` -``` - ## IPC API ### Available Methods @@ -151,6 +185,10 @@ cortex daemon uninstall | `config.get` | Get configuration | | `config.reload` | Reload config file | | `shutdown` | Request shutdown | +| `health` | Get system health metrics (CPU, memory, disk, services) | +| `alerts` / `alerts.get` | Get alerts with optional filtering | +| `alerts.acknowledge` | Acknowledge alerts (all or by UUID) | +| `alerts.dismiss` | Dismiss a specific alert by UUID | ### Example @@ -176,8 +214,30 @@ echo '{"method":"version"}' | socat - UNIX-CONNECT:/run/cortex/cortex.sock # } # } -# Get configuration -echo '{"method":"config.get"}' | socat - UNIX-CONNECT:/run/cortex/cortex.sock +# Get system health +echo '{"method":"health"}' | socat - UNIX-CONNECT:/run/cortex/cortex.sock + +# Response: +# { +# "success": true, +# "result": { +# "cpu_usage_percent": 45.2, +# "memory_usage_percent": 62.1, +# "disk_usage_percent": 78.5, +# "uptime_seconds": 86400, +# "failed_services_count": 0, +# "thresholds": { ... } +# } +# } + +# Get alerts +echo '{"method":"alerts"}' | socat - UNIX-CONNECT:/run/cortex/cortex.sock + +# Get alerts filtered by severity +echo '{"method":"alerts","params":{"severity":"warning"}}' | socat - UNIX-CONNECT:/run/cortex/cortex.sock + +# Acknowledge all alerts +echo '{"method":"alerts.acknowledge","params":{"all":true}}' | socat - UNIX-CONNECT:/run/cortex/cortex.sock ``` ## Configuration @@ -190,8 +250,115 @@ socket: timeout_ms: 5000 log_level: 1 # 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR + +# Monitoring thresholds (optional - uses defaults if not specified) +monitoring: + cpu: + warning_threshold: 80.0 # CPU usage % to trigger warning alert + critical_threshold: 95.0 # CPU usage % to trigger critical alert + memory: + warning_threshold: 80.0 # Memory usage % to trigger warning alert + critical_threshold: 95.0 # Memory usage % to trigger critical alert + disk: + warning_threshold: 80.0 # Disk usage % to trigger warning alert + critical_threshold: 95.0 # Disk usage % to trigger critical alert + check_interval_seconds: 60 # How often to check system health ``` +**Note**: Thresholds can be adjusted without restarting the daemon. After editing the config file, reload it with: +```bash +cortex daemon reload-config +# or +sudo systemctl reload cortexd +``` + +## System Monitoring + +The daemon includes a built-in system monitor that continuously tracks system health metrics and generates alerts when thresholds are exceeded. + +### Monitored Metrics + +- **CPU Usage**: Percentage of CPU utilization across all cores +- **Memory Usage**: Total, used, and available memory +- **Disk Usage**: Total, used, and available disk space (primary mount point) +- **System Uptime**: System uptime in seconds +- **Failed Services**: Count of failed systemd services + +### Monitoring Thresholds + +The monitor uses configurable thresholds to determine when to generate alerts. These can be configured in `/etc/cortex/daemon.yaml`: + +- **Warning Threshold**: Default 80% (CPU, memory, disk) - configurable via `monitoring.*.warning_threshold` +- **Critical Threshold**: Default 95% (CPU, memory, disk) - configurable via `monitoring.*.critical_threshold` +- **Check Interval**: Default 60 seconds - configurable via `monitoring.check_interval_seconds` + +Thresholds can be updated without restarting the daemon by editing the config file and reloading: +```bash +sudo systemctl reload cortexd +# or +cortex daemon reload-config +``` + +### Check Interval + +The monitor performs health checks every 60 seconds by default. This interval can be configured when creating the `SystemMonitor` instance. + +### Alert Generation + +When a metric exceeds a threshold, the monitor automatically creates an alert with: +- **Severity**: `WARNING` for threshold violations, `CRITICAL` for critical violations +- **Category**: `CPU`, `MEMORY`, `DISK`, or `SERVICE` (for failed services) +- **Source**: `SystemMonitor` +- **Message**: Brief description of the issue +- **Description**: Detailed information including current values and thresholds + +Alerts are persisted to SQLite and can be queried via the IPC API or CLI commands. + +## Alert Management + +The daemon includes a comprehensive alert management system with SQLite persistence. + +### Alert Database + +Alerts are stored in `/var/lib/cortex/alerts.db` (SQLite database). The database is automatically created and initialized on first use. + +### Alert Properties + +- **UUID**: Unique identifier for each alert +- **Severity**: `INFO`, `WARNING`, `ERROR`, or `CRITICAL` +- **Category**: `CPU`, `MEMORY`, `DISK`, `APT`, `CVE`, `SERVICE`, or `SYSTEM` +- **Source**: Origin of the alert (e.g., `SystemMonitor`) +- **Message**: Brief alert message +- **Description**: Detailed alert description +- **Timestamp**: When the alert was created +- **Status**: `ACTIVE`, `ACKNOWLEDGED`, or `DISMISSED` +- **Acknowledged At**: Optional timestamp when alert was acknowledged +- **Dismissed At**: Optional timestamp when alert was dismissed + +### Alert Lifecycle + +1. **Created**: Alert is created and stored with `ACTIVE` status +2. **Acknowledged**: User acknowledges the alert (status changes to `ACKNOWLEDGED`) +3. **Dismissed**: User dismisses the alert (status changes to `DISMISSED`) + +Dismissed alerts are excluded from default queries unless `include_dismissed=true` is specified. + +### Filtering Alerts + +Alerts can be filtered by: +- **Severity**: `info`, `warning`, `error`, `critical` +- **Category**: `cpu`, `memory`, `disk`, `apt`, `cve`, `service`, `system` +- **Status**: `active`, `acknowledged`, `dismissed` +- **Include Dismissed**: Include dismissed alerts in results (default: false) + +### Alert Counts + +The alert manager maintains real-time counts of alerts by severity: +- Total count +- Count by severity (INFO, WARNING, ERROR, CRITICAL) + +These counts are updated atomically and returned with alert queries for quick status overview. + ## Building from Source diff --git a/daemon/config/cortexd.yaml.example b/daemon/config/cortexd.yaml.example index ed0ada2a..9f3cfd53 100644 --- a/daemon/config/cortexd.yaml.example +++ b/daemon/config/cortexd.yaml.example @@ -2,7 +2,7 @@ # Copy this file to /etc/cortex/daemon.yaml or ~/.cortex/daemon.yaml # # - Socket and logging config -# - Additional configuration options may be added in future versions +# - Monitoring thresholds for alert generation # Socket configuration socket: @@ -18,3 +18,17 @@ rate_limit: # Logging level # 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR log_level: 1 + +# Monitoring thresholds for alert generation +# Alerts are generated when system metrics exceed these thresholds +monitoring: + cpu: + warning_threshold: 80.0 # CPU usage % to trigger warning alert + critical_threshold: 95.0 # CPU usage % to trigger critical alert + memory: + warning_threshold: 80.0 # Memory usage % to trigger warning alert + critical_threshold: 95.0 # Memory usage % to trigger critical alert + disk: + warning_threshold: 80.0 # Disk usage % to trigger warning alert + critical_threshold: 95.0 # Disk usage % to trigger critical alert + check_interval_seconds: 60 # How often to check system health (in seconds) diff --git a/daemon/include/cortexd/alerts/alert_manager.h b/daemon/include/cortexd/alerts/alert_manager.h new file mode 100644 index 00000000..82d538c2 --- /dev/null +++ b/daemon/include/cortexd/alerts/alert_manager.h @@ -0,0 +1,234 @@ +/** + * @file alert_manager.h + * @brief Alert management with SQLite persistence + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "cortexd/common.h" + +namespace cortexd { + +/** + * @brief Alert severity levels + */ +enum class AlertSeverity { + INFO = 0, + WARNING = 1, + ERROR = 2, + CRITICAL = 3 +}; + +/** + * @brief Alert status + */ +enum class AlertStatus { + ACTIVE = 0, + ACKNOWLEDGED = 1, + DISMISSED = 2 +}; + +/** + * @brief Alert category + */ +enum class AlertCategory { + CPU = 0, + MEMORY = 1, + DISK = 2, + APT = 3, + CVE = 4, + SERVICE = 5, + SYSTEM = 6 +}; + +/** + * @brief Alert structure + */ +struct Alert { + std::string uuid; + AlertSeverity severity; + AlertCategory category; + std::string source; + std::string message; + std::string description; + std::chrono::system_clock::time_point timestamp; + AlertStatus status; + std::optional acknowledged_at; + std::optional dismissed_at; + + /** + * @brief Convert alert to JSON + */ + json to_json() const; + + /** + * @brief Create alert from JSON + */ + static Alert from_json(const json& j); +}; + +/** + * @brief Alert filter for queries + */ +struct AlertFilter { + std::optional severity; + std::optional category; + std::optional status; + std::optional source; + bool include_dismissed = false; +}; + +/** + * @brief Alert manager with SQLite persistence + */ +class AlertManager { +public: + /** + * @brief Construct alert manager + * @param db_path Path to SQLite database + */ + explicit AlertManager(const std::string& db_path = "/var/lib/cortex/alerts.db"); + + ~AlertManager(); + + /** + * @brief Initialize database schema + */ + bool initialize(); + + /** + * @brief Create a new alert + * @param alert Alert to create (UUID will be generated if empty) + * @return Created alert with UUID + */ + std::optional create_alert(const Alert& alert); + + /** + * @brief Get alert by UUID + */ + std::optional get_alert(const std::string& uuid); + + /** + * @brief Get all alerts matching filter + */ + std::vector get_alerts(const AlertFilter& filter = AlertFilter()); + + /** + * @brief Acknowledge an alert + */ + bool acknowledge_alert(const std::string& uuid); + + /** + * @brief Acknowledge all active alerts + * @return Number of alerts acknowledged + */ + size_t acknowledge_all(); + + /** + * @brief Dismiss an alert + */ + bool dismiss_alert(const std::string& uuid); + + /** + * @brief Get alert counts by severity + */ + json get_alert_counts(); + + /** + * @brief Generate UUID for alert + */ + static std::string generate_uuid(); + + /** + * @brief Convert severity to string + */ + static std::string severity_to_string(AlertSeverity severity); + + /** + * @brief Convert string to severity + */ + static AlertSeverity string_to_severity(const std::string& str); + + /** + * @brief Convert category to string + */ + static std::string category_to_string(AlertCategory category); + + /** + * @brief Convert string to category + */ + static AlertCategory string_to_category(const std::string& str); + + /** + * @brief Convert status to string + */ + static std::string status_to_string(AlertStatus status); + + /** + * @brief Convert string to status + */ + static AlertStatus string_to_status(const std::string& str); + +private: + std::string db_path_; + void* db_handle_; // sqlite3* (opaque pointer to avoid including sqlite3.h in header) + + // Prepared statement cache + // NOTE: SQLite prepared statements are NOT thread-safe - must be protected by mutex + void* stmt_insert_; // sqlite3_stmt* + void* stmt_select_; // sqlite3_stmt* + void* stmt_select_all_; // sqlite3_stmt* + void* stmt_update_ack_; // sqlite3_stmt* + void* stmt_update_ack_all_; // sqlite3_stmt* + void* stmt_update_dismiss_; // sqlite3_stmt* + void* stmt_count_; // sqlite3_stmt* + + // Mutex to protect prepared statement usage (SQLite statements are NOT thread-safe) + mutable std::mutex stmt_mutex_; + + // In-memory alert counters (updated atomically) + std::atomic count_info_{0}; + std::atomic count_warning_{0}; + std::atomic count_error_{0}; + std::atomic count_critical_{0}; + std::atomic count_total_{0}; + + /** + * @brief Ensure database directory exists + */ + bool ensure_db_directory(); + + /** + * @brief Create database schema + */ + bool create_schema(); + + /** + * @brief Prepare and cache all statements + */ + bool prepare_statements(); + + /** + * @brief Finalize all cached statements + */ + void finalize_statements(); + + /** + * @brief Update in-memory counters based on severity + */ + void update_counters(AlertSeverity severity, int delta); + + /** + * @brief Load initial counters from database + */ + void load_initial_counters(); +}; + +} // namespace cortexd diff --git a/daemon/include/cortexd/config.h b/daemon/include/cortexd/config.h index 40b18820..b1584669 100644 --- a/daemon/include/cortexd/config.h +++ b/daemon/include/cortexd/config.h @@ -46,6 +46,15 @@ struct Config { // Logging int log_level = 1; // 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=CRITICAL + // Monitoring thresholds + double cpu_warning_threshold = 80.0; + double cpu_critical_threshold = 95.0; + double memory_warning_threshold = 80.0; + double memory_critical_threshold = 95.0; + double disk_warning_threshold = 80.0; + double disk_critical_threshold = 95.0; + int monitor_check_interval_seconds = 60; + /** * @brief Load configuration from YAML file * @param path Path to configuration file diff --git a/daemon/include/cortexd/core/daemon.h b/daemon/include/cortexd/core/daemon.h index e7004881..1bbe799b 100644 --- a/daemon/include/cortexd/core/daemon.h +++ b/daemon/include/cortexd/core/daemon.h @@ -13,6 +13,7 @@ #include #include #include + #include namespace cortexd { @@ -74,7 +75,8 @@ class IPCServer; */ template T* get_service() { - for (auto& svc : services_) { + std::shared_lock lock(services_mutex_); + for (const auto& svc : services_) { if (auto* ptr = dynamic_cast(svc.get())) { return ptr; } @@ -129,6 +131,7 @@ class IPCServer; Daemon() = default; std::vector> services_; + mutable std::shared_mutex services_mutex_; // Protect services_ vector access std::atomic running_{false}; std::atomic shutdown_requested_{false}; std::chrono::steady_clock::time_point start_time_; diff --git a/daemon/include/cortexd/ipc/handlers.h b/daemon/include/cortexd/ipc/handlers.h index 0bf3eebb..e7f6c7c4 100644 --- a/daemon/include/cortexd/ipc/handlers.h +++ b/daemon/include/cortexd/ipc/handlers.h @@ -7,9 +7,14 @@ #include "cortexd/ipc/server.h" #include "cortexd/ipc/protocol.h" +#include namespace cortexd { +// Forward declarations +class SystemMonitor; +class AlertManager; + /** * @brief IPC request handlers */ @@ -17,8 +22,15 @@ class Handlers { public: /** * @brief Register all handlers with IPC server + * @param server IPC server instance + * @param monitor System monitor instance (optional, for status/health handlers) + * @param alerts Alert manager instance (optional, for alert handlers) */ - static void register_all(IPCServer& server); + static void register_all( + IPCServer& server, + SystemMonitor* monitor = nullptr, + std::shared_ptr alerts = nullptr + ); private: // Handler implementations @@ -31,6 +43,14 @@ class Handlers { // Daemon control static Response handle_shutdown(const Request& req); + + // Monitoring handlers + static Response handle_health(const Request& req, SystemMonitor* monitor); + + // Alert handlers + static Response handle_alerts_get(const Request& req, std::shared_ptr alerts); + static Response handle_alerts_acknowledge(const Request& req, std::shared_ptr alerts); + static Response handle_alerts_dismiss(const Request& req, std::shared_ptr alerts); }; } // namespace cortexd \ No newline at end of file diff --git a/daemon/include/cortexd/ipc/protocol.h b/daemon/include/cortexd/ipc/protocol.h index a35fdd27..54919bb4 100644 --- a/daemon/include/cortexd/ipc/protocol.h +++ b/daemon/include/cortexd/ipc/protocol.h @@ -62,7 +62,6 @@ */ namespace Methods { // Status and health - constexpr const char* STATUS = "status"; constexpr const char* HEALTH = "health"; constexpr const char* VERSION = "version"; @@ -72,17 +71,11 @@ constexpr const char* ALERTS_ACK = "alerts.acknowledge"; constexpr const char* ALERTS_DISMISS = "alerts.dismiss"; - // Configuration - constexpr const char* CONFIG_GET = "config.get"; - constexpr const char* CONFIG_RELOAD = "config.reload"; - - // LLM operations - constexpr const char* LLM_STATUS = "llm.status"; - constexpr const char* LLM_LOAD = "llm.load"; - constexpr const char* LLM_UNLOAD = "llm.unload"; - constexpr const char* LLM_INFER = "llm.infer"; - - // Daemon control + // Configuration + constexpr const char* CONFIG_GET = "config.get"; + constexpr const char* CONFIG_RELOAD = "config.reload"; + + // Daemon control constexpr const char* SHUTDOWN = "shutdown"; constexpr const char* PING = "ping"; } @@ -101,10 +94,8 @@ constexpr int INVALID_PARAMS = -32602; constexpr int INTERNAL_ERROR = -32603; - // Custom application errors (non-reserved range: 1-999) - constexpr int LLM_NOT_LOADED = 100; - constexpr int LLM_BUSY = 101; - constexpr int RATE_LIMITED = 102; + // Custom application errors (non-reserved range: 1-999) + constexpr int RATE_LIMITED = 102; constexpr int ALERT_NOT_FOUND = 103; constexpr int CONFIG_ERROR = 104; } diff --git a/daemon/include/cortexd/ipc/server.h b/daemon/include/cortexd/ipc/server.h index 049b2f63..f2fa3f1f 100644 --- a/daemon/include/cortexd/ipc/server.h +++ b/daemon/include/cortexd/ipc/server.h @@ -7,14 +7,15 @@ #include "cortexd/core/service.h" #include "cortexd/ipc/protocol.h" - #include - #include - #include - #include - #include - #include - #include - #include +#include +#include +#include +#include +#include +#include +#include +#include +#include namespace cortexd { @@ -23,30 +24,33 @@ */ using RequestHandler = std::function; - /** - * @brief Rate limiter for request throttling - */ - class RateLimiter { - public: - explicit RateLimiter(int max_per_second); - - /** - * @brief Check if request is allowed - * @return true if allowed, false if rate limited - */ - bool allow(); - - /** - * @brief Reset the rate limiter - */ - void reset(); - - private: - int max_per_second_; - int count_ = 0; - std::chrono::steady_clock::time_point window_start_; - std::mutex mutex_; - }; +/** + * @brief Lock-free rate limiter for request throttling + * Uses atomic operations for better performance under high concurrency. + * + * SECURITY: Uses compare-and-swap to ensure the limit is never exceeded, + * even under high concurrency. This prevents rate limit bypass attacks. + */ +class RateLimiter { +public: + explicit RateLimiter(int max_per_second); + + /** + * @brief Check if request is allowed (lock-free) + * @return true if allowed, false if rate limited + */ + bool allow(); + + /** + * @brief Reset the rate limiter + */ + void reset(); + +private: + int max_per_second_; + std::atomic count_{0}; + std::atomic window_start_rep_{0}; +}; /** * @brief Unix socket IPC server @@ -92,8 +96,8 @@ std::atomic running_{false}; std::unique_ptr accept_thread_; - std::unordered_map handlers_; - std::mutex handlers_mutex_; + std::unordered_map handlers_; + mutable std::shared_mutex handlers_mutex_; // Read-write lock for concurrent reads RateLimiter rate_limiter_; diff --git a/daemon/include/cortexd/monitor/system_monitor.h b/daemon/include/cortexd/monitor/system_monitor.h new file mode 100644 index 00000000..ea7ed1f5 --- /dev/null +++ b/daemon/include/cortexd/monitor/system_monitor.h @@ -0,0 +1,210 @@ +/** + * @file system_monitor.h + * @brief System health monitoring service + */ + +#pragma once + +#include "cortexd/core/service.h" +#include "cortexd/alerts/alert_manager.h" +#include +#include +#include +#include +#include +#include +#include + +namespace cortexd { + +/** + * @brief System health metrics + */ +struct SystemHealth { + // CPU metrics + double cpu_usage_percent; + int cpu_cores; + + // Memory metrics + double memory_usage_percent; + uint64_t memory_total_bytes; + uint64_t memory_used_bytes; + uint64_t memory_available_bytes; + + // Disk metrics + double disk_usage_percent; + uint64_t disk_total_bytes; + uint64_t disk_used_bytes; + uint64_t disk_available_bytes; + std::string disk_mount_point; // Primary mount point monitored + + // System metrics + uint64_t uptime_seconds; + int failed_services_count; + + /** + * @brief Convert to JSON + */ + json to_json() const; +}; + +/** + * @brief Monitoring thresholds + */ +struct MonitoringThresholds { + double cpu_warning; + double cpu_critical; + double memory_warning; + double memory_critical; + double disk_warning; + double disk_critical; +}; + +/** + * @brief System monitoring service + * + * Monitors system health (CPU, memory, disk, services) and creates alerts + * when thresholds are exceeded. + */ +class SystemMonitor : public Service { +public: + /** + * @brief Construct system monitor + * @param alert_manager Shared pointer to alert manager + * @param check_interval_seconds Interval between health checks (default: 60) + * @param thresholds Monitoring thresholds (required) + */ + explicit SystemMonitor( + std::shared_ptr alert_manager, + int check_interval_seconds = 60, + const MonitoringThresholds& thresholds + ); + + ~SystemMonitor() override; + + // Service interface + bool start() override; + void stop() override; + const char* name() const override { return "SystemMonitor"; } + int priority() const override { return 50; } // Start after IPC server + bool is_running() const override; + bool is_healthy() const override; + + /** + * @brief Get current system health + */ + SystemHealth get_health() const; + + /** + * @brief Get monitoring thresholds + */ + MonitoringThresholds get_thresholds() const { return thresholds_; } + + /** + * @brief Set monitoring thresholds + */ + void set_thresholds(const MonitoringThresholds& thresholds) { thresholds_ = thresholds; } + +private: + std::shared_ptr alert_manager_; + std::atomic running_{false}; + std::unique_ptr monitor_thread_; + int check_interval_seconds_; + MonitoringThresholds thresholds_; + + mutable std::mutex health_mutex_; + SystemHealth current_health_; + + // CPU usage calculation state (thread-safe) + mutable std::mutex cpu_state_mutex_; + mutable double last_cpu_idle_ = 0.0; + mutable double last_cpu_total_ = 0.0; + mutable std::chrono::steady_clock::time_point last_cpu_time_; + mutable bool cpu_first_call_ = true; + + // Persistent systemd bus connection (reused across calls) + mutable void* systemd_bus_; // sd_bus* (opaque to avoid including systemd headers) + mutable std::mutex bus_mutex_; + + // /proc file cache (reduces I/O overhead) + struct ProcFileCache { + std::string content; + std::chrono::steady_clock::time_point timestamp; + static constexpr std::chrono::milliseconds ttl{1000}; // 1 second TTL + }; + mutable std::mutex proc_cache_mutex_; + mutable ProcFileCache proc_stat_cache_; + mutable ProcFileCache proc_meminfo_cache_; + mutable ProcFileCache proc_uptime_cache_; + + // Hash set for O(1) alert deduplication (tracks active alerts by key) + mutable std::unordered_set active_alert_keys_; + mutable std::mutex alert_keys_mutex_; + + /** + * @brief Get or create systemd bus connection + * @return sd_bus* or nullptr on error + */ + void* get_systemd_bus() const; + + /** + * @brief Read /proc file with caching + * @param path File path (e.g., "/proc/stat") + * @param cache Cache entry to use + * @return File content or empty string on error + */ + std::string read_proc_file_cached(const std::string& path, ProcFileCache& cache) const; + + /** + * @brief Monitoring thread function + */ + void monitor_loop(); + + /** + * @brief Perform health check + */ + SystemHealth check_health(); + + /** + * @brief Check CPU usage + */ + double get_cpu_usage() const; + + /** + * @brief Check memory usage + */ + void get_memory_usage(uint64_t& total, uint64_t& used, uint64_t& available) const; + + /** + * @brief Check disk usage + */ + void get_disk_usage(uint64_t& total, uint64_t& used, uint64_t& available, std::string& mount_point) const; + + /** + * @brief Get system uptime + */ + uint64_t get_uptime() const; + + /** + * @brief Check for failed systemd services + */ + int get_failed_services_count() const; + + /** + * @brief Check thresholds and create alerts if needed + */ + void check_thresholds(const SystemHealth& health); + + /** + * @brief Create basic alert (non-AI version for PR 2) + */ + void create_basic_alert( + AlertSeverity severity, + AlertCategory category, + const std::string& source, + const std::string& message, + const std::string& description + ); +}; + +} // namespace cortexd diff --git a/daemon/scripts/setup_daemon.py b/daemon/scripts/setup_daemon.py index b7899197..42164133 100755 --- a/daemon/scripts/setup_daemon.py +++ b/daemon/scripts/setup_daemon.py @@ -391,17 +391,76 @@ def check_tests_built() -> bool: return (DAEMON_DIR / "build" / "tests" / "test_config").exists() +def ensure_config_file() -> bool: + """ + Ensure the daemon config file exists, creating it from template if needed. + + Creates /etc/cortex/daemon.yaml from the template if it doesn't exist. + This is a safety measure in case install.sh doesn't create it. + + Returns: + bool: True if config file exists or was created successfully, False otherwise. + """ + config_path = Path(CONFIG_FILE) + + # If config already exists, we're done + if config_path.exists(): + return True + + # Check if template exists + if not CONFIG_EXAMPLE.exists(): + console.print( + f"[yellow]Warning: Config template not found at {CONFIG_EXAMPLE}[/yellow]" + ) + return False + + try: + # Create /etc/cortex directory if needed + config_path.parent.mkdir(parents=True, exist_ok=True) + + # Copy template to config file (requires sudo) + result = subprocess.run( + ["sudo", "cp", str(CONFIG_EXAMPLE), CONFIG_FILE], + check=False, + capture_output=True, + text=True, + ) + + if result.returncode == 0: + # Set proper permissions + subprocess.run( + ["sudo", "chmod", "0644", CONFIG_FILE], + check=False, + ) + console.print(f"[green]Created config file: {CONFIG_FILE}[/green]") + log_audit_event("create_config", f"Created config from template") + return True + else: + console.print( + f"[red]Failed to create config file: {result.stderr}[/red]" + ) + return False + + except Exception as e: + console.print(f"[red]Error creating config file: {e}[/red]") + return False + + def install_daemon() -> bool: """ Install the cortexd daemon system-wide. Runs the INSTALL_SCRIPT (daemon/scripts/install.sh) with sudo using - subprocess.run. + subprocess.run. The install script will create the config file if it + doesn't exist, but we also ensure it exists as a safety measure. Returns: bool: True if the installation completed successfully (exit code 0), False otherwise. """ + # Ensure config file exists before installation + ensure_config_file() + console.print("[cyan]Installing the daemon...[/cyan]") result = subprocess.run(["sudo", str(INSTALL_SCRIPT)], check=False) success = result.returncode == 0 diff --git a/daemon/src/alerts/alert_manager.cpp b/daemon/src/alerts/alert_manager.cpp new file mode 100644 index 00000000..80762f1a --- /dev/null +++ b/daemon/src/alerts/alert_manager.cpp @@ -0,0 +1,834 @@ +/** + * @file alert_manager.cpp + * @brief Alert management implementation with SQLite persistence + */ + +#include "cortexd/alerts/alert_manager.h" +#include "cortexd/logger.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cortexd { + +// Alert JSON conversion +json Alert::to_json() const { + json j; + j["uuid"] = uuid; + j["severity"] = static_cast(severity); + j["severity_name"] = AlertManager::severity_to_string(severity); + j["category"] = static_cast(category); + j["category_name"] = AlertManager::category_to_string(category); + j["source"] = source; + j["message"] = message; + j["description"] = description; + + // Convert timestamps to ISO 8601 strings + auto time_t = std::chrono::system_clock::to_time_t(timestamp); + std::stringstream ss; + ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%SZ"); + j["timestamp"] = ss.str(); + + j["status"] = static_cast(status); + j["status_name"] = AlertManager::status_to_string(status); + + if (acknowledged_at.has_value()) { + auto ack_time_t = std::chrono::system_clock::to_time_t(acknowledged_at.value()); + std::stringstream ack_ss; + ack_ss << std::put_time(std::gmtime(&ack_time_t), "%Y-%m-%dT%H:%M:%SZ"); + j["acknowledged_at"] = ack_ss.str(); + } + + if (dismissed_at.has_value()) { + auto dis_time_t = std::chrono::system_clock::to_time_t(dismissed_at.value()); + std::stringstream dis_ss; + dis_ss << std::put_time(std::gmtime(&dis_time_t), "%Y-%m-%dT%H:%M:%SZ"); + j["dismissed_at"] = dis_ss.str(); + } + + return j; +} + +Alert Alert::from_json(const json& j) { + Alert alert; + alert.uuid = j.value("uuid", ""); + alert.severity = static_cast(j.value("severity", 0)); + alert.category = static_cast(j.value("category", 0)); + alert.source = j.value("source", ""); + alert.message = j.value("message", ""); + alert.description = j.value("description", ""); + + // Parse timestamp + std::string timestamp_str = j.value("timestamp", ""); + if (!timestamp_str.empty()) { + std::tm tm = {}; + std::istringstream ss(timestamp_str); + ss >> std::get_time(&tm, "%Y-%m-%dT%H:%M:%SZ"); + if (!ss.fail()) { + alert.timestamp = std::chrono::system_clock::from_time_t(std::mktime(&tm)); + } else { + alert.timestamp = std::chrono::system_clock::now(); + } + } else { + alert.timestamp = std::chrono::system_clock::now(); + } + + alert.status = static_cast(j.value("status", 0)); + + // Parse optional timestamps + if (j.contains("acknowledged_at") && !j["acknowledged_at"].is_null()) { + std::string ack_str = j["acknowledged_at"]; + std::tm ack_tm = {}; + std::istringstream ack_ss(ack_str); + ack_ss >> std::get_time(&ack_tm, "%Y-%m-%dT%H:%M:%SZ"); + if (!ack_ss.fail()) { + alert.acknowledged_at = std::chrono::system_clock::from_time_t(std::mktime(&ack_tm)); + } + } + + if (j.contains("dismissed_at") && !j["dismissed_at"].is_null()) { + std::string dis_str = j["dismissed_at"]; + std::tm dis_tm = {}; + std::istringstream dis_ss(dis_str); + dis_ss >> std::get_time(&dis_tm, "%Y-%m-%dT%H:%M:%SZ"); + if (!dis_ss.fail()) { + alert.dismissed_at = std::chrono::system_clock::from_time_t(std::mktime(&dis_tm)); + } + } + + return alert; +} + +// AlertManager implementation +AlertManager::AlertManager(const std::string& db_path) + : db_path_(db_path), db_handle_(nullptr), + stmt_insert_(nullptr), stmt_select_(nullptr), stmt_select_all_(nullptr), + stmt_update_ack_(nullptr), stmt_update_ack_all_(nullptr), + stmt_update_dismiss_(nullptr), stmt_count_(nullptr) { +} + +AlertManager::~AlertManager() { + finalize_statements(); + if (db_handle_) { + sqlite3_close(static_cast(db_handle_)); + } +} + +bool AlertManager::ensure_db_directory() { + std::filesystem::path db_file(db_path_); + std::filesystem::path db_dir = db_file.parent_path(); + + try { + std::filesystem::create_directories(db_dir); + + // Check write permission + if (!std::filesystem::exists(db_dir) || + access(db_dir.c_str(), W_OK) != 0) { + // Fallback to user directory + const char* home = getenv("HOME"); + if (home) { + std::filesystem::path home_dir = std::filesystem::path(home); + db_dir = home_dir / ".cortex"; + std::filesystem::create_directories(db_dir); + db_path_ = (db_dir / "alerts.db").string(); + LOG_WARN("AlertManager", "Using user directory for alerts database: " + db_path_); + } else { + LOG_ERROR("AlertManager", "Cannot determine home directory for fallback"); + return false; + } + } + + return true; + } catch (const std::exception& e) { + LOG_ERROR("AlertManager", "Failed to create database directory: " + std::string(e.what())); + return false; + } +} + +bool AlertManager::create_schema() { + sqlite3* db = static_cast(db_handle_); + + const char* schema_sql = R"( + CREATE TABLE IF NOT EXISTS alerts ( + uuid TEXT PRIMARY KEY, + severity INTEGER NOT NULL, + category INTEGER NOT NULL, + source TEXT NOT NULL, + message TEXT NOT NULL, + description TEXT, + timestamp TEXT NOT NULL, + status INTEGER NOT NULL DEFAULT 0, + acknowledged_at TEXT, + dismissed_at TEXT + ); + + CREATE INDEX IF NOT EXISTS idx_alerts_timestamp ON alerts(timestamp DESC); + CREATE INDEX IF NOT EXISTS idx_alerts_severity ON alerts(severity); + CREATE INDEX IF NOT EXISTS idx_alerts_category ON alerts(category); + CREATE INDEX IF NOT EXISTS idx_alerts_status ON alerts(status); + )"; + + char* err_msg = nullptr; + int rc = sqlite3_exec(db, schema_sql, nullptr, nullptr, &err_msg); + + if (rc != SQLITE_OK) { + LOG_ERROR("AlertManager", "Failed to create schema: " + std::string(err_msg ? err_msg : "unknown error")); + sqlite3_free(err_msg); + return false; + } + + return true; +} + +bool AlertManager::initialize() { + if (!ensure_db_directory()) { + return false; + } + + sqlite3* db = nullptr; + int rc = sqlite3_open(db_path_.c_str(), &db); + + if (rc != SQLITE_OK) { + LOG_ERROR("AlertManager", "Failed to open database: " + std::string(sqlite3_errmsg(db))); + if (db) { + sqlite3_close(db); + } + return false; + } + + db_handle_ = db; + + // Enable WAL mode for better concurrency + sqlite3_exec(db, "PRAGMA journal_mode=WAL", nullptr, nullptr, nullptr); + sqlite3_exec(db, "PRAGMA synchronous=NORMAL", nullptr, nullptr, nullptr); + + if (!create_schema()) { + sqlite3_close(db); + db_handle_ = nullptr; + return false; + } + + // Prepare and cache all statements + if (!prepare_statements()) { + sqlite3_close(db); + db_handle_ = nullptr; + return false; + } + + // Load initial counters from database + load_initial_counters(); + + LOG_INFO("AlertManager", "Initialized alerts database at " + db_path_); + return true; +} + +bool AlertManager::prepare_statements() { + sqlite3* db = static_cast(db_handle_); + + const char* insert_sql = R"( + INSERT INTO alerts (uuid, severity, category, source, message, description, timestamp, status, acknowledged_at, dismissed_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + )"; + + const char* select_sql = "SELECT uuid, severity, category, source, message, description, timestamp, status, acknowledged_at, dismissed_at FROM alerts WHERE uuid = ?"; + + const char* select_all_sql = "SELECT uuid, severity, category, source, message, description, timestamp, status, acknowledged_at, dismissed_at FROM alerts WHERE 1=1"; + + const char* update_ack_sql = "UPDATE alerts SET status = ?, acknowledged_at = ? WHERE uuid = ?"; + + const char* update_ack_all_sql = "UPDATE alerts SET status = ?, acknowledged_at = ? WHERE status = ?"; + + const char* update_dismiss_sql = "UPDATE alerts SET status = ?, dismissed_at = ? WHERE uuid = ?"; + + const char* count_sql = "SELECT severity, COUNT(*) FROM alerts WHERE status != ? GROUP BY severity"; + + int rc; + + rc = sqlite3_prepare_v2(db, insert_sql, -1, reinterpret_cast(&stmt_insert_), nullptr); + if (rc != SQLITE_OK) { + LOG_ERROR("AlertManager", "Failed to prepare insert statement: " + std::string(sqlite3_errmsg(db))); + return false; + } + + rc = sqlite3_prepare_v2(db, select_sql, -1, reinterpret_cast(&stmt_select_), nullptr); + if (rc != SQLITE_OK) { + LOG_ERROR("AlertManager", "Failed to prepare select statement: " + std::string(sqlite3_errmsg(db))); + return false; + } + + rc = sqlite3_prepare_v2(db, select_all_sql, -1, reinterpret_cast(&stmt_select_all_), nullptr); + if (rc != SQLITE_OK) { + LOG_ERROR("AlertManager", "Failed to prepare select_all statement: " + std::string(sqlite3_errmsg(db))); + return false; + } + + rc = sqlite3_prepare_v2(db, update_ack_sql, -1, reinterpret_cast(&stmt_update_ack_), nullptr); + if (rc != SQLITE_OK) { + LOG_ERROR("AlertManager", "Failed to prepare update_ack statement: " + std::string(sqlite3_errmsg(db))); + return false; + } + + rc = sqlite3_prepare_v2(db, update_ack_all_sql, -1, reinterpret_cast(&stmt_update_ack_all_), nullptr); + if (rc != SQLITE_OK) { + LOG_ERROR("AlertManager", "Failed to prepare update_ack_all statement: " + std::string(sqlite3_errmsg(db))); + return false; + } + + rc = sqlite3_prepare_v2(db, update_dismiss_sql, -1, reinterpret_cast(&stmt_update_dismiss_), nullptr); + if (rc != SQLITE_OK) { + LOG_ERROR("AlertManager", "Failed to prepare update_dismiss statement: " + std::string(sqlite3_errmsg(db))); + return false; + } + + rc = sqlite3_prepare_v2(db, count_sql, -1, reinterpret_cast(&stmt_count_), nullptr); + if (rc != SQLITE_OK) { + LOG_ERROR("AlertManager", "Failed to prepare count statement: " + std::string(sqlite3_errmsg(db))); + return false; + } + + return true; +} + +void AlertManager::finalize_statements() { + if (stmt_insert_) { + sqlite3_finalize(static_cast(stmt_insert_)); + stmt_insert_ = nullptr; + } + if (stmt_select_) { + sqlite3_finalize(static_cast(stmt_select_)); + stmt_select_ = nullptr; + } + if (stmt_select_all_) { + sqlite3_finalize(static_cast(stmt_select_all_)); + stmt_select_all_ = nullptr; + } + if (stmt_update_ack_) { + sqlite3_finalize(static_cast(stmt_update_ack_)); + stmt_update_ack_ = nullptr; + } + if (stmt_update_ack_all_) { + sqlite3_finalize(static_cast(stmt_update_ack_all_)); + stmt_update_ack_all_ = nullptr; + } + if (stmt_update_dismiss_) { + sqlite3_finalize(static_cast(stmt_update_dismiss_)); + stmt_update_dismiss_ = nullptr; + } + if (stmt_count_) { + sqlite3_finalize(static_cast(stmt_count_)); + stmt_count_ = nullptr; + } +} + +void AlertManager::update_counters(AlertSeverity severity, int delta) { + switch (severity) { + case AlertSeverity::INFO: + count_info_.fetch_add(delta, std::memory_order_relaxed); + break; + case AlertSeverity::WARNING: + count_warning_.fetch_add(delta, std::memory_order_relaxed); + break; + case AlertSeverity::ERROR: + count_error_.fetch_add(delta, std::memory_order_relaxed); + break; + case AlertSeverity::CRITICAL: + count_critical_.fetch_add(delta, std::memory_order_relaxed); + break; + } + count_total_.fetch_add(delta, std::memory_order_relaxed); +} + +void AlertManager::load_initial_counters() { + if (!db_handle_ || !stmt_count_) { + return; + } + + { + // SQLite prepared statements are NOT thread-safe - protect with mutex + std::lock_guard lock(stmt_mutex_); + + sqlite3_stmt* stmt = static_cast(stmt_count_); + sqlite3_reset(stmt); + sqlite3_bind_int(stmt, 1, static_cast(AlertStatus::DISMISSED)); + + int rc; + while ((rc = sqlite3_step(stmt)) == SQLITE_ROW) { + int severity = sqlite3_column_int(stmt, 0); + int count = sqlite3_column_int(stmt, 1); + + switch (static_cast(severity)) { + case AlertSeverity::INFO: + count_info_.store(count, std::memory_order_relaxed); + break; + case AlertSeverity::WARNING: + count_warning_.store(count, std::memory_order_relaxed); + break; + case AlertSeverity::ERROR: + count_error_.store(count, std::memory_order_relaxed); + break; + case AlertSeverity::CRITICAL: + count_critical_.store(count, std::memory_order_relaxed); + break; + } + count_total_.fetch_add(count, std::memory_order_relaxed); + } // End of while loop + } // Lock released +} + +std::string AlertManager::generate_uuid() { + uuid_t uuid; + uuid_generate(uuid); + char uuid_str[37]; + uuid_unparse(uuid, uuid_str); + return std::string(uuid_str); +} + +std::optional AlertManager::create_alert(const Alert& alert) { + if (!db_handle_) { + LOG_ERROR("AlertManager", "Database not initialized"); + return std::nullopt; + } + + sqlite3* db = static_cast(db_handle_); + Alert new_alert = alert; + + // Generate UUID if not provided + if (new_alert.uuid.empty()) { + new_alert.uuid = generate_uuid(); + } + + // Set timestamp if not set + if (new_alert.timestamp.time_since_epoch().count() == 0) { + new_alert.timestamp = std::chrono::system_clock::now(); + } + + // Convert timestamp to ISO 8601 string + auto time_t = std::chrono::system_clock::to_time_t(new_alert.timestamp); + std::stringstream ss; + ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%SZ"); + std::string timestamp_str = ss.str(); + + int rc; + { + // SQLite prepared statements are NOT thread-safe - protect with mutex + std::lock_guard lock(stmt_mutex_); + + sqlite3_stmt* stmt = static_cast(stmt_insert_); + sqlite3_reset(stmt); + + sqlite3_bind_text(stmt, 1, new_alert.uuid.c_str(), -1, SQLITE_STATIC); + sqlite3_bind_int(stmt, 2, static_cast(new_alert.severity)); + sqlite3_bind_int(stmt, 3, static_cast(new_alert.category)); + sqlite3_bind_text(stmt, 4, new_alert.source.c_str(), -1, SQLITE_STATIC); + sqlite3_bind_text(stmt, 5, new_alert.message.c_str(), -1, SQLITE_STATIC); + sqlite3_bind_text(stmt, 6, new_alert.description.c_str(), -1, SQLITE_STATIC); + sqlite3_bind_text(stmt, 7, timestamp_str.c_str(), -1, SQLITE_STATIC); + sqlite3_bind_int(stmt, 8, static_cast(new_alert.status)); + + if (new_alert.acknowledged_at.has_value()) { + auto ack_time_t = std::chrono::system_clock::to_time_t(new_alert.acknowledged_at.value()); + std::stringstream ack_ss; + ack_ss << std::put_time(std::gmtime(&ack_time_t), "%Y-%m-%dT%H:%M:%SZ"); + sqlite3_bind_text(stmt, 9, ack_ss.str().c_str(), -1, SQLITE_STATIC); + } else { + sqlite3_bind_null(stmt, 9); + } + + if (new_alert.dismissed_at.has_value()) { + auto dis_time_t = std::chrono::system_clock::to_time_t(new_alert.dismissed_at.value()); + std::stringstream dis_ss; + dis_ss << std::put_time(std::gmtime(&dis_time_t), "%Y-%m-%dT%H:%M:%SZ"); + sqlite3_bind_text(stmt, 10, dis_ss.str().c_str(), -1, SQLITE_STATIC); + } else { + sqlite3_bind_null(stmt, 10); + } + + rc = sqlite3_step(stmt); + } // Lock released here + + if (rc != SQLITE_DONE) { + LOG_ERROR("AlertManager", "Failed to insert alert: " + std::string(sqlite3_errmsg(db))); + return std::nullopt; + } + + // Update counters (only for active alerts) - atomics are thread-safe + if (new_alert.status == AlertStatus::ACTIVE) { + update_counters(new_alert.severity, 1); + } + + LOG_DEBUG("AlertManager", "Created alert: " + new_alert.uuid); + return new_alert; +} + +std::optional AlertManager::get_alert(const std::string& uuid) { + if (!db_handle_ || !stmt_select_) { + return std::nullopt; + } + + Alert alert; + { + // SQLite prepared statements are NOT thread-safe - protect with mutex + std::lock_guard lock(stmt_mutex_); + + sqlite3_stmt* stmt = static_cast(stmt_select_); + sqlite3_reset(stmt); + sqlite3_bind_text(stmt, 1, uuid.c_str(), -1, SQLITE_STATIC); + + int rc = sqlite3_step(stmt); + if (rc != SQLITE_ROW) { + return std::nullopt; + } + + // Read all columns while lock is held (stmt is only valid during lock) + alert.uuid = reinterpret_cast(sqlite3_column_text(stmt, 0)); + alert.severity = static_cast(sqlite3_column_int(stmt, 1)); + alert.category = static_cast(sqlite3_column_int(stmt, 2)); + alert.source = reinterpret_cast(sqlite3_column_text(stmt, 3)); + alert.message = reinterpret_cast(sqlite3_column_text(stmt, 4)); + alert.description = reinterpret_cast(sqlite3_column_text(stmt, 5)); + + // Parse timestamp + std::string timestamp_str = reinterpret_cast(sqlite3_column_text(stmt, 6)); + std::tm tm = {}; + std::istringstream ss(timestamp_str); + ss >> std::get_time(&tm, "%Y-%m-%dT%H:%M:%SZ"); + if (!ss.fail()) { + alert.timestamp = std::chrono::system_clock::from_time_t(std::mktime(&tm)); + } else { + alert.timestamp = std::chrono::system_clock::now(); + } + + alert.status = static_cast(sqlite3_column_int(stmt, 7)); + + // Parse optional timestamps + if (sqlite3_column_type(stmt, 8) != SQLITE_NULL) { + std::string ack_str = reinterpret_cast(sqlite3_column_text(stmt, 8)); + std::tm ack_tm = {}; + std::istringstream ack_ss(ack_str); + ack_ss >> std::get_time(&ack_tm, "%Y-%m-%dT%H:%M:%SZ"); + if (!ack_ss.fail()) { + alert.acknowledged_at = std::chrono::system_clock::from_time_t(std::mktime(&ack_tm)); + } + } + + if (sqlite3_column_type(stmt, 9) != SQLITE_NULL) { + std::string dis_str = reinterpret_cast(sqlite3_column_text(stmt, 9)); + std::tm dis_tm = {}; + std::istringstream dis_ss(dis_str); + dis_ss >> std::get_time(&dis_tm, "%Y-%m-%dT%H:%M:%SZ"); + if (!dis_ss.fail()) { + alert.dismissed_at = std::chrono::system_clock::from_time_t(std::mktime(&dis_tm)); + } + } + } // Lock released - alert data is now copied + + return alert; +} + +std::vector AlertManager::get_alerts(const AlertFilter& filter) { + std::vector alerts; + + if (!db_handle_) { + return alerts; + } + + sqlite3* db = static_cast(db_handle_); + + std::string select_sql = "SELECT uuid, severity, category, source, message, description, timestamp, status, acknowledged_at, dismissed_at FROM alerts WHERE 1=1"; + + if (filter.severity.has_value()) { + select_sql += " AND severity = " + std::to_string(static_cast(filter.severity.value())); + } + + if (filter.category.has_value()) { + select_sql += " AND category = " + std::to_string(static_cast(filter.category.value())); + } + + if (filter.status.has_value()) { + select_sql += " AND status = " + std::to_string(static_cast(filter.status.value())); + } else if (!filter.include_dismissed) { + select_sql += " AND status != " + std::to_string(static_cast(AlertStatus::DISMISSED)); + } + + int param_index = 1; + if (filter.source.has_value()) { + select_sql += " AND source = ?"; + } + + select_sql += " ORDER BY timestamp DESC"; + + sqlite3_stmt* stmt; + int rc = sqlite3_prepare_v2(db, select_sql.c_str(), -1, &stmt, nullptr); + + if (rc != SQLITE_OK) { + return alerts; + } + + if (filter.source.has_value()) { + sqlite3_bind_text(stmt, param_index++, filter.source.value().c_str(), -1, SQLITE_STATIC); + } + + while ((rc = sqlite3_step(stmt)) == SQLITE_ROW) { + Alert alert; + alert.uuid = reinterpret_cast(sqlite3_column_text(stmt, 0)); + alert.severity = static_cast(sqlite3_column_int(stmt, 1)); + alert.category = static_cast(sqlite3_column_int(stmt, 2)); + alert.source = reinterpret_cast(sqlite3_column_text(stmt, 3)); + alert.message = reinterpret_cast(sqlite3_column_text(stmt, 4)); + alert.description = reinterpret_cast(sqlite3_column_text(stmt, 5)); + + // Parse timestamp + std::string timestamp_str = reinterpret_cast(sqlite3_column_text(stmt, 6)); + std::tm tm = {}; + std::istringstream ss(timestamp_str); + ss >> std::get_time(&tm, "%Y-%m-%dT%H:%M:%SZ"); + if (!ss.fail()) { + alert.timestamp = std::chrono::system_clock::from_time_t(std::mktime(&tm)); + } else { + alert.timestamp = std::chrono::system_clock::now(); + } + + alert.status = static_cast(sqlite3_column_int(stmt, 7)); + + // Parse optional timestamps + if (sqlite3_column_type(stmt, 8) != SQLITE_NULL) { + std::string ack_str = reinterpret_cast(sqlite3_column_text(stmt, 8)); + std::tm ack_tm = {}; + std::istringstream ack_ss(ack_str); + ack_ss >> std::get_time(&ack_tm, "%Y-%m-%dT%H:%M:%SZ"); + if (!ack_ss.fail()) { + alert.acknowledged_at = std::chrono::system_clock::from_time_t(std::mktime(&ack_tm)); + } + } + + if (sqlite3_column_type(stmt, 9) != SQLITE_NULL) { + std::string dis_str = reinterpret_cast(sqlite3_column_text(stmt, 9)); + std::tm dis_tm = {}; + std::istringstream dis_ss(dis_str); + dis_ss >> std::get_time(&dis_tm, "%Y-%m-%dT%H:%M:%SZ"); + if (!dis_ss.fail()) { + alert.dismissed_at = std::chrono::system_clock::from_time_t(std::mktime(&dis_tm)); + } + } + + alerts.push_back(alert); + } + + sqlite3_finalize(stmt); + return alerts; +} + +bool AlertManager::acknowledge_alert(const std::string& uuid) { + if (!db_handle_ || !stmt_update_ack_) { + return false; + } + + // Get alert first to know its severity for counter update + auto alert_opt = get_alert(uuid); + if (!alert_opt || alert_opt->status != AlertStatus::ACTIVE) { + return false; + } + + sqlite3* db = static_cast(db_handle_); + auto now = std::chrono::system_clock::now(); + auto time_t = std::chrono::system_clock::to_time_t(now); + std::stringstream ss; + ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%SZ"); + std::string timestamp_str = ss.str(); + + int rc; + int changes = 0; + { + // SQLite prepared statements are NOT thread-safe - protect with mutex + std::lock_guard lock(stmt_mutex_); + + sqlite3_stmt* stmt = static_cast(stmt_update_ack_); + sqlite3_reset(stmt); + + sqlite3_bind_int(stmt, 1, static_cast(AlertStatus::ACKNOWLEDGED)); + sqlite3_bind_text(stmt, 2, timestamp_str.c_str(), -1, SQLITE_STATIC); + sqlite3_bind_text(stmt, 3, uuid.c_str(), -1, SQLITE_STATIC); + + rc = sqlite3_step(stmt); + changes = (rc == SQLITE_DONE) ? sqlite3_changes(db) : 0; + } // Lock released + + if (rc == SQLITE_DONE && changes > 0) { + // Update counters (decrement active count) - atomics are thread-safe + update_counters(alert_opt->severity, -1); + return true; + } + + return false; +} + +size_t AlertManager::acknowledge_all() { + if (!db_handle_ || !stmt_update_ack_all_) { + return 0; + } + + sqlite3* db = static_cast(db_handle_); + auto now = std::chrono::system_clock::now(); + auto time_t = std::chrono::system_clock::to_time_t(now); + std::stringstream ss; + ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%SZ"); + std::string timestamp_str = ss.str(); + + int rc; + int changes = 0; + { + // SQLite prepared statements are NOT thread-safe - protect with mutex + std::lock_guard lock(stmt_mutex_); + + sqlite3_stmt* stmt = static_cast(stmt_update_ack_all_); + sqlite3_reset(stmt); + + sqlite3_bind_int(stmt, 1, static_cast(AlertStatus::ACKNOWLEDGED)); + sqlite3_bind_text(stmt, 2, timestamp_str.c_str(), -1, SQLITE_STATIC); + sqlite3_bind_int(stmt, 3, static_cast(AlertStatus::ACTIVE)); + + rc = sqlite3_step(stmt); + changes = (rc == SQLITE_DONE) ? sqlite3_changes(db) : 0; + } // Lock released + + // Update counters - reset all to 0 since all active alerts are now acknowledged + // Note: This is approximate - for exact counts we'd need to query by severity + // But for acknowledge_all, we typically want to clear all counters anyway + if (changes > 0) { + count_info_.store(0, std::memory_order_relaxed); + count_warning_.store(0, std::memory_order_relaxed); + count_error_.store(0, std::memory_order_relaxed); + count_critical_.store(0, std::memory_order_relaxed); + count_total_.store(0, std::memory_order_relaxed); + } + + return changes; +} + +bool AlertManager::dismiss_alert(const std::string& uuid) { + if (!db_handle_ || !stmt_update_dismiss_) { + return false; + } + + // Get alert first to know its severity and status for counter update + auto alert_opt = get_alert(uuid); + if (!alert_opt) { + return false; + } + + // Only update counters if alert was active or acknowledged (not already dismissed) + bool should_update_counters = (alert_opt->status == AlertStatus::ACTIVE || + alert_opt->status == AlertStatus::ACKNOWLEDGED); + + sqlite3* db = static_cast(db_handle_); + auto now = std::chrono::system_clock::now(); + auto time_t = std::chrono::system_clock::to_time_t(now); + std::stringstream ss; + ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%SZ"); + std::string timestamp_str = ss.str(); + + int rc; + int changes = 0; + { + // SQLite prepared statements are NOT thread-safe - protect with mutex + std::lock_guard lock(stmt_mutex_); + + sqlite3_stmt* stmt = static_cast(stmt_update_dismiss_); + sqlite3_reset(stmt); + + sqlite3_bind_int(stmt, 1, static_cast(AlertStatus::DISMISSED)); + sqlite3_bind_text(stmt, 2, timestamp_str.c_str(), -1, SQLITE_STATIC); + sqlite3_bind_text(stmt, 3, uuid.c_str(), -1, SQLITE_STATIC); + + rc = sqlite3_step(stmt); + changes = (rc == SQLITE_DONE) ? sqlite3_changes(db) : 0; + } // Lock released + + if (rc == SQLITE_DONE && changes > 0) { + // Update counters if alert was active - atomics are thread-safe + if (should_update_counters && alert_opt->status == AlertStatus::ACTIVE) { + update_counters(alert_opt->severity, -1); + } + return true; + } + + return false; +} + +json AlertManager::get_alert_counts() { + // Use in-memory counters for O(1) performance + json counts; + counts["info"] = count_info_.load(std::memory_order_relaxed); + counts["warning"] = count_warning_.load(std::memory_order_relaxed); + counts["error"] = count_error_.load(std::memory_order_relaxed); + counts["critical"] = count_critical_.load(std::memory_order_relaxed); + counts["total"] = count_total_.load(std::memory_order_relaxed); + + return counts; +} + +// Static helper methods +std::string AlertManager::severity_to_string(AlertSeverity severity) { + switch (severity) { + case AlertSeverity::INFO: return "info"; + case AlertSeverity::WARNING: return "warning"; + case AlertSeverity::ERROR: return "error"; + case AlertSeverity::CRITICAL: return "critical"; + default: return "unknown"; + } +} + +AlertSeverity AlertManager::string_to_severity(const std::string& str) { + if (str == "info") return AlertSeverity::INFO; + if (str == "warning") return AlertSeverity::WARNING; + if (str == "error") return AlertSeverity::ERROR; + if (str == "critical") return AlertSeverity::CRITICAL; + return AlertSeverity::INFO; +} + +std::string AlertManager::category_to_string(AlertCategory category) { + switch (category) { + case AlertCategory::CPU: return "cpu"; + case AlertCategory::MEMORY: return "memory"; + case AlertCategory::DISK: return "disk"; + case AlertCategory::APT: return "apt"; + case AlertCategory::CVE: return "cve"; + case AlertCategory::SERVICE: return "service"; + case AlertCategory::SYSTEM: return "system"; + default: return "unknown"; + } +} + +AlertCategory AlertManager::string_to_category(const std::string& str) { + if (str == "cpu") return AlertCategory::CPU; + if (str == "memory") return AlertCategory::MEMORY; + if (str == "disk") return AlertCategory::DISK; + if (str == "apt") return AlertCategory::APT; + if (str == "cve") return AlertCategory::CVE; + if (str == "service") return AlertCategory::SERVICE; + if (str == "system") return AlertCategory::SYSTEM; + return AlertCategory::SYSTEM; +} + +std::string AlertManager::status_to_string(AlertStatus status) { + switch (status) { + case AlertStatus::ACTIVE: return "active"; + case AlertStatus::ACKNOWLEDGED: return "acknowledged"; + case AlertStatus::DISMISSED: return "dismissed"; + default: return "unknown"; + } +} + +AlertStatus AlertManager::string_to_status(const std::string& str) { + if (str == "active") return AlertStatus::ACTIVE; + if (str == "acknowledged") return AlertStatus::ACKNOWLEDGED; + if (str == "dismissed") return AlertStatus::DISMISSED; + return AlertStatus::ACTIVE; +} + +} // namespace cortexd diff --git a/daemon/src/config/config.cpp b/daemon/src/config/config.cpp index 2bda89db..bb9a0e50 100644 --- a/daemon/src/config/config.cpp +++ b/daemon/src/config/config.cpp @@ -44,6 +44,41 @@ std::optional Config::load(const std::string& path) { config.log_level = yaml["log_level"].as(); } + // Monitoring thresholds + if (yaml["monitoring"]) { + auto monitoring = yaml["monitoring"]; + if (monitoring["cpu"]) { + auto cpu = monitoring["cpu"]; + if (cpu["warning_threshold"]) { + config.cpu_warning_threshold = cpu["warning_threshold"].as(); + } + if (cpu["critical_threshold"]) { + config.cpu_critical_threshold = cpu["critical_threshold"].as(); + } + } + if (monitoring["memory"]) { + auto memory = monitoring["memory"]; + if (memory["warning_threshold"]) { + config.memory_warning_threshold = memory["warning_threshold"].as(); + } + if (memory["critical_threshold"]) { + config.memory_critical_threshold = memory["critical_threshold"].as(); + } + } + if (monitoring["disk"]) { + auto disk = monitoring["disk"]; + if (disk["warning_threshold"]) { + config.disk_warning_threshold = disk["warning_threshold"].as(); + } + if (disk["critical_threshold"]) { + config.disk_critical_threshold = disk["critical_threshold"].as(); + } + } + if (monitoring["check_interval_seconds"]) { + config.monitor_check_interval_seconds = monitoring["check_interval_seconds"].as(); + } + } + // Expand paths and validate config.expand_paths(); std::string error = config.validate(); @@ -86,6 +121,23 @@ bool Config::save(const std::string& path) const { // Logging out << YAML::Key << "log_level" << YAML::Value << log_level; + // Monitoring thresholds + out << YAML::Key << "monitoring" << YAML::Value << YAML::BeginMap; + out << YAML::Key << "cpu" << YAML::Value << YAML::BeginMap; + out << YAML::Key << "warning_threshold" << YAML::Value << cpu_warning_threshold; + out << YAML::Key << "critical_threshold" << YAML::Value << cpu_critical_threshold; + out << YAML::EndMap; + out << YAML::Key << "memory" << YAML::Value << YAML::BeginMap; + out << YAML::Key << "warning_threshold" << YAML::Value << memory_warning_threshold; + out << YAML::Key << "critical_threshold" << YAML::Value << memory_critical_threshold; + out << YAML::EndMap; + out << YAML::Key << "disk" << YAML::Value << YAML::BeginMap; + out << YAML::Key << "warning_threshold" << YAML::Value << disk_warning_threshold; + out << YAML::Key << "critical_threshold" << YAML::Value << disk_critical_threshold; + out << YAML::EndMap; + out << YAML::Key << "check_interval_seconds" << YAML::Value << monitor_check_interval_seconds; + out << YAML::EndMap; + out << YAML::EndMap; std::ofstream file(expanded_path); @@ -121,6 +173,30 @@ std::string Config::validate() const { if (log_level < 0 || log_level > 4) { return "log_level must be between 0 and 4"; } + if (cpu_warning_threshold < 0 || cpu_warning_threshold > 100 || + cpu_critical_threshold < 0 || cpu_critical_threshold > 100) { + return "CPU thresholds must be between 0 and 100"; + } + if (cpu_warning_threshold >= cpu_critical_threshold) { + return "CPU warning threshold must be less than critical threshold"; + } + if (memory_warning_threshold < 0 || memory_warning_threshold > 100 || + memory_critical_threshold < 0 || memory_critical_threshold > 100) { + return "Memory thresholds must be between 0 and 100"; + } + if (memory_warning_threshold >= memory_critical_threshold) { + return "Memory warning threshold must be less than critical threshold"; + } + if (disk_warning_threshold < 0 || disk_warning_threshold > 100 || + disk_critical_threshold < 0 || disk_critical_threshold > 100) { + return "Disk thresholds must be between 0 and 100"; + } + if (disk_warning_threshold >= disk_critical_threshold) { + return "Disk warning threshold must be less than critical threshold"; + } + if (monitor_check_interval_seconds <= 0) { + return "monitor_check_interval_seconds must be positive"; + } return ""; // Valid } diff --git a/daemon/src/core/daemon.cpp b/daemon/src/core/daemon.cpp index d2d25fe1..6fdb29d9 100644 --- a/daemon/src/core/daemon.cpp +++ b/daemon/src/core/daemon.cpp @@ -119,6 +119,7 @@ void Daemon::request_shutdown() { void Daemon::register_service(std::unique_ptr service) { LOG_DEBUG("Daemon", "Registering service: " + std::string(service->name())); + std::unique_lock lock(services_mutex_); services_.push_back(std::move(service)); } @@ -164,18 +165,20 @@ bool Daemon::reload_config() { return false; } -void Daemon::reset() { - // Reset all singleton state for test isolation - // This ensures each test starts with a clean daemon state - // WARNING: This function has no synchronization and should ONLY be called - // when the daemon is stopped and no other threads are accessing services_. - // For production builds, consider using #ifdef TESTING guards. - - // Stop any running services first - stop_services(); - - // Clear all registered services - services_.clear(); + void Daemon::reset() { + // Reset all singleton state for test isolation + // This ensures each test starts with a clean daemon state + // WARNING: This function has no synchronization and should ONLY be called + // when the daemon is stopped and no other threads are accessing services_. + // For production builds, consider using #ifdef TESTING guards. + + // Stop any running services first + stop_services(); + + // Clear all registered services (exclusive lock for write) + std::unique_lock lock(services_mutex_); + services_.clear(); + lock.unlock(); // Reset state flags shutdown_requested_.store(false, std::memory_order_relaxed); @@ -204,13 +207,27 @@ void Daemon::reset() { } bool Daemon::start_services() { + std::unique_lock lock(services_mutex_); + // Sort services by priority (higher first) std::sort(services_.begin(), services_.end(), [](const auto& a, const auto& b) { return a->priority() > b->priority(); }); + // Release lock before starting services (start() may take time) + lock.unlock(); + for (auto& service : services_) { + // Re-acquire lock to access services_ safely + std::shared_lock read_lock(services_mutex_); + auto it = std::find_if(services_.begin(), services_.end(), + [&service](const auto& s) { return s.get() == service.get(); }); + if (it == services_.end()) { + read_lock.unlock(); + continue; // Service was removed + } + read_lock.unlock(); LOG_INFO("Daemon", "Starting service: " + std::string(service->name())); if (!service->start()) { @@ -227,13 +244,21 @@ void Daemon::reset() { } void Daemon::stop_services() { - // Stop services in reverse order (lower priority first) + std::shared_lock lock(services_mutex_); + + // Copy service pointers to avoid holding lock during stop() + std::vector service_ptrs; for (auto it = services_.rbegin(); it != services_.rend(); ++it) { - auto& service = *it; - if (service->is_running()) { - LOG_INFO("Daemon", "Stopping service: " + std::string(service->name())); - service->stop(); - LOG_INFO("Daemon", "Service stopped: " + std::string(service->name())); + service_ptrs.push_back(it->get()); + } + lock.unlock(); // Release lock before stopping services + + // Stop services in reverse order (lower priority first) + for (auto* service_ptr : service_ptrs) { + if (service_ptr->is_running()) { + LOG_INFO("Daemon", "Stopping service: " + std::string(service_ptr->name())); + service_ptr->stop(); + LOG_INFO("Daemon", "Service stopped: " + std::string(service_ptr->name())); } } } @@ -254,10 +279,13 @@ void Daemon::reset() { reload_config(); } - // Check service health - for (auto& service : services_) { - if (service->is_running() && !service->is_healthy()) { - LOG_WARN("Daemon", "Service unhealthy: " + std::string(service->name())); + // Check service health (read-only access - use shared lock) + { + std::shared_lock lock(services_mutex_); + for (const auto& service : services_) { + if (service->is_running() && !service->is_healthy()) { + LOG_WARN("Daemon", "Service unhealthy: " + std::string(service->name())); + } } } diff --git a/daemon/src/ipc/handlers.cpp b/daemon/src/ipc/handlers.cpp index a789064b..48caab41 100644 --- a/daemon/src/ipc/handlers.cpp +++ b/daemon/src/ipc/handlers.cpp @@ -7,11 +7,18 @@ #include "cortexd/core/daemon.h" #include "cortexd/config.h" #include "cortexd/logger.h" +#include "cortexd/monitor/system_monitor.h" +#include "cortexd/alerts/alert_manager.h" +#include "cortexd/ipc/server.h" namespace cortexd { -void Handlers::register_all(IPCServer& server) { - // Basic handlers only +void Handlers::register_all( + IPCServer& server, + SystemMonitor* monitor, + std::shared_ptr alerts +) { + // Basic handlers server.register_handler(Methods::PING, [](const Request& req) { return handle_ping(req); }); @@ -34,7 +41,38 @@ void Handlers::register_all(IPCServer& server) { return handle_shutdown(req); }); - LOG_INFO("Handlers", "Registered 5 core IPC handlers"); + // Monitoring handlers (if monitor is available) + if (monitor) { + server.register_handler(Methods::HEALTH, [monitor](const Request& req) { + return handle_health(req, monitor); + }); + } + + // Alert handlers (if alerts is available) + if (alerts) { + // Both "alerts" and "alerts.get" map to the same handler + server.register_handler(Methods::ALERTS, [alerts](const Request& req) { + return handle_alerts_get(req, alerts); + }); + + server.register_handler(Methods::ALERTS_GET, [alerts](const Request& req) { + return handle_alerts_get(req, alerts); + }); + + server.register_handler(Methods::ALERTS_ACK, [alerts](const Request& req) { + return handle_alerts_acknowledge(req, alerts); + }); + + server.register_handler(Methods::ALERTS_DISMISS, [alerts](const Request& req) { + return handle_alerts_dismiss(req, alerts); + }); + } + + int handler_count = 5; // Core handlers + if (monitor) handler_count += 1; // Health + if (alerts) handler_count += 4; // Alerts + Alerts get + ack + dismiss + + LOG_INFO("Handlers", "Registered " + std::to_string(handler_count) + " IPC handlers"); } Response Handlers::handle_ping(const Request& /*req*/) { @@ -76,4 +114,130 @@ Response Handlers::handle_shutdown(const Request& /*req*/) { return Response::ok({{"shutdown", "initiated"}}); } +Response Handlers::handle_health(const Request& /*req*/, SystemMonitor* monitor) { + if (!monitor) { + return Response::err("System monitor not available", ErrorCodes::INTERNAL_ERROR); + } + + SystemHealth health = monitor->get_health(); + MonitoringThresholds thresholds = monitor->get_thresholds(); + + json result = health.to_json(); + result["thresholds"] = { + {"cpu", { + {"warning", thresholds.cpu_warning}, + {"critical", thresholds.cpu_critical} + }}, + {"memory", { + {"warning", thresholds.memory_warning}, + {"critical", thresholds.memory_critical} + }}, + {"disk", { + {"warning", thresholds.disk_warning}, + {"critical", thresholds.disk_critical} + }} + }; + + return Response::ok(result); +} + +Response Handlers::handle_alerts_get(const Request& req, std::shared_ptr alerts) { + if (!alerts) { + return Response::err("Alert manager not available", ErrorCodes::INTERNAL_ERROR); + } + + AlertFilter filter; + filter.include_dismissed = false; // Default: don't include dismissed + + // Parse filter parameters + if (req.params.is_object()) { + if (req.params.contains("severity")) { + std::string severity_str = req.params["severity"].get(); + filter.severity = AlertManager::string_to_severity(severity_str); + } + + if (req.params.contains("category")) { + std::string category_str = req.params["category"].get(); + filter.category = AlertManager::string_to_category(category_str); + } + + if (req.params.contains("status")) { + std::string status_str = req.params["status"].get(); + filter.status = AlertManager::string_to_status(status_str); + } + + if (req.params.contains("include_dismissed")) { + filter.include_dismissed = req.params["include_dismissed"].get(); + } + } + + auto alert_list = alerts->get_alerts(filter); + json alerts_json = json::array(); + + for (const auto& alert : alert_list) { + alerts_json.push_back(alert.to_json()); + } + + json result; + result["alerts"] = alerts_json; + result["count"] = alert_list.size(); + result["counts"] = alerts->get_alert_counts(); + + return Response::ok(result); +} + +Response Handlers::handle_alerts_acknowledge(const Request& req, std::shared_ptr alerts) { + if (!alerts) { + return Response::err("Alert manager not available", ErrorCodes::INTERNAL_ERROR); + } + + // Check if acknowledging all or specific UUID + if (req.params.is_object() && req.params.contains("all") && req.params["all"].get()) { + size_t count = alerts->acknowledge_all(); + return Response::ok({ + {"acknowledged", count}, + {"message", "Acknowledged " + std::to_string(count) + " alert(s)"} + }); + } else if (req.params.is_object() && req.params.contains("uuid")) { + std::string uuid = req.params["uuid"].get(); + if (alerts->acknowledge_alert(uuid)) { + return Response::ok({ + {"acknowledged", true}, + {"uuid", uuid} + }); + } else { + return Response::err("Alert not found or already acknowledged", ErrorCodes::ALERT_NOT_FOUND); + } + } else { + // Default: acknowledge all + size_t count = alerts->acknowledge_all(); + return Response::ok({ + {"acknowledged", count}, + {"message", "Acknowledged " + std::to_string(count) + " alert(s)"} + }); + } +} + +Response Handlers::handle_alerts_dismiss(const Request& req, std::shared_ptr alerts) { + if (!alerts) { + return Response::err("Alert manager not available", ErrorCodes::INTERNAL_ERROR); + } + + std::string uuid; + if (req.params.is_object() && req.params.contains("uuid")) { + uuid = req.params["uuid"].get(); + } else { + return Response::err("UUID required for dismiss", ErrorCodes::INVALID_PARAMS); + } + + if (alerts->dismiss_alert(uuid)) { + return Response::ok({ + {"dismissed", true}, + {"uuid", uuid} + }); + } else { + return Response::err("Alert not found", ErrorCodes::ALERT_NOT_FOUND); + } +} + } // namespace cortexd \ No newline at end of file diff --git a/daemon/src/ipc/server.cpp b/daemon/src/ipc/server.cpp index 949bfe76..360c700b 100644 --- a/daemon/src/ipc/server.cpp +++ b/daemon/src/ipc/server.cpp @@ -15,38 +15,77 @@ namespace cortexd { - // RateLimiter implementation - - RateLimiter::RateLimiter(int max_per_second) - : max_per_second_(max_per_second) - , window_start_(std::chrono::steady_clock::now()) { - } - - bool RateLimiter::allow() { - std::lock_guard lock(mutex_); - - auto now = std::chrono::steady_clock::now(); - auto elapsed = std::chrono::duration_cast(now - window_start_); - - // Reset window every second - if (elapsed.count() >= 1000) { - count_ = 0; - window_start_ = now; - } - - if (count_ >= max_per_second_) { - return false; - } - - count_++; - return true; - } - - void RateLimiter::reset() { - std::lock_guard lock(mutex_); - count_ = 0; - window_start_ = std::chrono::steady_clock::now(); - } +// RateLimiter implementation (lock-free) + +RateLimiter::RateLimiter(int max_per_second) + : max_per_second_(max_per_second) { + auto now = std::chrono::steady_clock::now(); + auto now_rep = now.time_since_epoch().count(); + window_start_rep_.store(now_rep, std::memory_order_relaxed); +} + +bool RateLimiter::allow() { + auto now = std::chrono::steady_clock::now(); + auto now_rep = now.time_since_epoch().count(); + auto window_start_rep = window_start_rep_.load(std::memory_order_acquire); + + std::chrono::steady_clock::time_point window_start{ + std::chrono::steady_clock::duration{window_start_rep} + }; + + auto elapsed = std::chrono::duration_cast(now - window_start); + + // Reset window every second (lock-free compare-and-swap) + if (elapsed.count() >= 1000) { + // Try to update window_start atomically + auto expected = window_start_rep; + if (window_start_rep_.compare_exchange_weak( + expected, now_rep, + std::memory_order_acq_rel, + std::memory_order_acquire)) { + // We won the race to reset - reset count atomically + count_.store(0, std::memory_order_release); + } else { + // If we lost the race, reload window_start as another thread may have reset + window_start_rep = window_start_rep_.load(std::memory_order_acquire); + // Recalculate elapsed time with new window_start + std::chrono::steady_clock::time_point new_window_start{ + std::chrono::steady_clock::duration{window_start_rep} + }; + elapsed = std::chrono::duration_cast(now - new_window_start); + // If still in new window and count was reset, current will be 0, which is fine + } + } + + // Secure lock-free increment with check: use compare-and-swap loop + // This ensures we never exceed the limit, even under high concurrency + int current; + int next; + do { + current = count_.load(std::memory_order_acquire); + + // Check limit BEFORE incrementing (security: never exceed limit) + if (current >= max_per_second_) { + return false; // Rate limit exceeded + } + + next = current + 1; + // Atomically increment only if count hasn't changed (prevents race conditions) + } while (!count_.compare_exchange_weak( + current, next, + std::memory_order_release, + std::memory_order_acquire)); + + // Successfully incremented without exceeding limit + return true; +} + +void RateLimiter::reset() { + auto now = std::chrono::steady_clock::now(); + auto now_rep = now.time_since_epoch().count(); + count_.store(0, std::memory_order_relaxed); + window_start_rep_.store(now_rep, std::memory_order_relaxed); +} // IPCServer implementation @@ -109,11 +148,11 @@ return running_.load() && server_fd_ != -1; } - void IPCServer::register_handler(const std::string& method, RequestHandler handler) { - std::lock_guard lock(handlers_mutex_); - handlers_[method] = std::move(handler); - LOG_DEBUG("IPCServer", "Registered handler for: " + method); - } +void IPCServer::register_handler(const std::string& method, RequestHandler handler) { + std::unique_lock lock(handlers_mutex_); // Exclusive lock for write + handlers_[method] = std::move(handler); + LOG_DEBUG("IPCServer", "Registered handler for: " + method); +} bool IPCServer::create_socket() { // Create socket @@ -247,9 +286,9 @@ bool IPCServer::setup_permissions() { return; } - buffer[bytes] = '\0'; - std::string raw_request(buffer.data()); - LOG_DEBUG("IPCServer", "Received: " + raw_request); + buffer[bytes] = '\0'; + std::string raw_request(buffer.data()); + LOG_DEBUG("IPCServer", "Received request (" + std::to_string(bytes) + " bytes)"); // Check rate limit if (!rate_limiter_.allow()) { @@ -276,9 +315,9 @@ bool IPCServer::setup_permissions() { response = dispatch(*request); } - // Send response - std::string response_str = response.to_json(); - LOG_DEBUG("IPCServer", "Sending: " + response_str); + // Send response + std::string response_str = response.to_json(); + LOG_DEBUG("IPCServer", "Sending response (" + std::to_string(response_str.length()) + " bytes)"); if (send(client_fd, response_str.c_str(), response_str.length(), 0) == -1) { LOG_ERROR("IPCServer", "Failed to send response: " + std::string(strerror(errno))); @@ -299,28 +338,28 @@ bool IPCServer::setup_permissions() { connections_cv_.notify_all(); } - Response IPCServer::dispatch(const Request& request) { - RequestHandler handler; - { - std::lock_guard lock(handlers_mutex_); - - auto it = handlers_.find(request.method); - if (it == handlers_.end()) { - LOG_WARN("IPCServer", "Unknown method: " + request.method); - return Response::err("Method not found: " + request.method, ErrorCodes::METHOD_NOT_FOUND); - } - - // Copy handler to execute outside the lock - handler = it->second; - } +Response IPCServer::dispatch(const Request& request) { + RequestHandler handler; + { + std::shared_lock lock(handlers_mutex_); // Shared lock for read + + auto it = handlers_.find(request.method); + if (it == handlers_.end()) { + LOG_WARN("IPCServer", "Unknown method: " + request.method); + return Response::err("Method not found: " + request.method, ErrorCodes::METHOD_NOT_FOUND); + } + + // Copy handler to execute outside the lock + handler = it->second; + } - // Execute handler without holding the mutex to prevent deadlock - // if handler calls back into server (e.g., registering another handler) - LOG_INFO("IPCServer", "Handler found, invoking..."); - try { - Response resp = handler(request); - LOG_INFO("IPCServer", "Handler completed successfully"); - return resp; + // Execute handler without holding the mutex to prevent deadlock + // if handler calls back into server (e.g., registering another handler) + LOG_DEBUG("IPCServer", "Handler found, invoking: " + request.method); + try { + Response resp = handler(request); + LOG_DEBUG("IPCServer", "Handler completed: " + request.method); + return resp; } catch (const std::exception& e) { LOG_ERROR("IPCServer", "Handler error for " + request.method + ": " + e.what()); return Response::err(e.what(), ErrorCodes::INTERNAL_ERROR); diff --git a/daemon/src/main.cpp b/daemon/src/main.cpp index 98d6cd9b..206ba652 100644 --- a/daemon/src/main.cpp +++ b/daemon/src/main.cpp @@ -9,8 +9,11 @@ #include "cortexd/logger.h" #include "cortexd/config.h" #include "cortexd/common.h" +#include "cortexd/monitor/system_monitor.h" +#include "cortexd/alerts/alert_manager.h" #include #include +#include using namespace cortexd; @@ -99,20 +102,58 @@ return 1; } - // Get configuration - const auto& config = ConfigManager::instance().get(); - - // Create IPC server - auto ipc_server = std::make_unique( - config.socket_path, - config.max_requests_per_sec - ); - - // Register IPC handlers - Handlers::register_all(*ipc_server); - - // Register services with daemon - daemon.register_service(std::move(ipc_server)); + // Get configuration + const auto& config = ConfigManager::instance().get(); + + // Create alert manager (shared pointer for use by multiple components) + auto alert_manager = std::make_shared(); + if (!alert_manager->initialize()) { + LOG_ERROR("main", "Failed to initialize alert manager"); + return 1; + } + + // Create monitoring thresholds from config + MonitoringThresholds thresholds; + thresholds.cpu_warning = config.cpu_warning_threshold; + thresholds.cpu_critical = config.cpu_critical_threshold; + thresholds.memory_warning = config.memory_warning_threshold; + thresholds.memory_critical = config.memory_critical_threshold; + thresholds.disk_warning = config.disk_warning_threshold; + thresholds.disk_critical = config.disk_critical_threshold; + + // Create system monitor with config thresholds + auto system_monitor = std::make_unique( + alert_manager, + config.monitor_check_interval_seconds, + thresholds + ); + + // Create IPC server + auto ipc_server = std::make_unique( + config.socket_path, + config.max_requests_per_sec + ); + + // Register IPC handlers (with monitor and alerts) + Handlers::register_all(*ipc_server, system_monitor.get(), alert_manager); + + // Register config change callback to update monitor thresholds on reload + SystemMonitor* monitor_ptr = system_monitor.get(); + ConfigManager::instance().on_change([monitor_ptr](const Config& config) { + MonitoringThresholds thresholds; + thresholds.cpu_warning = config.cpu_warning_threshold; + thresholds.cpu_critical = config.cpu_critical_threshold; + thresholds.memory_warning = config.memory_warning_threshold; + thresholds.memory_critical = config.memory_critical_threshold; + thresholds.disk_warning = config.disk_warning_threshold; + thresholds.disk_critical = config.disk_critical_threshold; + monitor_ptr->set_thresholds(thresholds); + LOG_INFO("main", "Updated SystemMonitor thresholds from config"); + }); + + // Register services with daemon + daemon.register_service(std::move(ipc_server)); + daemon.register_service(std::move(system_monitor)); // Run daemon (blocks until shutdown) int exit_code = daemon.run(); diff --git a/daemon/src/monitor/system_monitor.cpp b/daemon/src/monitor/system_monitor.cpp new file mode 100644 index 00000000..e7d0534e --- /dev/null +++ b/daemon/src/monitor/system_monitor.cpp @@ -0,0 +1,551 @@ +/** + * @file system_monitor.cpp + * @brief System monitoring implementation + */ + +#include "cortexd/monitor/system_monitor.h" +#include "cortexd/logger.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cortexd { + +// SystemHealth JSON conversion +json SystemHealth::to_json() const { + json j; + j["cpu"] = { + {"usage_percent", cpu_usage_percent}, + {"cores", cpu_cores} + }; + j["memory"] = { + {"usage_percent", memory_usage_percent}, + {"total_bytes", memory_total_bytes}, + {"used_bytes", memory_used_bytes}, + {"available_bytes", memory_available_bytes} + }; + j["disk"] = { + {"usage_percent", disk_usage_percent}, + {"total_bytes", disk_total_bytes}, + {"used_bytes", disk_used_bytes}, + {"available_bytes", disk_available_bytes}, + {"mount_point", disk_mount_point} + }; + j["system"] = { + {"uptime_seconds", uptime_seconds}, + {"failed_services_count", failed_services_count} + }; + return j; +} + +// SystemMonitor implementation +SystemMonitor::SystemMonitor( + std::shared_ptr alert_manager, + int check_interval_seconds, + const MonitoringThresholds& thresholds +) : alert_manager_(alert_manager), + check_interval_seconds_(check_interval_seconds), + thresholds_(thresholds), + current_health_{}, + last_cpu_time_(std::chrono::steady_clock::now()), + systemd_bus_(nullptr) { +} + +SystemMonitor::~SystemMonitor() { + stop(); + // Clean up systemd bus connection + std::lock_guard lock(bus_mutex_); + if (systemd_bus_) { + sd_bus_unref(static_cast(systemd_bus_)); + systemd_bus_ = nullptr; + } +} + +bool SystemMonitor::start() { + if (running_.load()) { + LOG_WARN("SystemMonitor", "Already running"); + return true; + } + + if (!alert_manager_) { + LOG_ERROR("SystemMonitor", "Alert manager not set"); + return false; + } + + running_.store(true); + monitor_thread_ = std::make_unique(&SystemMonitor::monitor_loop, this); + + LOG_INFO("SystemMonitor", "Started monitoring (interval: " + + std::to_string(check_interval_seconds_) + "s)"); + return true; +} + +void SystemMonitor::stop() { + if (!running_.load()) { + return; + } + + running_.store(false); + + if (monitor_thread_ && monitor_thread_->joinable()) { + monitor_thread_->join(); + } + + monitor_thread_.reset(); + LOG_INFO("SystemMonitor", "Stopped"); +} + +bool SystemMonitor::is_running() const { + return running_.load(); +} + +bool SystemMonitor::is_healthy() const { + return running_.load(); +} + +SystemHealth SystemMonitor::get_health() const { + std::lock_guard lock(health_mutex_); + return current_health_; +} + +void SystemMonitor::monitor_loop() { + while (running_.load()) { + try { + SystemHealth health = check_health(); + + { + std::lock_guard lock(health_mutex_); + current_health_ = health; + } + + check_thresholds(health); + + } catch (const std::exception& e) { + LOG_ERROR("SystemMonitor", "Error in monitoring loop: " + std::string(e.what())); + } + + // Sleep with periodic checks for shutdown + for (int i = 0; i < check_interval_seconds_ && running_.load(); ++i) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + } +} + +SystemHealth SystemMonitor::check_health() { + SystemHealth health; + + // CPU + health.cpu_usage_percent = get_cpu_usage(); + long cores = sysconf(_SC_NPROCESSORS_ONLN); + // sysconf returns -1 on error, ensure at least 1 core + health.cpu_cores = (cores > 0) ? static_cast(cores) : 1; + + // Memory + get_memory_usage(health.memory_total_bytes, + health.memory_used_bytes, + health.memory_available_bytes); + if (health.memory_total_bytes > 0) { + health.memory_usage_percent = + (static_cast(health.memory_used_bytes) / health.memory_total_bytes) * 100.0; + } else { + health.memory_usage_percent = 0.0; + } + + // Disk + get_disk_usage(health.disk_total_bytes, + health.disk_used_bytes, + health.disk_available_bytes, + health.disk_mount_point); + if (health.disk_total_bytes > 0) { + health.disk_usage_percent = + (static_cast(health.disk_used_bytes) / health.disk_total_bytes) * 100.0; + } else { + health.disk_usage_percent = 0.0; + } + + // System + health.uptime_seconds = get_uptime(); + health.failed_services_count = get_failed_services_count(); + + return health; +} + +std::string SystemMonitor::read_proc_file_cached(const std::string& path, ProcFileCache& cache) const { + auto now = std::chrono::steady_clock::now(); + + std::lock_guard lock(proc_cache_mutex_); + + // Check if cache is valid + auto elapsed = std::chrono::duration_cast(now - cache.timestamp); + if (!cache.content.empty() && elapsed < ProcFileCache::ttl) { + return cache.content; + } + + // Read file + std::ifstream file(path); + if (!file.is_open()) { + return ""; + } + + std::string content; + std::string line; + while (std::getline(file, line)) { + content += line + "\n"; + } + + // Update cache + cache.content = content; + cache.timestamp = now; + + return content; +} + +double SystemMonitor::get_cpu_usage() const { + std::string stat_content = read_proc_file_cached("/proc/stat", proc_stat_cache_); + if (stat_content.empty()) { + return 0.0; + } + + std::istringstream stat_file(stat_content); + std::string line; + if (!std::getline(stat_file, line)) { + return 0.0; + } + + std::istringstream iss(line); + std::string cpu; + long user, nice, system, idle, iowait, irq, softirq, steal; + + iss >> cpu >> user >> nice >> system >> idle >> iowait >> irq >> softirq >> steal; + + if (iss.fail()) { + return 0.0; + } + + long total_idle = idle + iowait; + long total_non_idle = user + nice + system + irq + softirq + steal; + long total = total_idle + total_non_idle; + + auto now = std::chrono::steady_clock::now(); + + // Thread-safe access to CPU state + std::lock_guard lock(cpu_state_mutex_); + + auto elapsed = std::chrono::duration_cast(now - last_cpu_time_).count(); + + if (cpu_first_call_ || elapsed < 100) { + // First call or not enough time elapsed, initialize and return 0 + last_cpu_idle_ = static_cast(total_idle); + last_cpu_total_ = static_cast(total); + last_cpu_time_ = now; + cpu_first_call_ = false; + return 0.0; + } + + double idle_diff = static_cast(total_idle) - last_cpu_idle_; + double total_diff = static_cast(total) - last_cpu_total_; + + last_cpu_idle_ = static_cast(total_idle); + last_cpu_total_ = static_cast(total); + last_cpu_time_ = now; + + if (total_diff == 0.0) { + return 0.0; + } + + double cpu_percent = (1.0 - (idle_diff / total_diff)) * 100.0; + return std::max(0.0, std::min(100.0, cpu_percent)); +} + +void SystemMonitor::get_memory_usage(uint64_t& total, uint64_t& used, uint64_t& available) const { + std::string meminfo_content = read_proc_file_cached("/proc/meminfo", proc_meminfo_cache_); + if (meminfo_content.empty()) { + total = used = available = 0; + return; + } + + std::istringstream meminfo(meminfo_content); + uint64_t mem_total = 0, mem_free = 0, mem_available = 0, buffers = 0, cached = 0; + std::string line; + + while (std::getline(meminfo, line)) { + std::istringstream iss(line); + std::string key; + uint64_t value; + std::string unit; + + iss >> key >> value >> unit; + + if (key == "MemTotal:") { + mem_total = value * 1024; // Convert from KB to bytes + } else if (key == "MemFree:") { + mem_free = value * 1024; + } else if (key == "MemAvailable:") { + mem_available = value * 1024; + } else if (key == "Buffers:") { + buffers = value * 1024; + } else if (key == "Cached:") { + cached = value * 1024; + } + } + + total = mem_total; + + if (mem_available > 0) { + // Use MemAvailable if available (more accurate) + available = mem_available; + used = total - available; + } else { + // Fallback calculation + available = mem_free + buffers + cached; + used = total - available; + } +} + +void SystemMonitor::get_disk_usage(uint64_t& total, uint64_t& used, uint64_t& available, std::string& mount_point) const { + // Monitor root filesystem by default + mount_point = "/"; + + struct statvfs stat; + if (statvfs("/", &stat) != 0) { + total = used = available = 0; + return; + } + + total = static_cast(stat.f_blocks) * stat.f_frsize; + available = static_cast(stat.f_bavail) * stat.f_frsize; + used = total - (static_cast(stat.f_bfree) * stat.f_frsize); +} + +uint64_t SystemMonitor::get_uptime() const { + std::string uptime_content = read_proc_file_cached("/proc/uptime", proc_uptime_cache_); + if (uptime_content.empty()) { + return 0; + } + + std::istringstream uptime_file(uptime_content); + double uptime_seconds; + uptime_file >> uptime_seconds; + + return static_cast(uptime_seconds); +} + +void* SystemMonitor::get_systemd_bus() const { + std::lock_guard lock(bus_mutex_); + + // Reuse existing connection if available + if (systemd_bus_) { + return systemd_bus_; + } + + // Create new connection + sd_bus* bus = nullptr; + int r = sd_bus_default_system(&bus); + if (r < 0) { + LOG_DEBUG("SystemMonitor", "Failed to connect to systemd bus"); + return nullptr; + } + + systemd_bus_ = bus; + return systemd_bus_; +} + +int SystemMonitor::get_failed_services_count() const { + sd_bus* bus = static_cast(get_systemd_bus()); + if (!bus) { + return 0; + } + + sd_bus_message* reply = nullptr; + int r = sd_bus_call_method( + bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "ListUnits", + nullptr, + &reply, + "" + ); + + int failed_count = 0; + + if (r >= 0 && reply) { + r = sd_bus_message_enter_container(reply, 'a', "(ssssssouso)"); + if (r > 0) { + while (true) { + const char* name = nullptr; + const char* desc = nullptr; + const char* load = nullptr; + const char* active = nullptr; + const char* sub = nullptr; + const char* following = nullptr; + const char* object_path = nullptr; + uint32_t job_id = 0; + const char* job_type = nullptr; + const char* job_path = nullptr; + + r = sd_bus_message_read(reply, "(ssssssouso)", + &name, &desc, &load, &active, &sub, &following, + &object_path, &job_id, &job_type, &job_path); + + if (r <= 0) { + break; + } + + if (active && std::string(active) == "failed") { + failed_count++; + } + } + } + sd_bus_message_unref(reply); + } else if (r < 0) { + // Connection error - reset bus connection for next call + LOG_DEBUG("SystemMonitor", "systemd bus call failed, will reconnect next time"); + std::lock_guard lock(bus_mutex_); + if (systemd_bus_) { + sd_bus_unref(static_cast(systemd_bus_)); + systemd_bus_ = nullptr; + } + } + + return failed_count; +} + +void SystemMonitor::check_thresholds(const SystemHealth& health) { + // CPU checks + if (health.cpu_usage_percent >= thresholds_.cpu_critical) { + create_basic_alert( + AlertSeverity::CRITICAL, + AlertCategory::CPU, + "system_monitor", + "CPU usage critical", + "CPU usage is at " + std::to_string(static_cast(health.cpu_usage_percent)) + + "% (threshold: " + std::to_string(static_cast(thresholds_.cpu_critical)) + "%)" + ); + } else if (health.cpu_usage_percent >= thresholds_.cpu_warning) { + create_basic_alert( + AlertSeverity::WARNING, + AlertCategory::CPU, + "system_monitor", + "CPU usage high", + "CPU usage is at " + std::to_string(static_cast(health.cpu_usage_percent)) + + "% (threshold: " + std::to_string(static_cast(thresholds_.cpu_warning)) + "%)" + ); + } + + // Memory checks + if (health.memory_usage_percent >= thresholds_.memory_critical) { + create_basic_alert( + AlertSeverity::CRITICAL, + AlertCategory::MEMORY, + "system_monitor", + "Memory usage critical", + "Memory usage is at " + std::to_string(static_cast(health.memory_usage_percent)) + + "% (threshold: " + std::to_string(static_cast(thresholds_.memory_critical)) + "%)" + ); + } else if (health.memory_usage_percent >= thresholds_.memory_warning) { + create_basic_alert( + AlertSeverity::WARNING, + AlertCategory::MEMORY, + "system_monitor", + "Memory usage high", + "Memory usage is at " + std::to_string(static_cast(health.memory_usage_percent)) + + "% (threshold: " + std::to_string(static_cast(thresholds_.memory_warning)) + "%)" + ); + } + + // Disk checks + if (health.disk_usage_percent >= thresholds_.disk_critical) { + create_basic_alert( + AlertSeverity::CRITICAL, + AlertCategory::DISK, + "system_monitor", + "Disk usage critical", + "Disk usage on " + health.disk_mount_point + " is at " + + std::to_string(static_cast(health.disk_usage_percent)) + + "% (threshold: " + std::to_string(static_cast(thresholds_.disk_critical)) + "%)" + ); + } else if (health.disk_usage_percent >= thresholds_.disk_warning) { + create_basic_alert( + AlertSeverity::WARNING, + AlertCategory::DISK, + "system_monitor", + "Disk usage high", + "Disk usage on " + health.disk_mount_point + " is at " + + std::to_string(static_cast(health.disk_usage_percent)) + + "% (threshold: " + std::to_string(static_cast(thresholds_.disk_warning)) + "%)" + ); + } + + // Failed services check + if (health.failed_services_count > 0) { + create_basic_alert( + AlertSeverity::ERROR, + AlertCategory::SERVICE, + "system_monitor", + "Failed systemd services detected", + std::to_string(health.failed_services_count) + " systemd service(s) are in failed state" + ); + } +} + +void SystemMonitor::create_basic_alert( + AlertSeverity severity, + AlertCategory category, + const std::string& source, + const std::string& message, + const std::string& description +) { + if (!alert_manager_) { + return; + } + + // Create hash key for O(1) deduplication check + std::string alert_key = std::to_string(static_cast(category)) + ":" + + std::to_string(static_cast(severity)) + ":" + + source + ":" + message; + + // Fix race condition: use atomic check-and-insert pattern + // This prevents two threads from both creating the same alert + { + std::lock_guard lock(alert_keys_mutex_); + // Check if already exists + if (active_alert_keys_.find(alert_key) != active_alert_keys_.end()) { + // Alert already exists, don't create duplicate + return; + } + + // Insert BEFORE creating alert to prevent race condition + // If creation fails, we'll remove it below + active_alert_keys_.insert(alert_key); + } + + Alert alert; + alert.severity = severity; + alert.category = category; + alert.source = source; + alert.message = message; + alert.description = description; + alert.status = AlertStatus::ACTIVE; + alert.timestamp = std::chrono::system_clock::now(); + + auto created = alert_manager_->create_alert(alert); + if (!created.has_value()) { + // Creation failed - remove from hash set + std::lock_guard lock(alert_keys_mutex_); + active_alert_keys_.erase(alert_key); + return; + } + + LOG_DEBUG("SystemMonitor", "Created alert: " + message); +} + +} // namespace cortexd diff --git a/daemon/tests/CMakeLists.txt b/daemon/tests/CMakeLists.txt index 9406865b..f4ab6a96 100644 --- a/daemon/tests/CMakeLists.txt +++ b/daemon/tests/CMakeLists.txt @@ -7,6 +7,8 @@ set(DAEMON_TEST_SOURCES ${CMAKE_SOURCE_DIR}/src/ipc/server.cpp ${CMAKE_SOURCE_DIR}/src/ipc/protocol.cpp ${CMAKE_SOURCE_DIR}/src/ipc/handlers.cpp + ${CMAKE_SOURCE_DIR}/src/monitor/system_monitor.cpp + ${CMAKE_SOURCE_DIR}/src/alerts/alert_manager.cpp ${CMAKE_SOURCE_DIR}/src/utils/logger.cpp ) @@ -15,14 +17,14 @@ add_library(cortexd_lib STATIC ${DAEMON_TEST_SOURCES}) target_include_directories(cortexd_lib PUBLIC ${CMAKE_SOURCE_DIR}/include ${SYSTEMD_INCLUDE_DIRS} - ${OPENSSL_INCLUDE_DIRS} ${UUID_INCLUDE_DIRS} + ${SQLITE3_INCLUDE_DIRS} ) target_link_libraries(cortexd_lib PUBLIC ${SYSTEMD_LIBRARIES} - ${OPENSSL_LIBRARIES} ${UUID_LIBRARIES} + ${SQLITE3_LIBRARIES} nlohmann_json::nlohmann_json yaml-cpp::yaml-cpp pthread @@ -63,6 +65,12 @@ add_executable(test_common target_link_libraries(test_common PRIVATE cortexd_lib GTest::gtest_main) add_test(NAME test_common COMMAND test_common) +add_executable(test_alert_manager + unit/test_alert_manager.cpp +) +target_link_libraries(test_alert_manager PRIVATE cortexd_lib GTest::gtest_main) +add_test(NAME test_alert_manager COMMAND test_alert_manager) + # Integration tests add_executable(test_ipc_server integration/test_ipc_server.cpp @@ -85,6 +93,6 @@ add_test(NAME test_daemon COMMAND test_daemon) # Add custom target to run all tests add_custom_target(run_tests COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure - DEPENDS test_config test_protocol test_rate_limiter test_logger test_common test_ipc_server test_handlers test_daemon + DEPENDS test_config test_protocol test_rate_limiter test_logger test_common test_alert_manager test_ipc_server test_handlers test_daemon COMMENT "Running all cortexd tests" ) diff --git a/daemon/tests/integration/test_handlers.cpp b/daemon/tests/integration/test_handlers.cpp index 4675eaeb..ddfff48a 100644 --- a/daemon/tests/integration/test_handlers.cpp +++ b/daemon/tests/integration/test_handlers.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include #include @@ -17,6 +19,9 @@ #include "cortexd/config.h" #include "cortexd/core/daemon.h" #include "cortexd/logger.h" +#include "cortexd/monitor/system_monitor.h" +#include "cortexd/alerts/alert_manager.h" +#include namespace fs = std::filesystem; @@ -52,11 +57,18 @@ log_level: 1 } void TearDown() override { + if (system_monitor_) { + system_monitor_->stop(); + system_monitor_.reset(); + } + if (server_) { server_->stop(); server_.reset(); } + alert_manager_.reset(); + fs::remove_all(temp_dir_); cortexd::Logger::shutdown(); } @@ -69,6 +81,37 @@ log_level: 1 std::this_thread::sleep_for(std::chrono::milliseconds(50)); } + void start_server_with_monitoring() { + auto config = cortexd::ConfigManager::instance().get(); + server_ = std::make_unique(socket_path_, config.max_requests_per_sec); + + // Create alert manager + std::string alert_db = (temp_dir_ / "alerts.db").string(); + alert_manager_ = std::make_shared(alert_db); + ASSERT_TRUE(alert_manager_->initialize()); + + // Create system monitor with explicit thresholds (matching defaults) + cortexd::MonitoringThresholds thresholds; + thresholds.cpu_warning = 80.0; + thresholds.cpu_critical = 95.0; + thresholds.memory_warning = 80.0; + thresholds.memory_critical = 95.0; + thresholds.disk_warning = 80.0; + thresholds.disk_critical = 95.0; + system_monitor_ = std::make_unique(alert_manager_, 60, thresholds); + + // Start the monitor to populate health data + ASSERT_TRUE(system_monitor_->start()); + + // Register handlers with monitoring + cortexd::Handlers::register_all(*server_, system_monitor_.get(), alert_manager_); + ASSERT_TRUE(server_->start()); + + // Wait for monitor thread to start and run at least once to populate health data + // The monitor loop calls check_health() immediately when it starts + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + } + std::string send_request(const std::string& request) { int sock = socket(AF_UNIX, SOCK_STREAM, 0); if (sock == -1) return ""; @@ -112,13 +155,19 @@ log_level: 1 return cortexd::json{{"error", "empty response"}}; } - return cortexd::json::parse(response); + try { + return cortexd::json::parse(response); + } catch (const std::exception& e) { + return cortexd::json{{"error", "json parse error"}, {"message", e.what()}}; + } } fs::path temp_dir_; std::string socket_path_; std::string config_path_; std::unique_ptr server_; + std::shared_ptr alert_manager_; + std::unique_ptr system_monitor_; }; // ============================================================================ @@ -270,34 +319,144 @@ TEST_F(HandlersTest, UnknownMethodReturnsError) { EXPECT_EQ(response["error"]["code"], cortexd::ErrorCodes::METHOD_NOT_FOUND); } -TEST_F(HandlersTest, StatusMethodNotAvailableInPR1) { - start_server_with_handlers(); +// ============================================================================ +// Health handler tests +// ============================================================================ + +TEST_F(HandlersTest, HealthReturnsSystemMetrics) { + start_server_with_monitoring(); - // Status handler is not registered in PR 1 - auto response = send_json_request("status"); + auto response = send_json_request("health"); - EXPECT_FALSE(response["success"]); - EXPECT_EQ(response["error"]["code"], cortexd::ErrorCodes::METHOD_NOT_FOUND); + EXPECT_TRUE(response["success"]); + EXPECT_TRUE(response["result"].contains("cpu")); + EXPECT_TRUE(response["result"].contains("memory")); + EXPECT_TRUE(response["result"].contains("disk")); + EXPECT_TRUE(response["result"].contains("system")); + EXPECT_TRUE(response["result"].contains("thresholds")); } -TEST_F(HandlersTest, HealthMethodNotAvailableInPR1) { - start_server_with_handlers(); +TEST_F(HandlersTest, HealthReturnsValidCpuMetrics) { + start_server_with_monitoring(); - // Health handler is not registered in PR 1 auto response = send_json_request("health"); - EXPECT_FALSE(response["success"]); - EXPECT_EQ(response["error"]["code"], cortexd::ErrorCodes::METHOD_NOT_FOUND); + EXPECT_TRUE(response["success"]); + auto cpu = response["result"]["cpu"]; + EXPECT_TRUE(cpu.contains("usage_percent")); + EXPECT_TRUE(cpu.contains("cores")); + EXPECT_GE(cpu["usage_percent"], 0.0); + EXPECT_LE(cpu["usage_percent"], 100.0); + EXPECT_GT(cpu["cores"], 0); } -TEST_F(HandlersTest, AlertsMethodNotAvailableInPR1) { - start_server_with_handlers(); +TEST_F(HandlersTest, HealthReturnsValidMemoryMetrics) { + start_server_with_monitoring(); + + auto response = send_json_request("health"); + + EXPECT_TRUE(response["success"]); + auto memory = response["result"]["memory"]; + EXPECT_TRUE(memory.contains("usage_percent")); + EXPECT_TRUE(memory.contains("total_bytes")); + EXPECT_TRUE(memory.contains("used_bytes")); + EXPECT_TRUE(memory.contains("available_bytes")); + EXPECT_GE(memory["usage_percent"], 0.0); + EXPECT_LE(memory["usage_percent"], 100.0); +} + +// ============================================================================ +// Alerts handler tests +// ============================================================================ + +TEST_F(HandlersTest, AlertsGetReturnsAlertsList) { + start_server_with_monitoring(); - // Alerts handler is not registered in PR 1 auto response = send_json_request("alerts"); - EXPECT_FALSE(response["success"]); - EXPECT_EQ(response["error"]["code"], cortexd::ErrorCodes::METHOD_NOT_FOUND); + EXPECT_TRUE(response["success"]); + EXPECT_TRUE(response["result"].contains("alerts")); + EXPECT_TRUE(response["result"].contains("count")); + EXPECT_TRUE(response["result"].contains("counts")); + EXPECT_TRUE(response["result"]["alerts"].is_array()); +} + +TEST_F(HandlersTest, AlertsGetWithSeverityFilter) { + start_server_with_monitoring(); + + // Create a test alert + cortexd::Alert alert; + alert.severity = cortexd::AlertSeverity::WARNING; + alert.category = cortexd::AlertCategory::CPU; + alert.source = "test"; + alert.message = "Test warning"; + alert.status = cortexd::AlertStatus::ACTIVE; + alert_manager_->create_alert(alert); + + auto response = send_json_request("alerts", {{"severity", "warning"}}); + + EXPECT_TRUE(response["success"]); + auto alerts = response["result"]["alerts"]; + EXPECT_GE(alerts.size(), 1); + + // All returned alerts should be warnings + for (const auto& a : alerts) { + EXPECT_EQ(a["severity_name"], "warning"); + } +} + +TEST_F(HandlersTest, AlertsAcknowledgeAll) { + start_server_with_monitoring(); + + // Create multiple alerts + for (int i = 0; i < 3; ++i) { + cortexd::Alert alert; + alert.severity = cortexd::AlertSeverity::INFO; + alert.category = cortexd::AlertCategory::SYSTEM; + alert.source = "test"; + alert.message = "Test alert " + std::to_string(i); + alert.status = cortexd::AlertStatus::ACTIVE; + alert_manager_->create_alert(alert); + } + + auto response = send_json_request("alerts.acknowledge", {{"all", true}}); + + EXPECT_TRUE(response["success"]); + EXPECT_GE(response["result"]["acknowledged"], 3); +} + +TEST_F(HandlersTest, AlertsDismiss) { + start_server_with_monitoring(); + + // Create an alert + cortexd::Alert alert; + alert.severity = cortexd::AlertSeverity::WARNING; + alert.category = cortexd::AlertCategory::CPU; + alert.source = "test"; + alert.message = "Test alert"; + alert.status = cortexd::AlertStatus::ACTIVE; + auto created = alert_manager_->create_alert(alert); + ASSERT_TRUE(created.has_value()); + + auto response = send_json_request("alerts.dismiss", {{"uuid", created->uuid}}); + + EXPECT_TRUE(response["success"]); + EXPECT_TRUE(response["result"]["dismissed"]); + EXPECT_EQ(response["result"]["uuid"], created->uuid); + + // Verify it's dismissed + auto get_response = send_json_request("alerts"); + auto alerts = get_response["result"]["alerts"]; + bool found = false; + for (const auto& a : alerts) { + if (a["uuid"] == created->uuid) { + found = true; + EXPECT_EQ(a["status_name"], "dismissed"); + break; + } + } + // Dismissed alerts are excluded by default, so it shouldn't be in the list + EXPECT_FALSE(found); } // ============================================================================ diff --git a/daemon/tests/unit/test_alert_manager.cpp b/daemon/tests/unit/test_alert_manager.cpp new file mode 100644 index 00000000..2fc6a5aa --- /dev/null +++ b/daemon/tests/unit/test_alert_manager.cpp @@ -0,0 +1,268 @@ +/** + * @file test_alert_manager.cpp + * @brief Unit tests for AlertManager + */ + +#include +#include "cortexd/alerts/alert_manager.h" +#include +#include +#include + +using namespace cortexd; + +class AlertManagerTest : public ::testing::Test { +protected: + void SetUp() override { + // Create temporary database path + test_db_path_ = "/tmp/test_alerts_" + std::to_string(getpid()) + ".db"; + + // Remove test database if it exists + if (std::filesystem::exists(test_db_path_)) { + std::filesystem::remove(test_db_path_); + } + + alert_manager_ = std::make_unique(test_db_path_); + ASSERT_TRUE(alert_manager_->initialize()); + } + + void TearDown() override { + alert_manager_.reset(); + + // Clean up test database + if (std::filesystem::exists(test_db_path_)) { + std::filesystem::remove(test_db_path_); + } + } + + std::string test_db_path_; + std::unique_ptr alert_manager_; +}; + +TEST_F(AlertManagerTest, CreateAlert) { + Alert alert; + alert.severity = AlertSeverity::WARNING; + alert.category = AlertCategory::CPU; + alert.source = "test_source"; + alert.message = "Test alert message"; + alert.description = "Test alert description"; + alert.status = AlertStatus::ACTIVE; + alert.timestamp = std::chrono::system_clock::now(); + + auto created = alert_manager_->create_alert(alert); + ASSERT_TRUE(created.has_value()); + ASSERT_FALSE(created->uuid.empty()); + ASSERT_EQ(created->message, "Test alert message"); +} + +TEST_F(AlertManagerTest, GetAlert) { + Alert alert; + alert.severity = AlertSeverity::ERROR; + alert.category = AlertCategory::MEMORY; + alert.source = "test_source"; + alert.message = "Test alert"; + alert.status = AlertStatus::ACTIVE; + + auto created = alert_manager_->create_alert(alert); + ASSERT_TRUE(created.has_value()); + + auto retrieved = alert_manager_->get_alert(created->uuid); + ASSERT_TRUE(retrieved.has_value()); + ASSERT_EQ(retrieved->uuid, created->uuid); + ASSERT_EQ(retrieved->message, "Test alert"); + ASSERT_EQ(retrieved->severity, AlertSeverity::ERROR); +} + +TEST_F(AlertManagerTest, GetAlertsFilterBySeverity) { + // Create alerts with different severities + Alert alert1; + alert1.severity = AlertSeverity::WARNING; + alert1.category = AlertCategory::CPU; + alert1.source = "test"; + alert1.message = "Warning alert"; + alert1.status = AlertStatus::ACTIVE; + alert_manager_->create_alert(alert1); + + Alert alert2; + alert2.severity = AlertSeverity::ERROR; + alert2.category = AlertCategory::MEMORY; + alert2.source = "test"; + alert2.message = "Error alert"; + alert2.status = AlertStatus::ACTIVE; + alert_manager_->create_alert(alert2); + + AlertFilter filter; + filter.severity = AlertSeverity::WARNING; + auto alerts = alert_manager_->get_alerts(filter); + + ASSERT_EQ(alerts.size(), 1); + ASSERT_EQ(alerts[0].severity, AlertSeverity::WARNING); +} + +TEST_F(AlertManagerTest, GetAlertsFilterByCategory) { + Alert alert1; + alert1.severity = AlertSeverity::INFO; + alert1.category = AlertCategory::CPU; + alert1.source = "test"; + alert1.message = "CPU alert"; + alert1.status = AlertStatus::ACTIVE; + alert_manager_->create_alert(alert1); + + Alert alert2; + alert2.severity = AlertSeverity::INFO; + alert2.category = AlertCategory::DISK; + alert2.source = "test"; + alert2.message = "Disk alert"; + alert2.status = AlertStatus::ACTIVE; + alert_manager_->create_alert(alert2); + + AlertFilter filter; + filter.category = AlertCategory::CPU; + auto alerts = alert_manager_->get_alerts(filter); + + ASSERT_EQ(alerts.size(), 1); + ASSERT_EQ(alerts[0].category, AlertCategory::CPU); +} + +TEST_F(AlertManagerTest, AcknowledgeAlert) { + Alert alert; + alert.severity = AlertSeverity::WARNING; + alert.category = AlertCategory::CPU; + alert.source = "test"; + alert.message = "Test alert"; + alert.status = AlertStatus::ACTIVE; + + auto created = alert_manager_->create_alert(alert); + ASSERT_TRUE(created.has_value()); + + bool acknowledged = alert_manager_->acknowledge_alert(created->uuid); + ASSERT_TRUE(acknowledged); + + auto retrieved = alert_manager_->get_alert(created->uuid); + ASSERT_TRUE(retrieved.has_value()); + ASSERT_EQ(retrieved->status, AlertStatus::ACKNOWLEDGED); + ASSERT_TRUE(retrieved->acknowledged_at.has_value()); +} + +TEST_F(AlertManagerTest, AcknowledgeAll) { + // Create multiple active alerts + for (int i = 0; i < 3; ++i) { + Alert alert; + alert.severity = AlertSeverity::WARNING; + alert.category = AlertCategory::CPU; + alert.source = "test"; + alert.message = "Alert " + std::to_string(i); + alert.status = AlertStatus::ACTIVE; + alert_manager_->create_alert(alert); + } + + size_t count = alert_manager_->acknowledge_all(); + ASSERT_EQ(count, 3); + + AlertFilter filter; + filter.status = AlertStatus::ACKNOWLEDGED; + auto alerts = alert_manager_->get_alerts(filter); + ASSERT_EQ(alerts.size(), 3); +} + +TEST_F(AlertManagerTest, DismissAlert) { + Alert alert; + alert.severity = AlertSeverity::WARNING; + alert.category = AlertCategory::CPU; + alert.source = "test"; + alert.message = "Test alert"; + alert.status = AlertStatus::ACTIVE; + + auto created = alert_manager_->create_alert(alert); + ASSERT_TRUE(created.has_value()); + + bool dismissed = alert_manager_->dismiss_alert(created->uuid); + ASSERT_TRUE(dismissed); + + auto retrieved = alert_manager_->get_alert(created->uuid); + ASSERT_TRUE(retrieved.has_value()); + ASSERT_EQ(retrieved->status, AlertStatus::DISMISSED); + ASSERT_TRUE(retrieved->dismissed_at.has_value()); +} + +TEST_F(AlertManagerTest, GetAlertCounts) { + // Create alerts with different severities + Alert alert1; + alert1.severity = AlertSeverity::INFO; + alert1.category = AlertCategory::CPU; + alert1.source = "test"; + alert1.message = "Info alert"; + alert1.status = AlertStatus::ACTIVE; + alert_manager_->create_alert(alert1); + + Alert alert2; + alert2.severity = AlertSeverity::WARNING; + alert2.category = AlertCategory::MEMORY; + alert2.source = "test"; + alert2.message = "Warning alert"; + alert2.status = AlertStatus::ACTIVE; + alert_manager_->create_alert(alert2); + + Alert alert3; + alert3.severity = AlertSeverity::ERROR; + alert3.category = AlertCategory::DISK; + alert3.source = "test"; + alert3.message = "Error alert"; + alert3.status = AlertStatus::ACTIVE; + alert_manager_->create_alert(alert3); + + auto counts = alert_manager_->get_alert_counts(); + ASSERT_EQ(counts["info"], 1); + ASSERT_EQ(counts["warning"], 1); + ASSERT_EQ(counts["error"], 1); + ASSERT_EQ(counts["total"], 3); +} + +TEST_F(AlertManagerTest, AlertJsonConversion) { + Alert alert; + alert.uuid = AlertManager::generate_uuid(); + alert.severity = AlertSeverity::CRITICAL; + alert.category = AlertCategory::CPU; + alert.source = "test_source"; + alert.message = "Critical alert"; + alert.description = "Test description"; + alert.status = AlertStatus::ACTIVE; + alert.timestamp = std::chrono::system_clock::now(); + + json j = alert.to_json(); + ASSERT_EQ(j["uuid"], alert.uuid); + ASSERT_EQ(j["severity"], static_cast(AlertSeverity::CRITICAL)); + ASSERT_EQ(j["severity_name"], "critical"); + ASSERT_EQ(j["message"], "Critical alert"); + + Alert restored = Alert::from_json(j); + ASSERT_EQ(restored.uuid, alert.uuid); + ASSERT_EQ(restored.severity, AlertSeverity::CRITICAL); + ASSERT_EQ(restored.message, "Critical alert"); +} + +TEST_F(AlertManagerTest, ExcludeDismissedAlerts) { + Alert alert1; + alert1.severity = AlertSeverity::WARNING; + alert1.category = AlertCategory::CPU; + alert1.source = "test"; + alert1.message = "Active alert"; + alert1.status = AlertStatus::ACTIVE; + auto created1 = alert_manager_->create_alert(alert1); + + Alert alert2; + alert2.severity = AlertSeverity::WARNING; + alert2.category = AlertCategory::CPU; + alert2.source = "test"; + alert2.message = "Dismissed alert"; + alert2.status = AlertStatus::ACTIVE; + auto created2 = alert_manager_->create_alert(alert2); + + alert_manager_->dismiss_alert(created2->uuid); + + // Default filter should exclude dismissed + AlertFilter filter; + auto alerts = alert_manager_->get_alerts(filter); + ASSERT_EQ(alerts.size(), 1); + ASSERT_EQ(alerts[0].uuid, created1->uuid); +} diff --git a/daemon/tests/unit/test_common.cpp b/daemon/tests/unit/test_common.cpp index 661110e3..f3816fc7 100644 --- a/daemon/tests/unit/test_common.cpp +++ b/daemon/tests/unit/test_common.cpp @@ -1,8 +1,6 @@ /** * @file test_common.cpp - * @brief Unit tests for common.h constants and types (PR1 scope only) - * - * PR1 includes: Core daemon, IPC server, config management + * @brief Unit tests for common.h constants and types */ #include @@ -15,7 +13,7 @@ class CommonTest : public ::testing::Test { }; // ============================================================================ -// Version and Name constants (PR1) +// Version and Name constants // ============================================================================ TEST_F(CommonTest, VersionIsDefined) { @@ -29,7 +27,7 @@ TEST_F(CommonTest, NameIsDefined) { } // ============================================================================ -// Socket constants (PR1 - used by IPC server) +// Socket constants // ============================================================================ TEST_F(CommonTest, DefaultSocketPathIsDefined) { @@ -52,7 +50,7 @@ TEST_F(CommonTest, MaxMessageSizeIsPositive) { } // ============================================================================ -// Startup time target (PR1 - daemon startup performance) +// Startup time target // ============================================================================ TEST_F(CommonTest, StartupTimeTargetIsDefined) { @@ -62,7 +60,7 @@ TEST_F(CommonTest, StartupTimeTargetIsDefined) { } // ============================================================================ -// Clock type alias (PR1 - used in IPC protocol) +// Clock type alias // ============================================================================ TEST_F(CommonTest, ClockTypeAliasIsDefined) { diff --git a/daemon/tests/unit/test_protocol.cpp b/daemon/tests/unit/test_protocol.cpp index 76802f77..fbc7dfcb 100644 --- a/daemon/tests/unit/test_protocol.cpp +++ b/daemon/tests/unit/test_protocol.cpp @@ -301,22 +301,15 @@ TEST_F(ProtocolTest, ResponseToJsonIncludesTimestamp) { } // ============================================================================ -// Methods namespace tests (PR1 methods only) +// Methods namespace tests // ============================================================================ -TEST_F(ProtocolTest, PR1MethodConstantsAreDefined) { - // PR1 available methods: ping, version, config.get, config.reload, shutdown +TEST_F(ProtocolTest, MethodConstantsAreDefined) { EXPECT_STREQ(cortexd::Methods::PING, "ping"); EXPECT_STREQ(cortexd::Methods::VERSION, "version"); EXPECT_STREQ(cortexd::Methods::CONFIG_GET, "config.get"); EXPECT_STREQ(cortexd::Methods::CONFIG_RELOAD, "config.reload"); EXPECT_STREQ(cortexd::Methods::SHUTDOWN, "shutdown"); -} - -TEST_F(ProtocolTest, PR2MethodConstantsAreDefined) { - // PR2 methods are defined in protocol.h but handlers not registered in PR1 - // These constants exist for forward compatibility - EXPECT_STREQ(cortexd::Methods::STATUS, "status"); EXPECT_STREQ(cortexd::Methods::HEALTH, "health"); EXPECT_STREQ(cortexd::Methods::ALERTS, "alerts"); } From 09aeab14a01c07087f1119d6775d93130a9cb916 Mon Sep 17 00:00:00 2001 From: sujay-d07 Date: Wed, 21 Jan 2026 16:45:03 +0530 Subject: [PATCH 02/12] style(cli): Update UUID column style in alert table and fix UUID display - Modified the UUID display to show the full UUID instead of truncating it, enhancing clarity in alert information. --- cortex/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cortex/cli.py b/cortex/cli.py index 2680fb77..b4295c64 100644 --- a/cortex/cli.py +++ b/cortex/cli.py @@ -2739,7 +2739,7 @@ def _daemon_alerts(self, args: argparse.Namespace) -> int: from rich.table import Table table = Table(show_header=True, header_style="bold cyan") - table.add_column("UUID", style="dim") + table.add_column("UUID", style="bold cyan") table.add_column("Severity", style="bold") table.add_column("Category") table.add_column("Source") @@ -2757,7 +2757,7 @@ def _daemon_alerts(self, args: argparse.Namespace) -> int: }.get(severity_name, "white") table.add_row( - alert.get("uuid", "")[:8] + "...", + alert.get("uuid", ""), f"[{severity_color}]{severity_name.upper()}[/{severity_color}]", alert.get("category_name", "unknown"), alert.get("source", "unknown"), From 486b52b4cff7d49cbfc8b7229c9e11d250865352 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Wed, 21 Jan 2026 11:23:04 +0000 Subject: [PATCH 03/12] [autofix.ci] apply automated fixes --- cortex/cli.py | 45 +++++++++++++++++++--------------- daemon/scripts/setup_daemon.py | 24 ++++++++---------- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/cortex/cli.py b/cortex/cli.py index b4295c64..a6bc6020 100644 --- a/cortex/cli.py +++ b/cortex/cli.py @@ -2584,8 +2584,7 @@ def _daemon_health(self) -> int: usage = cpu.get("usage_percent", 0) color = "red" if usage >= 95 else "yellow" if usage >= 80 else "green" health_table.add_row( - "CPU Usage", - f"[{color}]{usage:.1f}%[/{color}] ({cpu.get('cores', 0)} cores)" + "CPU Usage", f"[{color}]{usage:.1f}%[/{color}] ({cpu.get('cores', 0)} cores)" ) # Memory @@ -2597,7 +2596,7 @@ def _daemon_health(self) -> int: mem_total_gb = mem.get("total_bytes", 0) / (1024**3) health_table.add_row( "Memory Usage", - f"[{color}]{usage:.1f}%[/{color}] ({mem_gb:.2f}GB / {mem_total_gb:.2f}GB)" + f"[{color}]{usage:.1f}%[/{color}] ({mem_gb:.2f}GB / {mem_total_gb:.2f}GB)", ) # Disk @@ -2610,7 +2609,7 @@ def _daemon_health(self) -> int: mount_point = disk.get("mount_point", "/") health_table.add_row( f"Disk Usage ({mount_point})", - f"[{color}]{usage:.1f}%[/{color}] ({disk_gb:.2f}GB / {disk_total_gb:.2f}GB)" + f"[{color}]{usage:.1f}%[/{color}] ({disk_gb:.2f}GB / {disk_total_gb:.2f}GB)", ) # System info @@ -2618,7 +2617,7 @@ def _daemon_health(self) -> int: sys_info = result["system"] uptime_hours = sys_info.get("uptime_seconds", 0) / 3600 health_table.add_row("System Uptime", f"{uptime_hours:.1f} hours") - + failed = sys_info.get("failed_services_count", 0) if failed > 0: health_table.add_row("Failed Services", f"[red]{failed}[/red]") @@ -2627,12 +2626,20 @@ def _daemon_health(self) -> int: # Display health panel console.print() - console.print(Panel(health_table, title="[bold cyan]System Health Metrics[/bold cyan]", border_style="cyan")) + console.print( + Panel( + health_table, + title="[bold cyan]System Health Metrics[/bold cyan]", + border_style="cyan", + ) + ) # Display thresholds in a separate panel if "thresholds" in result: thresholds = result["thresholds"] - threshold_table = Table(show_header=True, header_style="bold yellow", box=None, padding=(0, 2)) + threshold_table = Table( + show_header=True, header_style="bold yellow", box=None, padding=(0, 2) + ) threshold_table.add_column("Resource", style="bold") threshold_table.add_column("Warning", style="yellow") threshold_table.add_column("Critical", style="red") @@ -2640,27 +2647,27 @@ def _daemon_health(self) -> int: if "cpu" in thresholds: cpu_th = thresholds["cpu"] threshold_table.add_row( - "CPU", - f"{cpu_th.get('warning', 80)}%", - f"{cpu_th.get('critical', 95)}%" + "CPU", f"{cpu_th.get('warning', 80)}%", f"{cpu_th.get('critical', 95)}%" ) if "memory" in thresholds: mem_th = thresholds["memory"] threshold_table.add_row( - "Memory", - f"{mem_th.get('warning', 80)}%", - f"{mem_th.get('critical', 95)}%" + "Memory", f"{mem_th.get('warning', 80)}%", f"{mem_th.get('critical', 95)}%" ) if "disk" in thresholds: disk_th = thresholds["disk"] threshold_table.add_row( - "Disk", - f"{disk_th.get('warning', 80)}%", - f"{disk_th.get('critical', 95)}%" + "Disk", f"{disk_th.get('warning', 80)}%", f"{disk_th.get('critical', 95)}%" ) console.print() - console.print(Panel(threshold_table, title="[bold yellow]Monitoring Thresholds[/bold yellow]", border_style="yellow")) + console.print( + Panel( + threshold_table, + title="[bold yellow]Monitoring Thresholds[/bold yellow]", + border_style="yellow", + ) + ) return 0 @@ -4563,9 +4570,7 @@ def main(): daemon_subs.add_parser("health", help="Check system health") # daemon alerts - uses alerts IPC handlers - daemon_alerts_parser = daemon_subs.add_parser( - "alerts", help="Manage alerts" - ) + daemon_alerts_parser = daemon_subs.add_parser("alerts", help="Manage alerts") daemon_alerts_parser.add_argument( "--severity", choices=["info", "warning", "error", "critical"], diff --git a/daemon/scripts/setup_daemon.py b/daemon/scripts/setup_daemon.py index 42164133..5539b45e 100755 --- a/daemon/scripts/setup_daemon.py +++ b/daemon/scripts/setup_daemon.py @@ -402,22 +402,20 @@ def ensure_config_file() -> bool: bool: True if config file exists or was created successfully, False otherwise. """ config_path = Path(CONFIG_FILE) - + # If config already exists, we're done if config_path.exists(): return True - + # Check if template exists if not CONFIG_EXAMPLE.exists(): - console.print( - f"[yellow]Warning: Config template not found at {CONFIG_EXAMPLE}[/yellow]" - ) + console.print(f"[yellow]Warning: Config template not found at {CONFIG_EXAMPLE}[/yellow]") return False - + try: # Create /etc/cortex directory if needed config_path.parent.mkdir(parents=True, exist_ok=True) - + # Copy template to config file (requires sudo) result = subprocess.run( ["sudo", "cp", str(CONFIG_EXAMPLE), CONFIG_FILE], @@ -425,7 +423,7 @@ def ensure_config_file() -> bool: capture_output=True, text=True, ) - + if result.returncode == 0: # Set proper permissions subprocess.run( @@ -433,14 +431,12 @@ def ensure_config_file() -> bool: check=False, ) console.print(f"[green]Created config file: {CONFIG_FILE}[/green]") - log_audit_event("create_config", f"Created config from template") + log_audit_event("create_config", "Created config from template") return True else: - console.print( - f"[red]Failed to create config file: {result.stderr}[/red]" - ) + console.print(f"[red]Failed to create config file: {result.stderr}[/red]") return False - + except Exception as e: console.print(f"[red]Error creating config file: {e}[/red]") return False @@ -460,7 +456,7 @@ def install_daemon() -> bool: """ # Ensure config file exists before installation ensure_config_file() - + console.print("[cyan]Installing the daemon...[/cyan]") result = subprocess.run(["sudo", str(INSTALL_SCRIPT)], check=False) success = result.returncode == 0 From ead4a4b71f7c7e153956c05618e7225a1ff2e5fe Mon Sep 17 00:00:00 2001 From: sujay-d07 Date: Wed, 21 Jan 2026 17:31:25 +0530 Subject: [PATCH 04/12] feat(cli): Enhance system health reporting with configurable thresholds - Added support for configurable CPU, memory, and disk usage thresholds in the Cortex CLI, allowing for dynamic alerting based on user-defined limits. - Updated health check logic to utilize these thresholds for determining alert colors, improving clarity in system health reporting. - Refactored the SystemMonitor class to validate check intervals and ensure proper alert management based on system metrics. - Enhanced error handling in the setup process for configuration files, ensuring robust directory creation and permission handling. - Improved integration tests to verify health data readiness and system monitoring functionality. --- cortex/cli.py | 18 ++- .../include/cortexd/monitor/system_monitor.h | 4 +- daemon/scripts/setup_daemon.py | 37 ++++- daemon/src/alerts/alert_manager.cpp | 132 ++++++++++++------ daemon/src/core/daemon.cpp | 27 ++-- daemon/src/ipc/handlers.cpp | 5 + daemon/src/main.cpp | 2 +- daemon/src/monitor/system_monitor.cpp | 72 ++++++++++ daemon/tests/integration/test_handlers.cpp | 19 ++- 9 files changed, 248 insertions(+), 68 deletions(-) diff --git a/cortex/cli.py b/cortex/cli.py index a6bc6020..d2b69d30 100644 --- a/cortex/cli.py +++ b/cortex/cli.py @@ -2578,11 +2578,23 @@ def _daemon_health(self) -> int: health_table.add_column("Metric", style="bold") health_table.add_column("Value", style="") + # Get thresholds from result, with defaults + thresholds = result.get("thresholds", {}) + cpu_thresholds = thresholds.get("cpu", {}) + cpu_warning = cpu_thresholds.get("warning", 80) + cpu_critical = cpu_thresholds.get("critical", 95) + mem_thresholds = thresholds.get("memory", {}) + mem_warning = mem_thresholds.get("warning", 80) + mem_critical = mem_thresholds.get("critical", 95) + disk_thresholds = thresholds.get("disk", {}) + disk_warning = disk_thresholds.get("warning", 80) + disk_critical = disk_thresholds.get("critical", 95) + # CPU if "cpu" in result: cpu = result["cpu"] usage = cpu.get("usage_percent", 0) - color = "red" if usage >= 95 else "yellow" if usage >= 80 else "green" + color = "red" if usage >= cpu_critical else "yellow" if usage >= cpu_warning else "green" health_table.add_row( "CPU Usage", f"[{color}]{usage:.1f}%[/{color}] ({cpu.get('cores', 0)} cores)" ) @@ -2591,7 +2603,7 @@ def _daemon_health(self) -> int: if "memory" in result: mem = result["memory"] usage = mem.get("usage_percent", 0) - color = "red" if usage >= 95 else "yellow" if usage >= 80 else "green" + color = "red" if usage >= mem_critical else "yellow" if usage >= mem_warning else "green" mem_gb = mem.get("used_bytes", 0) / (1024**3) mem_total_gb = mem.get("total_bytes", 0) / (1024**3) health_table.add_row( @@ -2603,7 +2615,7 @@ def _daemon_health(self) -> int: if "disk" in result: disk = result["disk"] usage = disk.get("usage_percent", 0) - color = "red" if usage >= 95 else "yellow" if usage >= 80 else "green" + color = "red" if usage >= disk_critical else "yellow" if usage >= disk_warning else "green" disk_gb = disk.get("used_bytes", 0) / (1024**3) disk_total_gb = disk.get("total_bytes", 0) / (1024**3) mount_point = disk.get("mount_point", "/") diff --git a/daemon/include/cortexd/monitor/system_monitor.h b/daemon/include/cortexd/monitor/system_monitor.h index ea7ed1f5..7fcd6a9b 100644 --- a/daemon/include/cortexd/monitor/system_monitor.h +++ b/daemon/include/cortexd/monitor/system_monitor.h @@ -71,12 +71,12 @@ class SystemMonitor : public Service { /** * @brief Construct system monitor * @param alert_manager Shared pointer to alert manager - * @param check_interval_seconds Interval between health checks (default: 60) + * @param check_interval_seconds Interval between health checks * @param thresholds Monitoring thresholds (required) */ explicit SystemMonitor( std::shared_ptr alert_manager, - int check_interval_seconds = 60, + int check_interval_seconds, const MonitoringThresholds& thresholds ); diff --git a/daemon/scripts/setup_daemon.py b/daemon/scripts/setup_daemon.py index 5539b45e..3504ce6f 100755 --- a/daemon/scripts/setup_daemon.py +++ b/daemon/scripts/setup_daemon.py @@ -401,6 +401,7 @@ def ensure_config_file() -> bool: Returns: bool: True if config file exists or was created successfully, False otherwise. """ + import os config_path = Path(CONFIG_FILE) # If config already exists, we're done @@ -414,8 +415,24 @@ def ensure_config_file() -> bool: try: # Create /etc/cortex directory if needed - config_path.parent.mkdir(parents=True, exist_ok=True) - + # Check if parent directory exists + if not config_path.parent.exists(): + # If running as root, create directly; otherwise use sudo + if os.geteuid() == 0: + config_path.parent.mkdir(parents=True, exist_ok=True) + else: + result = subprocess.run( + ["sudo", "mkdir", "-p", str(config_path.parent)], + check=False, + capture_output=True, + text=True, + ) + if result.returncode != 0: + console.print( + f"[red]Failed to create config directory: {result.stderr}[/red]" + ) + return False + # Copy template to config file (requires sudo) result = subprocess.run( ["sudo", "cp", str(CONFIG_EXAMPLE), CONFIG_FILE], @@ -426,10 +443,16 @@ def ensure_config_file() -> bool: if result.returncode == 0: # Set proper permissions - subprocess.run( + perm_result = subprocess.run( ["sudo", "chmod", "0644", CONFIG_FILE], check=False, + capture_output=True, + text=True, ) + if perm_result.returncode != 0: + console.print( + f"[yellow]Warning: Failed to set config file permissions: {perm_result.stderr}[/yellow]" + ) console.print(f"[green]Created config file: {CONFIG_FILE}[/green]") log_audit_event("create_config", "Created config from template") return True @@ -455,8 +478,12 @@ def install_daemon() -> bool: False otherwise. """ # Ensure config file exists before installation - ensure_config_file() - + result = ensure_config_file() + if not result: + console.print("[yellow]Warning: Config template missing or failed to create[/yellow]") + console.print("[red]Error: Cannot proceed with installation without config file[/red]") + sys.exit(1) + console.print("[cyan]Installing the daemon...[/cyan]") result = subprocess.run(["sudo", str(INSTALL_SCRIPT)], check=False) success = result.returncode == 0 diff --git a/daemon/src/alerts/alert_manager.cpp b/daemon/src/alerts/alert_manager.cpp index 80762f1a..4a86de38 100644 --- a/daemon/src/alerts/alert_manager.cpp +++ b/daemon/src/alerts/alert_manager.cpp @@ -13,9 +13,39 @@ #include #include #include +#include namespace cortexd { +// Cross-platform UTC time conversion helper +// Converts struct tm (assumed to be in UTC) to time_t +static time_t utc_timegm(struct tm* tm) { +#ifdef _WIN32 + return _mkgmtime(tm); +#else + return timegm(tm); +#endif +} + +// Thread-safe UTC time formatting helper +// Formats time_t as ISO 8601 UTC string using gmtime_r +static std::string format_utc_time(time_t time_val) { + struct tm tm_buf; +#ifdef _WIN32 + // Windows uses _gmtime_s instead of gmtime_r + if (_gmtime_s(&tm_buf, &time_val) != 0) { + return ""; + } +#else + if (gmtime_r(&time_val, &tm_buf) == nullptr) { + return ""; + } +#endif + std::stringstream ss; + ss << std::put_time(&tm_buf, "%Y-%m-%dT%H:%M:%SZ"); + return ss.str(); +} + // Alert JSON conversion json Alert::to_json() const { json j; @@ -28,27 +58,21 @@ json Alert::to_json() const { j["message"] = message; j["description"] = description; - // Convert timestamps to ISO 8601 strings + // Convert timestamps to ISO 8601 strings (thread-safe) auto time_t = std::chrono::system_clock::to_time_t(timestamp); - std::stringstream ss; - ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%SZ"); - j["timestamp"] = ss.str(); + j["timestamp"] = format_utc_time(time_t); j["status"] = static_cast(status); j["status_name"] = AlertManager::status_to_string(status); if (acknowledged_at.has_value()) { auto ack_time_t = std::chrono::system_clock::to_time_t(acknowledged_at.value()); - std::stringstream ack_ss; - ack_ss << std::put_time(std::gmtime(&ack_time_t), "%Y-%m-%dT%H:%M:%SZ"); - j["acknowledged_at"] = ack_ss.str(); + j["acknowledged_at"] = format_utc_time(ack_time_t); } if (dismissed_at.has_value()) { auto dis_time_t = std::chrono::system_clock::to_time_t(dismissed_at.value()); - std::stringstream dis_ss; - dis_ss << std::put_time(std::gmtime(&dis_time_t), "%Y-%m-%dT%H:%M:%SZ"); - j["dismissed_at"] = dis_ss.str(); + j["dismissed_at"] = format_utc_time(dis_time_t); } return j; @@ -70,7 +94,7 @@ Alert Alert::from_json(const json& j) { std::istringstream ss(timestamp_str); ss >> std::get_time(&tm, "%Y-%m-%dT%H:%M:%SZ"); if (!ss.fail()) { - alert.timestamp = std::chrono::system_clock::from_time_t(std::mktime(&tm)); + alert.timestamp = std::chrono::system_clock::from_time_t(utc_timegm(&tm)); } else { alert.timestamp = std::chrono::system_clock::now(); } @@ -87,7 +111,7 @@ Alert Alert::from_json(const json& j) { std::istringstream ack_ss(ack_str); ack_ss >> std::get_time(&ack_tm, "%Y-%m-%dT%H:%M:%SZ"); if (!ack_ss.fail()) { - alert.acknowledged_at = std::chrono::system_clock::from_time_t(std::mktime(&ack_tm)); + alert.acknowledged_at = std::chrono::system_clock::from_time_t(utc_timegm(&ack_tm)); } } @@ -97,7 +121,7 @@ Alert Alert::from_json(const json& j) { std::istringstream dis_ss(dis_str); dis_ss >> std::get_time(&dis_tm, "%Y-%m-%dT%H:%M:%SZ"); if (!dis_ss.fail()) { - alert.dismissed_at = std::chrono::system_clock::from_time_t(std::mktime(&dis_tm)); + alert.dismissed_at = std::chrono::system_clock::from_time_t(utc_timegm(&dis_tm)); } } @@ -144,6 +168,32 @@ bool AlertManager::ensure_db_directory() { } return true; + } catch (const std::filesystem::filesystem_error& e) { + // Check if this is a permission-related error + if (e.code() == std::errc::permission_denied || + e.code() == std::errc::operation_not_permitted) { + // Fallback to user directory + const char* home = getenv("HOME"); + if (home) { + std::filesystem::path home_dir = std::filesystem::path(home); + db_dir = home_dir / ".cortex"; + try { + std::filesystem::create_directories(db_dir); + db_path_ = (db_dir / "alerts.db").string(); + LOG_WARN("AlertManager", "Permission denied for database directory, using user directory: " + db_path_); + return true; + } catch (const std::exception& fallback_e) { + LOG_ERROR("AlertManager", "Failed to create fallback database directory: " + std::string(fallback_e.what()) + " (original error: " + std::string(e.what()) + ")"); + return false; + } + } else { + LOG_ERROR("AlertManager", "Cannot determine home directory for fallback (original error: " + std::string(e.what()) + ")"); + return false; + } + } else { + LOG_ERROR("AlertManager", "Failed to create database directory: " + std::string(e.what())); + return false; + } } catch (const std::exception& e) { LOG_ERROR("AlertManager", "Failed to create database directory: " + std::string(e.what())); return false; @@ -407,11 +457,23 @@ std::optional AlertManager::create_alert(const Alert& alert) { new_alert.timestamp = std::chrono::system_clock::now(); } - // Convert timestamp to ISO 8601 string + // Convert timestamp to ISO 8601 string (thread-safe) auto time_t = std::chrono::system_clock::to_time_t(new_alert.timestamp); - std::stringstream ss; - ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%SZ"); - std::string timestamp_str = ss.str(); + std::string timestamp_str = format_utc_time(time_t); + + // Store optional timestamp strings in persistent variables to avoid use-after-free + std::string ack_ts; + std::string dis_ts; + + if (new_alert.acknowledged_at.has_value()) { + auto ack_time_t = std::chrono::system_clock::to_time_t(new_alert.acknowledged_at.value()); + ack_ts = format_utc_time(ack_time_t); + } + + if (new_alert.dismissed_at.has_value()) { + auto dis_time_t = std::chrono::system_clock::to_time_t(new_alert.dismissed_at.value()); + dis_ts = format_utc_time(dis_time_t); + } int rc; { @@ -427,23 +489,17 @@ std::optional AlertManager::create_alert(const Alert& alert) { sqlite3_bind_text(stmt, 4, new_alert.source.c_str(), -1, SQLITE_STATIC); sqlite3_bind_text(stmt, 5, new_alert.message.c_str(), -1, SQLITE_STATIC); sqlite3_bind_text(stmt, 6, new_alert.description.c_str(), -1, SQLITE_STATIC); - sqlite3_bind_text(stmt, 7, timestamp_str.c_str(), -1, SQLITE_STATIC); + sqlite3_bind_text(stmt, 7, timestamp_str.c_str(), -1, SQLITE_TRANSIENT); sqlite3_bind_int(stmt, 8, static_cast(new_alert.status)); if (new_alert.acknowledged_at.has_value()) { - auto ack_time_t = std::chrono::system_clock::to_time_t(new_alert.acknowledged_at.value()); - std::stringstream ack_ss; - ack_ss << std::put_time(std::gmtime(&ack_time_t), "%Y-%m-%dT%H:%M:%SZ"); - sqlite3_bind_text(stmt, 9, ack_ss.str().c_str(), -1, SQLITE_STATIC); + sqlite3_bind_text(stmt, 9, ack_ts.c_str(), -1, SQLITE_TRANSIENT); } else { sqlite3_bind_null(stmt, 9); } if (new_alert.dismissed_at.has_value()) { - auto dis_time_t = std::chrono::system_clock::to_time_t(new_alert.dismissed_at.value()); - std::stringstream dis_ss; - dis_ss << std::put_time(std::gmtime(&dis_time_t), "%Y-%m-%dT%H:%M:%SZ"); - sqlite3_bind_text(stmt, 10, dis_ss.str().c_str(), -1, SQLITE_STATIC); + sqlite3_bind_text(stmt, 10, dis_ts.c_str(), -1, SQLITE_TRANSIENT); } else { sqlite3_bind_null(stmt, 10); } @@ -498,7 +554,7 @@ std::optional AlertManager::get_alert(const std::string& uuid) { std::istringstream ss(timestamp_str); ss >> std::get_time(&tm, "%Y-%m-%dT%H:%M:%SZ"); if (!ss.fail()) { - alert.timestamp = std::chrono::system_clock::from_time_t(std::mktime(&tm)); + alert.timestamp = std::chrono::system_clock::from_time_t(utc_timegm(&tm)); } else { alert.timestamp = std::chrono::system_clock::now(); } @@ -512,7 +568,7 @@ std::optional AlertManager::get_alert(const std::string& uuid) { std::istringstream ack_ss(ack_str); ack_ss >> std::get_time(&ack_tm, "%Y-%m-%dT%H:%M:%SZ"); if (!ack_ss.fail()) { - alert.acknowledged_at = std::chrono::system_clock::from_time_t(std::mktime(&ack_tm)); + alert.acknowledged_at = std::chrono::system_clock::from_time_t(utc_timegm(&ack_tm)); } } @@ -522,7 +578,7 @@ std::optional AlertManager::get_alert(const std::string& uuid) { std::istringstream dis_ss(dis_str); dis_ss >> std::get_time(&dis_tm, "%Y-%m-%dT%H:%M:%SZ"); if (!dis_ss.fail()) { - alert.dismissed_at = std::chrono::system_clock::from_time_t(std::mktime(&dis_tm)); + alert.dismissed_at = std::chrono::system_clock::from_time_t(utc_timegm(&dis_tm)); } } } // Lock released - alert data is now copied @@ -588,7 +644,7 @@ std::vector AlertManager::get_alerts(const AlertFilter& filter) { std::istringstream ss(timestamp_str); ss >> std::get_time(&tm, "%Y-%m-%dT%H:%M:%SZ"); if (!ss.fail()) { - alert.timestamp = std::chrono::system_clock::from_time_t(std::mktime(&tm)); + alert.timestamp = std::chrono::system_clock::from_time_t(utc_timegm(&tm)); } else { alert.timestamp = std::chrono::system_clock::now(); } @@ -602,7 +658,7 @@ std::vector AlertManager::get_alerts(const AlertFilter& filter) { std::istringstream ack_ss(ack_str); ack_ss >> std::get_time(&ack_tm, "%Y-%m-%dT%H:%M:%SZ"); if (!ack_ss.fail()) { - alert.acknowledged_at = std::chrono::system_clock::from_time_t(std::mktime(&ack_tm)); + alert.acknowledged_at = std::chrono::system_clock::from_time_t(utc_timegm(&ack_tm)); } } @@ -612,7 +668,7 @@ std::vector AlertManager::get_alerts(const AlertFilter& filter) { std::istringstream dis_ss(dis_str); dis_ss >> std::get_time(&dis_tm, "%Y-%m-%dT%H:%M:%SZ"); if (!dis_ss.fail()) { - alert.dismissed_at = std::chrono::system_clock::from_time_t(std::mktime(&dis_tm)); + alert.dismissed_at = std::chrono::system_clock::from_time_t(utc_timegm(&dis_tm)); } } @@ -637,9 +693,7 @@ bool AlertManager::acknowledge_alert(const std::string& uuid) { sqlite3* db = static_cast(db_handle_); auto now = std::chrono::system_clock::now(); auto time_t = std::chrono::system_clock::to_time_t(now); - std::stringstream ss; - ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%SZ"); - std::string timestamp_str = ss.str(); + std::string timestamp_str = format_utc_time(time_t); int rc; int changes = 0; @@ -675,9 +729,7 @@ size_t AlertManager::acknowledge_all() { sqlite3* db = static_cast(db_handle_); auto now = std::chrono::system_clock::now(); auto time_t = std::chrono::system_clock::to_time_t(now); - std::stringstream ss; - ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%SZ"); - std::string timestamp_str = ss.str(); + std::string timestamp_str = format_utc_time(time_t); int rc; int changes = 0; @@ -728,9 +780,7 @@ bool AlertManager::dismiss_alert(const std::string& uuid) { sqlite3* db = static_cast(db_handle_); auto now = std::chrono::system_clock::now(); auto time_t = std::chrono::system_clock::to_time_t(now); - std::stringstream ss; - ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%SZ"); - std::string timestamp_str = ss.str(); + std::string timestamp_str = format_utc_time(time_t); int rc; int changes = 0; diff --git a/daemon/src/core/daemon.cpp b/daemon/src/core/daemon.cpp index 6fdb29d9..a3db649a 100644 --- a/daemon/src/core/daemon.cpp +++ b/daemon/src/core/daemon.cpp @@ -215,29 +215,28 @@ bool Daemon::reload_config() { return a->priority() > b->priority(); }); + // Copy service pointers to local vector while holding lock + // This prevents iterator invalidation when lock is released + std::vector service_ptrs; + for (const auto& service : services_) { + service_ptrs.push_back(service.get()); + } + // Release lock before starting services (start() may take time) lock.unlock(); - for (auto& service : services_) { - // Re-acquire lock to access services_ safely - std::shared_lock read_lock(services_mutex_); - auto it = std::find_if(services_.begin(), services_.end(), - [&service](const auto& s) { return s.get() == service.get(); }); - if (it == services_.end()) { - read_lock.unlock(); - continue; // Service was removed - } - read_lock.unlock(); - LOG_INFO("Daemon", "Starting service: " + std::string(service->name())); + // Iterate over local copy - no need to check if service was removed + for (auto* service_ptr : service_ptrs) { + LOG_INFO("Daemon", "Starting service: " + std::string(service_ptr->name())); - if (!service->start()) { - LOG_ERROR("Daemon", "Failed to start service: " + std::string(service->name())); + if (!service_ptr->start()) { + LOG_ERROR("Daemon", "Failed to start service: " + std::string(service_ptr->name())); // Stop already started services stop_services(); return false; } - LOG_INFO("Daemon", "Service started: " + std::string(service->name())); + LOG_INFO("Daemon", "Service started: " + std::string(service_ptr->name())); } return true; diff --git a/daemon/src/ipc/handlers.cpp b/daemon/src/ipc/handlers.cpp index 48caab41..dd1f26d2 100644 --- a/daemon/src/ipc/handlers.cpp +++ b/daemon/src/ipc/handlers.cpp @@ -166,6 +166,11 @@ Response Handlers::handle_alerts_get(const Request& req, std::shared_ptr(); + filter.source = source_str; + } + if (req.params.contains("include_dismissed")) { filter.include_dismissed = req.params["include_dismissed"].get(); } diff --git a/daemon/src/main.cpp b/daemon/src/main.cpp index 206ba652..5ee1880e 100644 --- a/daemon/src/main.cpp +++ b/daemon/src/main.cpp @@ -103,7 +103,7 @@ } // Get configuration - const auto& config = ConfigManager::instance().get(); + const auto config = ConfigManager::instance().get(); // Create alert manager (shared pointer for use by multiple components) auto alert_manager = std::make_shared(); diff --git a/daemon/src/monitor/system_monitor.cpp b/daemon/src/monitor/system_monitor.cpp index e7d0534e..3d3e6d71 100644 --- a/daemon/src/monitor/system_monitor.cpp +++ b/daemon/src/monitor/system_monitor.cpp @@ -56,6 +56,11 @@ SystemMonitor::SystemMonitor( current_health_{}, last_cpu_time_(std::chrono::steady_clock::now()), systemd_bus_(nullptr) { + // Validate check_interval_seconds to prevent busy-spin + if (check_interval_seconds_ <= 0) { + LOG_WARN("SystemMonitor", "Invalid check_interval_seconds (" + std::to_string(check_interval_seconds_) + "), clamping to minimum of 1 second"); + check_interval_seconds_ = 1; + } } SystemMonitor::~SystemMonitor() { @@ -421,6 +426,13 @@ int SystemMonitor::get_failed_services_count() const { void SystemMonitor::check_thresholds(const SystemHealth& health) { // CPU checks + std::string cpu_critical_key = std::to_string(static_cast(AlertCategory::CPU)) + ":" + + std::to_string(static_cast(AlertSeverity::CRITICAL)) + ":" + + "system_monitor:CPU usage critical"; + std::string cpu_warning_key = std::to_string(static_cast(AlertCategory::CPU)) + ":" + + std::to_string(static_cast(AlertSeverity::WARNING)) + ":" + + "system_monitor:CPU usage high"; + if (health.cpu_usage_percent >= thresholds_.cpu_critical) { create_basic_alert( AlertSeverity::CRITICAL, @@ -439,9 +451,28 @@ void SystemMonitor::check_thresholds(const SystemHealth& health) { "CPU usage is at " + std::to_string(static_cast(health.cpu_usage_percent)) + "% (threshold: " + std::to_string(static_cast(thresholds_.cpu_warning)) + "%)" ); + // Remove critical key if it exists (downgraded from critical to warning) + { + std::lock_guard lock(alert_keys_mutex_); + active_alert_keys_.erase(cpu_critical_key); + } + } else { + // CPU usage recovered - remove both keys + { + std::lock_guard lock(alert_keys_mutex_); + active_alert_keys_.erase(cpu_critical_key); + active_alert_keys_.erase(cpu_warning_key); + } } // Memory checks + std::string mem_critical_key = std::to_string(static_cast(AlertCategory::MEMORY)) + ":" + + std::to_string(static_cast(AlertSeverity::CRITICAL)) + ":" + + "system_monitor:Memory usage critical"; + std::string mem_warning_key = std::to_string(static_cast(AlertCategory::MEMORY)) + ":" + + std::to_string(static_cast(AlertSeverity::WARNING)) + ":" + + "system_monitor:Memory usage high"; + if (health.memory_usage_percent >= thresholds_.memory_critical) { create_basic_alert( AlertSeverity::CRITICAL, @@ -460,9 +491,28 @@ void SystemMonitor::check_thresholds(const SystemHealth& health) { "Memory usage is at " + std::to_string(static_cast(health.memory_usage_percent)) + "% (threshold: " + std::to_string(static_cast(thresholds_.memory_warning)) + "%)" ); + // Remove critical key if it exists (downgraded from critical to warning) + { + std::lock_guard lock(alert_keys_mutex_); + active_alert_keys_.erase(mem_critical_key); + } + } else { + // Memory usage recovered - remove both keys + { + std::lock_guard lock(alert_keys_mutex_); + active_alert_keys_.erase(mem_critical_key); + active_alert_keys_.erase(mem_warning_key); + } } // Disk checks + std::string disk_critical_key = std::to_string(static_cast(AlertCategory::DISK)) + ":" + + std::to_string(static_cast(AlertSeverity::CRITICAL)) + ":" + + "system_monitor:Disk usage critical"; + std::string disk_warning_key = std::to_string(static_cast(AlertCategory::DISK)) + ":" + + std::to_string(static_cast(AlertSeverity::WARNING)) + ":" + + "system_monitor:Disk usage high"; + if (health.disk_usage_percent >= thresholds_.disk_critical) { create_basic_alert( AlertSeverity::CRITICAL, @@ -483,9 +533,25 @@ void SystemMonitor::check_thresholds(const SystemHealth& health) { std::to_string(static_cast(health.disk_usage_percent)) + "% (threshold: " + std::to_string(static_cast(thresholds_.disk_warning)) + "%)" ); + // Remove critical key if it exists (downgraded from critical to warning) + { + std::lock_guard lock(alert_keys_mutex_); + active_alert_keys_.erase(disk_critical_key); + } + } else { + // Disk usage recovered - remove both keys + { + std::lock_guard lock(alert_keys_mutex_); + active_alert_keys_.erase(disk_critical_key); + active_alert_keys_.erase(disk_warning_key); + } } // Failed services check + std::string service_key = std::to_string(static_cast(AlertCategory::SERVICE)) + ":" + + std::to_string(static_cast(AlertSeverity::ERROR)) + ":" + + "system_monitor:Failed systemd services detected"; + if (health.failed_services_count > 0) { create_basic_alert( AlertSeverity::ERROR, @@ -494,6 +560,12 @@ void SystemMonitor::check_thresholds(const SystemHealth& health) { "Failed systemd services detected", std::to_string(health.failed_services_count) + " systemd service(s) are in failed state" ); + } else { + // No failed services - remove key if it exists + { + std::lock_guard lock(alert_keys_mutex_); + active_alert_keys_.erase(service_key); + } } } diff --git a/daemon/tests/integration/test_handlers.cpp b/daemon/tests/integration/test_handlers.cpp index ddfff48a..80ad8ff1 100644 --- a/daemon/tests/integration/test_handlers.cpp +++ b/daemon/tests/integration/test_handlers.cpp @@ -108,8 +108,23 @@ log_level: 1 ASSERT_TRUE(server_->start()); // Wait for monitor thread to start and run at least once to populate health data - // The monitor loop calls check_health() immediately when it starts - std::this_thread::sleep_for(std::chrono::milliseconds(200)); + // Poll for health data readiness instead of fixed sleep + const auto timeout = std::chrono::seconds(5); + const auto poll_interval = std::chrono::milliseconds(100); + auto start_time = std::chrono::steady_clock::now(); + bool health_ready = false; + + while (std::chrono::steady_clock::now() - start_time < timeout) { + auto health = system_monitor_->get_health(); + // Check if health data is populated (monitor has run at least once) + if (health.cpu_cores > 0 || health.uptime_seconds > 0) { + health_ready = true; + break; + } + std::this_thread::sleep_for(poll_interval); + } + + ASSERT_TRUE(health_ready) << "SystemMonitor did not populate health data within timeout"; } std::string send_request(const std::string& request) { From 32409cda7a5d3ac2e45a67b1571b1d3b53c0892b Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Wed, 21 Jan 2026 12:03:33 +0000 Subject: [PATCH 05/12] [autofix.ci] apply automated fixes --- cortex/cli.py | 12 +++++++++--- daemon/scripts/setup_daemon.py | 9 ++++----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/cortex/cli.py b/cortex/cli.py index d2b69d30..4ab878de 100644 --- a/cortex/cli.py +++ b/cortex/cli.py @@ -2594,7 +2594,9 @@ def _daemon_health(self) -> int: if "cpu" in result: cpu = result["cpu"] usage = cpu.get("usage_percent", 0) - color = "red" if usage >= cpu_critical else "yellow" if usage >= cpu_warning else "green" + color = ( + "red" if usage >= cpu_critical else "yellow" if usage >= cpu_warning else "green" + ) health_table.add_row( "CPU Usage", f"[{color}]{usage:.1f}%[/{color}] ({cpu.get('cores', 0)} cores)" ) @@ -2603,7 +2605,9 @@ def _daemon_health(self) -> int: if "memory" in result: mem = result["memory"] usage = mem.get("usage_percent", 0) - color = "red" if usage >= mem_critical else "yellow" if usage >= mem_warning else "green" + color = ( + "red" if usage >= mem_critical else "yellow" if usage >= mem_warning else "green" + ) mem_gb = mem.get("used_bytes", 0) / (1024**3) mem_total_gb = mem.get("total_bytes", 0) / (1024**3) health_table.add_row( @@ -2615,7 +2619,9 @@ def _daemon_health(self) -> int: if "disk" in result: disk = result["disk"] usage = disk.get("usage_percent", 0) - color = "red" if usage >= disk_critical else "yellow" if usage >= disk_warning else "green" + color = ( + "red" if usage >= disk_critical else "yellow" if usage >= disk_warning else "green" + ) disk_gb = disk.get("used_bytes", 0) / (1024**3) disk_total_gb = disk.get("total_bytes", 0) / (1024**3) mount_point = disk.get("mount_point", "/") diff --git a/daemon/scripts/setup_daemon.py b/daemon/scripts/setup_daemon.py index 3504ce6f..aeae898f 100755 --- a/daemon/scripts/setup_daemon.py +++ b/daemon/scripts/setup_daemon.py @@ -402,6 +402,7 @@ def ensure_config_file() -> bool: bool: True if config file exists or was created successfully, False otherwise. """ import os + config_path = Path(CONFIG_FILE) # If config already exists, we're done @@ -428,11 +429,9 @@ def ensure_config_file() -> bool: text=True, ) if result.returncode != 0: - console.print( - f"[red]Failed to create config directory: {result.stderr}[/red]" - ) + console.print(f"[red]Failed to create config directory: {result.stderr}[/red]") return False - + # Copy template to config file (requires sudo) result = subprocess.run( ["sudo", "cp", str(CONFIG_EXAMPLE), CONFIG_FILE], @@ -483,7 +482,7 @@ def install_daemon() -> bool: console.print("[yellow]Warning: Config template missing or failed to create[/yellow]") console.print("[red]Error: Cannot proceed with installation without config file[/red]") sys.exit(1) - + console.print("[cyan]Installing the daemon...[/cyan]") result = subprocess.run(["sudo", str(INSTALL_SCRIPT)], check=False) success = result.returncode == 0 From bd8340bce82c48d94bdb3ec4a5acc0e6abbc184e Mon Sep 17 00:00:00 2001 From: sujay-d07 Date: Wed, 21 Jan 2026 17:52:17 +0530 Subject: [PATCH 06/12] feat(monitor): Implement thread-safe access for monitoring thresholds - Added a shared mutex to the SystemMonitor class to ensure thread-safe access to monitoring thresholds. - Refactored get_thresholds and set_thresholds methods to utilize shared and unique locks, respectively, enhancing concurrency control. - Updated the AlertManager to hold a lock while updating alert counters during acknowledgment, preventing race conditions. --- .../include/cortexd/monitor/system_monitor.h | 7 ++++-- daemon/src/alerts/alert_manager.cpp | 23 ++++++++++--------- daemon/src/monitor/system_monitor.cpp | 13 +++++++++++ 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/daemon/include/cortexd/monitor/system_monitor.h b/daemon/include/cortexd/monitor/system_monitor.h index 7fcd6a9b..0d45942f 100644 --- a/daemon/include/cortexd/monitor/system_monitor.h +++ b/daemon/include/cortexd/monitor/system_monitor.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -98,18 +99,20 @@ class SystemMonitor : public Service { /** * @brief Get monitoring thresholds */ - MonitoringThresholds get_thresholds() const { return thresholds_; } + MonitoringThresholds get_thresholds() const; /** * @brief Set monitoring thresholds */ - void set_thresholds(const MonitoringThresholds& thresholds) { thresholds_ = thresholds; } + void set_thresholds(const MonitoringThresholds& thresholds); private: std::shared_ptr alert_manager_; std::atomic running_{false}; std::unique_ptr monitor_thread_; int check_interval_seconds_; + + mutable std::shared_mutex thresholds_mutex_; MonitoringThresholds thresholds_; mutable std::mutex health_mutex_; diff --git a/daemon/src/alerts/alert_manager.cpp b/daemon/src/alerts/alert_manager.cpp index 4a86de38..95667e9d 100644 --- a/daemon/src/alerts/alert_manager.cpp +++ b/daemon/src/alerts/alert_manager.cpp @@ -746,19 +746,20 @@ size_t AlertManager::acknowledge_all() { rc = sqlite3_step(stmt); changes = (rc == SQLITE_DONE) ? sqlite3_changes(db) : 0; + + // Update counters while holding lock to prevent race with concurrent inserts + // Reset all to 0 since all active alerts are now acknowledged + // Note: This is approximate - for exact counts we'd need to query by severity + // But for acknowledge_all, we typically want to clear all counters anyway + if (changes > 0) { + count_info_.store(0, std::memory_order_relaxed); + count_warning_.store(0, std::memory_order_relaxed); + count_error_.store(0, std::memory_order_relaxed); + count_critical_.store(0, std::memory_order_relaxed); + count_total_.store(0, std::memory_order_relaxed); + } } // Lock released - // Update counters - reset all to 0 since all active alerts are now acknowledged - // Note: This is approximate - for exact counts we'd need to query by severity - // But for acknowledge_all, we typically want to clear all counters anyway - if (changes > 0) { - count_info_.store(0, std::memory_order_relaxed); - count_warning_.store(0, std::memory_order_relaxed); - count_error_.store(0, std::memory_order_relaxed); - count_critical_.store(0, std::memory_order_relaxed); - count_total_.store(0, std::memory_order_relaxed); - } - return changes; } diff --git a/daemon/src/monitor/system_monitor.cpp b/daemon/src/monitor/system_monitor.cpp index 3d3e6d71..0c529581 100644 --- a/daemon/src/monitor/system_monitor.cpp +++ b/daemon/src/monitor/system_monitor.cpp @@ -120,6 +120,16 @@ SystemHealth SystemMonitor::get_health() const { return current_health_; } +MonitoringThresholds SystemMonitor::get_thresholds() const { + std::shared_lock lock(thresholds_mutex_); + return thresholds_; +} + +void SystemMonitor::set_thresholds(const MonitoringThresholds& thresholds) { + std::unique_lock lock(thresholds_mutex_); + thresholds_ = thresholds; +} + void SystemMonitor::monitor_loop() { while (running_.load()) { try { @@ -425,6 +435,9 @@ int SystemMonitor::get_failed_services_count() const { } void SystemMonitor::check_thresholds(const SystemHealth& health) { + // Acquire shared lock for reading thresholds + std::shared_lock lock(thresholds_mutex_); + // CPU checks std::string cpu_critical_key = std::to_string(static_cast(AlertCategory::CPU)) + ":" + std::to_string(static_cast(AlertSeverity::CRITICAL)) + ":" + From fa4702abd866509d4d45d169733d67f9ba585229 Mon Sep 17 00:00:00 2001 From: sujay-d07 Date: Wed, 21 Jan 2026 18:09:45 +0530 Subject: [PATCH 07/12] fix(alerts): Correct total alert count calculation in load_initial_counters - Updated the load_initial_counters method to initialize and store the total alert count correctly. - Replaced the previous fetch_add operation with a more accurate total accumulation approach, ensuring thread safety and consistency in alert status updates. --- daemon/src/alerts/alert_manager.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/daemon/src/alerts/alert_manager.cpp b/daemon/src/alerts/alert_manager.cpp index 95667e9d..63de7a5a 100644 --- a/daemon/src/alerts/alert_manager.cpp +++ b/daemon/src/alerts/alert_manager.cpp @@ -406,6 +406,9 @@ void AlertManager::load_initial_counters() { sqlite3_reset(stmt); sqlite3_bind_int(stmt, 1, static_cast(AlertStatus::DISMISSED)); + count_total_.store(0, std::memory_order_relaxed); + int total = 0; + int rc; while ((rc = sqlite3_step(stmt)) == SQLITE_ROW) { int severity = sqlite3_column_int(stmt, 0); @@ -425,8 +428,9 @@ void AlertManager::load_initial_counters() { count_critical_.store(count, std::memory_order_relaxed); break; } - count_total_.fetch_add(count, std::memory_order_relaxed); + total += count; } // End of while loop + count_total_.store(total, std::memory_order_relaxed); } // Lock released } From 9c7891b56852ba91a76abca607133e05c2300a48 Mon Sep 17 00:00:00 2001 From: sujay-d07 Date: Wed, 21 Jan 2026 18:22:10 +0530 Subject: [PATCH 08/12] fix(alerts): Enhance alert acknowledgment and dismissal SQL statements - Updated SQL queries in the AlertManager to include status checks during acknowledgment and dismissal of alerts, ensuring that only alerts with the expected status are modified. - Improved code readability by restructuring the load_initial_counters method for clarity in total alert count calculations. --- daemon/src/alerts/alert_manager.cpp | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/daemon/src/alerts/alert_manager.cpp b/daemon/src/alerts/alert_manager.cpp index 63de7a5a..962f6857 100644 --- a/daemon/src/alerts/alert_manager.cpp +++ b/daemon/src/alerts/alert_manager.cpp @@ -289,11 +289,11 @@ bool AlertManager::prepare_statements() { const char* select_all_sql = "SELECT uuid, severity, category, source, message, description, timestamp, status, acknowledged_at, dismissed_at FROM alerts WHERE 1=1"; - const char* update_ack_sql = "UPDATE alerts SET status = ?, acknowledged_at = ? WHERE uuid = ?"; + const char* update_ack_sql = "UPDATE alerts SET status = ?, acknowledged_at = ? WHERE uuid = ? AND status = ?"; const char* update_ack_all_sql = "UPDATE alerts SET status = ?, acknowledged_at = ? WHERE status = ?"; - const char* update_dismiss_sql = "UPDATE alerts SET status = ?, dismissed_at = ? WHERE uuid = ?"; + const char* update_dismiss_sql = "UPDATE alerts SET status = ?, dismissed_at = ? WHERE uuid = ? AND status = ?"; const char* count_sql = "SELECT severity, COUNT(*) FROM alerts WHERE status != ? GROUP BY severity"; @@ -411,10 +411,10 @@ void AlertManager::load_initial_counters() { int rc; while ((rc = sqlite3_step(stmt)) == SQLITE_ROW) { - int severity = sqlite3_column_int(stmt, 0); - int count = sqlite3_column_int(stmt, 1); - - switch (static_cast(severity)) { + int severity = sqlite3_column_int(stmt, 0); + int count = sqlite3_column_int(stmt, 1); + + switch (static_cast(severity)) { case AlertSeverity::INFO: count_info_.store(count, std::memory_order_relaxed); break; @@ -427,8 +427,8 @@ void AlertManager::load_initial_counters() { case AlertSeverity::CRITICAL: count_critical_.store(count, std::memory_order_relaxed); break; - } - total += count; + } + total += count; } // End of while loop count_total_.store(total, std::memory_order_relaxed); } // Lock released @@ -711,6 +711,7 @@ bool AlertManager::acknowledge_alert(const std::string& uuid) { sqlite3_bind_int(stmt, 1, static_cast(AlertStatus::ACKNOWLEDGED)); sqlite3_bind_text(stmt, 2, timestamp_str.c_str(), -1, SQLITE_STATIC); sqlite3_bind_text(stmt, 3, uuid.c_str(), -1, SQLITE_STATIC); + sqlite3_bind_int(stmt, 4, static_cast(AlertStatus::ACTIVE)); rc = sqlite3_step(stmt); changes = (rc == SQLITE_DONE) ? sqlite3_changes(db) : 0; @@ -789,6 +790,7 @@ bool AlertManager::dismiss_alert(const std::string& uuid) { int rc; int changes = 0; + AlertStatus expected_status = alert_opt->status; { // SQLite prepared statements are NOT thread-safe - protect with mutex std::lock_guard lock(stmt_mutex_); @@ -799,6 +801,7 @@ bool AlertManager::dismiss_alert(const std::string& uuid) { sqlite3_bind_int(stmt, 1, static_cast(AlertStatus::DISMISSED)); sqlite3_bind_text(stmt, 2, timestamp_str.c_str(), -1, SQLITE_STATIC); sqlite3_bind_text(stmt, 3, uuid.c_str(), -1, SQLITE_STATIC); + sqlite3_bind_int(stmt, 4, static_cast(expected_status)); rc = sqlite3_step(stmt); changes = (rc == SQLITE_DONE) ? sqlite3_changes(db) : 0; @@ -806,7 +809,7 @@ bool AlertManager::dismiss_alert(const std::string& uuid) { if (rc == SQLITE_DONE && changes > 0) { // Update counters if alert was active - atomics are thread-safe - if (should_update_counters && alert_opt->status == AlertStatus::ACTIVE) { + if (should_update_counters && expected_status == AlertStatus::ACTIVE) { update_counters(alert_opt->severity, -1); } return true; From 9fdd4da59fb54bd0fbf6cfb063c329a02331f0e4 Mon Sep 17 00:00:00 2001 From: sujay-d07 Date: Wed, 21 Jan 2026 20:28:59 +0530 Subject: [PATCH 09/12] fix(alerts): Improve handling of alert data retrieval in AlertManager - Updated the get_alert and get_alerts methods to safely handle potential null values from SQLite queries by checking for null pointers before dereferencing, ensuring robust string assignment for alert attributes. - Enhanced code readability by using descriptive variable names for SQLite column text retrieval. --- daemon/src/alerts/alert_manager.cpp | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/daemon/src/alerts/alert_manager.cpp b/daemon/src/alerts/alert_manager.cpp index 962f6857..d67f6082 100644 --- a/daemon/src/alerts/alert_manager.cpp +++ b/daemon/src/alerts/alert_manager.cpp @@ -545,12 +545,16 @@ std::optional AlertManager::get_alert(const std::string& uuid) { } // Read all columns while lock is held (stmt is only valid during lock) - alert.uuid = reinterpret_cast(sqlite3_column_text(stmt, 0)); + const unsigned char* uuid_txt = sqlite3_column_text(stmt, 0); + alert.uuid = uuid_txt ? reinterpret_cast(uuid_txt) : std::string(); alert.severity = static_cast(sqlite3_column_int(stmt, 1)); alert.category = static_cast(sqlite3_column_int(stmt, 2)); - alert.source = reinterpret_cast(sqlite3_column_text(stmt, 3)); - alert.message = reinterpret_cast(sqlite3_column_text(stmt, 4)); - alert.description = reinterpret_cast(sqlite3_column_text(stmt, 5)); + const unsigned char* source_txt = sqlite3_column_text(stmt, 3); + alert.source = source_txt ? reinterpret_cast(source_txt) : std::string(); + const unsigned char* message_txt = sqlite3_column_text(stmt, 4); + alert.message = message_txt ? reinterpret_cast(message_txt) : std::string(); + const unsigned char* desc_txt = sqlite3_column_text(stmt, 5); + alert.description = desc_txt ? reinterpret_cast(desc_txt) : std::string(); // Parse timestamp std::string timestamp_str = reinterpret_cast(sqlite3_column_text(stmt, 6)); @@ -635,12 +639,16 @@ std::vector AlertManager::get_alerts(const AlertFilter& filter) { while ((rc = sqlite3_step(stmt)) == SQLITE_ROW) { Alert alert; - alert.uuid = reinterpret_cast(sqlite3_column_text(stmt, 0)); + const unsigned char* uuid_txt = sqlite3_column_text(stmt, 0); + alert.uuid = uuid_txt ? reinterpret_cast(uuid_txt) : std::string(); alert.severity = static_cast(sqlite3_column_int(stmt, 1)); alert.category = static_cast(sqlite3_column_int(stmt, 2)); - alert.source = reinterpret_cast(sqlite3_column_text(stmt, 3)); - alert.message = reinterpret_cast(sqlite3_column_text(stmt, 4)); - alert.description = reinterpret_cast(sqlite3_column_text(stmt, 5)); + const unsigned char* source_txt = sqlite3_column_text(stmt, 3); + alert.source = source_txt ? reinterpret_cast(source_txt) : std::string(); + const unsigned char* message_txt = sqlite3_column_text(stmt, 4); + alert.message = message_txt ? reinterpret_cast(message_txt) : std::string(); + const unsigned char* desc_txt = sqlite3_column_text(stmt, 5); + alert.description = desc_txt ? reinterpret_cast(desc_txt) : std::string(); // Parse timestamp std::string timestamp_str = reinterpret_cast(sqlite3_column_text(stmt, 6)); From fab906f239640deac52e1ea87d7fc029f4f8b4a5 Mon Sep 17 00:00:00 2001 From: sujay-d07 Date: Wed, 21 Jan 2026 22:13:36 +0530 Subject: [PATCH 10/12] fix(alerts): Enhance timestamp parsing in AlertManager - Improved the get_alert and get_alerts methods to handle potential null values from SQLite more robustly, ensuring that timestamps are correctly parsed or defaulted to the current time when necessary. --- daemon/src/alerts/alert_manager.cpp | 42 ++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/daemon/src/alerts/alert_manager.cpp b/daemon/src/alerts/alert_manager.cpp index d67f6082..c61c52b8 100644 --- a/daemon/src/alerts/alert_manager.cpp +++ b/daemon/src/alerts/alert_manager.cpp @@ -557,12 +557,21 @@ std::optional AlertManager::get_alert(const std::string& uuid) { alert.description = desc_txt ? reinterpret_cast(desc_txt) : std::string(); // Parse timestamp - std::string timestamp_str = reinterpret_cast(sqlite3_column_text(stmt, 6)); - std::tm tm = {}; - std::istringstream ss(timestamp_str); - ss >> std::get_time(&tm, "%Y-%m-%dT%H:%M:%SZ"); - if (!ss.fail()) { - alert.timestamp = std::chrono::system_clock::from_time_t(utc_timegm(&tm)); + const unsigned char* raw = sqlite3_column_text(stmt, 6); + if (raw != nullptr) { + std::string timestamp_str = reinterpret_cast(raw); + if (!timestamp_str.empty()) { + std::tm tm = {}; + std::istringstream ss(timestamp_str); + ss >> std::get_time(&tm, "%Y-%m-%dT%H:%M:%SZ"); + if (!ss.fail()) { + alert.timestamp = std::chrono::system_clock::from_time_t(utc_timegm(&tm)); + } else { + alert.timestamp = std::chrono::system_clock::now(); + } + } else { + alert.timestamp = std::chrono::system_clock::now(); + } } else { alert.timestamp = std::chrono::system_clock::now(); } @@ -651,12 +660,21 @@ std::vector AlertManager::get_alerts(const AlertFilter& filter) { alert.description = desc_txt ? reinterpret_cast(desc_txt) : std::string(); // Parse timestamp - std::string timestamp_str = reinterpret_cast(sqlite3_column_text(stmt, 6)); - std::tm tm = {}; - std::istringstream ss(timestamp_str); - ss >> std::get_time(&tm, "%Y-%m-%dT%H:%M:%SZ"); - if (!ss.fail()) { - alert.timestamp = std::chrono::system_clock::from_time_t(utc_timegm(&tm)); + const unsigned char* raw = sqlite3_column_text(stmt, 6); + if (raw != nullptr) { + std::string timestamp_str = reinterpret_cast(raw); + if (!timestamp_str.empty()) { + std::tm tm = {}; + std::istringstream ss(timestamp_str); + ss >> std::get_time(&tm, "%Y-%m-%dT%H:%M:%SZ"); + if (!ss.fail()) { + alert.timestamp = std::chrono::system_clock::from_time_t(utc_timegm(&tm)); + } else { + alert.timestamp = std::chrono::system_clock::now(); + } + } else { + alert.timestamp = std::chrono::system_clock::now(); + } } else { alert.timestamp = std::chrono::system_clock::now(); } From 537aaeef07a067936ee0cff492c97bac8fab9b3d Mon Sep 17 00:00:00 2001 From: sujay-d07 Date: Wed, 21 Jan 2026 22:33:11 +0530 Subject: [PATCH 11/12] feat(alerts): Add dismiss-all functionality for alerts - Introduced a new command `cortex daemon alerts --dismiss-all` to dismiss all active and acknowledged alerts. - Implemented backend support in the AlertManager and IPC handlers to process the dismiss-all request. - Updated documentation to include the new command and its usage. - Added integration and unit tests to ensure the dismiss-all feature works as expected. --- README.md | 1 + cortex/cli.py | 20 +++++++ cortex/daemon_client.py | 9 +++ daemon/README.md | 6 +- daemon/include/cortexd/alerts/alert_manager.h | 7 +++ daemon/src/alerts/alert_manager.cpp | 57 ++++++++++++++++++- daemon/src/ipc/handlers.cpp | 27 +++++---- daemon/tests/integration/test_handlers.cpp | 20 +++++++ daemon/tests/unit/test_alert_manager.cpp | 29 ++++++++++ 9 files changed, 163 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 550158cf..64c28e7a 100644 --- a/README.md +++ b/README.md @@ -209,6 +209,7 @@ cortex role set | `cortex daemon alerts --severity ` | Filter alerts by severity (info/warning/error/critical) | | `cortex daemon alerts --category ` | Filter alerts by category (cpu/memory/disk/apt/cve/service/system) | | `cortex daemon alerts --acknowledge-all` | Acknowledge all active alerts | +| `cortex daemon alerts --dismiss-all` | Dismiss all active and acknowledged alerts | | `cortex daemon alerts --dismiss ` | Dismiss a specific alert by UUID | | `cortex daemon shutdown` | Request daemon shutdown | diff --git a/cortex/cli.py b/cortex/cli.py index 4ab878de..b982203c 100644 --- a/cortex/cli.py +++ b/cortex/cli.py @@ -2706,6 +2706,21 @@ def _daemon_alerts(self, args: argparse.Namespace) -> int: cx_print(f"Failed: {response.error}", "error") return 1 + # Handle dismiss-all + if getattr(args, "dismiss_all", False): + cx_header("Dismissing All Alerts") + success, response = self._daemon_ipc_call( + "alerts_dismiss_all", lambda c: c.alerts_dismiss_all() + ) + if not success: + return 1 + if response.success: + count = response.result.get("dismissed", 0) if response.result else 0 + cx_print(f"Dismissed {count} alert(s)", "success") + return 0 + cx_print(f"Failed: {response.error}", "error") + return 1 + # Handle dismiss dismiss_uuid = getattr(args, "dismiss", None) if dismiss_uuid: @@ -4604,6 +4619,11 @@ def main(): action="store_true", help="Acknowledge all active alerts", ) + daemon_alerts_parser.add_argument( + "--dismiss-all", + action="store_true", + help="Dismiss all active and acknowledged alerts", + ) daemon_alerts_parser.add_argument( "--dismiss", metavar="UUID", diff --git a/cortex/daemon_client.py b/cortex/daemon_client.py index 77a9a4ba..fa430f65 100644 --- a/cortex/daemon_client.py +++ b/cortex/daemon_client.py @@ -308,6 +308,15 @@ def alerts_dismiss(self, uuid: str) -> DaemonResponse: """ return self._send_request("alerts.dismiss", {"uuid": uuid}) + def alerts_dismiss_all(self) -> DaemonResponse: + """ + Dismiss all active and acknowledged alerts. + + Returns: + DaemonResponse with count of dismissed alerts. + """ + return self._send_request("alerts.dismiss", {"all": True}) + class DaemonNotInstalledError(Exception): """Raised when the daemon is not installed.""" diff --git a/daemon/README.md b/daemon/README.md index 75bc0719..4877cfce 100644 --- a/daemon/README.md +++ b/daemon/README.md @@ -166,6 +166,7 @@ cortex daemon alerts # List all active alerts cortex daemon alerts --severity warning # Filter by severity cortex daemon alerts --category cpu # Filter by category cortex daemon alerts --acknowledge-all # Acknowledge all alerts +cortex daemon alerts --dismiss-all # Dismiss all active and acknowledged alerts cortex daemon alerts --dismiss # Dismiss specific alert # Install/uninstall daemon @@ -188,7 +189,7 @@ cortex daemon uninstall | `health` | Get system health metrics (CPU, memory, disk, services) | | `alerts` / `alerts.get` | Get alerts with optional filtering | | `alerts.acknowledge` | Acknowledge alerts (all or by UUID) | -| `alerts.dismiss` | Dismiss a specific alert by UUID | +| `alerts.dismiss` | Dismiss alerts (all or by UUID) | ### Example @@ -238,6 +239,9 @@ echo '{"method":"alerts","params":{"severity":"warning"}}' | socat - UNIX-CONNEC # Acknowledge all alerts echo '{"method":"alerts.acknowledge","params":{"all":true}}' | socat - UNIX-CONNECT:/run/cortex/cortex.sock + +# Dismiss all alerts +echo '{"method":"alerts.dismiss","params":{"all":true}}' | socat - UNIX-CONNECT:/run/cortex/cortex.sock ``` ## Configuration diff --git a/daemon/include/cortexd/alerts/alert_manager.h b/daemon/include/cortexd/alerts/alert_manager.h index 82d538c2..45cb2f0f 100644 --- a/daemon/include/cortexd/alerts/alert_manager.h +++ b/daemon/include/cortexd/alerts/alert_manager.h @@ -136,6 +136,12 @@ class AlertManager { */ bool dismiss_alert(const std::string& uuid); + /** + * @brief Dismiss all active and acknowledged alerts + * @return Number of alerts dismissed + */ + size_t dismiss_all(); + /** * @brief Get alert counts by severity */ @@ -188,6 +194,7 @@ class AlertManager { void* stmt_update_ack_; // sqlite3_stmt* void* stmt_update_ack_all_; // sqlite3_stmt* void* stmt_update_dismiss_; // sqlite3_stmt* + void* stmt_update_dismiss_all_; // sqlite3_stmt* void* stmt_count_; // sqlite3_stmt* // Mutex to protect prepared statement usage (SQLite statements are NOT thread-safe) diff --git a/daemon/src/alerts/alert_manager.cpp b/daemon/src/alerts/alert_manager.cpp index c61c52b8..a5212365 100644 --- a/daemon/src/alerts/alert_manager.cpp +++ b/daemon/src/alerts/alert_manager.cpp @@ -133,7 +133,7 @@ AlertManager::AlertManager(const std::string& db_path) : db_path_(db_path), db_handle_(nullptr), stmt_insert_(nullptr), stmt_select_(nullptr), stmt_select_all_(nullptr), stmt_update_ack_(nullptr), stmt_update_ack_all_(nullptr), - stmt_update_dismiss_(nullptr), stmt_count_(nullptr) { + stmt_update_dismiss_(nullptr), stmt_update_dismiss_all_(nullptr), stmt_count_(nullptr) { } AlertManager::~AlertManager() { @@ -265,6 +265,7 @@ bool AlertManager::initialize() { // Prepare and cache all statements if (!prepare_statements()) { + finalize_statements(); sqlite3_close(db); db_handle_ = nullptr; return false; @@ -295,6 +296,8 @@ bool AlertManager::prepare_statements() { const char* update_dismiss_sql = "UPDATE alerts SET status = ?, dismissed_at = ? WHERE uuid = ? AND status = ?"; + const char* update_dismiss_all_sql = "UPDATE alerts SET status = ?, dismissed_at = ? WHERE status != ?"; + const char* count_sql = "SELECT severity, COUNT(*) FROM alerts WHERE status != ? GROUP BY severity"; int rc; @@ -335,6 +338,12 @@ bool AlertManager::prepare_statements() { return false; } + rc = sqlite3_prepare_v2(db, update_dismiss_all_sql, -1, reinterpret_cast(&stmt_update_dismiss_all_), nullptr); + if (rc != SQLITE_OK) { + LOG_ERROR("AlertManager", "Failed to prepare update_dismiss_all statement: " + std::string(sqlite3_errmsg(db))); + return false; + } + rc = sqlite3_prepare_v2(db, count_sql, -1, reinterpret_cast(&stmt_count_), nullptr); if (rc != SQLITE_OK) { LOG_ERROR("AlertManager", "Failed to prepare count statement: " + std::string(sqlite3_errmsg(db))); @@ -369,6 +378,10 @@ void AlertManager::finalize_statements() { sqlite3_finalize(static_cast(stmt_update_dismiss_)); stmt_update_dismiss_ = nullptr; } + if (stmt_update_dismiss_all_) { + sqlite3_finalize(static_cast(stmt_update_dismiss_all_)); + stmt_update_dismiss_all_ = nullptr; + } if (stmt_count_) { sqlite3_finalize(static_cast(stmt_count_)); stmt_count_ = nullptr; @@ -844,6 +857,48 @@ bool AlertManager::dismiss_alert(const std::string& uuid) { return false; } +size_t AlertManager::dismiss_all() { + if (!db_handle_ || !stmt_update_dismiss_all_) { + return 0; + } + + sqlite3* db = static_cast(db_handle_); + auto now = std::chrono::system_clock::now(); + auto time_t = std::chrono::system_clock::to_time_t(now); + std::string timestamp_str = format_utc_time(time_t); + + int rc; + int changes = 0; + { + // SQLite prepared statements are NOT thread-safe - protect with mutex + std::lock_guard lock(stmt_mutex_); + + sqlite3_stmt* stmt = static_cast(stmt_update_dismiss_all_); + sqlite3_reset(stmt); + + sqlite3_bind_int(stmt, 1, static_cast(AlertStatus::DISMISSED)); + sqlite3_bind_text(stmt, 2, timestamp_str.c_str(), -1, SQLITE_STATIC); + sqlite3_bind_int(stmt, 3, static_cast(AlertStatus::DISMISSED)); + + rc = sqlite3_step(stmt); + changes = (rc == SQLITE_DONE) ? sqlite3_changes(db) : 0; + + // Update counters while holding lock to prevent race with concurrent inserts + // Reset all to 0 since all active/acknowledged alerts are now dismissed + // Note: This is approximate - for exact counts we'd need to query by severity + // But for dismiss_all, we typically want to clear all counters anyway + if (changes > 0) { + count_info_.store(0, std::memory_order_relaxed); + count_warning_.store(0, std::memory_order_relaxed); + count_error_.store(0, std::memory_order_relaxed); + count_critical_.store(0, std::memory_order_relaxed); + count_total_.store(0, std::memory_order_relaxed); + } + } // Lock released + + return changes; +} + json AlertManager::get_alert_counts() { // Use in-memory counters for O(1) performance json counts; diff --git a/daemon/src/ipc/handlers.cpp b/daemon/src/ipc/handlers.cpp index dd1f26d2..4cc2222c 100644 --- a/daemon/src/ipc/handlers.cpp +++ b/daemon/src/ipc/handlers.cpp @@ -228,20 +228,25 @@ Response Handlers::handle_alerts_dismiss(const Request& req, std::shared_ptr(); - } else { - return Response::err("UUID required for dismiss", ErrorCodes::INVALID_PARAMS); - } - - if (alerts->dismiss_alert(uuid)) { + // Check if dismissing all or specific UUID + if (req.params.is_object() && req.params.contains("all") && req.params["all"].get()) { + size_t count = alerts->dismiss_all(); return Response::ok({ - {"dismissed", true}, - {"uuid", uuid} + {"dismissed", count}, + {"message", "Dismissed " + std::to_string(count) + " alert(s)"} }); + } else if (req.params.is_object() && req.params.contains("uuid")) { + std::string uuid = req.params["uuid"].get(); + if (alerts->dismiss_alert(uuid)) { + return Response::ok({ + {"dismissed", true}, + {"uuid", uuid} + }); + } else { + return Response::err("Alert not found", ErrorCodes::ALERT_NOT_FOUND); + } } else { - return Response::err("Alert not found", ErrorCodes::ALERT_NOT_FOUND); + return Response::err("UUID or 'all' parameter required for dismiss", ErrorCodes::INVALID_PARAMS); } } diff --git a/daemon/tests/integration/test_handlers.cpp b/daemon/tests/integration/test_handlers.cpp index 80ad8ff1..f7b61941 100644 --- a/daemon/tests/integration/test_handlers.cpp +++ b/daemon/tests/integration/test_handlers.cpp @@ -474,6 +474,26 @@ TEST_F(HandlersTest, AlertsDismiss) { EXPECT_FALSE(found); } +TEST_F(HandlersTest, AlertsDismissAll) { + start_server_with_monitoring(); + + // Create multiple alerts + for (int i = 0; i < 3; ++i) { + cortexd::Alert alert; + alert.severity = cortexd::AlertSeverity::INFO; + alert.category = cortexd::AlertCategory::SYSTEM; + alert.source = "test"; + alert.message = "Test alert " + std::to_string(i); + alert.status = cortexd::AlertStatus::ACTIVE; + alert_manager_->create_alert(alert); + } + + auto response = send_json_request("alerts.dismiss", {{"all", true}}); + + EXPECT_TRUE(response["success"]); + EXPECT_GE(response["result"]["dismissed"], 3); +} + // ============================================================================ // Response format tests // ============================================================================ diff --git a/daemon/tests/unit/test_alert_manager.cpp b/daemon/tests/unit/test_alert_manager.cpp index 2fc6a5aa..28093f8f 100644 --- a/daemon/tests/unit/test_alert_manager.cpp +++ b/daemon/tests/unit/test_alert_manager.cpp @@ -185,6 +185,35 @@ TEST_F(AlertManagerTest, DismissAlert) { ASSERT_TRUE(retrieved->dismissed_at.has_value()); } +TEST_F(AlertManagerTest, DismissAll) { + // Create multiple active and acknowledged alerts + for (int i = 0; i < 3; ++i) { + Alert alert; + alert.severity = AlertSeverity::WARNING; + alert.category = AlertCategory::CPU; + alert.source = "test"; + alert.message = "Alert " + std::to_string(i); + alert.status = AlertStatus::ACTIVE; + alert_manager_->create_alert(alert); + } + + // Acknowledge one alert + AlertFilter filter; + filter.status = AlertStatus::ACTIVE; + auto active_alerts = alert_manager_->get_alerts(filter); + if (!active_alerts.empty()) { + alert_manager_->acknowledge_alert(active_alerts[0].uuid); + } + + size_t count = alert_manager_->dismiss_all(); + ASSERT_GE(count, 3); // Should dismiss all active and acknowledged alerts + + AlertFilter dismissed_filter; + dismissed_filter.status = AlertStatus::DISMISSED; + auto dismissed_alerts = alert_manager_->get_alerts(dismissed_filter); + ASSERT_GE(dismissed_alerts.size(), 3); +} + TEST_F(AlertManagerTest, GetAlertCounts) { // Create alerts with different severities Alert alert1; From b874e6ef7faea0b225429e438e73ba0764fa1ea6 Mon Sep 17 00:00:00 2001 From: sujay-d07 Date: Wed, 21 Jan 2026 22:48:13 +0530 Subject: [PATCH 12/12] refactor(cli): Organize alert action flags into a mutually exclusive group - Refactored the argument parsing for alert actions in the CLI to use a mutually exclusive group, improving command clarity and preventing conflicting options. - Updated the handling of alert acknowledgment to ensure proper parameter validation, returning an error for missing or invalid parameters. --- cortex/cli.py | 8 +++++--- daemon/src/ipc/handlers.cpp | 7 +------ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/cortex/cli.py b/cortex/cli.py index b982203c..58cf1257 100644 --- a/cortex/cli.py +++ b/cortex/cli.py @@ -4614,17 +4614,19 @@ def main(): choices=["cpu", "memory", "disk", "apt", "cve", "service", "system"], help="Filter alerts by category", ) - daemon_alerts_parser.add_argument( + # Mutually exclusive group for action flags + actions_group = daemon_alerts_parser.add_mutually_exclusive_group() + actions_group.add_argument( "--acknowledge-all", action="store_true", help="Acknowledge all active alerts", ) - daemon_alerts_parser.add_argument( + actions_group.add_argument( "--dismiss-all", action="store_true", help="Dismiss all active and acknowledged alerts", ) - daemon_alerts_parser.add_argument( + actions_group.add_argument( "--dismiss", metavar="UUID", help="Dismiss a specific alert by UUID", diff --git a/daemon/src/ipc/handlers.cpp b/daemon/src/ipc/handlers.cpp index 4cc2222c..6cf1e9f9 100644 --- a/daemon/src/ipc/handlers.cpp +++ b/daemon/src/ipc/handlers.cpp @@ -214,12 +214,7 @@ Response Handlers::handle_alerts_acknowledge(const Request& req, std::shared_ptr return Response::err("Alert not found or already acknowledged", ErrorCodes::ALERT_NOT_FOUND); } } else { - // Default: acknowledge all - size_t count = alerts->acknowledge_all(); - return Response::ok({ - {"acknowledged", count}, - {"message", "Acknowledged " + std::to_string(count) + " alert(s)"} - }); + return Response::err("Missing or invalid parameters: require 'uuid' or 'all'", ErrorCodes::INVALID_PARAMS); } }