diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json new file mode 100644 index 0000000..e088c6d --- /dev/null +++ b/.cursor-plugin/plugin.json @@ -0,0 +1,25 @@ +{ + "name": "causely", + "displayName": "Causely", + "version": "1.0.0", + "description": "Use Causely directly in Cursor through a preconfigured MCP server. Query service health, root causes, SLOs, metrics, and topology through natural conversation — grounded in system ontology and live causal intelligence.", + "author": { + "name": "Causely", + "email": "support@causely.ai" + }, + "license": "Apache-2.0", + "logo": "logo.svg", + "homepage": "https://causely.ai", + "repository": "https://github.com/causely-oss/cursor-plugin", + "keywords": [ + "causely", + "observability", + "kubernetes", + "root-cause-analysis", + "incidents", + "slo", + "causal-intelligence", + "ontology", + "reliability" + ] +} diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..67167ab --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,18 @@ +# Changelog + +## 1.0.0 — 2026-04-22 + +Initial release of the Causely Cursor Marketplace plugin. + +### What's included + +- MCP server connection to `https://api.causely.app/mcp` (Streamable HTTP) +- Six packaged skills covering the most common Causely workflows: + - `causely-alert-triage` — map incoming alerts to root causes + - `causely-change-impact` — post-deploy regression and blast radius analysis + - `causely-correlated-incidents` — multi-service failure correlation + - `causely-health-reporting` — scheduled and on-demand health summaries + - `causely-k8s-investigation` — Kubernetes infrastructure deep-dives + - `causely-postmortem` — structured post-mortems and ticket drafts +- OAuth 2.0 authorization code flow (browser sign-in via Causely) +- API credentials fallback (`X-Causely-Client-Basic` header for non-interactive environments) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d8660c9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,17 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + Copyright 2026 Causely, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 96eacb8..9359155 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,99 @@ -# cursor-plugin -Cursor Marketplace plugin for Causely — system ontology and causal intelligence for application reliability +# Causely for Cursor + +Use Causely directly in Cursor through a preconfigured MCP server. Query service health, root causes, SLOs, metrics, and topology through natural conversation — grounded in system ontology and live causal intelligence. + +## Prerequisites + +- A [Causely](https://causely.ai) account. Contact support@causely.ai if you need access. +- Cursor with MCP support. + +## Installation + +Install via **Cursor Settings → Plugins**, search for **Causely**, and click **Install**. Cursor will prompt you to sign in to Causely and grant access. + +## Authentication + +### OAuth (recommended) + +Cursor initiates the OAuth flow automatically on install. Sign in to Causely once — Cursor manages token refresh from that point on. + +### API credentials + +For non-interactive or CI environments, generate API credentials at [auth.causely.app/oauth/portal/api-tokens](https://auth.causely.app/oauth/portal/api-tokens) and configure the server manually: + +```json +{ + "mcpServers": { + "causely": { + "url": "https://api.causely.app/mcp", + "transport": "Streamable HTTP", + "headers": { + "X-Causely-Client-Basic": "Basic " + } + } + } +} +``` + +## What you can ask + +- "What's the root cause of the checkout service degradation?" +- "Which services are burning their error budget?" +- "What changed before this incident started?" +- "Show me the blast radius of the database slowdown." +- "Are there correlated failures across namespaces?" +- "Write a post-mortem for the incident that resolved an hour ago." +- "What are the top slow queries on the orders database?" +- "Give me a morning health report for the production namespace." + +## Packaged skills + +This plugin includes six skills that activate automatically for the right type of question: + +| Skill | Activates for | +|---|---| +| `causely-alert-triage` | Incoming alerts — PagerDuty, Datadog, Prometheus, OpsGenie | +| `causely-change-impact` | Post-deploy regression checks and rollout validation | +| `causely-correlated-incidents` | Multi-service failures and blast radius analysis | +| `causely-health-reporting` | Health summaries, SLO status, morning briefings | +| `causely-k8s-investigation` | Kubernetes infrastructure — pods, nodes, namespaces | +| `causely-postmortem` | Post-mortems, incident reports, and ticket drafts | + +## Available tools + +All tools are read-only. Causely does not execute changes through this connection. + +| Tool | Description | +|---|---| +| `get_environment_health` | Global health overview: active root causes, symptoms, status | +| `get_root_causes` | Root causes with remediation, blast radius, and supporting symptoms | +| `get_symptoms` | Raw observable signals feeding into root cause detection | +| `triage` | Deep investigation of a single service | +| `get_service_summary` | Comprehensive health summary for a service | +| `get_entity_health` | Health for databases, pods, queues, and other entity types | +| `team_health` | Health summary for all services owned by a team | +| `get_metrics` | Time-series or snapshot metrics for any entity | +| `get_logs` | Logs for an entity or evidence logs for a root cause | +| `get_alerts` | Alert history with mapping state | +| `get_events` | Lifecycle events: restarts, deployments, scaling, config changes | +| `get_topology` | Dependency graph: upstream, downstream, or end-to-end dataflow | +| `get_slo` | Error budget, burn rate, at-risk and violated SLOs | +| `get_slow_queries` | Slow SQL queries ranked by total execution time | +| `get_entities` | Resolve service and infrastructure entity names to IDs | +| `get_scopes` | Discover available clusters, namespaces, customers, and products | +| `get_label_values` | Enumerate distinct values for a label key (e.g. team, product) | +| `list_namespaces` | List all Kubernetes namespace names | +| `list_clusters` | List all cluster names | +| `get_integration_status` | Scraper and integration status per cluster | +| `get_config` | Raw configuration files for an entity | +| `reliability_delta` | Post-deploy regression check for a single service | +| `fleet_reliability_delta` | Post-deploy regression check across a team or namespace | +| `postmortem` | Structured post-mortem draft from a resolved incident | +| `generate_ticket` | Ticket draft for Jira, GitHub, or Linear from an incident | +| `ask_causely` | Natural-language question answering (markdown response) | + +## Support + +Email: support@causely.ai +Website: https://causely.ai +Docs: https://docs.causely.ai/agent-integration/mcp-server diff --git a/logo.svg b/logo.svg new file mode 100644 index 0000000..cd432df --- /dev/null +++ b/logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/mcp.json b/mcp.json new file mode 100644 index 0000000..12aaa44 --- /dev/null +++ b/mcp.json @@ -0,0 +1,8 @@ +{ + "mcpServers": { + "causely": { + "url": "https://api.causely.app/mcp", + "transport": "Streamable HTTP" + } + } +} diff --git a/skills/causely-alert-triage/SKILL.md b/skills/causely-alert-triage/SKILL.md new file mode 100644 index 0000000..fd6935a --- /dev/null +++ b/skills/causely-alert-triage/SKILL.md @@ -0,0 +1,127 @@ +--- +name: causely-alert-triage +description: > + Use this skill when the user is starting from an alert — they received a PagerDuty page, Datadog alert, Prometheus/Alertmanager notification, Slack alert, or OpsGenie notification and want to understand what it means. Trigger for questions like "I got paged for KubeContainerWaiting", "what does this alert mean?", "PagerDuty woke me up for high error rate on checkout", "Datadog says memory is high on X", "what alerts are firing on X?", "how many unmapped alerts do we have?", "is this alert noise or real?", "which alerts map to Causely symptoms?", or "audit alert noise". Also trigger when the user pastes an alert name, alert payload, or references an external alerting system. Use this skill over generic causely-mcp when the investigation starts from an alert rather than from a service name or health question. +--- + +# Causely Alert Triage Skill + +Read `references/complete-investigation.md` for the full 25-tool inventory and evidence strategy. + +--- + +## Core tools for alert-driven triage + +| Tool | Use when | What it returns | +|---|---|---| +| `get_entities(query=, entity_types=)` | Resolve the service/entity from the alert | Entity IDs for the affected service | +| `get_alerts(entity_ids=)` | See all alerts firing + mapping state | Alert name, symptom mapping, severity, count, timestamps | +| `get_root_causes(symptom_ids=)` | Find diagnosed cause behind a mapped alert | Root causes with evidence, blast radius, remediation | +| `triage(entity_name=)` | Quick full-picture health check | Root causes, symptoms, impact — all in one call | +| `get_symptoms(entity_ids=)` | Check which alerts promoted to symptoms | Named signals in the causal graph | +| `ask_causely(question=)` | Free-form query when alert name doesn't resolve | NL fallback for complex alert-to-cause questions | + +--- + +## Core rule: alerts → entities → causes + +External alerting systems (PagerDuty, Datadog, Alertmanager) fire raw alert names. Causely maps some alerts to named symptoms in its causal model. The workflow bridges from alert → entity → mapped symptom → root cause. + +**`ask_causely` cannot resolve raw alert names.** Don't use it for "what is causing KubeContainerWaiting?" — use the structured workflow below. + +--- + +## Decision tree + +**Alert received — service name known:** +``` +triage(entity_name="") ← 1 call + → if root causes found: that's likely what triggered the alert + → description = evidence, remediation = what to do + → done in most cases +``` + +If you need to see the specific alert and its mapping status: +``` +get_entities(query="", entity_types=["Service"]) ← 1 call +get_alerts(entity_ids=[id], active_only=true) ← 1 call + → find the alert by name + → mapping_state = "mapped" → Causely has incorporated it + → mapping_state = "unmapped" → Causely hasn't promoted it to a symptom + → if mapped: symptom_name → get_root_causes(symptom_ids=[...]) for cause +``` + +**Alert received — service name unknown:** +``` +ask_causely("What active root causes are there right now?") ← 1 call + → scan results for the alert pattern or affected service + → then triage the identified service +``` + +**Alert name known, want to check if Causely knows about it:** +``` +get_entities(query="") ← 1 call +get_alerts(entity_ids=[id], alert_name_filters=[""]) ← 1 call + → mapping_state tells you if Causely has incorporated this alert + → if mapped: follow symptom_name → root cause chain + → if unmapped: alert is noise or not yet incorporated +``` + +**Alert noise audit ("how noisy are our alerts?"):** +``` +get_entities(query="") ← 1 call +get_alerts(entity_ids=[id], mapping_state_filters=["unmapped"]) ← 1 call + → high-count unmapped alerts = noise candidates for tuning + → compare with get_alerts(mapping_state_filters=["mapped"]) for signal-to-noise +``` + +**Multiple alerts firing at once:** +``` +get_root_causes(active_only=true) ← 1 call + → check if multiple alerts map to the same root cause + → impact_service_graph shows propagation → many alerts, one origin +``` + +--- + +## Mapping state guide + +| mapping_state | Meaning | Action | +|---|---|---| +| `mapped` | Causely has promoted this alert to a named symptom | Follow `symptom_name` → `get_root_causes(symptom_ids=)` for diagnosis | +| `unmapped` | Causely hasn't incorporated this alert | May be noise, or a new signal type not yet configured | + +--- + +## Output format + +### 🔔 Alert triage: [alert name] + +**Alert:** [alert_name from get_alerts or user's description] +**Service:** [entity name] +**Status:** [firing / resolved] · **Severity:** [from alert] +**Causely mapping:** ✅ Mapped to symptom "[symptom_name]" / ❌ Unmapped + +**Root cause:** [from triage or get_root_causes — name + entity + portal link] + +**Evidence:** [from description field] + +**Blast radius:** [from impacted_services] + +**Customer impact:** [from impacted_customers] + +**Owner:** [from causely.ai/team label] + +**Recommended actions:** [from remediation field] + +**Links:** [portal links] + +--- + +## Important behaviours + +- **Start with `triage` when you have a service name.** It's faster and gives the full picture without needing to resolve alert → symptom → root cause manually. +- **Use `get_alerts` when the user specifically wants to see alert-level detail** — mapping status, alert counts, firing times. +- **Don't use `ask_causely` for alert name resolution** — it can't resolve raw Alertmanager or Datadog alert names to Causely entities. +- **Unmapped ≠ irrelevant**: an unmapped alert might be a real signal that Causely hasn't been configured to ingest yet. Don't dismiss it. +- **Multiple alerts, one cause**: when the user reports several alerts, check `get_root_causes` first — they often share a single origin visible in the impact graph. diff --git a/skills/causely-alert-triage/references/complete-investigation.md b/skills/causely-alert-triage/references/complete-investigation.md new file mode 100644 index 0000000..d358e2b --- /dev/null +++ b/skills/causely-alert-triage/references/complete-investigation.md @@ -0,0 +1,178 @@ +# Complete Investigation Reference + +## Efficiency-first principle + +**`triage` is a complete answer in one call.** It returns root cause, symptoms, impacted services, impacted customers, remediation, and `has_stored_logs` — everything needed for a full six-dimension response. Do not follow it with separate `get_symptoms` or blast-radius calls; that data is already inside the triage response. + +**`description` is Causely's pre-synthesised evidence.** When `get_root_causes` or `triage` returns a `description` field with specific log patterns, error messages, or metrics, that is the evidence. Do not call `get_logs` to regenerate it — Causely already did that work at detection time. Read `description` first. + +--- + +## Complete tool inventory (25 tools) + +### Discovery & inventory +| Tool | Use when | Key params | +|---|---|---| +| `get_entities` | Resolve names → entity IDs for any downstream tool | `query`, `entity_types`, `namespace_names`, `cluster_names` | +| `list_clusters` | Discover valid cluster names | `query` (optional filter) | +| `list_namespaces` | Discover valid namespace names | `query` (optional filter) | +| `get_label_values` | Enumerate teams, products, environments | `label_key` (e.g. `causely.ai/team`) | +| `get_integration_status` | Check scraper/integration coverage per cluster | `cluster_names` (optional filter) | + +### Health & triage +| Tool | Use when | Key params | +|---|---|---| +| `triage` | Named entity health check — fastest, deterministic | `entity_name`, `root_cause_id`, `root_cause_name`, `start_time/end_time` | +| `get_environment_health` | Global or namespace-scoped health overview | `namespaces`, `services`, `lookback_hours` | +| `get_service_summary` | Comprehensive single-service health (all dimensions) | `service` (substring), `lookback_hours` | +| `get_entity_health` | Health for non-Service entities (pods, DBs, queues) | `entity_id`, `lookback_hours` | +| `team_health` | All services owned by a team | `team` (partial match) | +| `ask_causely` | Free-form NL query, cross-entity synthesis | `question` | + +### Diagnosis +| Tool | Use when | Key params | +|---|---|---| +| `get_root_causes` | All active root causes (structured JSON with impact graphs) | `active_only`, `impacted_service_ids`, `symptom_ids`, `root_cause_name` | +| `get_symptoms` | Raw signals / historical timelines | `entity_ids`, `active_only`, `lookback_hours` | +| `get_alerts` | Raw alert history, mapped/unmapped status | `entity_ids`, `alert_name_filters`, `mapping_state_filters` | +| `get_logs` | Live entity logs OR stored evidence logs | `entity_id` XOR `root_cause_id`, `severity_filter` | +| `get_events` | Lifecycle events (deploys, restarts, scaling) | `entity_id`, `severity_filter`, `message_contains` | +| `get_slow_queries` | DB slow query analysis | `entity_ids` | + +### Observability data +| Tool | Use when | Key params | +|---|---|---| +| `get_metrics` | Numeric snapshots or time-series | `entity_ids`, `metrics`, `window_minutes` | +| `get_slo` | SLO state, error budget, burn rate | `entity_ids`, `only_at_risk`, `only_violated` | +| `get_config` | Raw config files for an entity | `entity_id`, `name_contains` | +| `get_topology` | Dependency/dependent/dataflow graph | `entity_id`, `mode`, `levels` | + +### Post-deploy & reliability +| Tool | Use when | Key params | +|---|---|---| +| `reliability_delta` | Single-service pre/post deploy comparison | `service`, `lookback_hours`, `window_minutes` | +| `fleet_reliability_delta` | Batch regression check across multiple services | `team`, `namespace`, `services`, `window_minutes` | + +### Reporting & actions +| Tool | Use when | Key params | +|---|---|---| +| `generate_ticket` | Create Jira/GitHub/Linear ticket draft | `task` | +| `postmortem` | Generate postmortem for a resolved incident | `root_cause_id` (preferred), or `root_cause_name` + `entity_name`, or `service` + `incident_start` | + +--- + +## Primary decision tree + +``` +Have a service name? +│ +├─ triage(entity_name="") ← 1 call, full picture +│ ├─ Root cause, symptoms, blast radius, customer impact, remediation: all here +│ ├─ description populated with specifics? → use it as evidence, skip get_logs +│ ├─ description generic ("Inspect logs...") AND has_stored_logs=true? +│ │ └─ get_logs(root_cause_id=, limit=10, severity_filter=ERROR) ← conditional 2nd call +│ └─ causely.ai/team label in entity.labels? → use it, skip team_health +│ └─ label absent? → team_health(team="") ← conditional 2nd call +│ +├─ Need metrics/SLOs/topology? (not in triage) +│ └─ get_entities(query="") → get_metrics / get_slo / get_topology +│ +└─ No name / system sweep? + │ + ├─ get_environment_health() ← 1 call, overall status + └─ get_root_causes(active_only=true) ← 1 call, all active issues with evidence +``` + +--- + +## Entity resolution pattern + +Many tools require entity IDs. Use `get_entities` to resolve names first: + +``` +get_entities(query="checkout", entity_types=["Service"]) + → returns [{id: "abc-123", name: "otel-demo/checkoutservice", ...}] + → pass id to get_metrics, get_slo, get_topology, get_alerts, etc. +``` + +**Entity name format:** +| Type | Format | Example | +|---|---|---| +| K8s service | `namespace/service-name` | `default/animal-service` | +| ECS task / VM | `cluster/task-name-hash` | `chaos/quarkus-workshop-hero-service-2b62b3ef` | +| Node | AWS/GCP hostname | `ip-192-168-12-32.us-east-2.compute.internal` | + +--- + +## Evidence: description vs get_logs + +The `description` field on a root cause contains Causely's synthesised evidence — extracted log patterns, error messages, counts, and context assembled at detection time. Examples: + +- "Disk provider not supported... WARN mediation/scraper_manager.go:267 full resync failed {error: disk provider not supported, provider: kubernetes.io/aws-ebs} — Count: 22" +- "org.postgresql.util.PSQLException: The connection attempt failed — Count: 2" + +When description contains this level of specificity, **do not call `get_logs`**. The evidence is already there. + +Only call `get_logs` when description is generic (e.g. "Inspect the application logs for error messages or stack traces") AND `has_stored_logs=true`. Use `limit=10` and `severity_filter=ERROR`. + +--- + +## The six dimensions — where each comes from + +| Dimension | Source | Extra call needed? | +|---|---|---| +| Root cause | `triage.root_cause` or `get_root_causes[].name` | No | +| Evidence | `description` field on root cause | No — only call `get_logs` if description is generic AND `has_stored_logs=true` | +| Blast radius | `impacted_services` and `impact_service_graph` on root cause | No | +| Customer impact | `impacted_customers` on root cause | No | +| Owner / team | `entity.labels["causely.ai/team"]` | Only if label absent: `team_health(team=)` | +| Remediation | `remediation` field on root cause | No | + +--- + +## Owner resolution + +Check `entity.labels` in the triage or get_root_causes response first: +- `causely.ai/team` present → that is the owner. No extra call needed. +- `causely.ai/owner-scraper` is NOT a team name — it identifies the discovery mechanism. Never present this as the owner. +- `causely.ai/team` absent → `team_health(team="")`. Try the namespace name or service name prefix. +- `team_health` returns no match → "Owner not registered in Causely — check your service catalog (e.g. Backstage)" + +--- + +## Tool reliability fallbacks + +**`triage` returns "No Incident Data Found":** Service is likely healthy. Confirm with `get_root_causes(active_only=true)` to check system-wide, or accept the health verdict. + +**`get_root_causes` returns empty list:** No active root causes. Check `get_symptoms(active_only=true)` for undiagnosed signals — but only if the user specifically needs to know about raw alerts. + +**`get_logs` returns empty lines:** `has_stored_logs` may have been `false` or logs have expired. Note "No evidence logs available" — do not retry. + +**`team_health` returns no match:** Try a shorter partial name. If still no match, report "Not registered — check service catalog". + +**`get_entities` returns empty:** Try a broader query or check `list_namespaces` / `list_clusters` to discover valid scope values. + +**All tools error:** Tell the engineer which calls you would have made. Direct them to https://portal.causely.app. + +--- + +## Output template + +### 🔴 / 🟡 / 🟢 [Service] — [Status] + +**Root cause:** [name + entity + portal link from triage/get_root_causes] + +**Evidence:** +- [from `description` field — quote specific log patterns or error messages] +- [if get_logs called: add 1–2 key ERROR lines as supplement] +- [if description generic and no logs: "No stored evidence (has_stored_logs=false)"] + +**Blast radius:** [from `impacted_services`, or "None identified"] + +**Customer impact:** [from `impacted_customers`, or "None identified"] + +**Owner / team:** [from `causely.ai/team` label, or `team_health` result, or "Not registered — check service catalog"] + +**Recommended actions:** [from `remediation` field] + +**Links:** [Causely portal links from response] diff --git a/skills/causely-change-impact/SKILL.md b/skills/causely-change-impact/SKILL.md new file mode 100644 index 0000000..af69f9f --- /dev/null +++ b/skills/causely-change-impact/SKILL.md @@ -0,0 +1,120 @@ +--- +name: causely-change-impact +description: > + Use this skill when the user asks about the impact of a recent deployment, configuration change, rollout, or infrastructure update. Trigger for questions like "did our deployment break anything?", "what changed before this incident started?", "validate that the rollout didn't introduce regressions", "is this incident caused by our recent release?", "what's the impact of this config change?", "we just deployed — is everything OK?", "post-deploy health check", "pre/post comparison for our rollout", "check for regressions after deploy", "fleet-wide deploy validation", or "compare metrics before and after release". Also trigger when someone is doing a canary analysis, blue/green switch, or feature flag rollout and wants to know if health metrics changed. Use this skill over generic causely-mcp when the question is specifically change-driven. +--- + +# Causely Change Impact Skill + +Read `references/complete-investigation.md` for the full 25-tool inventory and evidence strategy. + +--- + +## Core tools for change impact + +| Tool | Use when | What it returns | +|---|---|---| +| `triage(entity_name=)` | Quick post-deploy check for one service | Root causes with `started_at` timestamps to compare against deploy time | +| `reliability_delta(service=)` | Metric regression check for one service | Before/after avg+max for CPU, memory, latency, error rate + verdict (PASS/WARNING/REGRESSION/WAIT) | +| `fleet_reliability_delta(team= or namespace= or services=)` | Batch regression check across multiple services | Summary table with per-service verdicts | +| `get_events(entity_id=)` | Find the deploy event / correlate changes | Lifecycle events (deploys, restarts, scaling, config changes) with timestamps | +| `get_config(entity_id=)` | Inspect config drift | Raw config files (manifests, specs) to compare | +| `get_metrics(entity_ids=, metrics=, window_minutes=)` | Custom metric comparison over time window | Time-series data for specific metrics | +| `get_root_causes(active_only=true)` | System-wide post-deploy sweep | All active RCs with `started_at` to filter by deploy time | + +--- + +## Decision tree + +**Single-service post-deploy check (recommended path):** +``` +reliability_delta(service="") ← 1 call + → verdict: PASS / WARNING / REGRESSION / WAIT + → per-metric delta: CPU, memory, latency, error rate before vs after + → if REGRESSION → recommend rollback + → if WAIT → deploy too recent, re-run later + → if PASS → deploy is clean +``` + +If `reliability_delta` returns REGRESSION or WARNING, add context: +``` +triage(entity_name="") ← 2nd call + → root cause started_at vs deploy time = causal correlation + → description = evidence of what broke + → remediation = what to do next +``` + +**Fleet-wide post-deploy validation:** +``` +fleet_reliability_delta(team="" or namespace="") ← 1 call + → summary table: service | verdict | release time | per-metric delta + → verdict counts: REGRESSION / WARNING / PASS / WAIT + → triage only REGRESSION services for detail +``` + +**Triage-only path (when reliability_delta not needed):** +``` +triage(entity_name="") ← 1 call + → root cause started_at before deploy? → change not the cause + → root cause started_at after deploy? → change is suspect + → description = evidence of what broke + → impacted_services = downstream blast radius + → impacted_customers = customer impact + → done +``` + +Only add extra calls if: +- Need to see the actual deploy event → `get_entities` → `get_events(entity_id=, message_contains="version")` +- Need config comparison → `get_entities` → `get_config(entity_id=)` +- Need custom metric time-series → `get_entities` → `get_metrics(entity_ids=, metrics=[...], window_minutes=60)` +- `has_stored_logs=true` AND description generic → `get_logs(root_cause_id=, limit=10, severity_filter=ERROR)` + +**Canary / blue-green:** +``` +reliability_delta(service="") ← 1 call +reliability_delta(service="") ← 1 call + → compare verdicts: regression on v2 only = canary failure +``` + +--- + +## Verdict logic + +| Signal | Verdict | Action | +|---|---|---| +| `reliability_delta` → PASS, no new root causes | ✅ Safe | Deploy is clean | +| `reliability_delta` → WARNING | ⚠️ Monitor | Watch for escalation; re-check in 30 min | +| `reliability_delta` → REGRESSION | 🔴 Rollback recommended | New root cause correlates with deploy | +| `reliability_delta` → WAIT | ⏳ Too early | Re-run after more post-deploy data accumulates | +| Root cause `started_at` before deploy | ✅ Pre-existing | Change not the cause | +| Root cause `started_at` after deploy | 🔴 Suspect | Check description for confirmation | +| No root causes at all | ✅ Safe | Service is healthy | + +--- + +## Output format + +### 🚀 Deployment validation report + +**Service:** [service-name] · **Deploy time:** [from reliability_delta or get_events] · **Report:** [now] + +**Verdict:** ✅ Safe / ⚠️ Monitor / 🔴 Rollback recommended / ⏳ Too early + +**Metric deltas:** +| Metric | Before (avg) | After (avg) | Delta | Status | +|---|---|---|---|---| +| [from reliability_delta response] | + +**New root causes since deploy:** [name + started_at, or "None detected"] + +**Evidence:** [from description field; supplement with get_logs only if generic AND has_stored_logs=true] + +**Blast radius:** [from impacted_services] + +**Customer impact:** [from impacted_customers] + +**Owner:** [from causely.ai/team label or team_health] + +**Recommended actions:** [from remediation field; rollback recommendation if 🔴] + +**Links:** [portal links] diff --git a/skills/causely-change-impact/references/complete-investigation.md b/skills/causely-change-impact/references/complete-investigation.md new file mode 100644 index 0000000..d358e2b --- /dev/null +++ b/skills/causely-change-impact/references/complete-investigation.md @@ -0,0 +1,178 @@ +# Complete Investigation Reference + +## Efficiency-first principle + +**`triage` is a complete answer in one call.** It returns root cause, symptoms, impacted services, impacted customers, remediation, and `has_stored_logs` — everything needed for a full six-dimension response. Do not follow it with separate `get_symptoms` or blast-radius calls; that data is already inside the triage response. + +**`description` is Causely's pre-synthesised evidence.** When `get_root_causes` or `triage` returns a `description` field with specific log patterns, error messages, or metrics, that is the evidence. Do not call `get_logs` to regenerate it — Causely already did that work at detection time. Read `description` first. + +--- + +## Complete tool inventory (25 tools) + +### Discovery & inventory +| Tool | Use when | Key params | +|---|---|---| +| `get_entities` | Resolve names → entity IDs for any downstream tool | `query`, `entity_types`, `namespace_names`, `cluster_names` | +| `list_clusters` | Discover valid cluster names | `query` (optional filter) | +| `list_namespaces` | Discover valid namespace names | `query` (optional filter) | +| `get_label_values` | Enumerate teams, products, environments | `label_key` (e.g. `causely.ai/team`) | +| `get_integration_status` | Check scraper/integration coverage per cluster | `cluster_names` (optional filter) | + +### Health & triage +| Tool | Use when | Key params | +|---|---|---| +| `triage` | Named entity health check — fastest, deterministic | `entity_name`, `root_cause_id`, `root_cause_name`, `start_time/end_time` | +| `get_environment_health` | Global or namespace-scoped health overview | `namespaces`, `services`, `lookback_hours` | +| `get_service_summary` | Comprehensive single-service health (all dimensions) | `service` (substring), `lookback_hours` | +| `get_entity_health` | Health for non-Service entities (pods, DBs, queues) | `entity_id`, `lookback_hours` | +| `team_health` | All services owned by a team | `team` (partial match) | +| `ask_causely` | Free-form NL query, cross-entity synthesis | `question` | + +### Diagnosis +| Tool | Use when | Key params | +|---|---|---| +| `get_root_causes` | All active root causes (structured JSON with impact graphs) | `active_only`, `impacted_service_ids`, `symptom_ids`, `root_cause_name` | +| `get_symptoms` | Raw signals / historical timelines | `entity_ids`, `active_only`, `lookback_hours` | +| `get_alerts` | Raw alert history, mapped/unmapped status | `entity_ids`, `alert_name_filters`, `mapping_state_filters` | +| `get_logs` | Live entity logs OR stored evidence logs | `entity_id` XOR `root_cause_id`, `severity_filter` | +| `get_events` | Lifecycle events (deploys, restarts, scaling) | `entity_id`, `severity_filter`, `message_contains` | +| `get_slow_queries` | DB slow query analysis | `entity_ids` | + +### Observability data +| Tool | Use when | Key params | +|---|---|---| +| `get_metrics` | Numeric snapshots or time-series | `entity_ids`, `metrics`, `window_minutes` | +| `get_slo` | SLO state, error budget, burn rate | `entity_ids`, `only_at_risk`, `only_violated` | +| `get_config` | Raw config files for an entity | `entity_id`, `name_contains` | +| `get_topology` | Dependency/dependent/dataflow graph | `entity_id`, `mode`, `levels` | + +### Post-deploy & reliability +| Tool | Use when | Key params | +|---|---|---| +| `reliability_delta` | Single-service pre/post deploy comparison | `service`, `lookback_hours`, `window_minutes` | +| `fleet_reliability_delta` | Batch regression check across multiple services | `team`, `namespace`, `services`, `window_minutes` | + +### Reporting & actions +| Tool | Use when | Key params | +|---|---|---| +| `generate_ticket` | Create Jira/GitHub/Linear ticket draft | `task` | +| `postmortem` | Generate postmortem for a resolved incident | `root_cause_id` (preferred), or `root_cause_name` + `entity_name`, or `service` + `incident_start` | + +--- + +## Primary decision tree + +``` +Have a service name? +│ +├─ triage(entity_name="") ← 1 call, full picture +│ ├─ Root cause, symptoms, blast radius, customer impact, remediation: all here +│ ├─ description populated with specifics? → use it as evidence, skip get_logs +│ ├─ description generic ("Inspect logs...") AND has_stored_logs=true? +│ │ └─ get_logs(root_cause_id=, limit=10, severity_filter=ERROR) ← conditional 2nd call +│ └─ causely.ai/team label in entity.labels? → use it, skip team_health +│ └─ label absent? → team_health(team="") ← conditional 2nd call +│ +├─ Need metrics/SLOs/topology? (not in triage) +│ └─ get_entities(query="") → get_metrics / get_slo / get_topology +│ +└─ No name / system sweep? + │ + ├─ get_environment_health() ← 1 call, overall status + └─ get_root_causes(active_only=true) ← 1 call, all active issues with evidence +``` + +--- + +## Entity resolution pattern + +Many tools require entity IDs. Use `get_entities` to resolve names first: + +``` +get_entities(query="checkout", entity_types=["Service"]) + → returns [{id: "abc-123", name: "otel-demo/checkoutservice", ...}] + → pass id to get_metrics, get_slo, get_topology, get_alerts, etc. +``` + +**Entity name format:** +| Type | Format | Example | +|---|---|---| +| K8s service | `namespace/service-name` | `default/animal-service` | +| ECS task / VM | `cluster/task-name-hash` | `chaos/quarkus-workshop-hero-service-2b62b3ef` | +| Node | AWS/GCP hostname | `ip-192-168-12-32.us-east-2.compute.internal` | + +--- + +## Evidence: description vs get_logs + +The `description` field on a root cause contains Causely's synthesised evidence — extracted log patterns, error messages, counts, and context assembled at detection time. Examples: + +- "Disk provider not supported... WARN mediation/scraper_manager.go:267 full resync failed {error: disk provider not supported, provider: kubernetes.io/aws-ebs} — Count: 22" +- "org.postgresql.util.PSQLException: The connection attempt failed — Count: 2" + +When description contains this level of specificity, **do not call `get_logs`**. The evidence is already there. + +Only call `get_logs` when description is generic (e.g. "Inspect the application logs for error messages or stack traces") AND `has_stored_logs=true`. Use `limit=10` and `severity_filter=ERROR`. + +--- + +## The six dimensions — where each comes from + +| Dimension | Source | Extra call needed? | +|---|---|---| +| Root cause | `triage.root_cause` or `get_root_causes[].name` | No | +| Evidence | `description` field on root cause | No — only call `get_logs` if description is generic AND `has_stored_logs=true` | +| Blast radius | `impacted_services` and `impact_service_graph` on root cause | No | +| Customer impact | `impacted_customers` on root cause | No | +| Owner / team | `entity.labels["causely.ai/team"]` | Only if label absent: `team_health(team=)` | +| Remediation | `remediation` field on root cause | No | + +--- + +## Owner resolution + +Check `entity.labels` in the triage or get_root_causes response first: +- `causely.ai/team` present → that is the owner. No extra call needed. +- `causely.ai/owner-scraper` is NOT a team name — it identifies the discovery mechanism. Never present this as the owner. +- `causely.ai/team` absent → `team_health(team="")`. Try the namespace name or service name prefix. +- `team_health` returns no match → "Owner not registered in Causely — check your service catalog (e.g. Backstage)" + +--- + +## Tool reliability fallbacks + +**`triage` returns "No Incident Data Found":** Service is likely healthy. Confirm with `get_root_causes(active_only=true)` to check system-wide, or accept the health verdict. + +**`get_root_causes` returns empty list:** No active root causes. Check `get_symptoms(active_only=true)` for undiagnosed signals — but only if the user specifically needs to know about raw alerts. + +**`get_logs` returns empty lines:** `has_stored_logs` may have been `false` or logs have expired. Note "No evidence logs available" — do not retry. + +**`team_health` returns no match:** Try a shorter partial name. If still no match, report "Not registered — check service catalog". + +**`get_entities` returns empty:** Try a broader query or check `list_namespaces` / `list_clusters` to discover valid scope values. + +**All tools error:** Tell the engineer which calls you would have made. Direct them to https://portal.causely.app. + +--- + +## Output template + +### 🔴 / 🟡 / 🟢 [Service] — [Status] + +**Root cause:** [name + entity + portal link from triage/get_root_causes] + +**Evidence:** +- [from `description` field — quote specific log patterns or error messages] +- [if get_logs called: add 1–2 key ERROR lines as supplement] +- [if description generic and no logs: "No stored evidence (has_stored_logs=false)"] + +**Blast radius:** [from `impacted_services`, or "None identified"] + +**Customer impact:** [from `impacted_customers`, or "None identified"] + +**Owner / team:** [from `causely.ai/team` label, or `team_health` result, or "Not registered — check service catalog"] + +**Recommended actions:** [from `remediation` field] + +**Links:** [Causely portal links from response] diff --git a/skills/causely-correlated-incidents/SKILL.md b/skills/causely-correlated-incidents/SKILL.md new file mode 100644 index 0000000..bc828c4 --- /dev/null +++ b/skills/causely-correlated-incidents/SKILL.md @@ -0,0 +1,120 @@ +--- +name: causely-correlated-incidents +description: > + Use this skill when the user reports that multiple services are broken at the same time, suspects a shared root cause, or asks about cascading failures, blast radius, dependency chains, or "what else is this affecting". Trigger for questions like "multiple things are broken", "is this a widespread outage?", "what's the common cause across these services?", "which services are affected by the same root cause?", "is this a network issue hitting everything?", "are these incidents related?", "show me the blast radius", "what depends on X?", "what's the dependency chain?", or "trace the impact path". Also trigger when the initial incident triage reveals that more than one service has active root causes — treat that as a signal to pivot to correlation analysis. Do NOT skip this skill just because the user mentions only one service; if investigation reveals a broader pattern, switch to this skill. +--- + +# Causely Correlated Incidents Skill + +Read `references/complete-investigation.md` for the full 25-tool inventory and evidence strategy. + +--- + +## Core tools for correlation analysis + +| Tool | Use when | What it returns | +|---|---|---| +| `get_root_causes(active_only=true)` | All active issues — primary correlation tool | All RCs with `impact_service_graph` edges showing propagation paths | +| `triage(entity_name=)` | Named service cascade investigation | Per-entity root causes with impact graph | +| `get_topology(entity_id=, mode=)` | Full dependency/dependent graph (beyond active incidents) | Node + edge graph: dependencies, dependents, or dataflow | +| `get_alerts(entity_ids=)` | Alert correlation across entities | Firing alerts with mapping state — find unmapped shared alerts | +| `get_environment_health(namespaces=)` | Scoped health check for affected namespace | Overall status + active root causes in scope | +| `ask_causely(question=)` | Cross-entity synthesis when names aren't clear | Free-form NL query for broad pattern detection | + +--- + +## Core rule: one sweep, read the graphs + +**`get_root_causes(active_only=true)` returns everything needed for correlation in one call:** +- Each root cause includes `impact_service_graph.edges` — a node appearing as source in multiple graphs is the shared origin +- `impacted_services` shows blast radius per root cause +- `impacted_customers` shows customer-facing impact +- `description` is the synthesised evidence — read it, don't re-fetch it + +Do not follow up with `get_symptoms` — symptoms are already included in the root cause response. + +--- + +## Decision tree + +**Widespread outage:** +``` +get_root_causes(active_only=true) ← 1 call + → look for shared node IDs across impact_service_graphs + → shared node = correlation origin + → description on that root cause = evidence + → impacted_customers across all RCs = customer impact + → done, unless description generic AND has_stored_logs=true: + → get_logs(root_cause_id=, limit=10, severity_filter=ERROR) ← optional 2nd call +``` + +**"Are these two incidents related?":** +``` +get_root_causes(active_only=true) ← 1 call (covers both services) + → compare impact_service_graph.nodes for shared IDs + → compare started_at — simultaneous = correlated + → done +``` + +**Named service, cascade suspected:** +``` +triage(entity_name="") ← 1 call + → read impact_service_graph: trace edges from root to leaves + → impacted_services = confirmed downstream blast radius + → done +``` + +**Full dependency graph (beyond active incidents):** +``` +get_entities(query="", entity_types=["Service"]) ← 1 call +get_topology(entity_id=, mode=dependents, levels=3) ← 1 call + → all services that call this entity (upstream blast radius victims) + → or mode=dependencies for what this entity calls (downstream risk) + → or mode=dataflow for full end-to-end data movement +``` + +**Alert-level correlation (shared alert patterns across services):** +``` +get_entities(query="") ← 1 call +get_entities(query="") ← 1 call +get_alerts(entity_ids=[id_a, id_b], active_only=true) ← 1 call + → shared alert_names across entities = correlated signals + → mapped alerts → get_root_causes(symptom_ids=) for cause +``` + +--- + +## Correlation methods + +1. **Impact graph overlap**: shared node IDs in `impact_service_graph` across multiple root causes → same origin +2. **Temporal correlation**: root causes with `started_at` within minutes of each other → likely same trigger +3. **Topology correlation**: `get_topology(mode=dependents)` shows all upstream callers — if the degraded entity is a shared dependency, all dependents are at risk +4. **Alert pattern correlation**: same `alert_name` firing across multiple entities simultaneously → shared infrastructure cause + +--- + +## Output format + +### 🔴 Multi-service incident summary + +**Affected services:** [from impacted_services across root causes] + +**Correlation:** ✅ Correlated / ⚠️ Partial / ❓ Unconfirmed — [origin entity if known] + +**Root cause:** [name + entity + portal link from get_root_causes] + +**Propagation path:** [from impact_service_graph edges, or get_topology if called] + +**Evidence:** [from description field; supplement with get_logs if generic AND has_stored_logs=true] + +**Blast radius:** [from impact_service_graph — total affected services count + names] + +**Customer impact:** [from impacted_customers] + +**Owner:** [from causely.ai/team label or team_health] + +**Timeline:** [started_at per root cause, in order] + +**Recommended action:** [from remediation field — single fix that resolves the origin] + +**Links:** [all portal links] diff --git a/skills/causely-correlated-incidents/references/complete-investigation.md b/skills/causely-correlated-incidents/references/complete-investigation.md new file mode 100644 index 0000000..d358e2b --- /dev/null +++ b/skills/causely-correlated-incidents/references/complete-investigation.md @@ -0,0 +1,178 @@ +# Complete Investigation Reference + +## Efficiency-first principle + +**`triage` is a complete answer in one call.** It returns root cause, symptoms, impacted services, impacted customers, remediation, and `has_stored_logs` — everything needed for a full six-dimension response. Do not follow it with separate `get_symptoms` or blast-radius calls; that data is already inside the triage response. + +**`description` is Causely's pre-synthesised evidence.** When `get_root_causes` or `triage` returns a `description` field with specific log patterns, error messages, or metrics, that is the evidence. Do not call `get_logs` to regenerate it — Causely already did that work at detection time. Read `description` first. + +--- + +## Complete tool inventory (25 tools) + +### Discovery & inventory +| Tool | Use when | Key params | +|---|---|---| +| `get_entities` | Resolve names → entity IDs for any downstream tool | `query`, `entity_types`, `namespace_names`, `cluster_names` | +| `list_clusters` | Discover valid cluster names | `query` (optional filter) | +| `list_namespaces` | Discover valid namespace names | `query` (optional filter) | +| `get_label_values` | Enumerate teams, products, environments | `label_key` (e.g. `causely.ai/team`) | +| `get_integration_status` | Check scraper/integration coverage per cluster | `cluster_names` (optional filter) | + +### Health & triage +| Tool | Use when | Key params | +|---|---|---| +| `triage` | Named entity health check — fastest, deterministic | `entity_name`, `root_cause_id`, `root_cause_name`, `start_time/end_time` | +| `get_environment_health` | Global or namespace-scoped health overview | `namespaces`, `services`, `lookback_hours` | +| `get_service_summary` | Comprehensive single-service health (all dimensions) | `service` (substring), `lookback_hours` | +| `get_entity_health` | Health for non-Service entities (pods, DBs, queues) | `entity_id`, `lookback_hours` | +| `team_health` | All services owned by a team | `team` (partial match) | +| `ask_causely` | Free-form NL query, cross-entity synthesis | `question` | + +### Diagnosis +| Tool | Use when | Key params | +|---|---|---| +| `get_root_causes` | All active root causes (structured JSON with impact graphs) | `active_only`, `impacted_service_ids`, `symptom_ids`, `root_cause_name` | +| `get_symptoms` | Raw signals / historical timelines | `entity_ids`, `active_only`, `lookback_hours` | +| `get_alerts` | Raw alert history, mapped/unmapped status | `entity_ids`, `alert_name_filters`, `mapping_state_filters` | +| `get_logs` | Live entity logs OR stored evidence logs | `entity_id` XOR `root_cause_id`, `severity_filter` | +| `get_events` | Lifecycle events (deploys, restarts, scaling) | `entity_id`, `severity_filter`, `message_contains` | +| `get_slow_queries` | DB slow query analysis | `entity_ids` | + +### Observability data +| Tool | Use when | Key params | +|---|---|---| +| `get_metrics` | Numeric snapshots or time-series | `entity_ids`, `metrics`, `window_minutes` | +| `get_slo` | SLO state, error budget, burn rate | `entity_ids`, `only_at_risk`, `only_violated` | +| `get_config` | Raw config files for an entity | `entity_id`, `name_contains` | +| `get_topology` | Dependency/dependent/dataflow graph | `entity_id`, `mode`, `levels` | + +### Post-deploy & reliability +| Tool | Use when | Key params | +|---|---|---| +| `reliability_delta` | Single-service pre/post deploy comparison | `service`, `lookback_hours`, `window_minutes` | +| `fleet_reliability_delta` | Batch regression check across multiple services | `team`, `namespace`, `services`, `window_minutes` | + +### Reporting & actions +| Tool | Use when | Key params | +|---|---|---| +| `generate_ticket` | Create Jira/GitHub/Linear ticket draft | `task` | +| `postmortem` | Generate postmortem for a resolved incident | `root_cause_id` (preferred), or `root_cause_name` + `entity_name`, or `service` + `incident_start` | + +--- + +## Primary decision tree + +``` +Have a service name? +│ +├─ triage(entity_name="") ← 1 call, full picture +│ ├─ Root cause, symptoms, blast radius, customer impact, remediation: all here +│ ├─ description populated with specifics? → use it as evidence, skip get_logs +│ ├─ description generic ("Inspect logs...") AND has_stored_logs=true? +│ │ └─ get_logs(root_cause_id=, limit=10, severity_filter=ERROR) ← conditional 2nd call +│ └─ causely.ai/team label in entity.labels? → use it, skip team_health +│ └─ label absent? → team_health(team="") ← conditional 2nd call +│ +├─ Need metrics/SLOs/topology? (not in triage) +│ └─ get_entities(query="") → get_metrics / get_slo / get_topology +│ +└─ No name / system sweep? + │ + ├─ get_environment_health() ← 1 call, overall status + └─ get_root_causes(active_only=true) ← 1 call, all active issues with evidence +``` + +--- + +## Entity resolution pattern + +Many tools require entity IDs. Use `get_entities` to resolve names first: + +``` +get_entities(query="checkout", entity_types=["Service"]) + → returns [{id: "abc-123", name: "otel-demo/checkoutservice", ...}] + → pass id to get_metrics, get_slo, get_topology, get_alerts, etc. +``` + +**Entity name format:** +| Type | Format | Example | +|---|---|---| +| K8s service | `namespace/service-name` | `default/animal-service` | +| ECS task / VM | `cluster/task-name-hash` | `chaos/quarkus-workshop-hero-service-2b62b3ef` | +| Node | AWS/GCP hostname | `ip-192-168-12-32.us-east-2.compute.internal` | + +--- + +## Evidence: description vs get_logs + +The `description` field on a root cause contains Causely's synthesised evidence — extracted log patterns, error messages, counts, and context assembled at detection time. Examples: + +- "Disk provider not supported... WARN mediation/scraper_manager.go:267 full resync failed {error: disk provider not supported, provider: kubernetes.io/aws-ebs} — Count: 22" +- "org.postgresql.util.PSQLException: The connection attempt failed — Count: 2" + +When description contains this level of specificity, **do not call `get_logs`**. The evidence is already there. + +Only call `get_logs` when description is generic (e.g. "Inspect the application logs for error messages or stack traces") AND `has_stored_logs=true`. Use `limit=10` and `severity_filter=ERROR`. + +--- + +## The six dimensions — where each comes from + +| Dimension | Source | Extra call needed? | +|---|---|---| +| Root cause | `triage.root_cause` or `get_root_causes[].name` | No | +| Evidence | `description` field on root cause | No — only call `get_logs` if description is generic AND `has_stored_logs=true` | +| Blast radius | `impacted_services` and `impact_service_graph` on root cause | No | +| Customer impact | `impacted_customers` on root cause | No | +| Owner / team | `entity.labels["causely.ai/team"]` | Only if label absent: `team_health(team=)` | +| Remediation | `remediation` field on root cause | No | + +--- + +## Owner resolution + +Check `entity.labels` in the triage or get_root_causes response first: +- `causely.ai/team` present → that is the owner. No extra call needed. +- `causely.ai/owner-scraper` is NOT a team name — it identifies the discovery mechanism. Never present this as the owner. +- `causely.ai/team` absent → `team_health(team="")`. Try the namespace name or service name prefix. +- `team_health` returns no match → "Owner not registered in Causely — check your service catalog (e.g. Backstage)" + +--- + +## Tool reliability fallbacks + +**`triage` returns "No Incident Data Found":** Service is likely healthy. Confirm with `get_root_causes(active_only=true)` to check system-wide, or accept the health verdict. + +**`get_root_causes` returns empty list:** No active root causes. Check `get_symptoms(active_only=true)` for undiagnosed signals — but only if the user specifically needs to know about raw alerts. + +**`get_logs` returns empty lines:** `has_stored_logs` may have been `false` or logs have expired. Note "No evidence logs available" — do not retry. + +**`team_health` returns no match:** Try a shorter partial name. If still no match, report "Not registered — check service catalog". + +**`get_entities` returns empty:** Try a broader query or check `list_namespaces` / `list_clusters` to discover valid scope values. + +**All tools error:** Tell the engineer which calls you would have made. Direct them to https://portal.causely.app. + +--- + +## Output template + +### 🔴 / 🟡 / 🟢 [Service] — [Status] + +**Root cause:** [name + entity + portal link from triage/get_root_causes] + +**Evidence:** +- [from `description` field — quote specific log patterns or error messages] +- [if get_logs called: add 1–2 key ERROR lines as supplement] +- [if description generic and no logs: "No stored evidence (has_stored_logs=false)"] + +**Blast radius:** [from `impacted_services`, or "None identified"] + +**Customer impact:** [from `impacted_customers`, or "None identified"] + +**Owner / team:** [from `causely.ai/team` label, or `team_health` result, or "Not registered — check service catalog"] + +**Recommended actions:** [from `remediation` field] + +**Links:** [Causely portal links from response] diff --git a/skills/causely-health-reporting/SKILL.md b/skills/causely-health-reporting/SKILL.md new file mode 100644 index 0000000..59d0dfa --- /dev/null +++ b/skills/causely-health-reporting/SKILL.md @@ -0,0 +1,126 @@ +--- +name: causely-health-reporting +description: > + Use this skill when the user wants a scheduled, proactive, or summary view of system health — not an active incident. Trigger for requests like "give me the morning health report", "what's the state of the system?", "weekly reliability summary", "anything I should know before standup?", "system health overview", "how are our services doing overall?", "what's been flapping this week?", "generate a status update", "what should the on-call team watch out for?", "SLO status report", "environment health check", "namespace health", "full service report", or "are any SLOs at risk?". Also trigger when someone asks for a digest, briefing, or dashboard-style summary rather than asking about a specific incident. This skill focuses on trend-awareness and proactive risk identification, not reactive triage. +--- + +# Causely Health Reporting Skill + +Read `references/complete-investigation.md` for the full 25-tool inventory and evidence strategy. + +--- + +## Core tools for health reporting + +| Tool | Use when | What it returns | +|---|---|---| +| `get_environment_health()` | Global or namespace-scoped health overview | Overall status (HEALTHY/DEGRADED/CRITICAL) + active root causes + remediation | +| `get_service_summary(service=)` | Comprehensive single-service report | Symptoms, root causes, SLOs, metrics, deps, slow queries, events, error logs — all in one call | +| `get_root_causes(active_only=true)` | All active issues with evidence | Structured JSON: description, impacted_services, impacted_customers per RC | +| `team_health(team=)` | Team-scoped standup | Degraded/critical services first, healthy grouped at end | +| `get_entity_health(entity_id=)` | Non-service entity health (DBs, pods, queues) | Symptoms, root causes, events, logs, metrics for one entity | +| `get_slo(entity_ids=)` | SLO error budget and burn rate | Per-SLO: budget remaining %, burn rate, at-risk/violated flags | +| `ask_causely(question=)` | System-wide SLO overview (no entity IDs needed) | "Which services have SLOs at risk or violated?" | +| `get_symptoms(active_only=false, lookback_hours=N)` | Historical flapping/recurring signals | Timeline of symptom start/end for trend analysis | + +--- + +## Decision tree + +**Morning standup / system sweep (recommended path):** +``` +get_environment_health() ← 1 call + → overall status: HEALTHY / DEGRADED / CRITICAL + → active root causes with severity, remediation + → done for quick overview +``` + +For more detail on each root cause: +``` +get_root_causes(active_only=true) ← 1 call + → group by severity: Critical → High → Medium → Low + → description = evidence per issue + → impacted_customers = customer impact per issue + → entity.labels["causely.ai/team"] = owner (if set) + → done +``` + +**Namespace-scoped health:** +``` +get_environment_health(namespaces=["otel-demo"]) ← 1 call + → scoped status + root causes for that namespace only +``` + +**Full service report (all dimensions):** +``` +get_service_summary(service="") ← 1 call + → status, symptoms, root causes, SLOs, metrics, deps, + slow queries, events, error logs — everything in one call + → done — do NOT chain 5 separate tools +``` + +**SLO-focused report:** +``` +ask_causely("Which services have SLOs at risk or violated?") ← 1 call (no entity IDs needed) + → or if you have entity IDs: +get_entities(query="") → get_slo(entity_ids=[...], only_at_risk=true) +``` + +**Team standup:** +``` +team_health(team="") ← 1 call + → degraded/critical services listed first + → for each degraded: get_service_summary(service=) if full detail needed +``` + +**Weekly report / trend analysis:** +``` +get_root_causes(active_only=false, lookback_hours=168) ← 1 call + → count per service to find recurring offenders + → compare started_at / ended_at for flapping patterns +``` + +**Non-service entity health (DBs, queues, pods):** +``` +get_entities(query="", entity_types=["Database"]) ← 1 call +get_entity_health(entity_id=) ← 1 call + → symptoms, root causes, events, logs, metrics +``` + +--- + +## Output formats + +### Morning / standup briefing + +**🟢 / 🟡 / 🔴 System health: [from get_environment_health status]** +*[N] active root causes as of [time]* + +| Service | Root cause | Severity | Since | Evidence | Customer impact | Owner | +|---|---|---|---|---|---|---| +| [from response] | [name] | [sev] | [started_at] | [from description] | [impacted_customers or "none"] | [team label or "unknown"] | + +**SLOs at risk:** [from get_slo or ask_causely — list services with burn rate > 1.0 or violated] + +**Watch:** [anything Critical or active >6h] + +--- + +### Full service report + +**[Service] — [status from get_service_summary]** + +**Active issues:** [root causes with severity + remediation] +**SLOs:** [budget remaining + burn rate] +**Key metrics:** [CPU, memory, error rate, p99 latency from resource metrics section] +**Dependencies:** [health of upstream/downstream services] +**Recent events:** [deploys, restarts, config changes] + +--- + +### On-call handoff + +🔴 **Active now:** [severity · service · root cause · started_at] +🟡 **SLOs burning:** [services with burn rate > 1.0] +⚠️ **Owner gaps:** [services missing causely.ai/team label] +📋 **Watch list:** [services with recurring root causes in the past 24h] diff --git a/skills/causely-health-reporting/references/complete-investigation.md b/skills/causely-health-reporting/references/complete-investigation.md new file mode 100644 index 0000000..d358e2b --- /dev/null +++ b/skills/causely-health-reporting/references/complete-investigation.md @@ -0,0 +1,178 @@ +# Complete Investigation Reference + +## Efficiency-first principle + +**`triage` is a complete answer in one call.** It returns root cause, symptoms, impacted services, impacted customers, remediation, and `has_stored_logs` — everything needed for a full six-dimension response. Do not follow it with separate `get_symptoms` or blast-radius calls; that data is already inside the triage response. + +**`description` is Causely's pre-synthesised evidence.** When `get_root_causes` or `triage` returns a `description` field with specific log patterns, error messages, or metrics, that is the evidence. Do not call `get_logs` to regenerate it — Causely already did that work at detection time. Read `description` first. + +--- + +## Complete tool inventory (25 tools) + +### Discovery & inventory +| Tool | Use when | Key params | +|---|---|---| +| `get_entities` | Resolve names → entity IDs for any downstream tool | `query`, `entity_types`, `namespace_names`, `cluster_names` | +| `list_clusters` | Discover valid cluster names | `query` (optional filter) | +| `list_namespaces` | Discover valid namespace names | `query` (optional filter) | +| `get_label_values` | Enumerate teams, products, environments | `label_key` (e.g. `causely.ai/team`) | +| `get_integration_status` | Check scraper/integration coverage per cluster | `cluster_names` (optional filter) | + +### Health & triage +| Tool | Use when | Key params | +|---|---|---| +| `triage` | Named entity health check — fastest, deterministic | `entity_name`, `root_cause_id`, `root_cause_name`, `start_time/end_time` | +| `get_environment_health` | Global or namespace-scoped health overview | `namespaces`, `services`, `lookback_hours` | +| `get_service_summary` | Comprehensive single-service health (all dimensions) | `service` (substring), `lookback_hours` | +| `get_entity_health` | Health for non-Service entities (pods, DBs, queues) | `entity_id`, `lookback_hours` | +| `team_health` | All services owned by a team | `team` (partial match) | +| `ask_causely` | Free-form NL query, cross-entity synthesis | `question` | + +### Diagnosis +| Tool | Use when | Key params | +|---|---|---| +| `get_root_causes` | All active root causes (structured JSON with impact graphs) | `active_only`, `impacted_service_ids`, `symptom_ids`, `root_cause_name` | +| `get_symptoms` | Raw signals / historical timelines | `entity_ids`, `active_only`, `lookback_hours` | +| `get_alerts` | Raw alert history, mapped/unmapped status | `entity_ids`, `alert_name_filters`, `mapping_state_filters` | +| `get_logs` | Live entity logs OR stored evidence logs | `entity_id` XOR `root_cause_id`, `severity_filter` | +| `get_events` | Lifecycle events (deploys, restarts, scaling) | `entity_id`, `severity_filter`, `message_contains` | +| `get_slow_queries` | DB slow query analysis | `entity_ids` | + +### Observability data +| Tool | Use when | Key params | +|---|---|---| +| `get_metrics` | Numeric snapshots or time-series | `entity_ids`, `metrics`, `window_minutes` | +| `get_slo` | SLO state, error budget, burn rate | `entity_ids`, `only_at_risk`, `only_violated` | +| `get_config` | Raw config files for an entity | `entity_id`, `name_contains` | +| `get_topology` | Dependency/dependent/dataflow graph | `entity_id`, `mode`, `levels` | + +### Post-deploy & reliability +| Tool | Use when | Key params | +|---|---|---| +| `reliability_delta` | Single-service pre/post deploy comparison | `service`, `lookback_hours`, `window_minutes` | +| `fleet_reliability_delta` | Batch regression check across multiple services | `team`, `namespace`, `services`, `window_minutes` | + +### Reporting & actions +| Tool | Use when | Key params | +|---|---|---| +| `generate_ticket` | Create Jira/GitHub/Linear ticket draft | `task` | +| `postmortem` | Generate postmortem for a resolved incident | `root_cause_id` (preferred), or `root_cause_name` + `entity_name`, or `service` + `incident_start` | + +--- + +## Primary decision tree + +``` +Have a service name? +│ +├─ triage(entity_name="") ← 1 call, full picture +│ ├─ Root cause, symptoms, blast radius, customer impact, remediation: all here +│ ├─ description populated with specifics? → use it as evidence, skip get_logs +│ ├─ description generic ("Inspect logs...") AND has_stored_logs=true? +│ │ └─ get_logs(root_cause_id=, limit=10, severity_filter=ERROR) ← conditional 2nd call +│ └─ causely.ai/team label in entity.labels? → use it, skip team_health +│ └─ label absent? → team_health(team="") ← conditional 2nd call +│ +├─ Need metrics/SLOs/topology? (not in triage) +│ └─ get_entities(query="") → get_metrics / get_slo / get_topology +│ +└─ No name / system sweep? + │ + ├─ get_environment_health() ← 1 call, overall status + └─ get_root_causes(active_only=true) ← 1 call, all active issues with evidence +``` + +--- + +## Entity resolution pattern + +Many tools require entity IDs. Use `get_entities` to resolve names first: + +``` +get_entities(query="checkout", entity_types=["Service"]) + → returns [{id: "abc-123", name: "otel-demo/checkoutservice", ...}] + → pass id to get_metrics, get_slo, get_topology, get_alerts, etc. +``` + +**Entity name format:** +| Type | Format | Example | +|---|---|---| +| K8s service | `namespace/service-name` | `default/animal-service` | +| ECS task / VM | `cluster/task-name-hash` | `chaos/quarkus-workshop-hero-service-2b62b3ef` | +| Node | AWS/GCP hostname | `ip-192-168-12-32.us-east-2.compute.internal` | + +--- + +## Evidence: description vs get_logs + +The `description` field on a root cause contains Causely's synthesised evidence — extracted log patterns, error messages, counts, and context assembled at detection time. Examples: + +- "Disk provider not supported... WARN mediation/scraper_manager.go:267 full resync failed {error: disk provider not supported, provider: kubernetes.io/aws-ebs} — Count: 22" +- "org.postgresql.util.PSQLException: The connection attempt failed — Count: 2" + +When description contains this level of specificity, **do not call `get_logs`**. The evidence is already there. + +Only call `get_logs` when description is generic (e.g. "Inspect the application logs for error messages or stack traces") AND `has_stored_logs=true`. Use `limit=10` and `severity_filter=ERROR`. + +--- + +## The six dimensions — where each comes from + +| Dimension | Source | Extra call needed? | +|---|---|---| +| Root cause | `triage.root_cause` or `get_root_causes[].name` | No | +| Evidence | `description` field on root cause | No — only call `get_logs` if description is generic AND `has_stored_logs=true` | +| Blast radius | `impacted_services` and `impact_service_graph` on root cause | No | +| Customer impact | `impacted_customers` on root cause | No | +| Owner / team | `entity.labels["causely.ai/team"]` | Only if label absent: `team_health(team=)` | +| Remediation | `remediation` field on root cause | No | + +--- + +## Owner resolution + +Check `entity.labels` in the triage or get_root_causes response first: +- `causely.ai/team` present → that is the owner. No extra call needed. +- `causely.ai/owner-scraper` is NOT a team name — it identifies the discovery mechanism. Never present this as the owner. +- `causely.ai/team` absent → `team_health(team="")`. Try the namespace name or service name prefix. +- `team_health` returns no match → "Owner not registered in Causely — check your service catalog (e.g. Backstage)" + +--- + +## Tool reliability fallbacks + +**`triage` returns "No Incident Data Found":** Service is likely healthy. Confirm with `get_root_causes(active_only=true)` to check system-wide, or accept the health verdict. + +**`get_root_causes` returns empty list:** No active root causes. Check `get_symptoms(active_only=true)` for undiagnosed signals — but only if the user specifically needs to know about raw alerts. + +**`get_logs` returns empty lines:** `has_stored_logs` may have been `false` or logs have expired. Note "No evidence logs available" — do not retry. + +**`team_health` returns no match:** Try a shorter partial name. If still no match, report "Not registered — check service catalog". + +**`get_entities` returns empty:** Try a broader query or check `list_namespaces` / `list_clusters` to discover valid scope values. + +**All tools error:** Tell the engineer which calls you would have made. Direct them to https://portal.causely.app. + +--- + +## Output template + +### 🔴 / 🟡 / 🟢 [Service] — [Status] + +**Root cause:** [name + entity + portal link from triage/get_root_causes] + +**Evidence:** +- [from `description` field — quote specific log patterns or error messages] +- [if get_logs called: add 1–2 key ERROR lines as supplement] +- [if description generic and no logs: "No stored evidence (has_stored_logs=false)"] + +**Blast radius:** [from `impacted_services`, or "None identified"] + +**Customer impact:** [from `impacted_customers`, or "None identified"] + +**Owner / team:** [from `causely.ai/team` label, or `team_health` result, or "Not registered — check service catalog"] + +**Recommended actions:** [from `remediation` field] + +**Links:** [Causely portal links from response] diff --git a/skills/causely-k8s-investigation/SKILL.md b/skills/causely-k8s-investigation/SKILL.md new file mode 100644 index 0000000..a9d7a18 --- /dev/null +++ b/skills/causely-k8s-investigation/SKILL.md @@ -0,0 +1,122 @@ +--- +name: causely-k8s-investigation +description: > + Use this skill when the user asks about Kubernetes infrastructure health: nodes, pods, namespaces, deployments, DaemonSets, containers, or infra-level issues like OOMKills, node pressure, pod restarts, scheduling failures, resource exhaustion, CrashLoopBackOff, or evictions. Trigger for questions like "why did my pod restart?", "is node X under pressure?", "what's wrong with the chaos namespace?", "are any nodes unhealthy?", "why is my deployment not scaling?", "show me container resource usage", "what events happened on this pod?", "show me the config for this deployment", or any question involving k8s primitives rather than application-level services. Always use this skill — not just the generic causely-mcp skill — when the question is clearly about infrastructure or workload scheduling rather than application SLOs or business metrics. +--- + +# Causely K8s Investigation Skill + +Read `references/complete-investigation.md` for the full 25-tool inventory and evidence strategy. + +--- + +## Core tools for K8s investigation + +| Tool | Use when | What it returns | +|---|---|---| +| `triage(entity_name=)` | Service-level health check — always start here | Root causes with infra-layer evidence (OOMKill, pod failure, memory pressure) | +| `get_entities(query=, entity_types=)` | Resolve K8s entities to IDs | Entity IDs for pods, containers, nodes, databases | +| `get_entity_health(entity_id=)` | Non-service entity health (pods, nodes, DBs, containers) | Symptoms, root causes, events, logs, metrics for one entity | +| `get_events(entity_id=)` | Lifecycle events (restarts, scaling, scheduling) | Timestamped events: OOMKill, CrashLoopBackOff, eviction, deploy, config change | +| `get_config(entity_id=)` | Inspect K8s manifests and resource specs | Raw config files: deployment spec, resource limits, HPA config | +| `get_metrics(entity_ids=, metrics=)` | Container/pod resource utilisation | CPU, memory, network I/O snapshots or time-series | +| `get_logs(entity_id=)` | Live container/pod logs | Real-time log stream for a running entity | +| `get_root_causes(active_only=true)` | System-wide infra sweep | All active RCs — filter for K8s-related root causes | +| `list_namespaces()` | Discover valid namespaces | Namespace names for scoping investigations | +| `list_clusters()` | Discover valid clusters | Cluster names for multi-cluster queries | + +--- + +## Entity name format + +| Type | Format | Example | +|---|---|---| +| K8s service | `namespace/service-name` | `default/animal-service` | +| ECS task / VM | `cluster/task-name-hash` | `chaos/quarkus-workshop-hero-service-2b62b3ef` | +| Node | AWS/GCP hostname | `ip-192-168-12-32.us-east-2.compute.internal` | + +--- + +## Decision tree + +**Service name known — start at service level:** +``` +triage(entity_name="") ← 1 call + → infra root causes: "Memory congestion", "Pod Failure", "OOMKill", "Node pressure" + → description = evidence (memory %, restart counts, disk errors) + → impacted_services = blast radius + → done +``` + +**Need pod/container-level detail:** +``` +get_entities(query="", entity_types=["Container","Pod"]) ← 1 call +get_entity_health(entity_id=) ← 1 call + → symptoms, root causes, events, logs, metrics for that specific entity +``` + +**Why did my pod restart?** +``` +get_entities(query="") ← 1 call +get_events(entity_id=, severity_filter=WARNING) ← 1 call + → look for OOMKill, CrashLoopBackOff, Evicted events with timestamps + → if OOMKill: get_config(entity_id=) to check resource limits + → if CrashLoopBackOff: get_logs(entity_id=, limit=20, severity_filter=ERROR) +``` + +**Resource utilisation check:** +``` +get_entities(query="", entity_types=["Service"]) ← 1 call +get_metrics(entity_ids=[id], metrics=["cpu_usage", "memory_usage", "memory_limit"]) ← 1 call + → compare usage vs limits + → if near limit: check get_config for resource requests/limits +``` + +**Inspect K8s config / resource limits:** +``` +get_entities(query="") ← 1 call +get_config(entity_id=) ← 1 call + → deployment spec, resource limits, HPA config, environment variables +``` + +**Service name unknown / namespace sweep:** +``` +get_environment_health(namespaces=[""]) ← 1 call + → overall namespace status + active root causes + → or: +get_root_causes(active_only=true) ← 1 call + → filter for namespace/entity names matching the namespace + → description = evidence for each RC + → only triage the single highest-severity hit for detail +``` + +**Triage returns "No Incident Data Found":** +- Service is healthy at the service level — the infra issue may be at pod/container level +- Try `get_entities(query="")` → `get_entity_health(entity_id=)` for pod-level health +- Or `get_root_causes(active_only=true)` and filter for the entity name pattern + +--- + +## Output format + +### 🔴 / 🟡 / 🟢 [Service/Entity] — [Status] + +**Root cause (infra layer):** [name + entity + portal link] + +**Evidence:** [from description field — specific metrics, counts, log patterns; supplement with get_logs only if description is generic AND has_stored_logs=true] + +**Resource state:** [from get_metrics if called — CPU/memory usage vs limits] + +**Configuration:** [from get_config if called — relevant resource limits, HPA settings] + +**Recent events:** [from get_events if called — OOMKill, restarts, scaling events with timestamps] + +**Blast radius:** [from impacted_services, or "None identified"] + +**Customer impact:** [from impacted_customers, or "None identified"] + +**Owner / team:** [from causely.ai/team label or team_health, or "Not registered"] + +**Recommended actions:** [from remediation field + k8s-specific steps: adjust resource limits, cordon/drain node, review HPA, check liveness probes] + +**Links:** [portal links from response] diff --git a/skills/causely-k8s-investigation/references/complete-investigation.md b/skills/causely-k8s-investigation/references/complete-investigation.md new file mode 100644 index 0000000..d358e2b --- /dev/null +++ b/skills/causely-k8s-investigation/references/complete-investigation.md @@ -0,0 +1,178 @@ +# Complete Investigation Reference + +## Efficiency-first principle + +**`triage` is a complete answer in one call.** It returns root cause, symptoms, impacted services, impacted customers, remediation, and `has_stored_logs` — everything needed for a full six-dimension response. Do not follow it with separate `get_symptoms` or blast-radius calls; that data is already inside the triage response. + +**`description` is Causely's pre-synthesised evidence.** When `get_root_causes` or `triage` returns a `description` field with specific log patterns, error messages, or metrics, that is the evidence. Do not call `get_logs` to regenerate it — Causely already did that work at detection time. Read `description` first. + +--- + +## Complete tool inventory (25 tools) + +### Discovery & inventory +| Tool | Use when | Key params | +|---|---|---| +| `get_entities` | Resolve names → entity IDs for any downstream tool | `query`, `entity_types`, `namespace_names`, `cluster_names` | +| `list_clusters` | Discover valid cluster names | `query` (optional filter) | +| `list_namespaces` | Discover valid namespace names | `query` (optional filter) | +| `get_label_values` | Enumerate teams, products, environments | `label_key` (e.g. `causely.ai/team`) | +| `get_integration_status` | Check scraper/integration coverage per cluster | `cluster_names` (optional filter) | + +### Health & triage +| Tool | Use when | Key params | +|---|---|---| +| `triage` | Named entity health check — fastest, deterministic | `entity_name`, `root_cause_id`, `root_cause_name`, `start_time/end_time` | +| `get_environment_health` | Global or namespace-scoped health overview | `namespaces`, `services`, `lookback_hours` | +| `get_service_summary` | Comprehensive single-service health (all dimensions) | `service` (substring), `lookback_hours` | +| `get_entity_health` | Health for non-Service entities (pods, DBs, queues) | `entity_id`, `lookback_hours` | +| `team_health` | All services owned by a team | `team` (partial match) | +| `ask_causely` | Free-form NL query, cross-entity synthesis | `question` | + +### Diagnosis +| Tool | Use when | Key params | +|---|---|---| +| `get_root_causes` | All active root causes (structured JSON with impact graphs) | `active_only`, `impacted_service_ids`, `symptom_ids`, `root_cause_name` | +| `get_symptoms` | Raw signals / historical timelines | `entity_ids`, `active_only`, `lookback_hours` | +| `get_alerts` | Raw alert history, mapped/unmapped status | `entity_ids`, `alert_name_filters`, `mapping_state_filters` | +| `get_logs` | Live entity logs OR stored evidence logs | `entity_id` XOR `root_cause_id`, `severity_filter` | +| `get_events` | Lifecycle events (deploys, restarts, scaling) | `entity_id`, `severity_filter`, `message_contains` | +| `get_slow_queries` | DB slow query analysis | `entity_ids` | + +### Observability data +| Tool | Use when | Key params | +|---|---|---| +| `get_metrics` | Numeric snapshots or time-series | `entity_ids`, `metrics`, `window_minutes` | +| `get_slo` | SLO state, error budget, burn rate | `entity_ids`, `only_at_risk`, `only_violated` | +| `get_config` | Raw config files for an entity | `entity_id`, `name_contains` | +| `get_topology` | Dependency/dependent/dataflow graph | `entity_id`, `mode`, `levels` | + +### Post-deploy & reliability +| Tool | Use when | Key params | +|---|---|---| +| `reliability_delta` | Single-service pre/post deploy comparison | `service`, `lookback_hours`, `window_minutes` | +| `fleet_reliability_delta` | Batch regression check across multiple services | `team`, `namespace`, `services`, `window_minutes` | + +### Reporting & actions +| Tool | Use when | Key params | +|---|---|---| +| `generate_ticket` | Create Jira/GitHub/Linear ticket draft | `task` | +| `postmortem` | Generate postmortem for a resolved incident | `root_cause_id` (preferred), or `root_cause_name` + `entity_name`, or `service` + `incident_start` | + +--- + +## Primary decision tree + +``` +Have a service name? +│ +├─ triage(entity_name="") ← 1 call, full picture +│ ├─ Root cause, symptoms, blast radius, customer impact, remediation: all here +│ ├─ description populated with specifics? → use it as evidence, skip get_logs +│ ├─ description generic ("Inspect logs...") AND has_stored_logs=true? +│ │ └─ get_logs(root_cause_id=, limit=10, severity_filter=ERROR) ← conditional 2nd call +│ └─ causely.ai/team label in entity.labels? → use it, skip team_health +│ └─ label absent? → team_health(team="") ← conditional 2nd call +│ +├─ Need metrics/SLOs/topology? (not in triage) +│ └─ get_entities(query="") → get_metrics / get_slo / get_topology +│ +└─ No name / system sweep? + │ + ├─ get_environment_health() ← 1 call, overall status + └─ get_root_causes(active_only=true) ← 1 call, all active issues with evidence +``` + +--- + +## Entity resolution pattern + +Many tools require entity IDs. Use `get_entities` to resolve names first: + +``` +get_entities(query="checkout", entity_types=["Service"]) + → returns [{id: "abc-123", name: "otel-demo/checkoutservice", ...}] + → pass id to get_metrics, get_slo, get_topology, get_alerts, etc. +``` + +**Entity name format:** +| Type | Format | Example | +|---|---|---| +| K8s service | `namespace/service-name` | `default/animal-service` | +| ECS task / VM | `cluster/task-name-hash` | `chaos/quarkus-workshop-hero-service-2b62b3ef` | +| Node | AWS/GCP hostname | `ip-192-168-12-32.us-east-2.compute.internal` | + +--- + +## Evidence: description vs get_logs + +The `description` field on a root cause contains Causely's synthesised evidence — extracted log patterns, error messages, counts, and context assembled at detection time. Examples: + +- "Disk provider not supported... WARN mediation/scraper_manager.go:267 full resync failed {error: disk provider not supported, provider: kubernetes.io/aws-ebs} — Count: 22" +- "org.postgresql.util.PSQLException: The connection attempt failed — Count: 2" + +When description contains this level of specificity, **do not call `get_logs`**. The evidence is already there. + +Only call `get_logs` when description is generic (e.g. "Inspect the application logs for error messages or stack traces") AND `has_stored_logs=true`. Use `limit=10` and `severity_filter=ERROR`. + +--- + +## The six dimensions — where each comes from + +| Dimension | Source | Extra call needed? | +|---|---|---| +| Root cause | `triage.root_cause` or `get_root_causes[].name` | No | +| Evidence | `description` field on root cause | No — only call `get_logs` if description is generic AND `has_stored_logs=true` | +| Blast radius | `impacted_services` and `impact_service_graph` on root cause | No | +| Customer impact | `impacted_customers` on root cause | No | +| Owner / team | `entity.labels["causely.ai/team"]` | Only if label absent: `team_health(team=)` | +| Remediation | `remediation` field on root cause | No | + +--- + +## Owner resolution + +Check `entity.labels` in the triage or get_root_causes response first: +- `causely.ai/team` present → that is the owner. No extra call needed. +- `causely.ai/owner-scraper` is NOT a team name — it identifies the discovery mechanism. Never present this as the owner. +- `causely.ai/team` absent → `team_health(team="")`. Try the namespace name or service name prefix. +- `team_health` returns no match → "Owner not registered in Causely — check your service catalog (e.g. Backstage)" + +--- + +## Tool reliability fallbacks + +**`triage` returns "No Incident Data Found":** Service is likely healthy. Confirm with `get_root_causes(active_only=true)` to check system-wide, or accept the health verdict. + +**`get_root_causes` returns empty list:** No active root causes. Check `get_symptoms(active_only=true)` for undiagnosed signals — but only if the user specifically needs to know about raw alerts. + +**`get_logs` returns empty lines:** `has_stored_logs` may have been `false` or logs have expired. Note "No evidence logs available" — do not retry. + +**`team_health` returns no match:** Try a shorter partial name. If still no match, report "Not registered — check service catalog". + +**`get_entities` returns empty:** Try a broader query or check `list_namespaces` / `list_clusters` to discover valid scope values. + +**All tools error:** Tell the engineer which calls you would have made. Direct them to https://portal.causely.app. + +--- + +## Output template + +### 🔴 / 🟡 / 🟢 [Service] — [Status] + +**Root cause:** [name + entity + portal link from triage/get_root_causes] + +**Evidence:** +- [from `description` field — quote specific log patterns or error messages] +- [if get_logs called: add 1–2 key ERROR lines as supplement] +- [if description generic and no logs: "No stored evidence (has_stored_logs=false)"] + +**Blast radius:** [from `impacted_services`, or "None identified"] + +**Customer impact:** [from `impacted_customers`, or "None identified"] + +**Owner / team:** [from `causely.ai/team` label, or `team_health` result, or "Not registered — check service catalog"] + +**Recommended actions:** [from `remediation` field] + +**Links:** [Causely portal links from response] diff --git a/skills/causely-postmortem/SKILL.md b/skills/causely-postmortem/SKILL.md new file mode 100644 index 0000000..fd86b87 --- /dev/null +++ b/skills/causely-postmortem/SKILL.md @@ -0,0 +1,117 @@ +--- +name: causely-postmortem +description: > + Use this skill when the user wants to generate a postmortem, incident retrospective, incident report, or blameless review for a resolved or active incident. Trigger for questions like "write a postmortem for the checkout incident", "generate an incident report", "create a retrospective for last night's outage", "what happened with X? write it up", "incident summary for the team", "create a ticket for the remediation", or "draft a Jira ticket for this fix". Also trigger when someone asks to document an incident for future reference, create action items from an incident, or generate a structured engineering ticket. This skill covers both the postmortem tool and the generate_ticket tool. +--- + +# Causely Postmortem & Ticket Skill + +Read `references/complete-investigation.md` for the full 25-tool inventory and evidence strategy. + +--- + +## Core tools for postmortems and tickets + +| Tool | Use when | What it returns | +|---|---|---| +| `postmortem(root_cause_id=)` | Generate full postmortem from Causely data | Markdown + structured fields: title, summary, timeline, root cause, blast radius, contributing factors, action items | +| `generate_ticket(task=)` | Create an engineering ticket draft | Structured JSON: title, description, context, requirements, acceptance criteria, notes | +| `get_root_causes(active_only=false, lookback_hours=N)` | Find the root cause ID for postmortem | Historical root causes with IDs | +| `triage(entity_name=, start_time=, end_time=)` | Scoped incident summary for a time window | Markdown narrative with root causes, symptoms, impact | +| `get_events(entity_id=)` | Build incident timeline | Lifecycle events (deploys, restarts, config changes) | +| `get_symptoms(active_only=false, lookback_hours=N)` | Reconstruct signal timeline | Historical symptom start/end for timeline building | + +--- + +## Decision tree + +**Generate postmortem — root cause ID known:** +``` +postmortem(root_cause_id="") ← 1 call + → complete postmortem: title, summary, timeline, root cause, + blast radius, contributing factors, action items + → done +``` + +**Generate postmortem — root cause ID unknown:** +``` +get_root_causes(active_only=false, lookback_hours=48, root_cause_name="") ← 1 call + → find the matching root cause, get its ID + → or: triage(entity_name="", start_time=, end_time=) to find RCs in window + +postmortem(root_cause_id=) ← 2nd call + → complete postmortem +``` + +**Generate postmortem — by service + time window (legacy path):** +``` +postmortem(service="", incident_start="2025-03-14T00:00:00Z") ← 1 call + → postmortem scoped to that service and time +``` + +**Generate postmortem — by root cause name:** +``` +postmortem(root_cause_name="", entity_name="") ← 1 call + → if ambiguous: returns ambiguity_candidates → re-submit with root_cause_id +``` + +**Enrich postmortem with additional context:** +``` +get_entities(query="") → get_events(entity_id=) ← timeline enrichment +get_symptoms(active_only=false, lookback_hours=48, entity_ids=[id]) ← signal timeline + → add deploy events, symptom transitions to the postmortem narrative +``` + +**Generate remediation ticket from postmortem:** +``` +postmortem(root_cause_id=) ← 1 call + → extract action items from postmortem +generate_ticket(task="") ← 1 call per ticket + → structured ticket: title, description, acceptance criteria +``` + +**Generate ticket without postmortem (standalone):** +``` +generate_ticket(task="") ← 1 call + → Jira/GitHub/Linear-ready ticket draft +``` + +--- + +## Postmortem input priority + +Use the first applicable lookup path: +1. **`root_cause_id`** — preferred; directly identifies the root cause +2. **`root_cause_name` + `entity_name`** — resolves by name; returns candidates if multiple match +3. **`service` + `incident_start`** — legacy path; requires service name and RFC3339 start time + +`incident_id` alone is not resolvable — always pair it with one of the paths above. + +--- + +## Output format + +### 📋 Incident postmortem + +[Postmortem markdown from the `postmortem` tool — includes title, summary, timeline, root cause analysis, blast radius, contributing factors, and action items] + +--- + +### 🎫 Remediation tickets + +For each action item from the postmortem: + +**Title:** [from generate_ticket] +**Priority:** [inferred from severity] +**Description:** [from generate_ticket — context + requirements] +**Acceptance criteria:** [from generate_ticket] + +--- + +## Important behaviours + +- **Prefer `root_cause_id`** over other lookup paths — it's the most reliable and unambiguous. +- **Handle ambiguity gracefully**: if `postmortem(root_cause_name=)` returns `ambiguity_candidates`, present the candidates to the user and ask them to pick one, then re-call with the selected `root_cause_id`. +- **Don't re-investigate**: the postmortem tool synthesises from Causely's data layer. Do not separately call triage + get_root_causes + get_logs to rebuild what postmortem already returns. +- **Tickets are forward-looking**: use `generate_ticket` for remediation work, not for documenting what happened (that's the postmortem). +- **Surface portal links** so engineers can drill into the Causely data behind the postmortem. diff --git a/skills/causely-postmortem/references/complete-investigation.md b/skills/causely-postmortem/references/complete-investigation.md new file mode 100644 index 0000000..d358e2b --- /dev/null +++ b/skills/causely-postmortem/references/complete-investigation.md @@ -0,0 +1,178 @@ +# Complete Investigation Reference + +## Efficiency-first principle + +**`triage` is a complete answer in one call.** It returns root cause, symptoms, impacted services, impacted customers, remediation, and `has_stored_logs` — everything needed for a full six-dimension response. Do not follow it with separate `get_symptoms` or blast-radius calls; that data is already inside the triage response. + +**`description` is Causely's pre-synthesised evidence.** When `get_root_causes` or `triage` returns a `description` field with specific log patterns, error messages, or metrics, that is the evidence. Do not call `get_logs` to regenerate it — Causely already did that work at detection time. Read `description` first. + +--- + +## Complete tool inventory (25 tools) + +### Discovery & inventory +| Tool | Use when | Key params | +|---|---|---| +| `get_entities` | Resolve names → entity IDs for any downstream tool | `query`, `entity_types`, `namespace_names`, `cluster_names` | +| `list_clusters` | Discover valid cluster names | `query` (optional filter) | +| `list_namespaces` | Discover valid namespace names | `query` (optional filter) | +| `get_label_values` | Enumerate teams, products, environments | `label_key` (e.g. `causely.ai/team`) | +| `get_integration_status` | Check scraper/integration coverage per cluster | `cluster_names` (optional filter) | + +### Health & triage +| Tool | Use when | Key params | +|---|---|---| +| `triage` | Named entity health check — fastest, deterministic | `entity_name`, `root_cause_id`, `root_cause_name`, `start_time/end_time` | +| `get_environment_health` | Global or namespace-scoped health overview | `namespaces`, `services`, `lookback_hours` | +| `get_service_summary` | Comprehensive single-service health (all dimensions) | `service` (substring), `lookback_hours` | +| `get_entity_health` | Health for non-Service entities (pods, DBs, queues) | `entity_id`, `lookback_hours` | +| `team_health` | All services owned by a team | `team` (partial match) | +| `ask_causely` | Free-form NL query, cross-entity synthesis | `question` | + +### Diagnosis +| Tool | Use when | Key params | +|---|---|---| +| `get_root_causes` | All active root causes (structured JSON with impact graphs) | `active_only`, `impacted_service_ids`, `symptom_ids`, `root_cause_name` | +| `get_symptoms` | Raw signals / historical timelines | `entity_ids`, `active_only`, `lookback_hours` | +| `get_alerts` | Raw alert history, mapped/unmapped status | `entity_ids`, `alert_name_filters`, `mapping_state_filters` | +| `get_logs` | Live entity logs OR stored evidence logs | `entity_id` XOR `root_cause_id`, `severity_filter` | +| `get_events` | Lifecycle events (deploys, restarts, scaling) | `entity_id`, `severity_filter`, `message_contains` | +| `get_slow_queries` | DB slow query analysis | `entity_ids` | + +### Observability data +| Tool | Use when | Key params | +|---|---|---| +| `get_metrics` | Numeric snapshots or time-series | `entity_ids`, `metrics`, `window_minutes` | +| `get_slo` | SLO state, error budget, burn rate | `entity_ids`, `only_at_risk`, `only_violated` | +| `get_config` | Raw config files for an entity | `entity_id`, `name_contains` | +| `get_topology` | Dependency/dependent/dataflow graph | `entity_id`, `mode`, `levels` | + +### Post-deploy & reliability +| Tool | Use when | Key params | +|---|---|---| +| `reliability_delta` | Single-service pre/post deploy comparison | `service`, `lookback_hours`, `window_minutes` | +| `fleet_reliability_delta` | Batch regression check across multiple services | `team`, `namespace`, `services`, `window_minutes` | + +### Reporting & actions +| Tool | Use when | Key params | +|---|---|---| +| `generate_ticket` | Create Jira/GitHub/Linear ticket draft | `task` | +| `postmortem` | Generate postmortem for a resolved incident | `root_cause_id` (preferred), or `root_cause_name` + `entity_name`, or `service` + `incident_start` | + +--- + +## Primary decision tree + +``` +Have a service name? +│ +├─ triage(entity_name="") ← 1 call, full picture +│ ├─ Root cause, symptoms, blast radius, customer impact, remediation: all here +│ ├─ description populated with specifics? → use it as evidence, skip get_logs +│ ├─ description generic ("Inspect logs...") AND has_stored_logs=true? +│ │ └─ get_logs(root_cause_id=, limit=10, severity_filter=ERROR) ← conditional 2nd call +│ └─ causely.ai/team label in entity.labels? → use it, skip team_health +│ └─ label absent? → team_health(team="") ← conditional 2nd call +│ +├─ Need metrics/SLOs/topology? (not in triage) +│ └─ get_entities(query="") → get_metrics / get_slo / get_topology +│ +└─ No name / system sweep? + │ + ├─ get_environment_health() ← 1 call, overall status + └─ get_root_causes(active_only=true) ← 1 call, all active issues with evidence +``` + +--- + +## Entity resolution pattern + +Many tools require entity IDs. Use `get_entities` to resolve names first: + +``` +get_entities(query="checkout", entity_types=["Service"]) + → returns [{id: "abc-123", name: "otel-demo/checkoutservice", ...}] + → pass id to get_metrics, get_slo, get_topology, get_alerts, etc. +``` + +**Entity name format:** +| Type | Format | Example | +|---|---|---| +| K8s service | `namespace/service-name` | `default/animal-service` | +| ECS task / VM | `cluster/task-name-hash` | `chaos/quarkus-workshop-hero-service-2b62b3ef` | +| Node | AWS/GCP hostname | `ip-192-168-12-32.us-east-2.compute.internal` | + +--- + +## Evidence: description vs get_logs + +The `description` field on a root cause contains Causely's synthesised evidence — extracted log patterns, error messages, counts, and context assembled at detection time. Examples: + +- "Disk provider not supported... WARN mediation/scraper_manager.go:267 full resync failed {error: disk provider not supported, provider: kubernetes.io/aws-ebs} — Count: 22" +- "org.postgresql.util.PSQLException: The connection attempt failed — Count: 2" + +When description contains this level of specificity, **do not call `get_logs`**. The evidence is already there. + +Only call `get_logs` when description is generic (e.g. "Inspect the application logs for error messages or stack traces") AND `has_stored_logs=true`. Use `limit=10` and `severity_filter=ERROR`. + +--- + +## The six dimensions — where each comes from + +| Dimension | Source | Extra call needed? | +|---|---|---| +| Root cause | `triage.root_cause` or `get_root_causes[].name` | No | +| Evidence | `description` field on root cause | No — only call `get_logs` if description is generic AND `has_stored_logs=true` | +| Blast radius | `impacted_services` and `impact_service_graph` on root cause | No | +| Customer impact | `impacted_customers` on root cause | No | +| Owner / team | `entity.labels["causely.ai/team"]` | Only if label absent: `team_health(team=)` | +| Remediation | `remediation` field on root cause | No | + +--- + +## Owner resolution + +Check `entity.labels` in the triage or get_root_causes response first: +- `causely.ai/team` present → that is the owner. No extra call needed. +- `causely.ai/owner-scraper` is NOT a team name — it identifies the discovery mechanism. Never present this as the owner. +- `causely.ai/team` absent → `team_health(team="")`. Try the namespace name or service name prefix. +- `team_health` returns no match → "Owner not registered in Causely — check your service catalog (e.g. Backstage)" + +--- + +## Tool reliability fallbacks + +**`triage` returns "No Incident Data Found":** Service is likely healthy. Confirm with `get_root_causes(active_only=true)` to check system-wide, or accept the health verdict. + +**`get_root_causes` returns empty list:** No active root causes. Check `get_symptoms(active_only=true)` for undiagnosed signals — but only if the user specifically needs to know about raw alerts. + +**`get_logs` returns empty lines:** `has_stored_logs` may have been `false` or logs have expired. Note "No evidence logs available" — do not retry. + +**`team_health` returns no match:** Try a shorter partial name. If still no match, report "Not registered — check service catalog". + +**`get_entities` returns empty:** Try a broader query or check `list_namespaces` / `list_clusters` to discover valid scope values. + +**All tools error:** Tell the engineer which calls you would have made. Direct them to https://portal.causely.app. + +--- + +## Output template + +### 🔴 / 🟡 / 🟢 [Service] — [Status] + +**Root cause:** [name + entity + portal link from triage/get_root_causes] + +**Evidence:** +- [from `description` field — quote specific log patterns or error messages] +- [if get_logs called: add 1–2 key ERROR lines as supplement] +- [if description generic and no logs: "No stored evidence (has_stored_logs=false)"] + +**Blast radius:** [from `impacted_services`, or "None identified"] + +**Customer impact:** [from `impacted_customers`, or "None identified"] + +**Owner / team:** [from `causely.ai/team` label, or `team_health` result, or "Not registered — check service catalog"] + +**Recommended actions:** [from `remediation` field] + +**Links:** [Causely portal links from response]