Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
7e0b465
add cf-o11y worker
iscekic Feb 5, 2026
e43c56e
feat(config): add observability service URL configuration
iscekic Feb 5, 2026
c8df6ad
feat(o11y): add API metrics ingestion endpoint with validation
iscekic Feb 5, 2026
c0b1dcc
refactor(o11y): extract JSON validation logic into reusable utility
iscekic Feb 5, 2026
45523ac
feat(o11y): add API metrics emission for model routing
iscekic Feb 5, 2026
f87984c
refactor(o11y): derive client name from secret instead of requiring it
iscekic Feb 5, 2026
9f1b080
feat(o11y): add tools tracking to API metrics
iscekic Feb 5, 2026
d899fb7
feat(o11y): track tools used in API requests
iscekic Feb 5, 2026
d849d5e
feat(o11y): add TTFB tracking to API metrics
iscekic Feb 5, 2026
078b334
feat(o11y): track complete request duration in API metrics
iscekic Feb 5, 2026
f92902a
perf(o11y): add timeout to response body draining for SSE streams
iscekic Feb 5, 2026
0c4ff19
feat(o11y): track HTTP status codes in API metrics
iscekic Feb 5, 2026
4b4be80
refactor(o11y): derive success state from status code instead of expl…
iscekic Feb 5, 2026
2417fa9
refactor(o11y): remove errorMessage field from API metrics schema
iscekic Feb 5, 2026
dc5f40b
feat(o11y): add token usage tracking to API metrics
iscekic Feb 5, 2026
a566249
feat(o11y): add user context and streaming metadata to API metrics
iscekic Feb 5, 2026
0190bca
feat(o11y): configure client secret from environment variable
iscekic Feb 5, 2026
140ace0
style(o11y): add blank lines for improved readability in validation f…
iscekic Feb 5, 2026
6a1b156
feat(o11y)!: integrate Cloudflare Secrets Store for authentication
iscekic Feb 5, 2026
5b5b565
feat(o11y): integrate PostHog analytics for API metrics tracking
iscekic Feb 5, 2026
a7351c9
test(o11y): simplify test setup by inlining execution context
iscekic Feb 5, 2026
4283efd
ci(o11y): add automated deployment workflow for observability service
iscekic Feb 5, 2026
0c6685d
feat(o11y): forward client IP address to PostHog for GeoIP resolution
iscekic Feb 5, 2026
dfcab1d
feat(o11y): implement SLO-based alerting with multi-window burn rate …
iscekic Feb 6, 2026
117c5ea
refactor(o11y): improve query syntax and rename base URL environment …
iscekic Feb 6, 2026
db86f70
feat(o11y): add custom domain route configuration
iscekic Feb 6, 2026
36d9dd0
chore(o11y): update compatibility date to 2026-02-01
iscekic Feb 6, 2026
4ef504e
refactor(o11y): include client name in alert deduplication keys
iscekic Feb 6, 2026
db8b3f2
refactor(o11y): improve alert timestamp format and error message clarity
iscekic Feb 6, 2026
7a1cbae
fix(o11y): add error handling for corrupted cache in recommended models
iscekic Feb 6, 2026
395d0fa
chore(o11y): update wrangler generated types
iscekic Feb 6, 2026
61b1b00
fix(o11y): add type guards for tool name extraction and move timing
iscekic Feb 6, 2026
f9ce09c
fix(o11y): normalize resolved model to lowercase for consistency
iscekic Feb 6, 2026
c20fdcb
fix(o11y): add timeout and error handling to slack webhook notifications
iscekic Feb 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions .github/workflows/deploy-o11y.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: Deploy O11y

on:
workflow_dispatch:
inputs:
environment:
description: 'Choose an environment to deploy to: dev or prod'
required: true
default: 'prod'
type: choice
options:
- dev
- prod
workflow_call:
inputs:
environment:
description: 'Choose an environment to deploy to: dev or prod'
required: false
default: 'prod'
type: string

jobs:
deploy:
runs-on: ubuntu-latest
name: Deploy O11y

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Setup pnpm
uses: pnpm/action-setup@v2
with:
version: latest

- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: 22

- name: Install dependencies
working-directory: cloudflare-o11y
run: pnpm install --frozen-lockfile

- name: Deploy to Cloudflare Workers
uses: cloudflare/wrangler-action@v3
with:
apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
workingDirectory: cloudflare-o11y
command: ${{ inputs.environment == 'dev' && 'deploy --env dev' || 'deploy' }}
30 changes: 30 additions & 0 deletions .github/workflows/deploy-production.yml
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ jobs:
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 2

- name: Check for cloud-agent changes
uses: dorny/paths-filter@v3
Expand All @@ -180,6 +182,8 @@ jobs:
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 2

- name: Check for session-ingest changes
uses: dorny/paths-filter@v3
Expand All @@ -196,3 +200,29 @@ jobs:
secrets: inherit
with:
environment: prod

check-o11y-changes:
runs-on: ubuntu-latest
outputs:
changed: ${{ steps.changes.outputs.o11y }}
steps:
- name: Checkout code
uses: actions/checkout@v4
Comment thread
iscekic marked this conversation as resolved.
with:
fetch-depth: 2

- name: Check for o11y changes
uses: dorny/paths-filter@v3
id: changes
with:
filters: |
o11y:
- 'cloudflare-o11y/**'

deploy-o11y:
needs: [check-o11y-changes]
if: needs.check-o11y-changes.outputs.changed == 'true'
uses: ./.github/workflows/deploy-o11y.yml
secrets: inherit
with:
environment: prod
12 changes: 12 additions & 0 deletions cloudflare-o11y/.editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# http://editorconfig.org
root = true

[*]
indent_style = tab
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true

[*.yml]
indent_style = space
167 changes: 167 additions & 0 deletions cloudflare-o11y/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
# Logs

logs
_.log
npm-debug.log_
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*

# Diagnostic reports (https://nodejs.org/api/report.html)

report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json

# Runtime data

pids
_.pid
_.seed
\*.pid.lock

# Directory for instrumented libs generated by jscoverage/JSCover

lib-cov

# Coverage directory used by tools like istanbul

coverage
\*.lcov

# nyc test coverage

.nyc_output

# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)

.grunt

# Bower dependency directory (https://bower.io/)

bower_components

# node-waf configuration

.lock-wscript

# Compiled binary addons (https://nodejs.org/api/addons.html)

build/Release

# Dependency directories

node_modules/
jspm_packages/

# Snowpack dependency directory (https://snowpack.dev/)

web_modules/

# TypeScript cache

\*.tsbuildinfo

# Optional npm cache directory

.npm

# Optional eslint cache

.eslintcache

# Optional stylelint cache

.stylelintcache

# Microbundle cache

.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/

# Optional REPL history

.node_repl_history

# Output of 'npm pack'

\*.tgz

# Yarn Integrity file

.yarn-integrity

# parcel-bundler cache (https://parceljs.org/)

.cache
.parcel-cache

# Next.js build output

.next
out

# Nuxt.js build / generate output

.nuxt
dist

# Gatsby files

.cache/

# Comment in the public line in if your project uses Gatsby and not Next.js

# https://nextjs.org/blog/next-9-1#public-directory-support

# public

# vuepress build output

.vuepress/dist

# vuepress v2.x temp and cache directory

.temp
.cache

# Docusaurus cache and generated files

.docusaurus

# Serverless directories

.serverless/

# FuseBox cache

.fusebox/

# DynamoDB Local files

.dynamodb/

# TernJS port file

.tern-port

# Stores VSCode versions used for testing VSCode extensions

.vscode-test

# yarn v2

.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.\*

# wrangler project

.dev.vars*
!.dev.vars.example
.env*
!.env.example
.wrangler/
6 changes: 6 additions & 0 deletions cloudflare-o11y/.prettierrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"printWidth": 140,
"singleQuote": true,
"semi": true,
"useTabs": true
}
5 changes: 5 additions & 0 deletions cloudflare-o11y/.vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"files.associations": {
"wrangler.json": "jsonc"
}
}
24 changes: 24 additions & 0 deletions cloudflare-o11y/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"name": "cloudflare-o11y",
"version": "0.0.0",
"private": true,
"scripts": {
"deploy": "wrangler deploy",
"dev": "wrangler dev",
"start": "wrangler dev",
"test": "vitest",
"cf-typegen": "wrangler types"
},
"devDependencies": {
"@cloudflare/vitest-pool-workers": "^0.12.4",
"@types/node": "^25.1.0",
"typescript": "^5.5.2",
"vitest": "~3.2.0",
"wrangler": "^4.61.1"
},
"dependencies": {
"@hono/zod-validator": "^0.7.6",
"hono": "^4.11.7",
"zod": "^4.3.6"
}
}
62 changes: 62 additions & 0 deletions cloudflare-o11y/src/alerting/dedup.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/**
* KV-based alert deduplication.
*
* Prevents the same alert from firing repeatedly by storing a cooldown
* marker in KV with a TTL. Higher-severity alerts suppress lower-severity
* ones for the same dimension.
*/

import type { AlertSeverity } from './slo-config';
import { PAGE_COOLDOWN_SECONDS, TICKET_COOLDOWN_SECONDS } from './slo-config';

function alertKey(severity: AlertSeverity, alertType: string, provider: string, model: string, clientName: string): string {
return `o11y:alert:${severity}:${alertType}:${provider}:${model}:${clientName}`;
}

function cooldownForSeverity(severity: AlertSeverity): number {
return severity === 'page' ? PAGE_COOLDOWN_SECONDS : TICKET_COOLDOWN_SECONDS;
}

/**
* Check whether an alert should be suppressed.
*
* Returns true if the alert should be suppressed (i.e. we already fired
* recently for this or a higher severity).
*/
export async function shouldSuppress(
kv: KVNamespace,
severity: AlertSeverity,
alertType: string,
provider: string,
model: string,
clientName: string,
): Promise<boolean> {
const key = alertKey(severity, alertType, provider, model, clientName);
const existing = await kv.get(key);
if (existing) return true;

// If this is a ticket, also check if a page-level alert is active
// (page suppresses ticket for the same dimension).
if (severity === 'ticket') {
const pageKey = alertKey('page', alertType, provider, model, clientName);
const pageExisting = await kv.get(pageKey);
if (pageExisting) return true;
}

return false;
}

/**
* Record that an alert was fired, setting the cooldown TTL.
*/
export async function recordAlertFired(
kv: KVNamespace,
severity: AlertSeverity,
alertType: string,
provider: string,
model: string,
clientName: string,
): Promise<void> {
const key = alertKey(severity, alertType, provider, model, clientName);
await kv.put(key, new Date().toISOString(), { expirationTtl: cooldownForSeverity(severity) });
}
Loading