Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .changeset/description-and-resolution.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
"@iqai/alert-logger": minor
---

feat: add `description` option and fix resolution noise

- Add `description` field to `AlertOptions` for separating short titles from detailed messages. When set, `description` is used as the embed body instead of the title.
- Allow `error()` and `critical()` to accept `(title, options)` without an intermediate `undefined` error param.
- Resolution notifications now only fire for sustained incidents (count > rampThreshold). One-off or sporadic alerts no longer produce "Resolved" messages.
- NestJS exception filter uses `{METHOD} {PATH}` as the alert title instead of the full error message.
79 changes: 65 additions & 14 deletions src/core/aggregator.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -170,16 +170,33 @@ describe('Aggregator', () => {
})

describe('resolution detection', () => {
it('detects entries past the cooldown as resolved', () => {
aggregator.process('err-resolve')
function processNTimes(fp: string, n: number) {
for (let i = 0; i < n; i++) {
aggregator.process(fp)
}
}

it('does not resolve alerts below rampThreshold', () => {
// Process a few times (below default rampThreshold of 64)
processNTimes('err-few', 10)

vi.advanceTimersByTime(DEFAULT_AGGREGATION.resolutionCooldownMs + 1)

const resolved = aggregator.checkResolutions()
expect(resolved).toHaveLength(0)
})

it('resolves alerts that exceeded rampThreshold (sustained crisis)', () => {
// Process more than rampThreshold (default 64)
processNTimes('err-resolve', 65)

// Advance past resolution cooldown (default 2 minutes)
vi.advanceTimersByTime(DEFAULT_AGGREGATION.resolutionCooldownMs + 1)

const resolved = aggregator.checkResolutions()
expect(resolved).toHaveLength(1)
expect(resolved[0].fingerprint).toBe('err-resolve')
expect(resolved[0].count).toBe(1)
expect(resolved[0].count).toBe(65)
})

it('does not resolve entries that are still active', () => {
Expand All @@ -193,7 +210,7 @@ describe('Aggregator', () => {
})

it('does not re-resolve already resolved entries', () => {
aggregator.process('err-once')
processNTimes('err-re-resolve', 65)

vi.advanceTimersByTime(DEFAULT_AGGREGATION.resolutionCooldownMs + 1)

Expand All @@ -207,10 +224,10 @@ describe('Aggregator', () => {

it('includes peakRate and timing info in resolved entries', () => {
const startTime = Date.now()
aggregator.process('err-info')
processNTimes('err-info', 64)

vi.advanceTimersByTime(1000)
aggregator.process('err-info')
aggregator.process('err-info') // count=65, exceeds rampThreshold

vi.advanceTimersByTime(DEFAULT_AGGREGATION.resolutionCooldownMs + 1)

Expand All @@ -222,8 +239,14 @@ describe('Aggregator', () => {
})

describe('state cleanup', () => {
function processNTimes(fp: string, n: number) {
for (let i = 0; i < n; i++) {
aggregator.process(fp)
}
}

it('evicts entries after resolution + grace period (5 minutes)', () => {
aggregator.process('err-evict')
processNTimes('err-evict', 65)

// Advance past cooldown to trigger resolution
vi.advanceTimersByTime(DEFAULT_AGGREGATION.resolutionCooldownMs + 1)
Expand All @@ -239,8 +262,25 @@ describe('Aggregator', () => {
expect(result.count).toBe(1)
})

it('evicts low-count entries after cooldown + grace period', () => {
aggregator.process('err-low-evict')

// Advance past cooldown — marks as resolution (but no resolved notification)
vi.advanceTimersByTime(DEFAULT_AGGREGATION.resolutionCooldownMs + 1)
aggregator.checkResolutions()

// Advance past grace period
vi.advanceTimersByTime(5 * 60_000 + 1)
aggregator.checkResolutions()

// Should create a fresh onset
const result = aggregator.process('err-low-evict')
expect(result.phase).toBe('onset')
expect(result.count).toBe(1)
})

it('does not evict entries before grace period expires', () => {
aggregator.process('err-keep')
processNTimes('err-keep', 65)

vi.advanceTimersByTime(DEFAULT_AGGREGATION.resolutionCooldownMs + 1)
aggregator.checkResolutions()
Expand All @@ -250,17 +290,15 @@ describe('Aggregator', () => {
aggregator.checkResolutions()

// Processing should NOT create a fresh onset (state still exists in resolution)
// The entry is still in the map with phase 'resolution', but process() will
// increment the existing state
const result = aggregator.process('err-keep')
expect(result.count).toBe(2)
expect(result.count).toBe(66)
})
})

describe('startResolutionTimer', () => {
it('calls onResolved callback for resolved entries', () => {
it('calls onResolved callback for sustained alerts', () => {
const onResolved = vi.fn()
aggregator.process('err-timer')
for (let i = 0; i < 65; i++) aggregator.process('err-timer')
aggregator.startResolutionTimer(onResolved)

// Advance past cooldown + the 30s check interval
Expand All @@ -269,14 +307,27 @@ describe('Aggregator', () => {
expect(onResolved).toHaveBeenCalledWith(expect.objectContaining({ fingerprint: 'err-timer' }))
})

it('does not call onResolved for low-count entries', () => {
const onResolved = vi.fn()
aggregator.process('err-low')
aggregator.process('err-low')
aggregator.process('err-low')
aggregator.startResolutionTimer(onResolved)

// Advance past cooldown + the 30s check interval
vi.advanceTimersByTime(DEFAULT_AGGREGATION.resolutionCooldownMs + 30_000 + 1)

expect(onResolved).not.toHaveBeenCalled()
})

it('does not start multiple timers', () => {
const onResolved1 = vi.fn()
const onResolved2 = vi.fn()

aggregator.startResolutionTimer(onResolved1)
aggregator.startResolutionTimer(onResolved2)

aggregator.process('err-multi')
for (let i = 0; i < 65; i++) aggregator.process('err-multi')

vi.advanceTimersByTime(DEFAULT_AGGREGATION.resolutionCooldownMs + 30_000 + 1)

Expand Down
20 changes: 13 additions & 7 deletions src/core/aggregator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -144,14 +144,20 @@ export class Aggregator {
}

if (now - state.lastSeen >= this.config.resolutionCooldownMs) {
// Only send resolution for alerts that reached the sustained phase
// (count > rampThreshold). A few sporadic failures aren't a "crisis"
// worth announcing as resolved — resolution is for ongoing incidents
// that generated a flood of alerts and then stopped.
if (state.count > this.config.rampThreshold) {
resolved.push({
fingerprint,
count: state.count,
firstSeen: state.firstSeen,
lastSeen: state.lastSeen,
peakRate: state.peakRate,
})
}
Comment on lines +151 to +159
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This conditional logic introduces a memory leak in AlertLogger. The AlertLogger maintains an alertMeta map that is only cleaned up when handleResolution is called. By filtering out low-count resolutions here, the corresponding metadata for sporadic or one-off alerts will never be removed from memory, causing the map to grow indefinitely over time.

To fix this, checkResolutions should return all resolved fingerprints so the logger can prune its metadata, while the logger itself should decide whether to actually send a notification based on the count (e.g., by adding a sustained flag to the ResolvedEntry interface).

state.phase = 'resolution'
resolved.push({
fingerprint,
count: state.count,
firstSeen: state.firstSeen,
lastSeen: state.lastSeen,
peakRate: state.peakRate,
})
}
})

Expand Down
66 changes: 66 additions & 0 deletions src/core/alert-logger.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,72 @@ describe('AlertLogger integration', () => {
})
})

describe('description option', () => {
it('uses description as message when provided on error()', async () => {
const adapter = new MockAdapter()
const logger = AlertLogger.init({
adapters: [adapter],
serviceName: 'test-service',
environment: 'production',
})

logger.error('Market Resolution Failed', {
description: 'Market abc-123 resolution failed, state reset to NONE',
})

await vi.waitFor(() => expect(adapter.sent.length).toBe(1))
expect(adapter.sent[0].title).toBe('Market Resolution Failed')
expect(adapter.sent[0].message).toBe('Market abc-123 resolution failed, state reset to NONE')
})

it('uses description as message when provided on warn()', async () => {
const adapter = new MockAdapter()
const logger = AlertLogger.init({
adapters: [adapter],
serviceName: 'test-service',
environment: 'production',
})

logger.warn('Indexer Health Changed', {
description: 'Status: healthy → unhealthy',
})

await vi.waitFor(() => expect(adapter.sent.length).toBe(1))
expect(adapter.sent[0].title).toBe('Indexer Health Changed')
expect(adapter.sent[0].message).toBe('Status: healthy → unhealthy')
})

it('falls back to title when no description or error', async () => {
const adapter = new MockAdapter()
const logger = AlertLogger.init({
adapters: [adapter],
serviceName: 'test-service',
environment: 'production',
})

logger.error('Something broke')

await vi.waitFor(() => expect(adapter.sent.length).toBe(1))
expect(adapter.sent[0].message).toBe('Something broke')
})

it('description takes priority over error.message', async () => {
const adapter = new MockAdapter()
const logger = AlertLogger.init({
adapters: [adapter],
serviceName: 'test-service',
environment: 'production',
})

logger.error('DB Failed', new Error('ECONNREFUSED'), {
description: 'Custom description here',
})

await vi.waitFor(() => expect(adapter.sent.length).toBe(1))
expect(adapter.sent[0].message).toBe('Custom description here')
})
})

describe('singleton', () => {
it('init() sets instance and getInstance() retrieves it', () => {
const adapter = new MockAdapter()
Expand Down
20 changes: 13 additions & 7 deletions src/core/alert-logger.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,22 +76,24 @@ export class AlertLogger {
}

info(title: string, options?: AlertOptions): void {
this.log('info', title, title, undefined, options)
const message = options?.description ?? title
this.log('info', title, message, undefined, options)
}

warn(title: string, options?: AlertOptions): void {
this.log('warning', title, title, undefined, options)
const message = options?.description ?? title
this.log('warning', title, message, undefined, options)
}

error(title: string, error?: Error | string, options?: AlertOptions): void {
error(title: string, error?: Error | string | AlertOptions, options?: AlertOptions): void {
const [err, opts] = this.normalizeErrorArgs(error, options)
const message = err?.message ?? title
const message = opts?.description ?? err?.message ?? title
this.log('critical', title, message, err, opts)
}

critical(title: string, error?: Error | string, options?: AlertOptions): void {
critical(title: string, error?: Error | string | AlertOptions, options?: AlertOptions): void {
const [err, opts] = this.normalizeErrorArgs(error, options)
const message = err?.message ?? title
const message = opts?.description ?? err?.message ?? title
this.log('critical', title, message, err, opts)
}

Expand Down Expand Up @@ -227,9 +229,13 @@ export class AlertLogger {
}

private normalizeErrorArgs(
error?: Error | string,
error?: Error | string | AlertOptions,
options?: AlertOptions,
): [Error | undefined, AlertOptions | undefined] {
// Allow error("title", { description: ... }) without an intermediate undefined
if (error != null && typeof error === 'object' && !(error instanceof Error)) {
return [undefined, error]
}
Comment on lines +236 to +238
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The current implementation of the overload ignores the options argument if the error argument is an object. This means calls like logger.error("title", { description: "..." }, { tags: ["foo"] }) will lose the tags and other options provided in the third argument.

Suggested change
if (error != null && typeof error === 'object' && !(error instanceof Error)) {
return [undefined, error]
}
if (error != null && typeof error === 'object' && !(error instanceof Error)) {
return [undefined, { ...error, ...options }]
}

if (typeof error === 'string') {
return [new Error(error), options]
}
Expand Down
2 changes: 2 additions & 0 deletions src/core/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ export type AlertLevel = 'info' | 'warning' | 'critical'
export type AggregationPhase = 'onset' | 'ramp' | 'sustained' | 'resolution'

export interface AlertOptions {
/** Detailed message shown in the embed body/description. When omitted, the title is used. */
description?: string
fields?: Record<string, string | number | boolean>
tags?: string[]
dedupKey?: string
Expand Down
17 changes: 13 additions & 4 deletions src/integrations/nestjs/alert-logger.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,21 @@ export class AlertLoggerService {
this.logger.warn(title, this.mergeContext(options))
}

error(title: string, error?: Error | string, options?: AlertOptions): void {
this.logger.error(title, error, this.mergeContext(options))
error(title: string, error?: Error | string | AlertOptions, options?: AlertOptions): void {
// When called as error("title", { ... }), pass through directly
if (error != null && typeof error === 'object' && !(error instanceof Error)) {
this.logger.error(title, this.mergeContext(error))
} else {
this.logger.error(title, error, this.mergeContext(options))
}
Comment on lines +21 to +25
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This overload implementation ignores the options argument when error is an object. Consider merging the options to ensure no metadata (like tags or custom fields) is lost when using the two-argument style alongside global context.

Suggested change
if (error != null && typeof error === 'object' && !(error instanceof Error)) {
this.logger.error(title, this.mergeContext(error))
} else {
this.logger.error(title, error, this.mergeContext(options))
}
if (error != null && typeof error === 'object' && !(error instanceof Error)) {
this.logger.error(title, this.mergeContext({ ...error, ...options }))
} else {
this.logger.error(title, error, this.mergeContext(options))
}

}

critical(title: string, error?: Error | string, options?: AlertOptions): void {
this.logger.critical(title, error, this.mergeContext(options))
critical(title: string, error?: Error | string | AlertOptions, options?: AlertOptions): void {
if (error != null && typeof error === 'object' && !(error instanceof Error)) {
this.logger.critical(title, this.mergeContext(error))
} else {
this.logger.critical(title, error, this.mergeContext(options))
}
Comment on lines +29 to +33
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Similar to the error method, this implementation ignores the options argument when error is an object. Merging them ensures consistency and prevents data loss.

Suggested change
if (error != null && typeof error === 'object' && !(error instanceof Error)) {
this.logger.critical(title, this.mergeContext(error))
} else {
this.logger.critical(title, error, this.mergeContext(options))
}
if (error != null && typeof error === 'object' && !(error instanceof Error)) {
this.logger.critical(title, this.mergeContext({ ...error, ...options }))
} else {
this.logger.critical(title, error, this.mergeContext(options))
}

}

private mergeContext(options?: AlertOptions): AlertOptions {
Expand Down
6 changes: 4 additions & 2 deletions src/integrations/nestjs/exception.filter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,20 @@ export class AlertExceptionFilter extends BaseExceptionFilter {
const path = request?.url ?? 'UNKNOWN'
const ip = request?.ip ?? 'UNKNOWN'

const title = `${method} ${path}`
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using the raw request path in the alert title can significantly degrade deduplication quality if the path contains dynamic parameters (e.g., /users/123). Since the path is already included in the fields metadata (line 34), consider using a more stable title (like the exception name or a generic route pattern) to ensure similar errors are grouped correctly by the aggregator.


if (exception instanceof HttpException) {
const statusCode = exception.getStatus()

if (statusCode >= 500) {
this.alert.error(exception.message, exception, {
this.alert.error(title, exception, {
fields: { method, path, statusCode, ip },
})
}
} else {
const error = exception instanceof Error ? exception : new Error(String(exception))

this.alert.critical(error.message, error, {
this.alert.critical(title, error, {
fields: { method, path, statusCode: 500, ip },
})
}
Expand Down
Loading