From 20a62c403bd2d7097be1cc9854e29c7cf67b2c1c Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 14:29:35 -0300 Subject: [PATCH 01/22] Add DiagnosticsMetrics class for enhanced metrics recording This commit introduces the DiagnosticsMetrics class, which provides a high-level API for recording metrics using the `@vtex/diagnostics-nodejs` library. It includes functionality for recording latency, incrementing counters, and setting gauge values while managing attribute limits to prevent high cardinality. The new metrics system is initialized in the startApp function, allowing for backward compatibility with the existing MetricsAccumulator. --- src/constants.ts | 1 + src/metrics/DiagnosticsMetrics.ts | 239 ++++++++++++++++++++++++++++++ src/service/index.ts | 9 +- 3 files changed, 248 insertions(+), 1 deletion(-) create mode 100644 src/metrics/DiagnosticsMetrics.ts diff --git a/src/constants.ts b/src/constants.ts index 38f4c7630..b20781979 100644 --- a/src/constants.ts +++ b/src/constants.ts @@ -102,6 +102,7 @@ export const INSPECT_DEBUGGER_PORT = 5858 export const cancellableMethods = new Set(['GET', 'OPTIONS', 'HEAD']) export const LOG_CLIENT_INIT_TIMEOUT_MS = 5000 +export const METRIC_CLIENT_INIT_TIMEOUT_MS = 5000 export const OTEL_EXPORTER_OTLP_ENDPOINT = process.env.OTEL_EXPORTER_OTLP_ENDPOINT as string; diff --git a/src/metrics/DiagnosticsMetrics.ts b/src/metrics/DiagnosticsMetrics.ts new file mode 100644 index 000000000..ae7b0805f --- /dev/null +++ b/src/metrics/DiagnosticsMetrics.ts @@ -0,0 +1,239 @@ +import { Attributes } from '@opentelemetry/api' +import { Types } from '@vtex/diagnostics-nodejs' +import { getMetricClient } from '../service/metrics/client' +import { METRIC_CLIENT_INIT_TIMEOUT_MS } from '../constants' + +/** + * Maximum number of attributes allowed per metric to control cardinality. + * This limit applies to external usage (VTEX IO Apps) to prevent unbounded metric dimensions. + */ +const MAX_ATTRIBUTES = 5 + +/** + * Converts an hrtime tuple [seconds, nanoseconds] to milliseconds. + */ +function hrtimeToMillis(hrtime: [number, number]): number { + return (hrtime[0] * 1e3) + (hrtime[1] / 1e6) +} + +/** + * Limits the number of attributes to prevent high cardinality. + * Takes the first MAX_ATTRIBUTES entries if the limit is exceeded. + * + * @param attributes Optional attributes object + * @returns Limited attributes object or undefined + */ +function limitAttributes(attributes?: Attributes): Attributes | undefined { + if (!attributes) { + return undefined + } + + const entries = Object.entries(attributes) + if (entries.length <= MAX_ATTRIBUTES) { + return attributes + } + + console.warn( + `Attribute limit exceeded: ${entries.length} attributes provided, using only the first ${MAX_ATTRIBUTES}. ` + + `Consider reducing the number of attributes to avoid high cardinality.` + ) + + return Object.fromEntries(entries.slice(0, MAX_ATTRIBUTES)) +} + +/** + * DiagnosticsMetrics provides a high-level API for recording metrics using + * the @vtex/diagnostics-nodejs library. It completely abstracts instrument + * management, bucket configuration, and lifecycle. + * + * Uses a single histogram for all latency measurements with attributes to differentiate. + * This follows OpenTelemetry best practices and reduces metric cardinality. + * + * @example + * ```typescript + * const diagnosticsMetrics = new DiagnosticsMetrics() + * diagnosticsMetrics.initMetricClient() + * + * // Record latency with operation type in attributes + * const start = process.hrtime() + * // ... do work ... + * diagnosticsMetrics.recordLatency(process.hrtime(start), { operation: 'api-call', status: '2xx' }) + * + * // Or from milliseconds + * diagnosticsMetrics.recordLatency(42.5, { operation: 'db-query', status: 'success' }) + * + * // Increment a counter + * diagnosticsMetrics.incrementCounter('http_requests_total', 1, { method: 'GET', status: '2xx' }) + * + * // Set a gauge value + * diagnosticsMetrics.setGauge('cache_items_current', 1024, { cache: 'pages' }) + * ``` + */ +export class DiagnosticsMetrics { + private metricsClient: Types.MetricClient | undefined + private clientInitPromise: Promise | undefined + + private latencyHistogram: Types.Histogram | undefined + // Counters and gauges keyed by name + private counters: Map + private gauges: Map + + constructor() { + this.counters = new Map() + this.gauges = new Map() + this.initMetricClient() + } + + /** + * Initialize the metrics client with timeout handling. + * Called automatically in constructor. + */ + private initMetricClient(): Promise { + if (this.clientInitPromise) { + return this.clientInitPromise + } + + this.clientInitPromise = (async () => { + try { + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => reject(new Error('Metric client initialization timeout')), METRIC_CLIENT_INIT_TIMEOUT_MS) + }) + + this.metricsClient = await Promise.race([ + getMetricClient(), + timeoutPromise + ]) + + // Create the single latency histogram after client is ready + this.createLatencyHistogram() + + return this.metricsClient + } catch (error) { + console.error('Failed to initialize metric client:', error) + return undefined + } + })() + + return this.clientInitPromise + } + + /** + * Create the single shared histogram for all latency measurements. + * Called after metric client is initialized. + */ + private createLatencyHistogram(): void { + if (!this.metricsClient) { + return + } + + this.latencyHistogram = this.metricsClient.createHistogram('io_app_operation_duration_milliseconds', { + description: 'Duration of VTEX IO app operations in milliseconds', + unit: 'ms', + }) + } + + /** + * Record a latency measurement using the single shared histogram. + * Accepts either an hrtime tuple from process.hrtime() or milliseconds as a number. + * Use attributes to differentiate between different operations. + * + * @param value Either [seconds, nanoseconds] from process.hrtime() or milliseconds + * @param attributes Attributes including 'operation' to identify the operation type (max of MAX_ATTRIBUTES attributes) + * + * @example + * ```typescript + * const start = process.hrtime() + * // ... do work ... + * diagnosticsMetrics.recordLatency(process.hrtime(start), { operation: 'api-call', status: '2xx' }) + * + * // Or with milliseconds + * diagnosticsMetrics.recordLatency(42.5, { operation: 'db-query', status: 'success' }) + * ``` + */ + public recordLatency(value: [number, number] | number, attributes?: Attributes): void { + if (!this.latencyHistogram) { + console.warn('DiagnosticsMetrics not initialized. Call initialize() first.') + return + } + + // Convert hrtime to milliseconds if needed + const milliseconds = Array.isArray(value) ? hrtimeToMillis(value) : value + + // Limit attributes to prevent high cardinality + const limitedAttributes = limitAttributes(attributes) + + // Record to the single shared histogram with attributes + this.latencyHistogram.record(milliseconds, limitedAttributes) + } + + /** + * Increment a counter by a specific value. + * Multiple counters are stored by name since counters represent different types of events. + * + * @param name Counter name (e.g., 'http_requests_total', 'cache_hits_total') + * @param value Amount to increment by (typically 1) + * @param attributes Optional attributes for the counter (max of MAX_ATTRIBUTES attributes, e.g., { method: 'GET', status: '2xx' }) + * + * @example + * ```typescript + * diagnosticsMetrics.incrementCounter('http_requests_total', 1, { method: 'GET', status: '2xx' }) + * ``` + */ + public incrementCounter(name: string, value: number, attributes?: Attributes): void { + if (!this.metricsClient) { + console.warn('DiagnosticsMetrics not initialized. Call initialize() first.') + return + } + + // Get or create counter instrument + if (!this.counters.has(name)) { + const counter = this.metricsClient.createCounter(name, { + description: `Counter for ${name}`, + unit: '1', + }) + this.counters.set(name, counter) + } + + // Limit attributes to prevent high cardinality + const limitedAttributes = limitAttributes(attributes) + + // Increment the counter + this.counters.get(name)!.add(value, limitedAttributes) + } + + /** + * Set a gauge to a specific value (current state). + * Multiple gauges are stored by name since gauges represent different types of measurements. + * + * @param name Gauge name (e.g., 'cache_items_current', 'memory_usage_bytes') + * @param value Current value + * @param attributes Optional attributes for the gauge (max of MAX_ATTRIBUTES attributes, e.g., { cache: 'pages' }) + * + * @example + * ```typescript + * diagnosticsMetrics.setGauge('cache_items_current', 1024, { cache: 'pages' }) + * ``` + */ + public setGauge(name: string, value: number, attributes?: Attributes): void { + if (!this.metricsClient) { + console.warn('DiagnosticsMetrics not initialized. Call initialize() first.') + return + } + + // Get or create gauge instrument + if (!this.gauges.has(name)) { + const gauge = this.metricsClient.createGauge(name, { + description: `Gauge for ${name}`, + unit: '1', + }) + this.gauges.set(name, gauge) + } + + // Limit attributes to prevent high cardinality + const limitedAttributes = limitAttributes(attributes) + + // Set the gauge value + this.gauges.get(name)!.set(value, limitedAttributes) + } +} + diff --git a/src/service/index.ts b/src/service/index.ts index d531e1247..ab2ed5326 100644 --- a/src/service/index.ts +++ b/src/service/index.ts @@ -3,15 +3,21 @@ import cluster from 'cluster' import { HTTP_SERVER_PORT } from '../constants' import { MetricsAccumulator } from '../metrics/MetricsAccumulator' +import { DiagnosticsMetrics } from '../metrics/DiagnosticsMetrics' import { getServiceJSON } from './loaders' import { LogLevel, logOnceToDevConsole } from './logger' export const startApp = async () => { await initializeTelemetry() - // Initialize global.metrics for both master and worker processes + // Initialize both legacy and new metrics systems independently + // Legacy: MetricsAccumulator for backward compatibility + // External apps and internal code can continue using this during migration global.metrics = new MetricsAccumulator() + // New: DiagnosticsMetrics for new diagnostics-based instrumentation + global.diagnosticsMetrics = new DiagnosticsMetrics() + const serviceJSON = getServiceJSON() try { // if it is a master process then call setting up worker process @@ -36,6 +42,7 @@ declare global { namespace NodeJS { interface Global { metrics: MetricsAccumulator + diagnosticsMetrics: DiagnosticsMetrics } } } From 31234b7672d67fa9a3b7a9a7726bb950fe4e3166 Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 14:35:24 -0300 Subject: [PATCH 02/22] Add unit tests for DiagnosticsMetrics class This commit introduces a comprehensive test suite for the DiagnosticsMetrics class, covering initialization, latency recording, counter incrementing, and gauge setting functionalities. The tests ensure proper handling of attributes, including limits on the number of attributes, and verify that metrics are recorded correctly under various scenarios, including error handling during initialization. --- src/metrics/DiagnosticsMetrics.test.ts | 381 +++++++++++++++++++++++++ 1 file changed, 381 insertions(+) create mode 100644 src/metrics/DiagnosticsMetrics.test.ts diff --git a/src/metrics/DiagnosticsMetrics.test.ts b/src/metrics/DiagnosticsMetrics.test.ts new file mode 100644 index 000000000..b6c8ce8ae --- /dev/null +++ b/src/metrics/DiagnosticsMetrics.test.ts @@ -0,0 +1,381 @@ +import { Types } from '@vtex/diagnostics-nodejs' +import { DiagnosticsMetrics } from './DiagnosticsMetrics' + +// Mock only the external I/O boundary (getMetricClient) +jest.mock('../service/metrics/client', () => ({ + getMetricClient: jest.fn(), +})) + +import { getMetricClient } from '../service/metrics/client' + +describe('DiagnosticsMetrics', () => { + let diagnosticsMetrics: DiagnosticsMetrics + let mockMetricsClient: Types.MetricClient + let recordedHistogramCalls: Array<{ value: number; attributes?: any }> + let recordedCounterCalls: Map> + let recordedGaugeCalls: Map> + + beforeEach(() => { + // Reset call tracking + recordedHistogramCalls = [] + recordedCounterCalls = new Map() + recordedGaugeCalls = new Map() + + // Create a mock client that tracks calls instead of using jest.fn() + mockMetricsClient = { + createHistogram: (name: string, options: any) => ({ + record: (value: number, attributes?: any) => { + recordedHistogramCalls.push({ value, attributes }) + }, + }), + createCounter: (name: string, options: any) => ({ + add: (value: number, attributes?: any) => { + if (!recordedCounterCalls.has(name)) { + recordedCounterCalls.set(name, []) + } + recordedCounterCalls.get(name)!.push({ value, attributes }) + }, + }), + createGauge: (name: string, options: any) => ({ + set: (value: number, attributes?: any) => { + if (!recordedGaugeCalls.has(name)) { + recordedGaugeCalls.set(name, []) + } + recordedGaugeCalls.get(name)!.push({ value, attributes }) + }, + }), + } as any + + // Mock only the external call + ;(getMetricClient as jest.Mock).mockResolvedValue(mockMetricsClient) + + // Create real instance + diagnosticsMetrics = new DiagnosticsMetrics() + }) + + afterEach(() => { + jest.clearAllMocks() + }) + + describe('initialization', () => { + it('should initialize metrics client and create latency histogram in constructor', async () => { + // Wait for initialization to complete + await new Promise(resolve => setTimeout(resolve, 10)) + + expect(getMetricClient).toHaveBeenCalledTimes(1) + + // Verify histogram was created by recording a value + diagnosticsMetrics.recordLatency(100) + expect(recordedHistogramCalls).toHaveLength(1) + expect(recordedHistogramCalls[0].value).toBe(100) + }) + + it('should handle initialization errors gracefully', async () => { + const error = new Error('Initialization failed') + ;(getMetricClient as jest.Mock).mockRejectedValueOnce(error) + + const consoleErrorSpy = jest.spyOn(console, 'error').mockImplementation() + + // Create new instance that will fail initialization + const failingMetrics = new DiagnosticsMetrics() + + // Wait for initialization attempt + await new Promise(resolve => setTimeout(resolve, 10)) + + expect(consoleErrorSpy).toHaveBeenCalledWith('Failed to initialize metric client:', error) + + consoleErrorSpy.mockRestore() + }) + }) + + describe('recordLatency', () => { + beforeEach(async () => { + // Wait for initialization + await new Promise(resolve => setTimeout(resolve, 10)) + }) + + it('should record latency from hrtime tuple to single shared histogram', () => { + const hrtimeDiff: [number, number] = [1, 500000000] // 1.5 seconds + const attributes = { operation: 'api-call', status: '2xx' } + + diagnosticsMetrics.recordLatency(hrtimeDiff, attributes) + + expect(recordedHistogramCalls).toHaveLength(1) + expect(recordedHistogramCalls[0]).toEqual({ value: 1500, attributes }) + }) + + it('should record latency from milliseconds number to single shared histogram', () => { + const milliseconds = 42.5 + const attributes = { operation: 'db-query', status: 'success' } + + diagnosticsMetrics.recordLatency(milliseconds, attributes) + + expect(recordedHistogramCalls).toHaveLength(1) + expect(recordedHistogramCalls[0]).toEqual({ value: milliseconds, attributes }) + }) + + it('should record latency without attributes', () => { + const milliseconds = 100 + + diagnosticsMetrics.recordLatency(milliseconds) + + expect(recordedHistogramCalls).toHaveLength(1) + expect(recordedHistogramCalls[0]).toEqual({ value: milliseconds, attributes: undefined }) + }) + + it('should use the same histogram for all latency measurements', () => { + diagnosticsMetrics.recordLatency(10, { operation: 'op1' }) + diagnosticsMetrics.recordLatency(20, { operation: 'op2' }) + diagnosticsMetrics.recordLatency(30, { operation: 'op3' }) + + // All recordings go to the same histogram + expect(recordedHistogramCalls).toHaveLength(3) + expect(recordedHistogramCalls[0].value).toBe(10) + expect(recordedHistogramCalls[1].value).toBe(20) + expect(recordedHistogramCalls[2].value).toBe(30) + }) + + it('should warn if not initialized', () => { + const uninitializedMetrics = new DiagnosticsMetrics() + const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation() + + // Don't wait for initialization + uninitializedMetrics.recordLatency(100) + + expect(consoleWarnSpy).toHaveBeenCalledWith('DiagnosticsMetrics not initialized. Call initialize() first.') + consoleWarnSpy.mockRestore() + }) + }) + + describe('incrementCounter', () => { + beforeEach(async () => { + await new Promise(resolve => setTimeout(resolve, 10)) + }) + + it('should increment counter with value and attributes', () => { + const attributes = { method: 'GET', status: '2xx' } + + diagnosticsMetrics.incrementCounter('http_requests_total', 1, attributes) + + const calls = recordedCounterCalls.get('http_requests_total') + expect(calls).toHaveLength(1) + expect(calls![0]).toEqual({ value: 1, attributes }) + }) + + it('should increment counter without attributes', () => { + diagnosticsMetrics.incrementCounter('requests', 5) + + const calls = recordedCounterCalls.get('requests') + expect(calls).toHaveLength(1) + expect(calls![0]).toEqual({ value: 5, attributes: undefined }) + }) + + it('should reuse existing counter for same metric name', () => { + diagnosticsMetrics.incrementCounter('requests', 1) + diagnosticsMetrics.incrementCounter('requests', 2) + diagnosticsMetrics.incrementCounter('requests', 3) + + const calls = recordedCounterCalls.get('requests') + expect(calls).toHaveLength(3) + expect(calls![0].value).toBe(1) + expect(calls![1].value).toBe(2) + expect(calls![2].value).toBe(3) + }) + + it('should create separate counters for different metric names', () => { + diagnosticsMetrics.incrementCounter('counter1', 1) + diagnosticsMetrics.incrementCounter('counter2', 2) + + expect(recordedCounterCalls.get('counter1')).toHaveLength(1) + expect(recordedCounterCalls.get('counter2')).toHaveLength(1) + expect(recordedCounterCalls.get('counter1')![0].value).toBe(1) + expect(recordedCounterCalls.get('counter2')![0].value).toBe(2) + }) + + it('should warn if not initialized', () => { + const uninitializedMetrics = new DiagnosticsMetrics() + const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation() + + uninitializedMetrics.incrementCounter('test', 1) + + expect(consoleWarnSpy).toHaveBeenCalledWith('DiagnosticsMetrics not initialized. Call initialize() first.') + consoleWarnSpy.mockRestore() + }) + }) + + describe('setGauge', () => { + beforeEach(async () => { + await new Promise(resolve => setTimeout(resolve, 10)) + }) + + it('should set gauge with value and attributes', () => { + const attributes = { cache: 'pages' } + + diagnosticsMetrics.setGauge('cache_items_current', 1024, attributes) + + const calls = recordedGaugeCalls.get('cache_items_current') + expect(calls).toHaveLength(1) + expect(calls![0]).toEqual({ value: 1024, attributes }) + }) + + it('should set gauge without attributes', () => { + diagnosticsMetrics.setGauge('memory_usage', 512) + + const calls = recordedGaugeCalls.get('memory_usage') + expect(calls).toHaveLength(1) + expect(calls![0]).toEqual({ value: 512, attributes: undefined }) + }) + + it('should reuse existing gauge for same metric name', () => { + diagnosticsMetrics.setGauge('gauge1', 10) + diagnosticsMetrics.setGauge('gauge1', 20) + diagnosticsMetrics.setGauge('gauge1', 30) + + const calls = recordedGaugeCalls.get('gauge1') + expect(calls).toHaveLength(3) + expect(calls![0].value).toBe(10) + expect(calls![1].value).toBe(20) + expect(calls![2].value).toBe(30) + }) + + it('should create separate gauges for different metric names', () => { + diagnosticsMetrics.setGauge('gauge1', 100) + diagnosticsMetrics.setGauge('gauge2', 200) + + expect(recordedGaugeCalls.get('gauge1')).toHaveLength(1) + expect(recordedGaugeCalls.get('gauge2')).toHaveLength(1) + expect(recordedGaugeCalls.get('gauge1')![0].value).toBe(100) + expect(recordedGaugeCalls.get('gauge2')![0].value).toBe(200) + }) + + it('should warn if not initialized', () => { + const uninitializedMetrics = new DiagnosticsMetrics() + const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation() + + uninitializedMetrics.setGauge('test', 100) + + expect(consoleWarnSpy).toHaveBeenCalledWith('DiagnosticsMetrics not initialized. Call initialize() first.') + consoleWarnSpy.mockRestore() + }) + }) + + describe('Attribute Limiting', () => { + it('should allow up to 5 attributes without warning', async () => { + const warnSpy = jest.spyOn(console, 'warn').mockImplementation() + + const attributes = { + attr1: 'value1', + attr2: 'value2', + attr3: 'value3', + attr4: 'value4', + attr5: 'value5', + } + + diagnosticsMetrics.recordLatency([0, 1000000], attributes) + + expect(recordedHistogramCalls[0].attributes).toEqual(attributes) + expect(warnSpy).not.toHaveBeenCalled() + + warnSpy.mockRestore() + }) + + it('should limit attributes to 5 and warn when exceeded (recordLatency)', async () => { + const warnSpy = jest.spyOn(console, 'warn').mockImplementation() + + const attributes = { + attr1: 'value1', + attr2: 'value2', + attr3: 'value3', + attr4: 'value4', + attr5: 'value5', + attr6: 'value6', + attr7: 'value7', + } + + diagnosticsMetrics.recordLatency([0, 1000000], attributes) + + // Should only include first 5 attributes + const recorded = recordedHistogramCalls[0].attributes + expect(Object.keys(recorded)).toHaveLength(5) + expect(recorded).toEqual({ + attr1: 'value1', + attr2: 'value2', + attr3: 'value3', + attr4: 'value4', + attr5: 'value5', + }) + + expect(warnSpy).toHaveBeenCalledWith( + expect.stringContaining('Attribute limit exceeded: 7 attributes provided, using only the first 5') + ) + + warnSpy.mockRestore() + }) + + it('should limit attributes to 5 and warn when exceeded (incrementCounter)', async () => { + const warnSpy = jest.spyOn(console, 'warn').mockImplementation() + + const attributes = { + attr1: 'value1', + attr2: 'value2', + attr3: 'value3', + attr4: 'value4', + attr5: 'value5', + attr6: 'value6', + } + + diagnosticsMetrics.incrementCounter('test_counter', 1, attributes) + + // Should only include first 5 attributes + const recorded = recordedCounterCalls.get('test_counter')![0].attributes + expect(Object.keys(recorded)).toHaveLength(5) + expect(recorded).toEqual({ + attr1: 'value1', + attr2: 'value2', + attr3: 'value3', + attr4: 'value4', + attr5: 'value5', + }) + + expect(warnSpy).toHaveBeenCalledWith( + expect.stringContaining('Attribute limit exceeded: 6 attributes provided, using only the first 5') + ) + + warnSpy.mockRestore() + }) + + it('should limit attributes to 5 and warn when exceeded (setGauge)', async () => { + const warnSpy = jest.spyOn(console, 'warn').mockImplementation() + + const attributes = { + attr1: 'value1', + attr2: 'value2', + attr3: 'value3', + attr4: 'value4', + attr5: 'value5', + attr6: 'value6', + attr7: 'value7', + attr8: 'value8', + } + + diagnosticsMetrics.setGauge('test_gauge', 100, attributes) + + // Should only include first 5 attributes + const recorded = recordedGaugeCalls.get('test_gauge')![0].attributes + expect(Object.keys(recorded)).toHaveLength(5) + expect(recorded).toEqual({ + attr1: 'value1', + attr2: 'value2', + attr3: 'value3', + attr4: 'value4', + attr5: 'value5', + }) + + expect(warnSpy).toHaveBeenCalledWith( + expect.stringContaining('Attribute limit exceeded: 8 attributes provided, using only the first 5') + ) + + warnSpy.mockRestore() + }) + }) +}) From 271d5bbc736676b70fa5cc772eec085553f29522 Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 15:34:29 -0300 Subject: [PATCH 03/22] Add method to update HTTP agent metrics for diagnostics This commit introduces the `updateHttpAgentMetrics` method in the HttpAgentSingleton class, which periodically exports the current state of the HTTP agent as diagnostic metrics. It reports the current number of sockets, free sockets, and pending requests as gauges. This stats are used in `MetricsAccumulator` to produce the same metrics in log-based format. --- .../middlewares/request/HttpAgentSingleton.ts | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/HttpClient/middlewares/request/HttpAgentSingleton.ts b/src/HttpClient/middlewares/request/HttpAgentSingleton.ts index d51f475cb..057f0f5fe 100644 --- a/src/HttpClient/middlewares/request/HttpAgentSingleton.ts +++ b/src/HttpClient/middlewares/request/HttpAgentSingleton.ts @@ -21,6 +21,25 @@ export class HttpAgentSingleton { sockets, } } + + /** + * Update HTTP agent statistics as diagnostics metrics (gauges). + * This method should be called periodically to export current HTTP agent state. + */ + public static updateHttpAgentMetrics() { + if (!global.diagnosticsMetrics) { + console.warn('DiagnosticsMetrics not available. HTTP agent metrics not reported.') + return + } + + const stats = HttpAgentSingleton.httpAgentStats() + + // Report HTTP agent stats as gauges (current state) + global.diagnosticsMetrics.setGauge('http_agent_sockets_current', stats.sockets, {}) + global.diagnosticsMetrics.setGauge('http_agent_free_sockets_current', stats.freeSockets, {}) + global.diagnosticsMetrics.setGauge('http_agent_pending_requests_current', stats.pendingRequests, {}) + } + private static httpAgent: HttpAgent private static count(obj: { [key: string]: any[] }) { From 17f45be171ffd907ca6ebae8761ebbfda9150b66 Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 15:35:36 -0300 Subject: [PATCH 04/22] Integrate HTTP agent metrics update into status tracking This commit updates the `trackStatus` function to include a call to `HttpAgentSingleton.updateHttpAgentMetrics()`, enhancing the diagnostics metrics by exporting HTTP agent statistics. This addition complements the existing legacy status tracking functionality. --- src/service/worker/runtime/statusTrack.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/service/worker/runtime/statusTrack.ts b/src/service/worker/runtime/statusTrack.ts index 60ea53752..34e1caf2b 100644 --- a/src/service/worker/runtime/statusTrack.ts +++ b/src/service/worker/runtime/statusTrack.ts @@ -1,6 +1,7 @@ import cluster from 'cluster' import { ACCOUNT, APP, LINKED, PRODUCTION, WORKSPACE } from '../../../constants' +import { HttpAgentSingleton } from '../../../HttpClient/middlewares/request/HttpAgentSingleton' import { ServiceContext } from './typings' export type StatusTrack = () => EnvMetric[] @@ -33,6 +34,10 @@ export const statusTrackHandler = async (ctx: ServiceContext) => { } export const trackStatus = () => { + // Update diagnostics metrics (gauges for HTTP agent stats) + HttpAgentSingleton.updateHttpAgentMetrics() + + // Legacy status tracking (console.log export) global.metrics.statusTrack().forEach(status => { logStatus(status) }) From 2d994acb7eb77ac4fcccb1e5dce771ebff99138c Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 15:38:26 -0300 Subject: [PATCH 05/22] Add unit tests for HttpAgentSingleton class This commit introduces a new test suite for the HttpAgentSingleton class, validating the functionality of HTTP agent statistics retrieval and metrics reporting. The tests cover scenarios for both populated and empty agent states, ensuring accurate gauge reporting to diagnostics metrics. --- .../request/HttpAgentSingleton.test.ts | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 src/HttpClient/middlewares/request/HttpAgentSingleton.test.ts diff --git a/src/HttpClient/middlewares/request/HttpAgentSingleton.test.ts b/src/HttpClient/middlewares/request/HttpAgentSingleton.test.ts new file mode 100644 index 000000000..172a26aad --- /dev/null +++ b/src/HttpClient/middlewares/request/HttpAgentSingleton.test.ts @@ -0,0 +1,102 @@ +import { HttpAgentSingleton } from './HttpAgentSingleton' + +// Mock the createHttpAgent function (external dependency) +jest.mock('../../../HttpClient/agents', () => ({ + createHttpAgent: jest.fn(() => ({ + sockets: {}, + freeSockets: {}, + requests: {}, + })), +})) + +describe('HttpAgentSingleton', () => { + let recordedGaugeCalls: Map> + + beforeEach(() => { + // Reset call tracking + recordedGaugeCalls = new Map() + + // Create a minimal stub that tracks gauge calls (not mocking DiagnosticsMetrics class) + global.diagnosticsMetrics = { + setGauge: (name: string, value: number, attributes?: any) => { + if (!recordedGaugeCalls.has(name)) { + recordedGaugeCalls.set(name, []) + } + recordedGaugeCalls.get(name)!.push({ value, attributes }) + }, + } as any + + // Reset the agent's internal state + const agent = HttpAgentSingleton.getHttpAgent() + ;(agent as any).sockets = {} + ;(agent as any).freeSockets = {} + ;(agent as any).requests = {} + }) + + afterEach(() => { + // Clean up global + delete (global as any).diagnosticsMetrics + }) + + describe('httpAgentStats', () => { + it('should return current HTTP agent statistics', () => { + const agent = HttpAgentSingleton.getHttpAgent() + + // Mock some socket data + ;(agent as any).sockets = { 'host1:80': [1, 2], 'host2:443': [1] } + ;(agent as any).freeSockets = { 'host1:80': [1] } + ;(agent as any).requests = { 'host1:80': [1, 2, 3] } + + const stats = HttpAgentSingleton.httpAgentStats() + + expect(stats).toEqual({ + sockets: 3, + freeSockets: 1, + pendingRequests: 3, + }) + }) + + it('should return zero counts for empty agent', () => { + const stats = HttpAgentSingleton.httpAgentStats() + + expect(stats).toEqual({ + sockets: 0, + freeSockets: 0, + pendingRequests: 0, + }) + }) + }) + + describe('updateHttpAgentMetrics', () => { + it('should report HTTP agent stats as gauges to diagnostics metrics', () => { + const agent = HttpAgentSingleton.getHttpAgent() + + // Mock some socket data + ;(agent as any).sockets = { 'host1:80': [1, 2] } + ;(agent as any).freeSockets = { 'host1:80': [1] } + ;(agent as any).requests = { 'host1:80': [1, 2, 3] } + + HttpAgentSingleton.updateHttpAgentMetrics() + + expect(recordedGaugeCalls.get('http_agent_sockets_current')).toEqual([{ value: 2, attributes: {} }]) + expect(recordedGaugeCalls.get('http_agent_free_sockets_current')).toEqual([{ value: 1, attributes: {} }]) + expect(recordedGaugeCalls.get('http_agent_pending_requests_current')).toEqual([{ value: 3, attributes: {} }]) + }) + + it('should handle missing global.diagnosticsMetrics gracefully', () => { + delete (global as any).diagnosticsMetrics + + // Should not throw + expect(() => HttpAgentSingleton.updateHttpAgentMetrics()).not.toThrow() + }) + + it('should report zero values when agent has no active connections', () => { + HttpAgentSingleton.updateHttpAgentMetrics() + + expect(recordedGaugeCalls.get('http_agent_sockets_current')).toEqual([{ value: 0, attributes: {} }]) + expect(recordedGaugeCalls.get('http_agent_free_sockets_current')).toEqual([{ value: 0, attributes: {} }]) + expect(recordedGaugeCalls.get('http_agent_pending_requests_current')).toEqual([{ value: 0, attributes: {} }]) + }) + }) +}) + From 2fc51aa263f7a92c9c93ea9be58375360dbd5bd2 Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 16:05:22 -0300 Subject: [PATCH 06/22] Enhance total incoming requests reporting in middleware This commit updates the trackIncomingRequestStats middleware to include detailed metric reporting for total incoming requests. It introduces a cumulative counter for total requests ensuring that metrics are reported to diagnostics if available. --- .../worker/runtime/http/middlewares/requestStats.ts | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/service/worker/runtime/http/middlewares/requestStats.ts b/src/service/worker/runtime/http/middlewares/requestStats.ts index 6510968c8..e5473ab59 100644 --- a/src/service/worker/runtime/http/middlewares/requestStats.ts +++ b/src/service/worker/runtime/http/middlewares/requestStats.ts @@ -49,5 +49,18 @@ export async function trackIncomingRequestStats < ctx.req.on('close', requestClosed) ctx.req.on('aborted', requestAborted(ctx)) incomingRequestStats.total++ + + // Report total requests to diagnostics metrics (cumulative counter) + const { status: statusCode, vtex: { route: { id, type } } } = ctx + if (global.diagnosticsMetrics) { + global.diagnosticsMetrics.incrementCounter('http_server_requests_total', 1, { + route_id: id, + route_type: type, + status_code: statusCode, + }) + } else { + console.warn('DiagnosticsMetrics not available. Request total metric not reported.') + } + await next() } From 6c69034182ed63f8797016c7f6d9c00cd7e64f8e Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 16:11:59 -0300 Subject: [PATCH 07/22] Enhance closed incoming request metrics reporting in middleware This commit modifies the requestClosed function to include detailed reporting of closed request metrics to diagnostics. It ensures that metrics are incremented and reported with relevant route and status information, improving the overall monitoring capabilities of the middleware. --- .../runtime/http/middlewares/requestStats.ts | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/service/worker/runtime/http/middlewares/requestStats.ts b/src/service/worker/runtime/http/middlewares/requestStats.ts index e5473ab59..5f8fe68da 100644 --- a/src/service/worker/runtime/http/middlewares/requestStats.ts +++ b/src/service/worker/runtime/http/middlewares/requestStats.ts @@ -25,8 +25,25 @@ class IncomingRequestStats { export const incomingRequestStats = new IncomingRequestStats() -const requestClosed = () => { +const requestClosed = < + T extends IOClients, + U extends RecorderState, + V extends ParamsContext +>(ctx: ServiceContext) => () => { incomingRequestStats.closed++ + + // Report to diagnostics metrics (cumulative counter) + const { status: statusCode, vtex: { route: { id, type } } } = ctx + + if (global.diagnosticsMetrics) { + global.diagnosticsMetrics.incrementCounter('http_server_requests_closed_total', 1, { + route_id: id, + route_type: type, + status_code: statusCode, + }) + } else { + console.warn('DiagnosticsMetrics not available. Request closed metric not reported.') + } } const requestAborted = < T extends IOClients, @@ -46,7 +63,7 @@ export async function trackIncomingRequestStats < U extends RecorderState, V extends ParamsContext > (ctx: ServiceContext, next: () => Promise) { - ctx.req.on('close', requestClosed) + ctx.req.on('close', requestClosed(ctx)) ctx.req.on('aborted', requestAborted(ctx)) incomingRequestStats.total++ From ec738539c45a33262a2d3f5c3e3182b7bbdc17aa Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 16:12:35 -0300 Subject: [PATCH 08/22] Add diagnostics metrics reporting for aborted requests in middleware This commit enhances the requestAborted function to report metrics for aborted requests to diagnostics. It includes relevant route and status information, improving monitoring capabilities. --- .../worker/runtime/http/middlewares/requestStats.ts | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/service/worker/runtime/http/middlewares/requestStats.ts b/src/service/worker/runtime/http/middlewares/requestStats.ts index 5f8fe68da..89153e141 100644 --- a/src/service/worker/runtime/http/middlewares/requestStats.ts +++ b/src/service/worker/runtime/http/middlewares/requestStats.ts @@ -52,6 +52,19 @@ const requestAborted = < >(ctx: ServiceContext) => () => { incomingRequestStats.aborted++ + // Report to diagnostics metrics (cumulative counter) + const { status: statusCode, vtex: { route: { id, type } } } = ctx + + if (global.diagnosticsMetrics) { + global.diagnosticsMetrics.incrementCounter('http_server_requests_aborted_total', 1, { + route_id: id, + route_type: type, + status_code: statusCode, + }) + } else { + console.warn('DiagnosticsMetrics not available. Request aborted metric not reported.') + } + if (ctx.vtex.cancellation && ctx.vtex.cancellation.cancelable) { ctx.vtex.cancellation.source.cancel(cancelMessage) ctx.vtex.cancellation.cancelled = true From e6b5023c5b22f016d2382cf9eb80f57705ca4e63 Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 16:13:07 -0300 Subject: [PATCH 09/22] Add unit tests for requestStats middleware This commit introduces a comprehensive test suite for the requestStats middleware, validating the functionality of tracking incoming request statistics, including total, aborted, and closed requests. The tests ensure accurate reporting to diagnostics metrics and handle various scenarios, including the absence of global diagnostics metrics. --- .../http/middlewares/requestStats.test.ts | 177 ++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 src/service/worker/runtime/http/middlewares/requestStats.test.ts diff --git a/src/service/worker/runtime/http/middlewares/requestStats.test.ts b/src/service/worker/runtime/http/middlewares/requestStats.test.ts new file mode 100644 index 000000000..86080b6b5 --- /dev/null +++ b/src/service/worker/runtime/http/middlewares/requestStats.test.ts @@ -0,0 +1,177 @@ +import { EventEmitter } from 'events' +import { DiagnosticsMetrics } from '../../../../../metrics/DiagnosticsMetrics' +import { incomingRequestStats, trackIncomingRequestStats } from './requestStats' + +describe('requestStats', () => { + let mockDiagnosticsMetrics: jest.Mocked + + beforeEach(() => { + // Create mock DiagnosticsMetrics instance + mockDiagnosticsMetrics = { + setGauge: jest.fn(), + incrementCounter: jest.fn(), + recordLatency: jest.fn(), + } as any + + // Set global.diagnosticsMetrics for the tests + global.diagnosticsMetrics = mockDiagnosticsMetrics + + // Clear stats before each test + incomingRequestStats.clear() + }) + + afterEach(() => { + jest.clearAllMocks() + delete (global as any).diagnosticsMetrics + }) + + describe('IncomingRequestStats', () => { + it('should track total requests', () => { + incomingRequestStats.total++ + incomingRequestStats.total++ + + const stats = incomingRequestStats.get() + expect(stats.total).toBe(2) + }) + + it('should track aborted requests', () => { + incomingRequestStats.aborted++ + incomingRequestStats.aborted++ + + const stats = incomingRequestStats.get() + expect(stats.aborted).toBe(2) + }) + + it('should track closed requests', () => { + incomingRequestStats.closed++ + + const stats = incomingRequestStats.get() + expect(stats.closed).toBe(1) + }) + + it('should clear all stats', () => { + incomingRequestStats.total = 5 + incomingRequestStats.aborted = 3 + incomingRequestStats.closed = 2 + + incomingRequestStats.clear() + + const stats = incomingRequestStats.get() + expect(stats).toEqual({ + total: 0, + aborted: 0, + closed: 0, + }) + }) + }) + + describe('trackIncomingRequestStats', () => { + let mockCtx: any + let mockRequest: EventEmitter + let mockNext: jest.Mock + + beforeEach(() => { + mockRequest = new EventEmitter() + mockNext = jest.fn().mockResolvedValue(undefined) + mockCtx = { + req: mockRequest, + vtex: { + cancellation: { + cancelable: true, + source: { cancel: jest.fn() }, + cancelled: false, + }, + }, + } + }) + + it('should increment total requests counter and report to diagnostics metrics', async () => { + await trackIncomingRequestStats(mockCtx, mockNext) + + expect(incomingRequestStats.get().total).toBe(1) + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_server_requests_total', + 1, + {} + ) + }) + + it('should call next middleware', async () => { + await trackIncomingRequestStats(mockCtx, mockNext) + + expect(mockNext).toHaveBeenCalledTimes(1) + }) + + it('should increment closed counter when request closes and report to diagnostics', async () => { + await trackIncomingRequestStats(mockCtx, mockNext) + + mockRequest.emit('close') + + expect(incomingRequestStats.get().closed).toBe(1) + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_server_requests_closed_total', + 1, + {} + ) + }) + + it('should increment aborted counter when request aborts and report to diagnostics', async () => { + await trackIncomingRequestStats(mockCtx, mockNext) + + mockRequest.emit('aborted') + + expect(incomingRequestStats.get().aborted).toBe(1) + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_server_requests_aborted_total', + 1, + {} + ) + }) + + it('should cancel request when aborted and cancellation is available', async () => { + await trackIncomingRequestStats(mockCtx, mockNext) + + mockRequest.emit('aborted') + + expect(mockCtx.vtex.cancellation.source.cancel).toHaveBeenCalledWith('Request cancelled') + expect(mockCtx.vtex.cancellation.cancelled).toBe(true) + }) + + it('should handle multiple events correctly', async () => { + await trackIncomingRequestStats(mockCtx, mockNext) + + mockRequest.emit('close') + mockRequest.emit('aborted') + + expect(incomingRequestStats.get().total).toBe(1) + expect(incomingRequestStats.get().closed).toBe(1) + expect(incomingRequestStats.get().aborted).toBe(1) + }) + + it('should work without global.diagnosticsMetrics', async () => { + delete (global as any).diagnosticsMetrics + + // Should not throw + await expect(trackIncomingRequestStats(mockCtx, mockNext)).resolves.not.toThrow() + + expect(incomingRequestStats.get().total).toBe(1) + }) + + it('should handle request events without diagnostics metrics', async () => { + delete (global as any).diagnosticsMetrics + + await trackIncomingRequestStats(mockCtx, mockNext) + + // Should not throw when events are emitted + expect(() => mockRequest.emit('close')).not.toThrow() + expect(() => mockRequest.emit('aborted')).not.toThrow() + + expect(incomingRequestStats.get().closed).toBe(1) + expect(incomingRequestStats.get().aborted).toBe(1) + }) + }) +}) + + + + From 22438316e87d8e3fc25be0873dbb3907d0d627aa Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 16:52:59 -0300 Subject: [PATCH 10/22] Instrument http-client metrics with diagnostics - Add OpenTelemetry `Attributes` import in `metricsMiddleware` to annotate HTTP client metric dimensions - Captures elapsed request time and feed it to `global.diagnosticsMetrics.recordLatency` - Create and increment a request counter --- src/HttpClient/middlewares/metrics.ts | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/HttpClient/middlewares/metrics.ts b/src/HttpClient/middlewares/metrics.ts index 2e8b3f349..96da8343c 100644 --- a/src/HttpClient/middlewares/metrics.ts +++ b/src/HttpClient/middlewares/metrics.ts @@ -13,6 +13,7 @@ import { import { TIMEOUT_CODE } from '../../utils/retry' import { statusLabel } from '../../utils/status' import { MiddlewareContext } from '../typings' +import { Attributes } from '@opentelemetry/api' interface MetricsOpts { metrics?: MetricsAccumulator @@ -102,8 +103,32 @@ export const metricsMiddleware = ({metrics, serverTiming, name}: MetricsOpts) => ? process.hrtime(start) : undefined + // Legacy metrics (backward compatibility) metrics.batch(label, end, extensions) + // New diagnostics metrics with stable names and attributes + if (global.diagnosticsMetrics) { + const elapsed = process.hrtime(start) + const rawStatusCode = ctx.response?.status || errorStatus + const baseAttributes: Attributes = { + component: 'http-client', + client_metric: ctx.config.metric, + status_code: rawStatusCode, + status, + } + + // Record latency histogram with all context + global.diagnosticsMetrics.recordLatency(elapsed, { + ...baseAttributes, + }) + + // Increment counters for different event types (replaces extensions behavior) + // Main request counter with status as attribute + global.diagnosticsMetrics.incrementCounter('http_client_requests_total', 1, baseAttributes) + } else { + console.warn('DiagnosticsMetrics not available. HTTP client metrics not reported.') + } + if (ctx.config.verbose) { console.log(`VERBOSE: ${name}.${ctx.config.label}`, { ...extensions, From c3de30161c7c5de42239fb3b80cfca11c183abe6 Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 17:01:05 -0300 Subject: [PATCH 11/22] Add cache state in http-client metrics - Capture the resolved cache state for each HTTP client response - Include `cache_state` on latency histograms and the primary request counter - Emit `http_client_cache_total` diagnostics counter to replace legacy extension stats --- src/HttpClient/middlewares/metrics.ts | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/HttpClient/middlewares/metrics.ts b/src/HttpClient/middlewares/metrics.ts index 96da8343c..f86c7c131 100644 --- a/src/HttpClient/middlewares/metrics.ts +++ b/src/HttpClient/middlewares/metrics.ts @@ -76,19 +76,25 @@ export const metricsMiddleware = ({metrics, serverTiming, name}: MetricsOpts) => Object.assign(extensions, {[status]: 1}) + // Determine cache state for diagnostics metrics + let cacheState = 'none' if (ctx.cacheHit) { Object.assign(extensions, ctx.cacheHit, {[`${status}-hit`]: 1}) + cacheState = 'hit' } else if (!ctx.inflightHit && !ctx.memoizedHit) { // Lets us know how many calls passed through to origin Object.assign(extensions, {[`${status}-miss`]: 1}) + cacheState = 'miss' } if (ctx.inflightHit) { Object.assign(extensions, {[`${status}-inflight`]: 1}) + cacheState = 'inflight' } if (ctx.memoizedHit) { Object.assign(extensions, {[`${status}-memoized`]: 1}) + cacheState = 'memoized' } if (ctx.config.retryCount) { @@ -120,11 +126,20 @@ export const metricsMiddleware = ({metrics, serverTiming, name}: MetricsOpts) => // Record latency histogram with all context global.diagnosticsMetrics.recordLatency(elapsed, { ...baseAttributes, + cache_state: cacheState, }) // Increment counters for different event types (replaces extensions behavior) // Main request counter with status as attribute global.diagnosticsMetrics.incrementCounter('http_client_requests_total', 1, baseAttributes) + + // Cache counter with cache_state as attribute (replaces extensions like 'success-hit', 'error-miss') + if (cacheState !== 'none') { + global.diagnosticsMetrics.incrementCounter('http_client_cache_total', 1, { + ...baseAttributes, + cache_state: cacheState, + }) + } } else { console.warn('DiagnosticsMetrics not available. HTTP client metrics not reported.') } From 436805a0495e082cdef508ecf94aab44192804f2 Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 17:03:46 -0300 Subject: [PATCH 12/22] Report retries via diagnostics metrics for http-client metrics - Read retry count from config with safe default so zero attempts are tracked - Retain legacy status extension updates while preparing to deprecate them - Emit `http_client_requests_retried_total` counter with base attributes for diagnostics --- src/HttpClient/middlewares/metrics.ts | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/HttpClient/middlewares/metrics.ts b/src/HttpClient/middlewares/metrics.ts index f86c7c131..c2af8ba23 100644 --- a/src/HttpClient/middlewares/metrics.ts +++ b/src/HttpClient/middlewares/metrics.ts @@ -97,12 +97,10 @@ export const metricsMiddleware = ({metrics, serverTiming, name}: MetricsOpts) => cacheState = 'memoized' } - if (ctx.config.retryCount) { - const retryCount = ctx.config.retryCount + const retryCount = ctx.config.retryCount || 0 - if (retryCount > 0) { - extensions[`retry-${status}-${retryCount}`] = 1 - } + if (retryCount > 0) { + extensions[`retry-${status}-${retryCount}`] = 1 } const end = status === 'success' && !ctx.cacheHit && !ctx.inflightHit && !ctx.memoizedHit @@ -140,6 +138,14 @@ export const metricsMiddleware = ({metrics, serverTiming, name}: MetricsOpts) => cache_state: cacheState, }) } + + // Retry counter (replaces 'retry-{status}-{count}' extensions) + if (retryCount > 0) { + global.diagnosticsMetrics.incrementCounter('http_client_requests_retried_total', 1, { + ...baseAttributes, + retry_count: retryCount, + }) + } } else { console.warn('DiagnosticsMetrics not available. HTTP client metrics not reported.') } From e08415eb683ae8c33320fddbe1cf4941d6c1d586 Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 17:04:44 -0300 Subject: [PATCH 13/22] Add unit tests for metricsMiddleware This commit introduces a test suite for the metricsMiddleware, validating the functionality of metrics recording for various HTTP client scenarios, including successful requests, cache hits, retries, and error handling. --- src/HttpClient/middlewares/metrics.test.ts | 335 +++++++++++++++++++++ 1 file changed, 335 insertions(+) create mode 100644 src/HttpClient/middlewares/metrics.test.ts diff --git a/src/HttpClient/middlewares/metrics.test.ts b/src/HttpClient/middlewares/metrics.test.ts new file mode 100644 index 000000000..67b5ee456 --- /dev/null +++ b/src/HttpClient/middlewares/metrics.test.ts @@ -0,0 +1,335 @@ +// Mock @vtex/diagnostics-nodejs before any imports +jest.mock('@vtex/diagnostics-nodejs', () => ({ + Types: {}, + getMetricClient: jest.fn(), + getLogger: jest.fn(), +})) + +jest.mock('../../service/metrics/client', () => ({ + getMetricClient: jest.fn(), +})) + +jest.mock('../../errors/RequestCancelledError', () => ({ + RequestCancelledError: class RequestCancelledError extends Error { + constructor(message: string) { + super(message) + this.name = 'RequestCancelledError' + } + }, +})) + +import { DiagnosticsMetrics } from '../../metrics/DiagnosticsMetrics' +import { MetricsAccumulator } from '../../metrics/MetricsAccumulator' +import { metricsMiddleware } from './metrics' +import { MiddlewareContext } from '../typings' + +describe('metricsMiddleware', () => { + let mockMetrics: jest.Mocked + let mockDiagnosticsMetrics: jest.Mocked + let mockNext: jest.Mock + let mockCtx: MiddlewareContext + + beforeEach(() => { + // Mock MetricsAccumulator + mockMetrics = { + batch: jest.fn(), + } as any + + // Mock DiagnosticsMetrics + mockDiagnosticsMetrics = { + recordLatency: jest.fn(), + incrementCounter: jest.fn(), + setGauge: jest.fn(), + } as any + + // Set up global + global.diagnosticsMetrics = mockDiagnosticsMetrics + + // Mock next function + mockNext = jest.fn().mockResolvedValue(undefined) + + // Mock context + mockCtx = { + config: { + metric: 'test-client', + retryCount: 0, + }, + response: { + status: 200, + }, + cacheHit: undefined, + inflightHit: undefined, + memoizedHit: undefined, + } as any + }) + + afterEach(() => { + jest.clearAllMocks() + delete (global as any).diagnosticsMetrics + }) + + describe('successful requests', () => { + it('should record metrics for successful request with no cache', async () => { + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + // Legacy metrics + const batchCall = mockMetrics.batch.mock.calls[0] + expect(batchCall[0]).toBe('http-client-test-client') + expect(Array.isArray(batchCall[1])).toBe(true) // hrtime tuple + expect(batchCall[2]).toMatchObject({ + success: 1, + 'success-miss': 1, + }) + + // Diagnostics metrics + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(Array.isArray(latencyCall[0])).toBe(true) // hrtime tuple + expect(latencyCall[1]).toMatchObject({ + component: 'http-client', + client_metric: 'test-client', + status_code: 200, + status: 'success', + cache_state: 'miss', + }) + + // Main request counter with status as attribute + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_client_requests_total', + 1, + expect.objectContaining({ + component: 'http-client', + client_metric: 'test-client', + status_code: 200, + status: 'success', + }) + ) + + // Cache counter with cache_state as attribute + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_client_cache_total', + 1, + expect.objectContaining({ + component: 'http-client', + client_metric: 'test-client', + status: 'success', + cache_state: 'miss', + }) + ) + }) + + it('should record metrics with cache hit', async () => { + mockCtx.cacheHit = { revalidated: 1 } + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + cache_state: 'hit', + }) + + // Should have cache counter with cache_state attribute + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_client_cache_total', + 1, + expect.objectContaining({ + component: 'http-client', + client_metric: 'test-client', + cache_state: 'hit', + }) + ) + }) + + it('should record metrics with inflight hit', async () => { + mockCtx.inflightHit = true + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + cache_state: 'inflight', + }) + }) + + it('should record metrics with memoized hit', async () => { + mockCtx.memoizedHit = true + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + cache_state: 'memoized', + }) + }) + + it('should include retry count in attributes when retries occurred', async () => { + mockCtx.config.retryCount = 2 + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + // Histogram should not include retry_count (only in counter) + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: 'success', + cache_state: 'miss', + }) + + // Should have retry counter with retry_count attribute + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_client_requests_retried_total', + 1, + expect.objectContaining({ + component: 'http-client', + client_metric: 'test-client', + status: 'success', + status_code: 200, + retry_count: 2, // Number, not string + }) + ) + }) + }) + + describe('error handling', () => { + it('should record metrics for timeout errors', async () => { + mockNext.mockRejectedValueOnce({ + response: { + data: { code: 'ProxyTimeout' }, // TIMEOUT_CODE + }, + }) + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await expect(middleware(mockCtx, mockNext)).rejects.toMatchObject({ + response: expect.any(Object), + }) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: 'timeout', + }) + }) + + it('should record metrics for aborted requests', async () => { + mockNext.mockRejectedValueOnce({ + code: 'ECONNABORTED', + }) + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await expect(middleware(mockCtx, mockNext)).rejects.toMatchObject({ + code: 'ECONNABORTED', + }) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: 'aborted', + }) + }) + + it('should record metrics for cancelled requests', async () => { + mockNext.mockRejectedValueOnce({ + message: 'Request cancelled', + }) + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await expect(middleware(mockCtx, mockNext)).rejects.toThrow('Request cancelled') + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: 'cancelled', + }) + }) + + it('should record metrics for HTTP error responses', async () => { + mockNext.mockRejectedValueOnce({ + response: { + status: 500, + }, + }) + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await expect(middleware(mockCtx, mockNext)).rejects.toMatchObject({ + response: { status: 500 }, + }) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: 'error', + }) + }) + + it('should record metrics for generic errors', async () => { + mockNext.mockRejectedValueOnce(new Error('Generic error')) + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await expect(middleware(mockCtx, mockNext)).rejects.toThrow('Generic error') + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: 'error', + }) + }) + }) + + describe('backward compatibility', () => { + it('should maintain legacy metrics when config.metric is set', async () => { + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + // Verify legacy metrics still called + expect(mockMetrics.batch).toHaveBeenCalledTimes(1) + const batchCall = mockMetrics.batch.mock.calls[0] + expect(batchCall[0]).toBe('http-client-test-client') + expect(Array.isArray(batchCall[1])).toBe(true) + expect(batchCall[2]).toBeTruthy() + }) + + it('should not record metrics when config.metric is not set', async () => { + mockCtx.config.metric = undefined + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + expect(mockMetrics.batch).not.toHaveBeenCalled() + expect(mockDiagnosticsMetrics.recordLatency).not.toHaveBeenCalled() + expect(mockDiagnosticsMetrics.incrementCounter).not.toHaveBeenCalled() + }) + }) + + describe('graceful degradation', () => { + it('should work without global.diagnosticsMetrics', async () => { + delete (global as any).diagnosticsMetrics + + const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation() + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + // Legacy metrics still work + expect(mockMetrics.batch).toHaveBeenCalled() + + // Warning logged + expect(consoleWarnSpy).toHaveBeenCalledWith( + 'DiagnosticsMetrics not available. HTTP client metrics not reported.' + ) + + consoleWarnSpy.mockRestore() + }) + }) +}) + From e920a5b671dc5abb47a78f9f2db5f3d6600e8809 Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 17:11:12 -0300 Subject: [PATCH 14/22] Instrument timings middleware with diagnostics metrics The HTTP timings middleware now takes advantage of DiagnosticsMetrics whenever it is available so we emit request latency and counter telemetry with stable OpenTelemetry-style attributes. This ensure `http-handler` metrics from legacy log-based approach are proper instrumented with diagnostics patterns. Legacy batching remains in place for backwards compatibility, and we log a warning when the diagnostics client is missing to make instrumentation gaps visible. - extend context destructuring to capture the route type for tagging - record latency histograms for every request and keep legacy batching intact - increment the new http_handler_requests_total counter with consistent attributes --- .../runtime/http/middlewares/timings.ts | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/service/worker/runtime/http/middlewares/timings.ts b/src/service/worker/runtime/http/middlewares/timings.ts index 6d000053d..98c1ed2f8 100644 --- a/src/service/worker/runtime/http/middlewares/timings.ts +++ b/src/service/worker/runtime/http/middlewares/timings.ts @@ -1,4 +1,5 @@ import chalk from 'chalk' +import { Attributes } from '@opentelemetry/api' import { IOClients } from '../../../../../clients/IOClients' import { APP, LINKED, PID } from '../../../../../constants' @@ -63,12 +64,33 @@ export async function timings < // Errors will be caught by the next middleware so we don't have to catch. await next() - const { status: statusCode, vtex: { route: { id } }, timings: {total}, vtex } = ctx + const { status: statusCode, vtex: { route: { id, type } }, timings: {total}, vtex } = ctx const totalMillis = hrToMillis(total) console.log(log(ctx, totalMillis)) console.log(logBillingInfo(vtex, totalMillis)) const status = statusLabel(statusCode) + + // Legacy metrics (backward compatibility) // Only batch successful responses so metrics don't consider errors metrics.batch(`http-handler-${id}`, status === 'success' ? total : undefined, { [status]: 1 }) + + // New diagnostics metrics with stable names and attributes + if (global.diagnosticsMetrics) { + const attributes: Attributes = { + component: 'http-handler', + route_id: id, + route_type: type, + status_code: statusCode, + status, + } + + // Record latency histogram (record all requests, not just successful ones) + global.diagnosticsMetrics.recordLatency(total, attributes) + + // Increment counter (status is an attribute, not in metric name) + global.diagnosticsMetrics.incrementCounter('http_handler_requests_total', 1, attributes) + } else { + console.warn('DiagnosticsMetrics not available. HTTP handler metrics not reported.') + } } From 9adf4ad1695039e545b304fc2dbaf03ee95aec83 Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 17:11:56 -0300 Subject: [PATCH 15/22] Add unit tests for timings middleware This commit introduces a test suite for the timings middleware, validating the recording of diagnostics metrics for various HTTP request scenarios, including successful requests, error responses, and legacy compatibility. The tests ensure accurate logging of timing and billing information, as well as proper handling of different status codes and graceful degradation when diagnostics metrics are unavailable. --- .../runtime/http/middlewares/timings.test.ts | 256 ++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 src/service/worker/runtime/http/middlewares/timings.test.ts diff --git a/src/service/worker/runtime/http/middlewares/timings.test.ts b/src/service/worker/runtime/http/middlewares/timings.test.ts new file mode 100644 index 000000000..1984af4c8 --- /dev/null +++ b/src/service/worker/runtime/http/middlewares/timings.test.ts @@ -0,0 +1,256 @@ +// Mock @vtex/diagnostics-nodejs before any imports +jest.mock('@vtex/diagnostics-nodejs', () => ({ + Types: {}, + getMetricClient: jest.fn(), + getLogger: jest.fn(), +})) + +jest.mock('../../../../../service/metrics/client', () => ({ + getMetricClient: jest.fn(), +})) + +import { DiagnosticsMetrics } from '../../../../../metrics/DiagnosticsMetrics' +import { timings } from './timings' + +describe('timings middleware', () => { + let mockDiagnosticsMetrics: jest.Mocked + let mockNext: jest.Mock + let mockCtx: any + let consoleLogSpy: jest.SpyInstance + + beforeEach(() => { + // Mock DiagnosticsMetrics + mockDiagnosticsMetrics = { + recordLatency: jest.fn(), + incrementCounter: jest.fn(), + setGauge: jest.fn(), + } as any + + // Set up global + global.diagnosticsMetrics = mockDiagnosticsMetrics + + // Mock global.metrics for legacy support + ;(global as any).metrics = { + batch: jest.fn(), + } + + // Mock next function + mockNext = jest.fn().mockResolvedValue(undefined) + + // Mock console.log to avoid test output noise + consoleLogSpy = jest.spyOn(console, 'log').mockImplementation() + + // Mock context + mockCtx = { + status: 200, + method: 'GET', + path: '/test', + vtex: { + account: 'testaccount', + workspace: 'master', + production: true, + route: { + id: 'test-route', + type: 'public', + }, + }, + timings: { + total: [1, 500000000], // 1.5 seconds + }, + } + }) + + afterEach(() => { + jest.clearAllMocks() + consoleLogSpy.mockRestore() + delete (global as any).diagnosticsMetrics + delete (global as any).metrics + }) + + describe('successful requests', () => { + it('should record metrics for successful request', async () => { + await timings(mockCtx, mockNext) + + // Diagnostics metrics + expect(mockDiagnosticsMetrics.recordLatency).toHaveBeenCalledWith( + [1, 500000000], + expect.objectContaining({ + component: 'http-handler', + route_id: 'test-route', + route_type: 'public', + status_code: 200, + status: 'success', + }) + ) + + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_handler_requests_total', + 1, + expect.objectContaining({ + component: 'http-handler', + route_id: 'test-route', + route_type: 'public', + status_code: 200, + status: 'success', + }) + ) + }) + + it('should log timing and billing information', async () => { + await timings(mockCtx, mockNext) + + // Check console.log was called twice (timing log + billing log) + expect(consoleLogSpy).toHaveBeenCalledTimes(2) + + // Verify billing info structure + const billingCall = consoleLogSpy.mock.calls[1][0] + const billingInfo = JSON.parse(billingCall) + + expect(billingInfo).toMatchObject({ + __VTEX_IO_BILLING: 'true', + account: 'testaccount', + handler: 'test-route', + production: true, + routeType: 'public_route', + type: 'process-time', + workspace: 'master', + }) + expect(billingInfo.value).toBeGreaterThan(0) // millis + }) + + it('should maintain legacy metrics compatibility', async () => { + await timings(mockCtx, mockNext) + + // Verify legacy metrics.batch was called + expect((global as any).metrics.batch).toHaveBeenCalledWith( + 'http-handler-test-route', + [1, 500000000], + { success: 1 } + ) + }) + }) + + describe('error responses', () => { + it('should record metrics for 4xx errors', async () => { + mockCtx.status = 404 + + await timings(mockCtx, mockNext) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: '4xx', + }) + + // Counter with status as attribute + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_handler_requests_total', + 1, + expect.objectContaining({ + component: 'http-handler', + route_id: 'test-route', + route_type: 'public', + status_code: 404, + status: '4xx', + }) + ) + }) + + it('should record metrics for 5xx errors', async () => { + mockCtx.status = 500 + + await timings(mockCtx, mockNext) + + expect(mockDiagnosticsMetrics.recordLatency).toHaveBeenCalledWith( + expect.any(Array), + expect.objectContaining({ + status: 'error', + }) + ) + + // Verify legacy only batches successful responses (no hrtime for errors) + expect((global as any).metrics.batch).toHaveBeenCalledWith( + 'http-handler-test-route', + undefined, + { error: 1 } + ) + }) + }) + + describe('route types', () => { + it('should correctly identify private routes in billing info', async () => { + mockCtx.vtex.route.type = 'private' + + await timings(mockCtx, mockNext) + + const billingCall = consoleLogSpy.mock.calls[1][0] + const billingInfo = JSON.parse(billingCall) + + expect(billingInfo.routeType).toBe('private_route') + }) + }) + + describe('graceful degradation', () => { + it('should work without global.diagnosticsMetrics', async () => { + delete (global as any).diagnosticsMetrics + + const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation() + + await timings(mockCtx, mockNext) + + // Legacy metrics still work + expect((global as any).metrics.batch).toHaveBeenCalled() + + // Warning logged + expect(consoleWarnSpy).toHaveBeenCalledWith( + 'DiagnosticsMetrics not available. HTTP handler metrics not reported.' + ) + + consoleWarnSpy.mockRestore() + }) + }) + + describe('different status codes', () => { + const testCases = [ + { status: 200, expected: 'success' }, + { status: 201, expected: 'success' }, + { status: 204, expected: 'success' }, + { status: 301, expected: '3xx' }, // statusLabel returns range-based labels + { status: 302, expected: '3xx' }, + { status: 400, expected: '4xx' }, + { status: 401, expected: '4xx' }, + { status: 403, expected: '4xx' }, + { status: 404, expected: '4xx' }, + { status: 500, expected: 'error' }, + { status: 502, expected: 'error' }, + { status: 503, expected: 'error' }, + ] + + testCases.forEach(({ status, expected }) => { + it(`should categorize status ${status} as ${expected}`, async () => { + mockCtx.status = status + + await timings(mockCtx, mockNext) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: expected, + }) + }) + }) + }) + + describe('middleware execution', () => { + it('should call next() before recording metrics', async () => { + let nextCalled = false + mockNext.mockImplementation(async () => { + nextCalled = true + }) + + await timings(mockCtx, mockNext) + + expect(nextCalled).toBe(true) + expect(mockNext).toHaveBeenCalledTimes(1) + }) + }) +}) + From 3b3fab35838382f693e558bb27180271f7d517d9 Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 17:13:36 -0300 Subject: [PATCH 16/22] Enhance GraphQL metrics with diagnostics support This commit updates the Metric schema directive to integrate new diagnostics metrics, allowing for the recording of latency histograms and counters with stable attributes. It maintains backward compatibility with legacy metrics while providing warnings when diagnostics metrics are unavailable. Key changes include: - Refactoring status handling for clarity - Adding attributes for component, field name, and status in diagnostics metrics - Recording latency for all requests and incrementing a counter for GraphQL field requests --- .../graphql/schema/schemaDirectives/Metric.ts | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/service/worker/runtime/graphql/schema/schemaDirectives/Metric.ts b/src/service/worker/runtime/graphql/schema/schemaDirectives/Metric.ts index eb386e426..d4600cb90 100644 --- a/src/service/worker/runtime/graphql/schema/schemaDirectives/Metric.ts +++ b/src/service/worker/runtime/graphql/schema/schemaDirectives/Metric.ts @@ -1,5 +1,6 @@ import { defaultFieldResolver, GraphQLField } from 'graphql' import { SchemaDirectiveVisitor } from 'graphql-tools' +import { Attributes } from '@opentelemetry/api' import { APP } from '../../../../../..' import { GraphQLServiceContext } from '../../typings' @@ -26,14 +27,33 @@ export class Metric extends SchemaDirectiveVisitor { failedToResolve = true } - ctx.graphql.status = failedToResolve ? 'error' : 'success' + const status = failedToResolve ? 'error' : 'success' + ctx.graphql.status = status const payload = { - [ctx.graphql.status]: 1, + [status]: 1, } + // Legacy metrics (backward compatibility) metrics.batch(`graphql-metric-${name}`, failedToResolve ? undefined : ellapsed, payload) + // New diagnostics metrics with stable names and attributes + if (global.diagnosticsMetrics) { + const attributes: Attributes = { + component: 'graphql', + field_name: name, + status, + } + + // Record latency histogram (record all requests, not just successful ones) + global.diagnosticsMetrics.recordLatency(ellapsed, attributes) + + // Increment counter (status is an attribute, not in metric name) + global.diagnosticsMetrics.incrementCounter('graphql_field_requests_total', 1, attributes) + } else { + console.warn('DiagnosticsMetrics not available. GraphQL field metrics not reported.') + } + if (failedToResolve) { throw result } From 645c7d66be8bcd0bfc6863cf133c9264f1756110 Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 17:13:55 -0300 Subject: [PATCH 17/22] Add unit tests for Metric schema directive This commit introduces a comprehensive test suite for the Metric schema directive, validating the recording of metrics for both successful and failed GraphQL field resolutions. The tests ensure accurate logging of latency and error metrics, while also checking for proper handling when diagnostics metrics are unavailable. Key additions include: - Tests for successful field resolution metrics - Tests for failed field resolution metrics - Warning checks when diagnostics metrics are not present --- .../schema/schemaDirectives/Metric.test.ts | 261 ++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 src/service/worker/runtime/graphql/schema/schemaDirectives/Metric.test.ts diff --git a/src/service/worker/runtime/graphql/schema/schemaDirectives/Metric.test.ts b/src/service/worker/runtime/graphql/schema/schemaDirectives/Metric.test.ts new file mode 100644 index 000000000..4243b6ec8 --- /dev/null +++ b/src/service/worker/runtime/graphql/schema/schemaDirectives/Metric.test.ts @@ -0,0 +1,261 @@ +// Mock the diagnostics-nodejs module to avoid deep import issues (must be before imports) +jest.mock('@vtex/diagnostics-nodejs', () => ({ + Types: {}, +})) + +describe('Metric Schema Directive', () => { + let mockMetricsAccumulator: any + let mockDiagnosticsMetrics: any + let mockContext: any + + beforeEach(() => { + // Reset global.metrics and global.diagnosticsMetrics + mockMetricsAccumulator = { + batch: jest.fn(), + } + mockDiagnosticsMetrics = { + recordLatency: jest.fn(), + incrementCounter: jest.fn(), + } + ;(global as any).metrics = mockMetricsAccumulator + ;(global as any).diagnosticsMetrics = mockDiagnosticsMetrics + ;(global as any).APP = { NAME: 'vtex.render-server@1.x' } + + // Create a mock context + mockContext = { + graphql: { + status: undefined, + }, + } + }) + + afterEach(() => { + delete (global as any).metrics + delete (global as any).diagnosticsMetrics + delete (global as any).APP + jest.clearAllMocks() + }) + + it('should record successful field resolution metrics', async () => { + const mockResolver = jest.fn().mockResolvedValue('test-result') + + // Simulate what the Metric directive does + const wrappedResolver = async (root: any, args: any, ctx: any, info: any) => { + let failedToResolve = false + let result: any = null + let ellapsed: [number, number] = [0, 0] + + try { + const start = process.hrtime() + result = await mockResolver(root, args, ctx, info) + ellapsed = process.hrtime(start) + } catch (error) { + result = error + failedToResolve = true + } + + const status = failedToResolve ? 'error' : 'success' + ctx.graphql.status = status + const name = 'vtex.render-server@1.x-testField' + + const payload = { + [status]: 1, + } + + // Legacy metrics + ;(global as any).metrics.batch(`graphql-metric-${name}`, failedToResolve ? undefined : ellapsed, payload) + + // New diagnostics metrics + if ((global as any).diagnosticsMetrics) { + const attributes = { + component: 'graphql', + field_name: name, + status, + } + + ;(global as any).diagnosticsMetrics.recordLatency(ellapsed, attributes) + ;(global as any).diagnosticsMetrics.incrementCounter('graphql_field_requests_total', 1, attributes) + } + + if (failedToResolve) { + throw result + } + + return result + } + + // Execute the wrapped resolver + const result = await wrappedResolver({}, {}, mockContext, {}) + + expect(result).toBe('test-result') + expect(mockContext.graphql.status).toBe('success') + + // Legacy metrics + const batchCall = mockMetricsAccumulator.batch.mock.calls[0] + expect(batchCall[0]).toBe('graphql-metric-vtex.render-server@1.x-testField') + expect(Array.isArray(batchCall[1])).toBe(true) // hrtime is an array + expect(batchCall[2]).toEqual({ success: 1 }) + + // Diagnostics metrics + const recordLatencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(Array.isArray(recordLatencyCall[0])).toBe(true) // hrtime is an array + expect(recordLatencyCall[1]).toEqual({ + component: 'graphql', + field_name: 'vtex.render-server@1.x-testField', + status: 'success', + }) + + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'graphql_field_requests_total', + 1, + { + component: 'graphql', + field_name: 'vtex.render-server@1.x-testField', + status: 'success', + } + ) + }) + + it('should record failed field resolution metrics', async () => { + const testError = new Error('Test error') + const mockResolver = jest.fn().mockRejectedValue(testError) + + // Simulate what the Metric directive does + const wrappedResolver = async (root: any, args: any, ctx: any, info: any) => { + let failedToResolve = false + let result: any = null + let ellapsed: [number, number] = [0, 0] + + try { + const start = process.hrtime() + result = await mockResolver(root, args, ctx, info) + ellapsed = process.hrtime(start) + } catch (error) { + result = error + failedToResolve = true + } + + const status = failedToResolve ? 'error' : 'success' + ctx.graphql.status = status + const name = 'vtex.render-server@1.x-testField' + + const payload = { + [status]: 1, + } + + // Legacy metrics + ;(global as any).metrics.batch(`graphql-metric-${name}`, failedToResolve ? undefined : ellapsed, payload) + + // New diagnostics metrics + if ((global as any).diagnosticsMetrics) { + const attributes = { + component: 'graphql', + field_name: name, + status, + } + + ;(global as any).diagnosticsMetrics.recordLatency(ellapsed, attributes) + ;(global as any).diagnosticsMetrics.incrementCounter('graphql_field_requests_total', 1, attributes) + } + + if (failedToResolve) { + throw result + } + + return result + } + + // Execute the wrapped resolver and expect it to throw + await expect(wrappedResolver({}, {}, mockContext, {})).rejects.toThrow('Test error') + + expect(mockContext.graphql.status).toBe('error') + + // Legacy metrics (no latency on error) + expect(mockMetricsAccumulator.batch).toHaveBeenCalledWith( + 'graphql-metric-vtex.render-server@1.x-testField', + undefined, + { error: 1 } + ) + + // Diagnostics metrics (record latency even on error) + const recordLatencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(Array.isArray(recordLatencyCall[0])).toBe(true) // hrtime is an array + expect(recordLatencyCall[1]).toEqual({ + component: 'graphql', + field_name: 'vtex.render-server@1.x-testField', + status: 'error', + }) + + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'graphql_field_requests_total', + 1, + { + component: 'graphql', + field_name: 'vtex.render-server@1.x-testField', + status: 'error', + } + ) + }) + + it('should warn when DiagnosticsMetrics is not available', async () => { + const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation() + delete (global as any).diagnosticsMetrics + + const mockResolver = jest.fn().mockResolvedValue('test-result') + + // Simulate what the Metric directive does (without diagnostics) + const wrappedResolver = async (root: any, args: any, ctx: any, info: any) => { + let failedToResolve = false + let result: any = null + let ellapsed: [number, number] = [0, 0] + + try { + const start = process.hrtime() + result = await mockResolver(root, args, ctx, info) + ellapsed = process.hrtime(start) + } catch (error) { + result = error + failedToResolve = true + } + + const status = failedToResolve ? 'error' : 'success' + ctx.graphql.status = status + const name = 'vtex.render-server@1.x-testField' + + const payload = { + [status]: 1, + } + + // Legacy metrics + ;(global as any).metrics.batch(`graphql-metric-${name}`, failedToResolve ? undefined : ellapsed, payload) + + // New diagnostics metrics + if ((global as any).diagnosticsMetrics) { + const attributes = { + component: 'graphql', + field_name: name, + status, + } + + ;(global as any).diagnosticsMetrics.recordLatency(ellapsed, attributes) + ;(global as any).diagnosticsMetrics.incrementCounter('graphql_field_requests_total', 1, attributes) + } else { + console.warn('DiagnosticsMetrics not available. GraphQL field metrics not reported.') + } + + if (failedToResolve) { + throw result + } + + return result + } + + // Execute the wrapped resolver + await wrappedResolver({}, {}, mockContext, {}) + + expect(consoleWarnSpy).toHaveBeenCalledWith('DiagnosticsMetrics not available. GraphQL field metrics not reported.') + expect(mockMetricsAccumulator.batch).toHaveBeenCalled() // Legacy still works + + consoleWarnSpy.mockRestore() + }) +}) From a2c1eb2e7e8c979615197d7ee2978336937dd3de Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Fri, 3 Oct 2025 17:16:34 -0300 Subject: [PATCH 18/22] Export DiagnosticsMetrics --- src/index.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/index.ts b/src/index.ts index 725af54c7..58d4d4af1 100644 --- a/src/index.ts +++ b/src/index.ts @@ -5,6 +5,7 @@ export * from './clients' export * from './errors' export * from './HttpClient' export * from './metrics/MetricsAccumulator' +export * from './metrics/DiagnosticsMetrics' export * from './responses' export * from './service/worker/runtime/Service' export * from './service/worker/runtime/method' From a5b31053d99d1ada8bbf30bde3db577782357f5e Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Mon, 13 Oct 2025 20:00:02 -0300 Subject: [PATCH 19/22] Update DiagnosticsMetrics to conditionally log limit warnings This commit modifies the DiagnosticsMetrics class to log warnings about exceeded attribute limits only if it is in a linked app context --- src/metrics/DiagnosticsMetrics.ts | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/metrics/DiagnosticsMetrics.ts b/src/metrics/DiagnosticsMetrics.ts index ae7b0805f..35d09c217 100644 --- a/src/metrics/DiagnosticsMetrics.ts +++ b/src/metrics/DiagnosticsMetrics.ts @@ -1,7 +1,7 @@ import { Attributes } from '@opentelemetry/api' import { Types } from '@vtex/diagnostics-nodejs' import { getMetricClient } from '../service/metrics/client' -import { METRIC_CLIENT_INIT_TIMEOUT_MS } from '../constants' +import { METRIC_CLIENT_INIT_TIMEOUT_MS, LINKED } from '../constants' /** * Maximum number of attributes allowed per metric to control cardinality. @@ -33,10 +33,12 @@ function limitAttributes(attributes?: Attributes): Attributes | undefined { return attributes } - console.warn( - `Attribute limit exceeded: ${entries.length} attributes provided, using only the first ${MAX_ATTRIBUTES}. ` + - `Consider reducing the number of attributes to avoid high cardinality.` - ) + if (LINKED) { + console.warn( + `Attribute limit exceeded: ${entries.length} attributes provided, using only the first ${MAX_ATTRIBUTES}. ` + + `Consider reducing the number of attributes to avoid high cardinality.` + ) + } return Object.fromEntries(entries.slice(0, MAX_ATTRIBUTES)) } From e87afb4a7082e76a741b9e2be622df1410aa3a70 Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Mon, 13 Oct 2025 23:44:15 -0300 Subject: [PATCH 20/22] Add a description explaning unit test that handles intialization errors --- src/metrics/DiagnosticsMetrics.test.ts | 33 +++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/metrics/DiagnosticsMetrics.test.ts b/src/metrics/DiagnosticsMetrics.test.ts index b6c8ce8ae..170f6b18a 100644 --- a/src/metrics/DiagnosticsMetrics.test.ts +++ b/src/metrics/DiagnosticsMetrics.test.ts @@ -70,7 +70,37 @@ describe('DiagnosticsMetrics', () => { expect(recordedHistogramCalls[0].value).toBe(100) }) + /** + * @description + * Verifies that DiagnosticsMetrics handles metric client initialization failures gracefully + * without crashing the application. + * + * Test Strategy: + * 1. Configure getMetricClient() mock to reject before instance creation + * - This simulates diagnostics service being unavailable + * - Must be done BEFORE constructor runs since it immediately calls getMetricClient() + * + * 2. Create DiagnosticsMetrics instance + * - Constructor calls initMetricClient() synchronously + * - initMetricClient() starts async initialization (returns immediately) + * - Async code races getMetricClient() vs timeout + * - getMetricClient() rejects due to our mock + * - catch block logs error and sets metricsClient = undefined + * + * 3. Wait for async initialization to complete + * - Constructor returns immediately (can't await in constructor) + * - Need to wait for async promise to settle before checking results + * - 10ms is sufficient for promise rejection and catch block execution + * + * 4. Verify graceful degradation + * - Instance was created successfully (no exception thrown) + * - Error was logged to console (operational visibility) + * - metricsClient remains undefined (all record methods will no-op) + * */ it('should handle initialization errors gracefully', async () => { + + // Mock the getMetricClient (which is a Jest mock) to return an error + // Using mockRejectedValueOnce to configure the mock to reject with an error the next time it's called const error = new Error('Initialization failed') ;(getMetricClient as jest.Mock).mockRejectedValueOnce(error) @@ -79,9 +109,10 @@ describe('DiagnosticsMetrics', () => { // Create new instance that will fail initialization const failingMetrics = new DiagnosticsMetrics() - // Wait for initialization attempt + // Wait for initialization attempt (async operation in constructor) await new Promise(resolve => setTimeout(resolve, 10)) + // Verify error was logged (provides operational visibility) expect(consoleErrorSpy).toHaveBeenCalledWith('Failed to initialize metric client:', error) consoleErrorSpy.mockRestore() From 199da90c3e9ab588962869b1e89ee312c899a894 Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Thu, 16 Oct 2025 11:14:05 -0300 Subject: [PATCH 21/22] fix: correct mock context and test setup for metrics tests Add missing mock properties to requestStats tests (`status`, `route`) and enable `LINKED` constant mocking for DiagnosticsMetrics attribute limiting tests --- src/metrics/DiagnosticsMetrics.test.ts | 26 +++++++++++++++++++ .../http/middlewares/requestStats.test.ts | 23 +++++++++++++--- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/src/metrics/DiagnosticsMetrics.test.ts b/src/metrics/DiagnosticsMetrics.test.ts index 170f6b18a..90fc8f5f8 100644 --- a/src/metrics/DiagnosticsMetrics.test.ts +++ b/src/metrics/DiagnosticsMetrics.test.ts @@ -6,6 +6,12 @@ jest.mock('../service/metrics/client', () => ({ getMetricClient: jest.fn(), })) +// Mock constants to control LINKED value +jest.mock('../constants', () => ({ + ...jest.requireActual('../constants'), + LINKED: false, // Default to false, will override in specific tests +})) + import { getMetricClient } from '../service/metrics/client' describe('DiagnosticsMetrics', () => { @@ -291,6 +297,26 @@ describe('DiagnosticsMetrics', () => { }) describe('Attribute Limiting', () => { + beforeEach(() => { + // Enable LINKED for these tests so warnings are triggered + const constants = require('../constants') + Object.defineProperty(constants, 'LINKED', { + value: true, + writable: true, + configurable: true, + }) + }) + + afterEach(() => { + // Reset LINKED back to false + const constants = require('../constants') + Object.defineProperty(constants, 'LINKED', { + value: false, + writable: true, + configurable: true, + }) + }) + it('should allow up to 5 attributes without warning', async () => { const warnSpy = jest.spyOn(console, 'warn').mockImplementation() diff --git a/src/service/worker/runtime/http/middlewares/requestStats.test.ts b/src/service/worker/runtime/http/middlewares/requestStats.test.ts index 86080b6b5..60f8a625e 100644 --- a/src/service/worker/runtime/http/middlewares/requestStats.test.ts +++ b/src/service/worker/runtime/http/middlewares/requestStats.test.ts @@ -75,7 +75,12 @@ describe('requestStats', () => { mockNext = jest.fn().mockResolvedValue(undefined) mockCtx = { req: mockRequest, + status: 200, vtex: { + route: { + id: 'test-route', + type: 'public', + }, cancellation: { cancelable: true, source: { cancel: jest.fn() }, @@ -92,7 +97,11 @@ describe('requestStats', () => { expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( 'http_server_requests_total', 1, - {} + { + route_id: 'test-route', + route_type: 'public', + status_code: 200, + } ) }) @@ -111,7 +120,11 @@ describe('requestStats', () => { expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( 'http_server_requests_closed_total', 1, - {} + { + route_id: 'test-route', + route_type: 'public', + status_code: 200, + } ) }) @@ -124,7 +137,11 @@ describe('requestStats', () => { expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( 'http_server_requests_aborted_total', 1, - {} + { + route_id: 'test-route', + route_type: 'public', + status_code: 200, + } ) }) From e9964e0e33f6b823d63f9b69b7e574aed4abb35a Mon Sep 17 00:00:00 2001 From: Daniyel Rocha Date: Thu, 16 Oct 2025 18:08:11 -0300 Subject: [PATCH 22/22] Release beta version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index e16f675ab..98781cff8 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@vtex/api", - "version": "7.2.4", + "version": "7.2.5-beta.0", "description": "VTEX I/O API client", "main": "lib/index.js", "typings": "lib/index.d.ts",