diff --git a/package.json b/package.json index e16f675ab..98781cff8 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@vtex/api", - "version": "7.2.4", + "version": "7.2.5-beta.0", "description": "VTEX I/O API client", "main": "lib/index.js", "typings": "lib/index.d.ts", diff --git a/src/HttpClient/middlewares/metrics.test.ts b/src/HttpClient/middlewares/metrics.test.ts new file mode 100644 index 000000000..67b5ee456 --- /dev/null +++ b/src/HttpClient/middlewares/metrics.test.ts @@ -0,0 +1,335 @@ +// Mock @vtex/diagnostics-nodejs before any imports +jest.mock('@vtex/diagnostics-nodejs', () => ({ + Types: {}, + getMetricClient: jest.fn(), + getLogger: jest.fn(), +})) + +jest.mock('../../service/metrics/client', () => ({ + getMetricClient: jest.fn(), +})) + +jest.mock('../../errors/RequestCancelledError', () => ({ + RequestCancelledError: class RequestCancelledError extends Error { + constructor(message: string) { + super(message) + this.name = 'RequestCancelledError' + } + }, +})) + +import { DiagnosticsMetrics } from '../../metrics/DiagnosticsMetrics' +import { MetricsAccumulator } from '../../metrics/MetricsAccumulator' +import { metricsMiddleware } from './metrics' +import { MiddlewareContext } from '../typings' + +describe('metricsMiddleware', () => { + let mockMetrics: jest.Mocked + let mockDiagnosticsMetrics: jest.Mocked + let mockNext: jest.Mock + let mockCtx: MiddlewareContext + + beforeEach(() => { + // Mock MetricsAccumulator + mockMetrics = { + batch: jest.fn(), + } as any + + // Mock DiagnosticsMetrics + mockDiagnosticsMetrics = { + recordLatency: jest.fn(), + incrementCounter: jest.fn(), + setGauge: jest.fn(), + } as any + + // Set up global + global.diagnosticsMetrics = mockDiagnosticsMetrics + + // Mock next function + mockNext = jest.fn().mockResolvedValue(undefined) + + // Mock context + mockCtx = { + config: { + metric: 'test-client', + retryCount: 0, + }, + response: { + status: 200, + }, + cacheHit: undefined, + inflightHit: undefined, + memoizedHit: undefined, + } as any + }) + + afterEach(() => { + jest.clearAllMocks() + delete (global as any).diagnosticsMetrics + }) + + describe('successful requests', () => { + it('should record metrics for successful request with no cache', async () => { + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + // Legacy metrics + const batchCall = mockMetrics.batch.mock.calls[0] + expect(batchCall[0]).toBe('http-client-test-client') + expect(Array.isArray(batchCall[1])).toBe(true) // hrtime tuple + expect(batchCall[2]).toMatchObject({ + success: 1, + 'success-miss': 1, + }) + + // Diagnostics metrics + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(Array.isArray(latencyCall[0])).toBe(true) // hrtime tuple + expect(latencyCall[1]).toMatchObject({ + component: 'http-client', + client_metric: 'test-client', + status_code: 200, + status: 'success', + cache_state: 'miss', + }) + + // Main request counter with status as attribute + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_client_requests_total', + 1, + expect.objectContaining({ + component: 'http-client', + client_metric: 'test-client', + status_code: 200, + status: 'success', + }) + ) + + // Cache counter with cache_state as attribute + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_client_cache_total', + 1, + expect.objectContaining({ + component: 'http-client', + client_metric: 'test-client', + status: 'success', + cache_state: 'miss', + }) + ) + }) + + it('should record metrics with cache hit', async () => { + mockCtx.cacheHit = { revalidated: 1 } + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + cache_state: 'hit', + }) + + // Should have cache counter with cache_state attribute + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_client_cache_total', + 1, + expect.objectContaining({ + component: 'http-client', + client_metric: 'test-client', + cache_state: 'hit', + }) + ) + }) + + it('should record metrics with inflight hit', async () => { + mockCtx.inflightHit = true + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + cache_state: 'inflight', + }) + }) + + it('should record metrics with memoized hit', async () => { + mockCtx.memoizedHit = true + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + cache_state: 'memoized', + }) + }) + + it('should include retry count in attributes when retries occurred', async () => { + mockCtx.config.retryCount = 2 + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + // Histogram should not include retry_count (only in counter) + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: 'success', + cache_state: 'miss', + }) + + // Should have retry counter with retry_count attribute + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_client_requests_retried_total', + 1, + expect.objectContaining({ + component: 'http-client', + client_metric: 'test-client', + status: 'success', + status_code: 200, + retry_count: 2, // Number, not string + }) + ) + }) + }) + + describe('error handling', () => { + it('should record metrics for timeout errors', async () => { + mockNext.mockRejectedValueOnce({ + response: { + data: { code: 'ProxyTimeout' }, // TIMEOUT_CODE + }, + }) + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await expect(middleware(mockCtx, mockNext)).rejects.toMatchObject({ + response: expect.any(Object), + }) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: 'timeout', + }) + }) + + it('should record metrics for aborted requests', async () => { + mockNext.mockRejectedValueOnce({ + code: 'ECONNABORTED', + }) + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await expect(middleware(mockCtx, mockNext)).rejects.toMatchObject({ + code: 'ECONNABORTED', + }) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: 'aborted', + }) + }) + + it('should record metrics for cancelled requests', async () => { + mockNext.mockRejectedValueOnce({ + message: 'Request cancelled', + }) + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await expect(middleware(mockCtx, mockNext)).rejects.toThrow('Request cancelled') + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: 'cancelled', + }) + }) + + it('should record metrics for HTTP error responses', async () => { + mockNext.mockRejectedValueOnce({ + response: { + status: 500, + }, + }) + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await expect(middleware(mockCtx, mockNext)).rejects.toMatchObject({ + response: { status: 500 }, + }) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: 'error', + }) + }) + + it('should record metrics for generic errors', async () => { + mockNext.mockRejectedValueOnce(new Error('Generic error')) + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await expect(middleware(mockCtx, mockNext)).rejects.toThrow('Generic error') + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: 'error', + }) + }) + }) + + describe('backward compatibility', () => { + it('should maintain legacy metrics when config.metric is set', async () => { + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + // Verify legacy metrics still called + expect(mockMetrics.batch).toHaveBeenCalledTimes(1) + const batchCall = mockMetrics.batch.mock.calls[0] + expect(batchCall[0]).toBe('http-client-test-client') + expect(Array.isArray(batchCall[1])).toBe(true) + expect(batchCall[2]).toBeTruthy() + }) + + it('should not record metrics when config.metric is not set', async () => { + mockCtx.config.metric = undefined + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + expect(mockMetrics.batch).not.toHaveBeenCalled() + expect(mockDiagnosticsMetrics.recordLatency).not.toHaveBeenCalled() + expect(mockDiagnosticsMetrics.incrementCounter).not.toHaveBeenCalled() + }) + }) + + describe('graceful degradation', () => { + it('should work without global.diagnosticsMetrics', async () => { + delete (global as any).diagnosticsMetrics + + const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation() + + const middleware = metricsMiddleware({ metrics: mockMetrics, name: 'test' }) + + await middleware(mockCtx, mockNext) + + // Legacy metrics still work + expect(mockMetrics.batch).toHaveBeenCalled() + + // Warning logged + expect(consoleWarnSpy).toHaveBeenCalledWith( + 'DiagnosticsMetrics not available. HTTP client metrics not reported.' + ) + + consoleWarnSpy.mockRestore() + }) + }) +}) + diff --git a/src/HttpClient/middlewares/metrics.ts b/src/HttpClient/middlewares/metrics.ts index 2e8b3f349..c2af8ba23 100644 --- a/src/HttpClient/middlewares/metrics.ts +++ b/src/HttpClient/middlewares/metrics.ts @@ -13,6 +13,7 @@ import { import { TIMEOUT_CODE } from '../../utils/retry' import { statusLabel } from '../../utils/status' import { MiddlewareContext } from '../typings' +import { Attributes } from '@opentelemetry/api' interface MetricsOpts { metrics?: MetricsAccumulator @@ -75,35 +76,80 @@ export const metricsMiddleware = ({metrics, serverTiming, name}: MetricsOpts) => Object.assign(extensions, {[status]: 1}) + // Determine cache state for diagnostics metrics + let cacheState = 'none' if (ctx.cacheHit) { Object.assign(extensions, ctx.cacheHit, {[`${status}-hit`]: 1}) + cacheState = 'hit' } else if (!ctx.inflightHit && !ctx.memoizedHit) { // Lets us know how many calls passed through to origin Object.assign(extensions, {[`${status}-miss`]: 1}) + cacheState = 'miss' } if (ctx.inflightHit) { Object.assign(extensions, {[`${status}-inflight`]: 1}) + cacheState = 'inflight' } if (ctx.memoizedHit) { Object.assign(extensions, {[`${status}-memoized`]: 1}) + cacheState = 'memoized' } - if (ctx.config.retryCount) { - const retryCount = ctx.config.retryCount + const retryCount = ctx.config.retryCount || 0 - if (retryCount > 0) { - extensions[`retry-${status}-${retryCount}`] = 1 - } + if (retryCount > 0) { + extensions[`retry-${status}-${retryCount}`] = 1 } const end = status === 'success' && !ctx.cacheHit && !ctx.inflightHit && !ctx.memoizedHit ? process.hrtime(start) : undefined + // Legacy metrics (backward compatibility) metrics.batch(label, end, extensions) + // New diagnostics metrics with stable names and attributes + if (global.diagnosticsMetrics) { + const elapsed = process.hrtime(start) + const rawStatusCode = ctx.response?.status || errorStatus + const baseAttributes: Attributes = { + component: 'http-client', + client_metric: ctx.config.metric, + status_code: rawStatusCode, + status, + } + + // Record latency histogram with all context + global.diagnosticsMetrics.recordLatency(elapsed, { + ...baseAttributes, + cache_state: cacheState, + }) + + // Increment counters for different event types (replaces extensions behavior) + // Main request counter with status as attribute + global.diagnosticsMetrics.incrementCounter('http_client_requests_total', 1, baseAttributes) + + // Cache counter with cache_state as attribute (replaces extensions like 'success-hit', 'error-miss') + if (cacheState !== 'none') { + global.diagnosticsMetrics.incrementCounter('http_client_cache_total', 1, { + ...baseAttributes, + cache_state: cacheState, + }) + } + + // Retry counter (replaces 'retry-{status}-{count}' extensions) + if (retryCount > 0) { + global.diagnosticsMetrics.incrementCounter('http_client_requests_retried_total', 1, { + ...baseAttributes, + retry_count: retryCount, + }) + } + } else { + console.warn('DiagnosticsMetrics not available. HTTP client metrics not reported.') + } + if (ctx.config.verbose) { console.log(`VERBOSE: ${name}.${ctx.config.label}`, { ...extensions, diff --git a/src/HttpClient/middlewares/request/HttpAgentSingleton.test.ts b/src/HttpClient/middlewares/request/HttpAgentSingleton.test.ts new file mode 100644 index 000000000..172a26aad --- /dev/null +++ b/src/HttpClient/middlewares/request/HttpAgentSingleton.test.ts @@ -0,0 +1,102 @@ +import { HttpAgentSingleton } from './HttpAgentSingleton' + +// Mock the createHttpAgent function (external dependency) +jest.mock('../../../HttpClient/agents', () => ({ + createHttpAgent: jest.fn(() => ({ + sockets: {}, + freeSockets: {}, + requests: {}, + })), +})) + +describe('HttpAgentSingleton', () => { + let recordedGaugeCalls: Map> + + beforeEach(() => { + // Reset call tracking + recordedGaugeCalls = new Map() + + // Create a minimal stub that tracks gauge calls (not mocking DiagnosticsMetrics class) + global.diagnosticsMetrics = { + setGauge: (name: string, value: number, attributes?: any) => { + if (!recordedGaugeCalls.has(name)) { + recordedGaugeCalls.set(name, []) + } + recordedGaugeCalls.get(name)!.push({ value, attributes }) + }, + } as any + + // Reset the agent's internal state + const agent = HttpAgentSingleton.getHttpAgent() + ;(agent as any).sockets = {} + ;(agent as any).freeSockets = {} + ;(agent as any).requests = {} + }) + + afterEach(() => { + // Clean up global + delete (global as any).diagnosticsMetrics + }) + + describe('httpAgentStats', () => { + it('should return current HTTP agent statistics', () => { + const agent = HttpAgentSingleton.getHttpAgent() + + // Mock some socket data + ;(agent as any).sockets = { 'host1:80': [1, 2], 'host2:443': [1] } + ;(agent as any).freeSockets = { 'host1:80': [1] } + ;(agent as any).requests = { 'host1:80': [1, 2, 3] } + + const stats = HttpAgentSingleton.httpAgentStats() + + expect(stats).toEqual({ + sockets: 3, + freeSockets: 1, + pendingRequests: 3, + }) + }) + + it('should return zero counts for empty agent', () => { + const stats = HttpAgentSingleton.httpAgentStats() + + expect(stats).toEqual({ + sockets: 0, + freeSockets: 0, + pendingRequests: 0, + }) + }) + }) + + describe('updateHttpAgentMetrics', () => { + it('should report HTTP agent stats as gauges to diagnostics metrics', () => { + const agent = HttpAgentSingleton.getHttpAgent() + + // Mock some socket data + ;(agent as any).sockets = { 'host1:80': [1, 2] } + ;(agent as any).freeSockets = { 'host1:80': [1] } + ;(agent as any).requests = { 'host1:80': [1, 2, 3] } + + HttpAgentSingleton.updateHttpAgentMetrics() + + expect(recordedGaugeCalls.get('http_agent_sockets_current')).toEqual([{ value: 2, attributes: {} }]) + expect(recordedGaugeCalls.get('http_agent_free_sockets_current')).toEqual([{ value: 1, attributes: {} }]) + expect(recordedGaugeCalls.get('http_agent_pending_requests_current')).toEqual([{ value: 3, attributes: {} }]) + }) + + it('should handle missing global.diagnosticsMetrics gracefully', () => { + delete (global as any).diagnosticsMetrics + + // Should not throw + expect(() => HttpAgentSingleton.updateHttpAgentMetrics()).not.toThrow() + }) + + it('should report zero values when agent has no active connections', () => { + HttpAgentSingleton.updateHttpAgentMetrics() + + expect(recordedGaugeCalls.get('http_agent_sockets_current')).toEqual([{ value: 0, attributes: {} }]) + expect(recordedGaugeCalls.get('http_agent_free_sockets_current')).toEqual([{ value: 0, attributes: {} }]) + expect(recordedGaugeCalls.get('http_agent_pending_requests_current')).toEqual([{ value: 0, attributes: {} }]) + }) + }) +}) + diff --git a/src/HttpClient/middlewares/request/HttpAgentSingleton.ts b/src/HttpClient/middlewares/request/HttpAgentSingleton.ts index d51f475cb..057f0f5fe 100644 --- a/src/HttpClient/middlewares/request/HttpAgentSingleton.ts +++ b/src/HttpClient/middlewares/request/HttpAgentSingleton.ts @@ -21,6 +21,25 @@ export class HttpAgentSingleton { sockets, } } + + /** + * Update HTTP agent statistics as diagnostics metrics (gauges). + * This method should be called periodically to export current HTTP agent state. + */ + public static updateHttpAgentMetrics() { + if (!global.diagnosticsMetrics) { + console.warn('DiagnosticsMetrics not available. HTTP agent metrics not reported.') + return + } + + const stats = HttpAgentSingleton.httpAgentStats() + + // Report HTTP agent stats as gauges (current state) + global.diagnosticsMetrics.setGauge('http_agent_sockets_current', stats.sockets, {}) + global.diagnosticsMetrics.setGauge('http_agent_free_sockets_current', stats.freeSockets, {}) + global.diagnosticsMetrics.setGauge('http_agent_pending_requests_current', stats.pendingRequests, {}) + } + private static httpAgent: HttpAgent private static count(obj: { [key: string]: any[] }) { diff --git a/src/constants.ts b/src/constants.ts index cab1e5ba8..6cdb680fd 100644 --- a/src/constants.ts +++ b/src/constants.ts @@ -186,6 +186,7 @@ export const INSPECT_DEBUGGER_PORT = 5858 export const cancellableMethods = new Set(['GET', 'OPTIONS', 'HEAD']) export const LOG_CLIENT_INIT_TIMEOUT_MS = 5000 +export const METRIC_CLIENT_INIT_TIMEOUT_MS = 5000 export const OTEL_EXPORTER_OTLP_ENDPOINT = process.env.OTEL_EXPORTER_OTLP_ENDPOINT as string; diff --git a/src/index.ts b/src/index.ts index 725af54c7..58d4d4af1 100644 --- a/src/index.ts +++ b/src/index.ts @@ -5,6 +5,7 @@ export * from './clients' export * from './errors' export * from './HttpClient' export * from './metrics/MetricsAccumulator' +export * from './metrics/DiagnosticsMetrics' export * from './responses' export * from './service/worker/runtime/Service' export * from './service/worker/runtime/method' diff --git a/src/metrics/DiagnosticsMetrics.test.ts b/src/metrics/DiagnosticsMetrics.test.ts new file mode 100644 index 000000000..90fc8f5f8 --- /dev/null +++ b/src/metrics/DiagnosticsMetrics.test.ts @@ -0,0 +1,438 @@ +import { Types } from '@vtex/diagnostics-nodejs' +import { DiagnosticsMetrics } from './DiagnosticsMetrics' + +// Mock only the external I/O boundary (getMetricClient) +jest.mock('../service/metrics/client', () => ({ + getMetricClient: jest.fn(), +})) + +// Mock constants to control LINKED value +jest.mock('../constants', () => ({ + ...jest.requireActual('../constants'), + LINKED: false, // Default to false, will override in specific tests +})) + +import { getMetricClient } from '../service/metrics/client' + +describe('DiagnosticsMetrics', () => { + let diagnosticsMetrics: DiagnosticsMetrics + let mockMetricsClient: Types.MetricClient + let recordedHistogramCalls: Array<{ value: number; attributes?: any }> + let recordedCounterCalls: Map> + let recordedGaugeCalls: Map> + + beforeEach(() => { + // Reset call tracking + recordedHistogramCalls = [] + recordedCounterCalls = new Map() + recordedGaugeCalls = new Map() + + // Create a mock client that tracks calls instead of using jest.fn() + mockMetricsClient = { + createHistogram: (name: string, options: any) => ({ + record: (value: number, attributes?: any) => { + recordedHistogramCalls.push({ value, attributes }) + }, + }), + createCounter: (name: string, options: any) => ({ + add: (value: number, attributes?: any) => { + if (!recordedCounterCalls.has(name)) { + recordedCounterCalls.set(name, []) + } + recordedCounterCalls.get(name)!.push({ value, attributes }) + }, + }), + createGauge: (name: string, options: any) => ({ + set: (value: number, attributes?: any) => { + if (!recordedGaugeCalls.has(name)) { + recordedGaugeCalls.set(name, []) + } + recordedGaugeCalls.get(name)!.push({ value, attributes }) + }, + }), + } as any + + // Mock only the external call + ;(getMetricClient as jest.Mock).mockResolvedValue(mockMetricsClient) + + // Create real instance + diagnosticsMetrics = new DiagnosticsMetrics() + }) + + afterEach(() => { + jest.clearAllMocks() + }) + + describe('initialization', () => { + it('should initialize metrics client and create latency histogram in constructor', async () => { + // Wait for initialization to complete + await new Promise(resolve => setTimeout(resolve, 10)) + + expect(getMetricClient).toHaveBeenCalledTimes(1) + + // Verify histogram was created by recording a value + diagnosticsMetrics.recordLatency(100) + expect(recordedHistogramCalls).toHaveLength(1) + expect(recordedHistogramCalls[0].value).toBe(100) + }) + + /** + * @description + * Verifies that DiagnosticsMetrics handles metric client initialization failures gracefully + * without crashing the application. + * + * Test Strategy: + * 1. Configure getMetricClient() mock to reject before instance creation + * - This simulates diagnostics service being unavailable + * - Must be done BEFORE constructor runs since it immediately calls getMetricClient() + * + * 2. Create DiagnosticsMetrics instance + * - Constructor calls initMetricClient() synchronously + * - initMetricClient() starts async initialization (returns immediately) + * - Async code races getMetricClient() vs timeout + * - getMetricClient() rejects due to our mock + * - catch block logs error and sets metricsClient = undefined + * + * 3. Wait for async initialization to complete + * - Constructor returns immediately (can't await in constructor) + * - Need to wait for async promise to settle before checking results + * - 10ms is sufficient for promise rejection and catch block execution + * + * 4. Verify graceful degradation + * - Instance was created successfully (no exception thrown) + * - Error was logged to console (operational visibility) + * - metricsClient remains undefined (all record methods will no-op) + * */ + it('should handle initialization errors gracefully', async () => { + + // Mock the getMetricClient (which is a Jest mock) to return an error + // Using mockRejectedValueOnce to configure the mock to reject with an error the next time it's called + const error = new Error('Initialization failed') + ;(getMetricClient as jest.Mock).mockRejectedValueOnce(error) + + const consoleErrorSpy = jest.spyOn(console, 'error').mockImplementation() + + // Create new instance that will fail initialization + const failingMetrics = new DiagnosticsMetrics() + + // Wait for initialization attempt (async operation in constructor) + await new Promise(resolve => setTimeout(resolve, 10)) + + // Verify error was logged (provides operational visibility) + expect(consoleErrorSpy).toHaveBeenCalledWith('Failed to initialize metric client:', error) + + consoleErrorSpy.mockRestore() + }) + }) + + describe('recordLatency', () => { + beforeEach(async () => { + // Wait for initialization + await new Promise(resolve => setTimeout(resolve, 10)) + }) + + it('should record latency from hrtime tuple to single shared histogram', () => { + const hrtimeDiff: [number, number] = [1, 500000000] // 1.5 seconds + const attributes = { operation: 'api-call', status: '2xx' } + + diagnosticsMetrics.recordLatency(hrtimeDiff, attributes) + + expect(recordedHistogramCalls).toHaveLength(1) + expect(recordedHistogramCalls[0]).toEqual({ value: 1500, attributes }) + }) + + it('should record latency from milliseconds number to single shared histogram', () => { + const milliseconds = 42.5 + const attributes = { operation: 'db-query', status: 'success' } + + diagnosticsMetrics.recordLatency(milliseconds, attributes) + + expect(recordedHistogramCalls).toHaveLength(1) + expect(recordedHistogramCalls[0]).toEqual({ value: milliseconds, attributes }) + }) + + it('should record latency without attributes', () => { + const milliseconds = 100 + + diagnosticsMetrics.recordLatency(milliseconds) + + expect(recordedHistogramCalls).toHaveLength(1) + expect(recordedHistogramCalls[0]).toEqual({ value: milliseconds, attributes: undefined }) + }) + + it('should use the same histogram for all latency measurements', () => { + diagnosticsMetrics.recordLatency(10, { operation: 'op1' }) + diagnosticsMetrics.recordLatency(20, { operation: 'op2' }) + diagnosticsMetrics.recordLatency(30, { operation: 'op3' }) + + // All recordings go to the same histogram + expect(recordedHistogramCalls).toHaveLength(3) + expect(recordedHistogramCalls[0].value).toBe(10) + expect(recordedHistogramCalls[1].value).toBe(20) + expect(recordedHistogramCalls[2].value).toBe(30) + }) + + it('should warn if not initialized', () => { + const uninitializedMetrics = new DiagnosticsMetrics() + const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation() + + // Don't wait for initialization + uninitializedMetrics.recordLatency(100) + + expect(consoleWarnSpy).toHaveBeenCalledWith('DiagnosticsMetrics not initialized. Call initialize() first.') + consoleWarnSpy.mockRestore() + }) + }) + + describe('incrementCounter', () => { + beforeEach(async () => { + await new Promise(resolve => setTimeout(resolve, 10)) + }) + + it('should increment counter with value and attributes', () => { + const attributes = { method: 'GET', status: '2xx' } + + diagnosticsMetrics.incrementCounter('http_requests_total', 1, attributes) + + const calls = recordedCounterCalls.get('http_requests_total') + expect(calls).toHaveLength(1) + expect(calls![0]).toEqual({ value: 1, attributes }) + }) + + it('should increment counter without attributes', () => { + diagnosticsMetrics.incrementCounter('requests', 5) + + const calls = recordedCounterCalls.get('requests') + expect(calls).toHaveLength(1) + expect(calls![0]).toEqual({ value: 5, attributes: undefined }) + }) + + it('should reuse existing counter for same metric name', () => { + diagnosticsMetrics.incrementCounter('requests', 1) + diagnosticsMetrics.incrementCounter('requests', 2) + diagnosticsMetrics.incrementCounter('requests', 3) + + const calls = recordedCounterCalls.get('requests') + expect(calls).toHaveLength(3) + expect(calls![0].value).toBe(1) + expect(calls![1].value).toBe(2) + expect(calls![2].value).toBe(3) + }) + + it('should create separate counters for different metric names', () => { + diagnosticsMetrics.incrementCounter('counter1', 1) + diagnosticsMetrics.incrementCounter('counter2', 2) + + expect(recordedCounterCalls.get('counter1')).toHaveLength(1) + expect(recordedCounterCalls.get('counter2')).toHaveLength(1) + expect(recordedCounterCalls.get('counter1')![0].value).toBe(1) + expect(recordedCounterCalls.get('counter2')![0].value).toBe(2) + }) + + it('should warn if not initialized', () => { + const uninitializedMetrics = new DiagnosticsMetrics() + const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation() + + uninitializedMetrics.incrementCounter('test', 1) + + expect(consoleWarnSpy).toHaveBeenCalledWith('DiagnosticsMetrics not initialized. Call initialize() first.') + consoleWarnSpy.mockRestore() + }) + }) + + describe('setGauge', () => { + beforeEach(async () => { + await new Promise(resolve => setTimeout(resolve, 10)) + }) + + it('should set gauge with value and attributes', () => { + const attributes = { cache: 'pages' } + + diagnosticsMetrics.setGauge('cache_items_current', 1024, attributes) + + const calls = recordedGaugeCalls.get('cache_items_current') + expect(calls).toHaveLength(1) + expect(calls![0]).toEqual({ value: 1024, attributes }) + }) + + it('should set gauge without attributes', () => { + diagnosticsMetrics.setGauge('memory_usage', 512) + + const calls = recordedGaugeCalls.get('memory_usage') + expect(calls).toHaveLength(1) + expect(calls![0]).toEqual({ value: 512, attributes: undefined }) + }) + + it('should reuse existing gauge for same metric name', () => { + diagnosticsMetrics.setGauge('gauge1', 10) + diagnosticsMetrics.setGauge('gauge1', 20) + diagnosticsMetrics.setGauge('gauge1', 30) + + const calls = recordedGaugeCalls.get('gauge1') + expect(calls).toHaveLength(3) + expect(calls![0].value).toBe(10) + expect(calls![1].value).toBe(20) + expect(calls![2].value).toBe(30) + }) + + it('should create separate gauges for different metric names', () => { + diagnosticsMetrics.setGauge('gauge1', 100) + diagnosticsMetrics.setGauge('gauge2', 200) + + expect(recordedGaugeCalls.get('gauge1')).toHaveLength(1) + expect(recordedGaugeCalls.get('gauge2')).toHaveLength(1) + expect(recordedGaugeCalls.get('gauge1')![0].value).toBe(100) + expect(recordedGaugeCalls.get('gauge2')![0].value).toBe(200) + }) + + it('should warn if not initialized', () => { + const uninitializedMetrics = new DiagnosticsMetrics() + const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation() + + uninitializedMetrics.setGauge('test', 100) + + expect(consoleWarnSpy).toHaveBeenCalledWith('DiagnosticsMetrics not initialized. Call initialize() first.') + consoleWarnSpy.mockRestore() + }) + }) + + describe('Attribute Limiting', () => { + beforeEach(() => { + // Enable LINKED for these tests so warnings are triggered + const constants = require('../constants') + Object.defineProperty(constants, 'LINKED', { + value: true, + writable: true, + configurable: true, + }) + }) + + afterEach(() => { + // Reset LINKED back to false + const constants = require('../constants') + Object.defineProperty(constants, 'LINKED', { + value: false, + writable: true, + configurable: true, + }) + }) + + it('should allow up to 5 attributes without warning', async () => { + const warnSpy = jest.spyOn(console, 'warn').mockImplementation() + + const attributes = { + attr1: 'value1', + attr2: 'value2', + attr3: 'value3', + attr4: 'value4', + attr5: 'value5', + } + + diagnosticsMetrics.recordLatency([0, 1000000], attributes) + + expect(recordedHistogramCalls[0].attributes).toEqual(attributes) + expect(warnSpy).not.toHaveBeenCalled() + + warnSpy.mockRestore() + }) + + it('should limit attributes to 5 and warn when exceeded (recordLatency)', async () => { + const warnSpy = jest.spyOn(console, 'warn').mockImplementation() + + const attributes = { + attr1: 'value1', + attr2: 'value2', + attr3: 'value3', + attr4: 'value4', + attr5: 'value5', + attr6: 'value6', + attr7: 'value7', + } + + diagnosticsMetrics.recordLatency([0, 1000000], attributes) + + // Should only include first 5 attributes + const recorded = recordedHistogramCalls[0].attributes + expect(Object.keys(recorded)).toHaveLength(5) + expect(recorded).toEqual({ + attr1: 'value1', + attr2: 'value2', + attr3: 'value3', + attr4: 'value4', + attr5: 'value5', + }) + + expect(warnSpy).toHaveBeenCalledWith( + expect.stringContaining('Attribute limit exceeded: 7 attributes provided, using only the first 5') + ) + + warnSpy.mockRestore() + }) + + it('should limit attributes to 5 and warn when exceeded (incrementCounter)', async () => { + const warnSpy = jest.spyOn(console, 'warn').mockImplementation() + + const attributes = { + attr1: 'value1', + attr2: 'value2', + attr3: 'value3', + attr4: 'value4', + attr5: 'value5', + attr6: 'value6', + } + + diagnosticsMetrics.incrementCounter('test_counter', 1, attributes) + + // Should only include first 5 attributes + const recorded = recordedCounterCalls.get('test_counter')![0].attributes + expect(Object.keys(recorded)).toHaveLength(5) + expect(recorded).toEqual({ + attr1: 'value1', + attr2: 'value2', + attr3: 'value3', + attr4: 'value4', + attr5: 'value5', + }) + + expect(warnSpy).toHaveBeenCalledWith( + expect.stringContaining('Attribute limit exceeded: 6 attributes provided, using only the first 5') + ) + + warnSpy.mockRestore() + }) + + it('should limit attributes to 5 and warn when exceeded (setGauge)', async () => { + const warnSpy = jest.spyOn(console, 'warn').mockImplementation() + + const attributes = { + attr1: 'value1', + attr2: 'value2', + attr3: 'value3', + attr4: 'value4', + attr5: 'value5', + attr6: 'value6', + attr7: 'value7', + attr8: 'value8', + } + + diagnosticsMetrics.setGauge('test_gauge', 100, attributes) + + // Should only include first 5 attributes + const recorded = recordedGaugeCalls.get('test_gauge')![0].attributes + expect(Object.keys(recorded)).toHaveLength(5) + expect(recorded).toEqual({ + attr1: 'value1', + attr2: 'value2', + attr3: 'value3', + attr4: 'value4', + attr5: 'value5', + }) + + expect(warnSpy).toHaveBeenCalledWith( + expect.stringContaining('Attribute limit exceeded: 8 attributes provided, using only the first 5') + ) + + warnSpy.mockRestore() + }) + }) +}) diff --git a/src/metrics/DiagnosticsMetrics.ts b/src/metrics/DiagnosticsMetrics.ts new file mode 100644 index 000000000..35d09c217 --- /dev/null +++ b/src/metrics/DiagnosticsMetrics.ts @@ -0,0 +1,241 @@ +import { Attributes } from '@opentelemetry/api' +import { Types } from '@vtex/diagnostics-nodejs' +import { getMetricClient } from '../service/metrics/client' +import { METRIC_CLIENT_INIT_TIMEOUT_MS, LINKED } from '../constants' + +/** + * Maximum number of attributes allowed per metric to control cardinality. + * This limit applies to external usage (VTEX IO Apps) to prevent unbounded metric dimensions. + */ +const MAX_ATTRIBUTES = 5 + +/** + * Converts an hrtime tuple [seconds, nanoseconds] to milliseconds. + */ +function hrtimeToMillis(hrtime: [number, number]): number { + return (hrtime[0] * 1e3) + (hrtime[1] / 1e6) +} + +/** + * Limits the number of attributes to prevent high cardinality. + * Takes the first MAX_ATTRIBUTES entries if the limit is exceeded. + * + * @param attributes Optional attributes object + * @returns Limited attributes object or undefined + */ +function limitAttributes(attributes?: Attributes): Attributes | undefined { + if (!attributes) { + return undefined + } + + const entries = Object.entries(attributes) + if (entries.length <= MAX_ATTRIBUTES) { + return attributes + } + + if (LINKED) { + console.warn( + `Attribute limit exceeded: ${entries.length} attributes provided, using only the first ${MAX_ATTRIBUTES}. ` + + `Consider reducing the number of attributes to avoid high cardinality.` + ) + } + + return Object.fromEntries(entries.slice(0, MAX_ATTRIBUTES)) +} + +/** + * DiagnosticsMetrics provides a high-level API for recording metrics using + * the @vtex/diagnostics-nodejs library. It completely abstracts instrument + * management, bucket configuration, and lifecycle. + * + * Uses a single histogram for all latency measurements with attributes to differentiate. + * This follows OpenTelemetry best practices and reduces metric cardinality. + * + * @example + * ```typescript + * const diagnosticsMetrics = new DiagnosticsMetrics() + * diagnosticsMetrics.initMetricClient() + * + * // Record latency with operation type in attributes + * const start = process.hrtime() + * // ... do work ... + * diagnosticsMetrics.recordLatency(process.hrtime(start), { operation: 'api-call', status: '2xx' }) + * + * // Or from milliseconds + * diagnosticsMetrics.recordLatency(42.5, { operation: 'db-query', status: 'success' }) + * + * // Increment a counter + * diagnosticsMetrics.incrementCounter('http_requests_total', 1, { method: 'GET', status: '2xx' }) + * + * // Set a gauge value + * diagnosticsMetrics.setGauge('cache_items_current', 1024, { cache: 'pages' }) + * ``` + */ +export class DiagnosticsMetrics { + private metricsClient: Types.MetricClient | undefined + private clientInitPromise: Promise | undefined + + private latencyHistogram: Types.Histogram | undefined + // Counters and gauges keyed by name + private counters: Map + private gauges: Map + + constructor() { + this.counters = new Map() + this.gauges = new Map() + this.initMetricClient() + } + + /** + * Initialize the metrics client with timeout handling. + * Called automatically in constructor. + */ + private initMetricClient(): Promise { + if (this.clientInitPromise) { + return this.clientInitPromise + } + + this.clientInitPromise = (async () => { + try { + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => reject(new Error('Metric client initialization timeout')), METRIC_CLIENT_INIT_TIMEOUT_MS) + }) + + this.metricsClient = await Promise.race([ + getMetricClient(), + timeoutPromise + ]) + + // Create the single latency histogram after client is ready + this.createLatencyHistogram() + + return this.metricsClient + } catch (error) { + console.error('Failed to initialize metric client:', error) + return undefined + } + })() + + return this.clientInitPromise + } + + /** + * Create the single shared histogram for all latency measurements. + * Called after metric client is initialized. + */ + private createLatencyHistogram(): void { + if (!this.metricsClient) { + return + } + + this.latencyHistogram = this.metricsClient.createHistogram('io_app_operation_duration_milliseconds', { + description: 'Duration of VTEX IO app operations in milliseconds', + unit: 'ms', + }) + } + + /** + * Record a latency measurement using the single shared histogram. + * Accepts either an hrtime tuple from process.hrtime() or milliseconds as a number. + * Use attributes to differentiate between different operations. + * + * @param value Either [seconds, nanoseconds] from process.hrtime() or milliseconds + * @param attributes Attributes including 'operation' to identify the operation type (max of MAX_ATTRIBUTES attributes) + * + * @example + * ```typescript + * const start = process.hrtime() + * // ... do work ... + * diagnosticsMetrics.recordLatency(process.hrtime(start), { operation: 'api-call', status: '2xx' }) + * + * // Or with milliseconds + * diagnosticsMetrics.recordLatency(42.5, { operation: 'db-query', status: 'success' }) + * ``` + */ + public recordLatency(value: [number, number] | number, attributes?: Attributes): void { + if (!this.latencyHistogram) { + console.warn('DiagnosticsMetrics not initialized. Call initialize() first.') + return + } + + // Convert hrtime to milliseconds if needed + const milliseconds = Array.isArray(value) ? hrtimeToMillis(value) : value + + // Limit attributes to prevent high cardinality + const limitedAttributes = limitAttributes(attributes) + + // Record to the single shared histogram with attributes + this.latencyHistogram.record(milliseconds, limitedAttributes) + } + + /** + * Increment a counter by a specific value. + * Multiple counters are stored by name since counters represent different types of events. + * + * @param name Counter name (e.g., 'http_requests_total', 'cache_hits_total') + * @param value Amount to increment by (typically 1) + * @param attributes Optional attributes for the counter (max of MAX_ATTRIBUTES attributes, e.g., { method: 'GET', status: '2xx' }) + * + * @example + * ```typescript + * diagnosticsMetrics.incrementCounter('http_requests_total', 1, { method: 'GET', status: '2xx' }) + * ``` + */ + public incrementCounter(name: string, value: number, attributes?: Attributes): void { + if (!this.metricsClient) { + console.warn('DiagnosticsMetrics not initialized. Call initialize() first.') + return + } + + // Get or create counter instrument + if (!this.counters.has(name)) { + const counter = this.metricsClient.createCounter(name, { + description: `Counter for ${name}`, + unit: '1', + }) + this.counters.set(name, counter) + } + + // Limit attributes to prevent high cardinality + const limitedAttributes = limitAttributes(attributes) + + // Increment the counter + this.counters.get(name)!.add(value, limitedAttributes) + } + + /** + * Set a gauge to a specific value (current state). + * Multiple gauges are stored by name since gauges represent different types of measurements. + * + * @param name Gauge name (e.g., 'cache_items_current', 'memory_usage_bytes') + * @param value Current value + * @param attributes Optional attributes for the gauge (max of MAX_ATTRIBUTES attributes, e.g., { cache: 'pages' }) + * + * @example + * ```typescript + * diagnosticsMetrics.setGauge('cache_items_current', 1024, { cache: 'pages' }) + * ``` + */ + public setGauge(name: string, value: number, attributes?: Attributes): void { + if (!this.metricsClient) { + console.warn('DiagnosticsMetrics not initialized. Call initialize() first.') + return + } + + // Get or create gauge instrument + if (!this.gauges.has(name)) { + const gauge = this.metricsClient.createGauge(name, { + description: `Gauge for ${name}`, + unit: '1', + }) + this.gauges.set(name, gauge) + } + + // Limit attributes to prevent high cardinality + const limitedAttributes = limitAttributes(attributes) + + // Set the gauge value + this.gauges.get(name)!.set(value, limitedAttributes) + } +} + diff --git a/src/service/index.ts b/src/service/index.ts index d531e1247..ab2ed5326 100644 --- a/src/service/index.ts +++ b/src/service/index.ts @@ -3,15 +3,21 @@ import cluster from 'cluster' import { HTTP_SERVER_PORT } from '../constants' import { MetricsAccumulator } from '../metrics/MetricsAccumulator' +import { DiagnosticsMetrics } from '../metrics/DiagnosticsMetrics' import { getServiceJSON } from './loaders' import { LogLevel, logOnceToDevConsole } from './logger' export const startApp = async () => { await initializeTelemetry() - // Initialize global.metrics for both master and worker processes + // Initialize both legacy and new metrics systems independently + // Legacy: MetricsAccumulator for backward compatibility + // External apps and internal code can continue using this during migration global.metrics = new MetricsAccumulator() + // New: DiagnosticsMetrics for new diagnostics-based instrumentation + global.diagnosticsMetrics = new DiagnosticsMetrics() + const serviceJSON = getServiceJSON() try { // if it is a master process then call setting up worker process @@ -36,6 +42,7 @@ declare global { namespace NodeJS { interface Global { metrics: MetricsAccumulator + diagnosticsMetrics: DiagnosticsMetrics } } } diff --git a/src/service/worker/runtime/graphql/schema/schemaDirectives/Metric.test.ts b/src/service/worker/runtime/graphql/schema/schemaDirectives/Metric.test.ts new file mode 100644 index 000000000..4243b6ec8 --- /dev/null +++ b/src/service/worker/runtime/graphql/schema/schemaDirectives/Metric.test.ts @@ -0,0 +1,261 @@ +// Mock the diagnostics-nodejs module to avoid deep import issues (must be before imports) +jest.mock('@vtex/diagnostics-nodejs', () => ({ + Types: {}, +})) + +describe('Metric Schema Directive', () => { + let mockMetricsAccumulator: any + let mockDiagnosticsMetrics: any + let mockContext: any + + beforeEach(() => { + // Reset global.metrics and global.diagnosticsMetrics + mockMetricsAccumulator = { + batch: jest.fn(), + } + mockDiagnosticsMetrics = { + recordLatency: jest.fn(), + incrementCounter: jest.fn(), + } + ;(global as any).metrics = mockMetricsAccumulator + ;(global as any).diagnosticsMetrics = mockDiagnosticsMetrics + ;(global as any).APP = { NAME: 'vtex.render-server@1.x' } + + // Create a mock context + mockContext = { + graphql: { + status: undefined, + }, + } + }) + + afterEach(() => { + delete (global as any).metrics + delete (global as any).diagnosticsMetrics + delete (global as any).APP + jest.clearAllMocks() + }) + + it('should record successful field resolution metrics', async () => { + const mockResolver = jest.fn().mockResolvedValue('test-result') + + // Simulate what the Metric directive does + const wrappedResolver = async (root: any, args: any, ctx: any, info: any) => { + let failedToResolve = false + let result: any = null + let ellapsed: [number, number] = [0, 0] + + try { + const start = process.hrtime() + result = await mockResolver(root, args, ctx, info) + ellapsed = process.hrtime(start) + } catch (error) { + result = error + failedToResolve = true + } + + const status = failedToResolve ? 'error' : 'success' + ctx.graphql.status = status + const name = 'vtex.render-server@1.x-testField' + + const payload = { + [status]: 1, + } + + // Legacy metrics + ;(global as any).metrics.batch(`graphql-metric-${name}`, failedToResolve ? undefined : ellapsed, payload) + + // New diagnostics metrics + if ((global as any).diagnosticsMetrics) { + const attributes = { + component: 'graphql', + field_name: name, + status, + } + + ;(global as any).diagnosticsMetrics.recordLatency(ellapsed, attributes) + ;(global as any).diagnosticsMetrics.incrementCounter('graphql_field_requests_total', 1, attributes) + } + + if (failedToResolve) { + throw result + } + + return result + } + + // Execute the wrapped resolver + const result = await wrappedResolver({}, {}, mockContext, {}) + + expect(result).toBe('test-result') + expect(mockContext.graphql.status).toBe('success') + + // Legacy metrics + const batchCall = mockMetricsAccumulator.batch.mock.calls[0] + expect(batchCall[0]).toBe('graphql-metric-vtex.render-server@1.x-testField') + expect(Array.isArray(batchCall[1])).toBe(true) // hrtime is an array + expect(batchCall[2]).toEqual({ success: 1 }) + + // Diagnostics metrics + const recordLatencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(Array.isArray(recordLatencyCall[0])).toBe(true) // hrtime is an array + expect(recordLatencyCall[1]).toEqual({ + component: 'graphql', + field_name: 'vtex.render-server@1.x-testField', + status: 'success', + }) + + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'graphql_field_requests_total', + 1, + { + component: 'graphql', + field_name: 'vtex.render-server@1.x-testField', + status: 'success', + } + ) + }) + + it('should record failed field resolution metrics', async () => { + const testError = new Error('Test error') + const mockResolver = jest.fn().mockRejectedValue(testError) + + // Simulate what the Metric directive does + const wrappedResolver = async (root: any, args: any, ctx: any, info: any) => { + let failedToResolve = false + let result: any = null + let ellapsed: [number, number] = [0, 0] + + try { + const start = process.hrtime() + result = await mockResolver(root, args, ctx, info) + ellapsed = process.hrtime(start) + } catch (error) { + result = error + failedToResolve = true + } + + const status = failedToResolve ? 'error' : 'success' + ctx.graphql.status = status + const name = 'vtex.render-server@1.x-testField' + + const payload = { + [status]: 1, + } + + // Legacy metrics + ;(global as any).metrics.batch(`graphql-metric-${name}`, failedToResolve ? undefined : ellapsed, payload) + + // New diagnostics metrics + if ((global as any).diagnosticsMetrics) { + const attributes = { + component: 'graphql', + field_name: name, + status, + } + + ;(global as any).diagnosticsMetrics.recordLatency(ellapsed, attributes) + ;(global as any).diagnosticsMetrics.incrementCounter('graphql_field_requests_total', 1, attributes) + } + + if (failedToResolve) { + throw result + } + + return result + } + + // Execute the wrapped resolver and expect it to throw + await expect(wrappedResolver({}, {}, mockContext, {})).rejects.toThrow('Test error') + + expect(mockContext.graphql.status).toBe('error') + + // Legacy metrics (no latency on error) + expect(mockMetricsAccumulator.batch).toHaveBeenCalledWith( + 'graphql-metric-vtex.render-server@1.x-testField', + undefined, + { error: 1 } + ) + + // Diagnostics metrics (record latency even on error) + const recordLatencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(Array.isArray(recordLatencyCall[0])).toBe(true) // hrtime is an array + expect(recordLatencyCall[1]).toEqual({ + component: 'graphql', + field_name: 'vtex.render-server@1.x-testField', + status: 'error', + }) + + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'graphql_field_requests_total', + 1, + { + component: 'graphql', + field_name: 'vtex.render-server@1.x-testField', + status: 'error', + } + ) + }) + + it('should warn when DiagnosticsMetrics is not available', async () => { + const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation() + delete (global as any).diagnosticsMetrics + + const mockResolver = jest.fn().mockResolvedValue('test-result') + + // Simulate what the Metric directive does (without diagnostics) + const wrappedResolver = async (root: any, args: any, ctx: any, info: any) => { + let failedToResolve = false + let result: any = null + let ellapsed: [number, number] = [0, 0] + + try { + const start = process.hrtime() + result = await mockResolver(root, args, ctx, info) + ellapsed = process.hrtime(start) + } catch (error) { + result = error + failedToResolve = true + } + + const status = failedToResolve ? 'error' : 'success' + ctx.graphql.status = status + const name = 'vtex.render-server@1.x-testField' + + const payload = { + [status]: 1, + } + + // Legacy metrics + ;(global as any).metrics.batch(`graphql-metric-${name}`, failedToResolve ? undefined : ellapsed, payload) + + // New diagnostics metrics + if ((global as any).diagnosticsMetrics) { + const attributes = { + component: 'graphql', + field_name: name, + status, + } + + ;(global as any).diagnosticsMetrics.recordLatency(ellapsed, attributes) + ;(global as any).diagnosticsMetrics.incrementCounter('graphql_field_requests_total', 1, attributes) + } else { + console.warn('DiagnosticsMetrics not available. GraphQL field metrics not reported.') + } + + if (failedToResolve) { + throw result + } + + return result + } + + // Execute the wrapped resolver + await wrappedResolver({}, {}, mockContext, {}) + + expect(consoleWarnSpy).toHaveBeenCalledWith('DiagnosticsMetrics not available. GraphQL field metrics not reported.') + expect(mockMetricsAccumulator.batch).toHaveBeenCalled() // Legacy still works + + consoleWarnSpy.mockRestore() + }) +}) diff --git a/src/service/worker/runtime/graphql/schema/schemaDirectives/Metric.ts b/src/service/worker/runtime/graphql/schema/schemaDirectives/Metric.ts index eb386e426..d4600cb90 100644 --- a/src/service/worker/runtime/graphql/schema/schemaDirectives/Metric.ts +++ b/src/service/worker/runtime/graphql/schema/schemaDirectives/Metric.ts @@ -1,5 +1,6 @@ import { defaultFieldResolver, GraphQLField } from 'graphql' import { SchemaDirectiveVisitor } from 'graphql-tools' +import { Attributes } from '@opentelemetry/api' import { APP } from '../../../../../..' import { GraphQLServiceContext } from '../../typings' @@ -26,14 +27,33 @@ export class Metric extends SchemaDirectiveVisitor { failedToResolve = true } - ctx.graphql.status = failedToResolve ? 'error' : 'success' + const status = failedToResolve ? 'error' : 'success' + ctx.graphql.status = status const payload = { - [ctx.graphql.status]: 1, + [status]: 1, } + // Legacy metrics (backward compatibility) metrics.batch(`graphql-metric-${name}`, failedToResolve ? undefined : ellapsed, payload) + // New diagnostics metrics with stable names and attributes + if (global.diagnosticsMetrics) { + const attributes: Attributes = { + component: 'graphql', + field_name: name, + status, + } + + // Record latency histogram (record all requests, not just successful ones) + global.diagnosticsMetrics.recordLatency(ellapsed, attributes) + + // Increment counter (status is an attribute, not in metric name) + global.diagnosticsMetrics.incrementCounter('graphql_field_requests_total', 1, attributes) + } else { + console.warn('DiagnosticsMetrics not available. GraphQL field metrics not reported.') + } + if (failedToResolve) { throw result } diff --git a/src/service/worker/runtime/http/middlewares/requestStats.test.ts b/src/service/worker/runtime/http/middlewares/requestStats.test.ts new file mode 100644 index 000000000..60f8a625e --- /dev/null +++ b/src/service/worker/runtime/http/middlewares/requestStats.test.ts @@ -0,0 +1,194 @@ +import { EventEmitter } from 'events' +import { DiagnosticsMetrics } from '../../../../../metrics/DiagnosticsMetrics' +import { incomingRequestStats, trackIncomingRequestStats } from './requestStats' + +describe('requestStats', () => { + let mockDiagnosticsMetrics: jest.Mocked + + beforeEach(() => { + // Create mock DiagnosticsMetrics instance + mockDiagnosticsMetrics = { + setGauge: jest.fn(), + incrementCounter: jest.fn(), + recordLatency: jest.fn(), + } as any + + // Set global.diagnosticsMetrics for the tests + global.diagnosticsMetrics = mockDiagnosticsMetrics + + // Clear stats before each test + incomingRequestStats.clear() + }) + + afterEach(() => { + jest.clearAllMocks() + delete (global as any).diagnosticsMetrics + }) + + describe('IncomingRequestStats', () => { + it('should track total requests', () => { + incomingRequestStats.total++ + incomingRequestStats.total++ + + const stats = incomingRequestStats.get() + expect(stats.total).toBe(2) + }) + + it('should track aborted requests', () => { + incomingRequestStats.aborted++ + incomingRequestStats.aborted++ + + const stats = incomingRequestStats.get() + expect(stats.aborted).toBe(2) + }) + + it('should track closed requests', () => { + incomingRequestStats.closed++ + + const stats = incomingRequestStats.get() + expect(stats.closed).toBe(1) + }) + + it('should clear all stats', () => { + incomingRequestStats.total = 5 + incomingRequestStats.aborted = 3 + incomingRequestStats.closed = 2 + + incomingRequestStats.clear() + + const stats = incomingRequestStats.get() + expect(stats).toEqual({ + total: 0, + aborted: 0, + closed: 0, + }) + }) + }) + + describe('trackIncomingRequestStats', () => { + let mockCtx: any + let mockRequest: EventEmitter + let mockNext: jest.Mock + + beforeEach(() => { + mockRequest = new EventEmitter() + mockNext = jest.fn().mockResolvedValue(undefined) + mockCtx = { + req: mockRequest, + status: 200, + vtex: { + route: { + id: 'test-route', + type: 'public', + }, + cancellation: { + cancelable: true, + source: { cancel: jest.fn() }, + cancelled: false, + }, + }, + } + }) + + it('should increment total requests counter and report to diagnostics metrics', async () => { + await trackIncomingRequestStats(mockCtx, mockNext) + + expect(incomingRequestStats.get().total).toBe(1) + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_server_requests_total', + 1, + { + route_id: 'test-route', + route_type: 'public', + status_code: 200, + } + ) + }) + + it('should call next middleware', async () => { + await trackIncomingRequestStats(mockCtx, mockNext) + + expect(mockNext).toHaveBeenCalledTimes(1) + }) + + it('should increment closed counter when request closes and report to diagnostics', async () => { + await trackIncomingRequestStats(mockCtx, mockNext) + + mockRequest.emit('close') + + expect(incomingRequestStats.get().closed).toBe(1) + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_server_requests_closed_total', + 1, + { + route_id: 'test-route', + route_type: 'public', + status_code: 200, + } + ) + }) + + it('should increment aborted counter when request aborts and report to diagnostics', async () => { + await trackIncomingRequestStats(mockCtx, mockNext) + + mockRequest.emit('aborted') + + expect(incomingRequestStats.get().aborted).toBe(1) + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_server_requests_aborted_total', + 1, + { + route_id: 'test-route', + route_type: 'public', + status_code: 200, + } + ) + }) + + it('should cancel request when aborted and cancellation is available', async () => { + await trackIncomingRequestStats(mockCtx, mockNext) + + mockRequest.emit('aborted') + + expect(mockCtx.vtex.cancellation.source.cancel).toHaveBeenCalledWith('Request cancelled') + expect(mockCtx.vtex.cancellation.cancelled).toBe(true) + }) + + it('should handle multiple events correctly', async () => { + await trackIncomingRequestStats(mockCtx, mockNext) + + mockRequest.emit('close') + mockRequest.emit('aborted') + + expect(incomingRequestStats.get().total).toBe(1) + expect(incomingRequestStats.get().closed).toBe(1) + expect(incomingRequestStats.get().aborted).toBe(1) + }) + + it('should work without global.diagnosticsMetrics', async () => { + delete (global as any).diagnosticsMetrics + + // Should not throw + await expect(trackIncomingRequestStats(mockCtx, mockNext)).resolves.not.toThrow() + + expect(incomingRequestStats.get().total).toBe(1) + }) + + it('should handle request events without diagnostics metrics', async () => { + delete (global as any).diagnosticsMetrics + + await trackIncomingRequestStats(mockCtx, mockNext) + + // Should not throw when events are emitted + expect(() => mockRequest.emit('close')).not.toThrow() + expect(() => mockRequest.emit('aborted')).not.toThrow() + + expect(incomingRequestStats.get().closed).toBe(1) + expect(incomingRequestStats.get().aborted).toBe(1) + }) + }) +}) + + + + diff --git a/src/service/worker/runtime/http/middlewares/requestStats.ts b/src/service/worker/runtime/http/middlewares/requestStats.ts index 6510968c8..89153e141 100644 --- a/src/service/worker/runtime/http/middlewares/requestStats.ts +++ b/src/service/worker/runtime/http/middlewares/requestStats.ts @@ -25,8 +25,25 @@ class IncomingRequestStats { export const incomingRequestStats = new IncomingRequestStats() -const requestClosed = () => { +const requestClosed = < + T extends IOClients, + U extends RecorderState, + V extends ParamsContext +>(ctx: ServiceContext) => () => { incomingRequestStats.closed++ + + // Report to diagnostics metrics (cumulative counter) + const { status: statusCode, vtex: { route: { id, type } } } = ctx + + if (global.diagnosticsMetrics) { + global.diagnosticsMetrics.incrementCounter('http_server_requests_closed_total', 1, { + route_id: id, + route_type: type, + status_code: statusCode, + }) + } else { + console.warn('DiagnosticsMetrics not available. Request closed metric not reported.') + } } const requestAborted = < T extends IOClients, @@ -35,6 +52,19 @@ const requestAborted = < >(ctx: ServiceContext) => () => { incomingRequestStats.aborted++ + // Report to diagnostics metrics (cumulative counter) + const { status: statusCode, vtex: { route: { id, type } } } = ctx + + if (global.diagnosticsMetrics) { + global.diagnosticsMetrics.incrementCounter('http_server_requests_aborted_total', 1, { + route_id: id, + route_type: type, + status_code: statusCode, + }) + } else { + console.warn('DiagnosticsMetrics not available. Request aborted metric not reported.') + } + if (ctx.vtex.cancellation && ctx.vtex.cancellation.cancelable) { ctx.vtex.cancellation.source.cancel(cancelMessage) ctx.vtex.cancellation.cancelled = true @@ -46,8 +76,21 @@ export async function trackIncomingRequestStats < U extends RecorderState, V extends ParamsContext > (ctx: ServiceContext, next: () => Promise) { - ctx.req.on('close', requestClosed) + ctx.req.on('close', requestClosed(ctx)) ctx.req.on('aborted', requestAborted(ctx)) incomingRequestStats.total++ + + // Report total requests to diagnostics metrics (cumulative counter) + const { status: statusCode, vtex: { route: { id, type } } } = ctx + if (global.diagnosticsMetrics) { + global.diagnosticsMetrics.incrementCounter('http_server_requests_total', 1, { + route_id: id, + route_type: type, + status_code: statusCode, + }) + } else { + console.warn('DiagnosticsMetrics not available. Request total metric not reported.') + } + await next() } diff --git a/src/service/worker/runtime/http/middlewares/timings.test.ts b/src/service/worker/runtime/http/middlewares/timings.test.ts new file mode 100644 index 000000000..1984af4c8 --- /dev/null +++ b/src/service/worker/runtime/http/middlewares/timings.test.ts @@ -0,0 +1,256 @@ +// Mock @vtex/diagnostics-nodejs before any imports +jest.mock('@vtex/diagnostics-nodejs', () => ({ + Types: {}, + getMetricClient: jest.fn(), + getLogger: jest.fn(), +})) + +jest.mock('../../../../../service/metrics/client', () => ({ + getMetricClient: jest.fn(), +})) + +import { DiagnosticsMetrics } from '../../../../../metrics/DiagnosticsMetrics' +import { timings } from './timings' + +describe('timings middleware', () => { + let mockDiagnosticsMetrics: jest.Mocked + let mockNext: jest.Mock + let mockCtx: any + let consoleLogSpy: jest.SpyInstance + + beforeEach(() => { + // Mock DiagnosticsMetrics + mockDiagnosticsMetrics = { + recordLatency: jest.fn(), + incrementCounter: jest.fn(), + setGauge: jest.fn(), + } as any + + // Set up global + global.diagnosticsMetrics = mockDiagnosticsMetrics + + // Mock global.metrics for legacy support + ;(global as any).metrics = { + batch: jest.fn(), + } + + // Mock next function + mockNext = jest.fn().mockResolvedValue(undefined) + + // Mock console.log to avoid test output noise + consoleLogSpy = jest.spyOn(console, 'log').mockImplementation() + + // Mock context + mockCtx = { + status: 200, + method: 'GET', + path: '/test', + vtex: { + account: 'testaccount', + workspace: 'master', + production: true, + route: { + id: 'test-route', + type: 'public', + }, + }, + timings: { + total: [1, 500000000], // 1.5 seconds + }, + } + }) + + afterEach(() => { + jest.clearAllMocks() + consoleLogSpy.mockRestore() + delete (global as any).diagnosticsMetrics + delete (global as any).metrics + }) + + describe('successful requests', () => { + it('should record metrics for successful request', async () => { + await timings(mockCtx, mockNext) + + // Diagnostics metrics + expect(mockDiagnosticsMetrics.recordLatency).toHaveBeenCalledWith( + [1, 500000000], + expect.objectContaining({ + component: 'http-handler', + route_id: 'test-route', + route_type: 'public', + status_code: 200, + status: 'success', + }) + ) + + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_handler_requests_total', + 1, + expect.objectContaining({ + component: 'http-handler', + route_id: 'test-route', + route_type: 'public', + status_code: 200, + status: 'success', + }) + ) + }) + + it('should log timing and billing information', async () => { + await timings(mockCtx, mockNext) + + // Check console.log was called twice (timing log + billing log) + expect(consoleLogSpy).toHaveBeenCalledTimes(2) + + // Verify billing info structure + const billingCall = consoleLogSpy.mock.calls[1][0] + const billingInfo = JSON.parse(billingCall) + + expect(billingInfo).toMatchObject({ + __VTEX_IO_BILLING: 'true', + account: 'testaccount', + handler: 'test-route', + production: true, + routeType: 'public_route', + type: 'process-time', + workspace: 'master', + }) + expect(billingInfo.value).toBeGreaterThan(0) // millis + }) + + it('should maintain legacy metrics compatibility', async () => { + await timings(mockCtx, mockNext) + + // Verify legacy metrics.batch was called + expect((global as any).metrics.batch).toHaveBeenCalledWith( + 'http-handler-test-route', + [1, 500000000], + { success: 1 } + ) + }) + }) + + describe('error responses', () => { + it('should record metrics for 4xx errors', async () => { + mockCtx.status = 404 + + await timings(mockCtx, mockNext) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: '4xx', + }) + + // Counter with status as attribute + expect(mockDiagnosticsMetrics.incrementCounter).toHaveBeenCalledWith( + 'http_handler_requests_total', + 1, + expect.objectContaining({ + component: 'http-handler', + route_id: 'test-route', + route_type: 'public', + status_code: 404, + status: '4xx', + }) + ) + }) + + it('should record metrics for 5xx errors', async () => { + mockCtx.status = 500 + + await timings(mockCtx, mockNext) + + expect(mockDiagnosticsMetrics.recordLatency).toHaveBeenCalledWith( + expect.any(Array), + expect.objectContaining({ + status: 'error', + }) + ) + + // Verify legacy only batches successful responses (no hrtime for errors) + expect((global as any).metrics.batch).toHaveBeenCalledWith( + 'http-handler-test-route', + undefined, + { error: 1 } + ) + }) + }) + + describe('route types', () => { + it('should correctly identify private routes in billing info', async () => { + mockCtx.vtex.route.type = 'private' + + await timings(mockCtx, mockNext) + + const billingCall = consoleLogSpy.mock.calls[1][0] + const billingInfo = JSON.parse(billingCall) + + expect(billingInfo.routeType).toBe('private_route') + }) + }) + + describe('graceful degradation', () => { + it('should work without global.diagnosticsMetrics', async () => { + delete (global as any).diagnosticsMetrics + + const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation() + + await timings(mockCtx, mockNext) + + // Legacy metrics still work + expect((global as any).metrics.batch).toHaveBeenCalled() + + // Warning logged + expect(consoleWarnSpy).toHaveBeenCalledWith( + 'DiagnosticsMetrics not available. HTTP handler metrics not reported.' + ) + + consoleWarnSpy.mockRestore() + }) + }) + + describe('different status codes', () => { + const testCases = [ + { status: 200, expected: 'success' }, + { status: 201, expected: 'success' }, + { status: 204, expected: 'success' }, + { status: 301, expected: '3xx' }, // statusLabel returns range-based labels + { status: 302, expected: '3xx' }, + { status: 400, expected: '4xx' }, + { status: 401, expected: '4xx' }, + { status: 403, expected: '4xx' }, + { status: 404, expected: '4xx' }, + { status: 500, expected: 'error' }, + { status: 502, expected: 'error' }, + { status: 503, expected: 'error' }, + ] + + testCases.forEach(({ status, expected }) => { + it(`should categorize status ${status} as ${expected}`, async () => { + mockCtx.status = status + + await timings(mockCtx, mockNext) + + const latencyCall = mockDiagnosticsMetrics.recordLatency.mock.calls[0] + expect(latencyCall[1]).toMatchObject({ + status: expected, + }) + }) + }) + }) + + describe('middleware execution', () => { + it('should call next() before recording metrics', async () => { + let nextCalled = false + mockNext.mockImplementation(async () => { + nextCalled = true + }) + + await timings(mockCtx, mockNext) + + expect(nextCalled).toBe(true) + expect(mockNext).toHaveBeenCalledTimes(1) + }) + }) +}) + diff --git a/src/service/worker/runtime/http/middlewares/timings.ts b/src/service/worker/runtime/http/middlewares/timings.ts index 6d000053d..98c1ed2f8 100644 --- a/src/service/worker/runtime/http/middlewares/timings.ts +++ b/src/service/worker/runtime/http/middlewares/timings.ts @@ -1,4 +1,5 @@ import chalk from 'chalk' +import { Attributes } from '@opentelemetry/api' import { IOClients } from '../../../../../clients/IOClients' import { APP, LINKED, PID } from '../../../../../constants' @@ -63,12 +64,33 @@ export async function timings < // Errors will be caught by the next middleware so we don't have to catch. await next() - const { status: statusCode, vtex: { route: { id } }, timings: {total}, vtex } = ctx + const { status: statusCode, vtex: { route: { id, type } }, timings: {total}, vtex } = ctx const totalMillis = hrToMillis(total) console.log(log(ctx, totalMillis)) console.log(logBillingInfo(vtex, totalMillis)) const status = statusLabel(statusCode) + + // Legacy metrics (backward compatibility) // Only batch successful responses so metrics don't consider errors metrics.batch(`http-handler-${id}`, status === 'success' ? total : undefined, { [status]: 1 }) + + // New diagnostics metrics with stable names and attributes + if (global.diagnosticsMetrics) { + const attributes: Attributes = { + component: 'http-handler', + route_id: id, + route_type: type, + status_code: statusCode, + status, + } + + // Record latency histogram (record all requests, not just successful ones) + global.diagnosticsMetrics.recordLatency(total, attributes) + + // Increment counter (status is an attribute, not in metric name) + global.diagnosticsMetrics.incrementCounter('http_handler_requests_total', 1, attributes) + } else { + console.warn('DiagnosticsMetrics not available. HTTP handler metrics not reported.') + } } diff --git a/src/service/worker/runtime/statusTrack.ts b/src/service/worker/runtime/statusTrack.ts index 60ea53752..34e1caf2b 100644 --- a/src/service/worker/runtime/statusTrack.ts +++ b/src/service/worker/runtime/statusTrack.ts @@ -1,6 +1,7 @@ import cluster from 'cluster' import { ACCOUNT, APP, LINKED, PRODUCTION, WORKSPACE } from '../../../constants' +import { HttpAgentSingleton } from '../../../HttpClient/middlewares/request/HttpAgentSingleton' import { ServiceContext } from './typings' export type StatusTrack = () => EnvMetric[] @@ -33,6 +34,10 @@ export const statusTrackHandler = async (ctx: ServiceContext) => { } export const trackStatus = () => { + // Update diagnostics metrics (gauges for HTTP agent stats) + HttpAgentSingleton.updateHttpAgentMetrics() + + // Legacy status tracking (console.log export) global.metrics.statusTrack().forEach(status => { logStatus(status) })