Skip to content

Commit 05e6e69

Browse files
authored
ci(llmobs): add evaluation metric assertion helper using node:assert (#7227)
* add evaluation metrics assertion helper * remove .only
1 parent 64b7e4a commit 05e6e69

11 files changed

Lines changed: 165 additions & 51 deletions

File tree

packages/dd-trace/test/llmobs/plugins/ai/index.spec.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ describe('Plugin', () => {
2626
OPENAI_API_KEY: '<not-a-real-key>'
2727
})
2828

29-
const getEvents = useLlmObs({ plugin: 'ai' })
29+
const { getEvents } = useLlmObs({ plugin: 'ai' })
3030

3131
withVersions('ai', 'ai', range, (version, _, realVersion) => {
3232
let ai

packages/dd-trace/test/llmobs/plugins/anthropic/index.spec.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ describe('Plugin', () => {
4242
ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY || '<not-a-real-key>'
4343
})
4444

45-
const getEvents = useLlmObs({ plugin: 'anthropic' })
45+
const { getEvents } = useLlmObs({ plugin: 'anthropic' })
4646

4747
withVersions('anthropic', '@anthropic-ai/sdk', (version, moduleName, realVersion) => {
4848
let client

packages/dd-trace/test/llmobs/plugins/aws-sdk/bedrockruntime.spec.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ describe('Plugin', () => {
2222
AWS_ACCESS_KEY_ID: '00000000000000000000'
2323
})
2424

25-
const getEvents = useLlmObs({ plugin: 'aws-sdk' })
25+
const { getEvents } = useLlmObs({ plugin: 'aws-sdk' })
2626

2727
withVersions('aws-sdk', ['@aws-sdk/smithy-client', 'aws-sdk'], '>=3', (version, moduleName) => {
2828
let AWS

packages/dd-trace/test/llmobs/plugins/google-cloud-vertexai/index.spec.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ describe('integrations', () => {
7575
}
7676

7777
describe('vertexai', () => {
78-
const getEvents = useLlmObs({ plugin: 'google-cloud-vertexai' })
78+
const { getEvents } = useLlmObs({ plugin: 'google-cloud-vertexai' })
7979

8080
withVersions('google-cloud-vertexai', '@google-cloud/vertexai', '>=1', version => {
8181
before(() => {

packages/dd-trace/test/llmobs/plugins/google-genai/index.spec.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ const {
1212
} = require('../../util')
1313

1414
describe('Plugin', () => {
15-
const getEvents = useLlmObs({ plugin: 'google-genai' })
15+
const { getEvents } = useLlmObs({ plugin: 'google-genai' })
1616

1717
withVersions('google-genai', '@google/genai', (version) => {
1818
let client

packages/dd-trace/test/llmobs/plugins/langchain/index.spec.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ describe('integrations', () => {
7474
}
7575

7676
describe('langchain', () => {
77-
const getEvents = useLlmObs({ plugin: 'langchain' })
77+
const { getEvents } = useLlmObs({ plugin: 'langchain' })
7878

7979
before(async () => {
8080
iastFilter.isDdTrace = file => {

packages/dd-trace/test/llmobs/plugins/openai/openaiv3.spec.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ describe('integrations', () => {
1616
let openai
1717

1818
describe('openai', () => {
19-
const getEvents = useLlmObs({ plugin: 'openai', closeOptions: { wipe: true } })
19+
const { getEvents } = useLlmObs({ plugin: 'openai', closeOptions: { wipe: true } })
2020

2121
withVersions('openai', 'openai', '<4', version => {
2222
const moduleRequirePath = `../../../../../../versions/openai@${version}`

packages/dd-trace/test/llmobs/plugins/openai/openaiv4.spec.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ describe('integrations', () => {
2020
let deepseekOpenai
2121

2222
describe('openai', () => {
23-
const getEvents = useLlmObs({ plugin: 'openai', closeOptions: { wipe: true } })
23+
const { getEvents } = useLlmObs({ plugin: 'openai', closeOptions: { wipe: true } })
2424

2525
withVersions('openai', 'openai', '>=4', version => {
2626
const moduleRequirePath = `../../../../../../versions/openai@${version}`

packages/dd-trace/test/llmobs/sdk/integration.spec.js

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ const { after, afterEach, before, beforeEach, describe, it } = require('mocha')
66
const sinon = require('sinon')
77

88
const agent = require('../../plugins/agent')
9-
const { useLlmObs, assertLlmObsSpanEvent } = require('../util')
9+
const { useLlmObs, assertLlmObsSpanEvent, assertLlmObsEvaluationMetric } = require('../util')
1010
function getTag (llmobsSpan, tagName) {
1111
const tag = llmobsSpan.tags.find(tag => tag.split(':')[0] === tagName)
1212
return tag?.split(':')[1]
@@ -16,7 +16,7 @@ describe('end to end sdk integration tests', () => {
1616
let tracer
1717
let llmobs
1818

19-
const getEvents = useLlmObs()
19+
const { getEvents, getEvaluationMetrics } = useLlmObs()
2020

2121
before(() => {
2222
tracer = require('../../../../dd-trace')
@@ -120,38 +120,38 @@ describe('end to end sdk integration tests', () => {
120120
Date.now.restore()
121121
})
122122

123-
// TODO(sabrenner): follow-up on re-enabling this test in a different PR
124-
it.skip('submits evaluations', () => {
123+
it('submits evaluations', async () => {
125124
llmobs.trace({ kind: 'agent', name: 'myAgent' }, () => {
126125
llmobs.annotate({ inputData: 'hello', outputData: 'world' })
127126
const spanCtx = llmobs.exportSpan()
128127
llmobs.submitEvaluation(spanCtx, {
129128
label: 'foo',
130129
metricType: 'categorical',
131-
value: 'bar'
130+
value: 'bar',
131+
tags: {
132+
foo: 'bar'
133+
}
132134
})
133135
})
134136

135-
// const { spans, llmobsSpans, evaluationMetrics } = run(payloadGenerator)
136-
// assert.strictEqual(spans.length, 1)
137-
// assert.strictEqual(llmobsSpans.length, 1)
138-
// assert.strictEqual(evaluationMetrics.length, 1)
139-
140-
// // check eval metrics content
141-
// const expected = [
142-
// {
143-
// trace_id: spans[0].context().toTraceId(true),
144-
// span_id: spans[0].context().toSpanId(),
145-
// label: 'foo',
146-
// metric_type: 'categorical',
147-
// categorical_value: 'bar',
148-
// ml_app: 'test',
149-
// timestamp_ms: 1234567890,
150-
// tags: [`ddtrace.version:${tracerVersion}`, 'ml_app:test']
151-
// }
152-
// ]
153-
154-
// check(expected, evaluationMetrics)
137+
const { apmSpans, llmobsSpans } = await getEvents()
138+
const llmobsEvaluationMetrics = await getEvaluationMetrics()
139+
140+
assert.equal(apmSpans.length, 1)
141+
assert.equal(llmobsSpans.length, 1)
142+
assert.equal(llmobsEvaluationMetrics.length, 1)
143+
144+
assertLlmObsEvaluationMetric(llmobsEvaluationMetrics[0], {
145+
traceId: llmobsSpans[0].trace_id,
146+
spanId: llmobsSpans[0].span_id,
147+
label: 'foo',
148+
metricType: 'categorical',
149+
mlApp: 'test',
150+
value: 'bar',
151+
tags: {
152+
foo: 'bar'
153+
}
154+
})
155155
})
156156
})
157157

packages/dd-trace/test/llmobs/util.js

Lines changed: 109 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,19 @@ const MOCK_NOT_NULLISH = Symbol('not-nullish')
3737
* }} ExpectedLLMObsSpanEvent
3838
*/
3939

40+
/**
41+
* @typedef {{
42+
* label: string,
43+
* traceId: string,
44+
* spanId: string,
45+
* metricType: 'categorical' | 'score',
46+
* mlApp: string,
47+
* timestamp?: number,
48+
* value: string | number,
49+
* tags?: { [key: string]: string },
50+
* }} ExpectedLLMObsEvaluationMetrics
51+
*/
52+
4053
/**
4154
*
4255
* @param {object} actual
@@ -202,6 +215,11 @@ function assertLlmObsSpanEvent (actual, expected) {
202215
const expectedTags = expectedLLMObsTags({ span, tags, error, sessionId })
203216
const sortedExpectedTags = [...expectedTags.sort()]
204217
const sortedActualTags = [...actualTags.sort()]
218+
if (sortedExpectedTags.length !== sortedActualTags.length) {
219+
assert.fail(
220+
`tags have different length than expected (${sortedExpectedTags.length} !== ${sortedActualTags.length})`
221+
)
222+
}
205223
for (let i = 0; i < sortedExpectedTags.length; i++) {
206224
assert.equal(
207225
sortedActualTags[i],
@@ -251,6 +269,66 @@ function assertLlmObsSpanEvent (actual, expected) {
251269
assert.deepStrictEqual(actual, expectedSpanEvent)
252270
}
253271

272+
/**
273+
* Asserts that the actual LLMObs evaluation metric matches the evaluation metric created from the expected fields.
274+
*
275+
* Dynamic fields, like tags and timestamp, can be asserted with mock values.
276+
* @param {object} actual
277+
* @param {ExpectedLLMObsEvaluationMetrics} expected
278+
*/
279+
function assertLlmObsEvaluationMetric (actual, expected) {
280+
const {
281+
label,
282+
traceId = MOCK_STRING,
283+
spanId = MOCK_STRING,
284+
metricType,
285+
mlApp,
286+
timestamp = MOCK_NUMBER,
287+
value,
288+
tags
289+
} = expected
290+
291+
const actualTags = actual.tags
292+
const actualTimestamp = actual.timestamp_ms
293+
294+
delete actual.tags
295+
delete actual.timestamp_ms
296+
297+
assertWithMockValues(actualTimestamp, timestamp, 'timestamp_ms')
298+
299+
const expectedTags = [
300+
`ddtrace.version:${tracerVersion}`,
301+
`ml_app:${mlApp}`,
302+
...Object.entries(tags ?? {}).map(([key, value]) => `${key}:${value}`),
303+
]
304+
const sortedExpectedTags = [...expectedTags.sort()]
305+
const sortedActualTags = [...actualTags.sort()]
306+
if (sortedExpectedTags.length !== sortedActualTags.length) {
307+
assert.fail(
308+
`Tags have different length than expected (${sortedExpectedTags.length} !== ${sortedActualTags.length}).
309+
Diff: ${util.inspect(sortedExpectedTags)} !== ${util.inspect(sortedActualTags)}.`
310+
)
311+
}
312+
for (let i = 0; i < sortedExpectedTags.length; i++) {
313+
assert.equal(
314+
sortedActualTags[i],
315+
sortedExpectedTags[i],
316+
`tags[${i}] does not match expected (${sortedExpectedTags[i]} !== ${sortedActualTags[i]})`
317+
)
318+
}
319+
320+
const expectedEvaluationMetric = {
321+
span_id: spanId,
322+
trace_id: traceId,
323+
label,
324+
metric_type: metricType,
325+
ml_app: mlApp,
326+
[`${metricType}_value`]: value,
327+
}
328+
329+
assert.deepStrictEqual(actual, expectedEvaluationMetric)
330+
}
331+
254332
function expectedLLMObsTags ({
255333
span,
256334
error,
@@ -294,7 +372,10 @@ function fromBuffer (spanProperty, isNumber = false) {
294372
* @param {string} options.plugin
295373
* @param {object} options.tracerConfigOptions
296374
* @param {object} options.closeOptions
297-
* @returns {function(): Promise<{ apmSpans: Array, llmobsSpans: Array }>}
375+
* @returns {{
376+
* getEvents: () => Promise<{ apmSpans: Array<object>, llmobsSpans: Array<object> }>,
377+
* getEvaluationMetrics: () => Promise<Array<ExpectedLLMObsEvaluationMetrics>>
378+
* }}
298379
*/
299380
function useLlmObs ({
300381
plugin,
@@ -332,24 +413,33 @@ function useLlmObs ({
332413
return agent.close({ ritmReset: false, ...closeOptions })
333414
})
334415

335-
return async function (numLlmObsSpans = 1) {
336-
// get apm spans from the agent
337-
const apmSpans = await apmTracesPromise
338-
resetTracesPromises()
339-
340-
// get llmobs span events requests from the agent
341-
// because llmobs process spans on span finish and submits periodically,
342-
// we need to aggregate all of the span events
343-
// tests should know how many spans they expect to see, otherwise tests will timeout
344-
const llmobsSpans = []
345-
346-
while (llmobsSpans.length < numLlmObsSpans) {
347-
await new Promise(resolve => setImmediate(resolve))
348-
const llmobsSpanEventsRequests = agent.getLlmObsSpanEventsRequests(true)
349-
llmobsSpans.push(...getLlmObsSpansFromRequests(llmobsSpanEventsRequests))
416+
return {
417+
getEvents: async function (numLlmObsSpans = 1) {
418+
// get apm spans from the agent
419+
const apmSpans = await apmTracesPromise
420+
resetTracesPromises()
421+
422+
// get llmobs span events requests from the agent
423+
// because llmobs process spans on span finish and submits periodically,
424+
// we need to aggregate all of the span events
425+
// tests should know how many spans they expect to see, otherwise tests will timeout
426+
const llmobsSpans = []
427+
428+
while (llmobsSpans.length < numLlmObsSpans) {
429+
await new Promise(resolve => setImmediate(resolve))
430+
const llmobsSpanEventsRequests = agent.getLlmObsSpanEventsRequests(true)
431+
llmobsSpans.push(...getLlmObsSpansFromRequests(llmobsSpanEventsRequests))
432+
}
433+
434+
return { apmSpans, llmobsSpans: llmobsSpans.sort((a, b) => a.start_ns - b.start_ns) }
435+
},
436+
437+
getEvaluationMetrics: function () {
438+
const evaluationMetricsRequests = agent.getLlmObsEvaluationMetricsRequests(true)
439+
return evaluationMetricsRequests
440+
.flatMap(request => request.data.attributes.metrics)
441+
.sort((a, b) => a.timestamp_ms - b.timestamp_ms)
350442
}
351-
352-
return { apmSpans, llmobsSpans: llmobsSpans.sort((a, b) => a.start_ns - b.start_ns) }
353443
}
354444
}
355445

@@ -405,6 +495,7 @@ function assertPromptTracking (
405495
}
406496

407497
module.exports = {
498+
assertLlmObsEvaluationMetric,
408499
assertLlmObsSpanEvent,
409500
assertPromptTracking,
410501
useLlmObs,

0 commit comments

Comments
 (0)