Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 41 additions & 4 deletions examples/frontdesk/frontdesk_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@
function_tool,
inference,
)
from livekit.agents.evals import (
JudgeGroup,
accuracy_judge,
coherence_judge,
conciseness_judge,
handoff_judge,
relevancy_judge,
safety_judge,
task_completion_judge,
tool_use_judge,
)
from livekit.plugins import silero
from livekit.plugins.turn_detector.multilingual import MultilingualModel

Expand All @@ -34,6 +45,7 @@
@dataclass
class Userdata:
cal: Calendar
appointment_booked: bool = False


logger = logging.getLogger("front-desk")
Expand Down Expand Up @@ -96,6 +108,8 @@ async def schedule_appointment(
# Tell the LLM this slot isn't available anymore
raise ToolError("This slot isn't available anymore") from None

ctx.userdata.appointment_booked = True

local = slot.start_time.astimezone(self.tz)
return f"The appointment was successfully scheduled for {local.strftime('%A, %B %d, %Y at %H:%M %Z')}."

Expand Down Expand Up @@ -160,12 +174,35 @@ async def list_available_slots(


async def on_session_end(ctx: JobContext) -> None:
# import json
report = ctx.make_session_report()

# Skip evaluation for very short conversations
chat = report.chat_history.copy(exclude_function_call=True, exclude_instructions=True)
if len(chat.items) < 3:
return

judges = JudgeGroup(
llm="openai/gpt-4o-mini",
judges=[
task_completion_judge(),
accuracy_judge(),
tool_use_judge(),
handoff_judge(),
safety_judge(),
relevancy_judge(),
coherence_judge(),
conciseness_judge(),
],
)

# report = ctx.make_session_report()
# report_json = json.dumps(report.to_cloud_data(), indent=2)
await judges.evaluate(report.chat_history)

if ctx.primary_session.userdata.appointment_booked:
ctx.tagger.success()
else:
ctx.tagger.fail(reason="Appointment was not booked")

pass
logger.info("session tags: %s", ctx.tagger.tags)


@server.rtc_session(on_session_end=on_session_end)
Expand Down
4 changes: 4 additions & 0 deletions livekit-agents/livekit/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
get_job_context,
)
from .llm.chat_context import (
AgentConfigUpdate,
AgentHandoff,
ChatContent,
ChatContext,
ChatItem,
Expand Down Expand Up @@ -149,6 +151,8 @@ def __getattr__(name: str) -> typing.Any:
"FunctionToolsExecutedEvent",
"FunctionCall",
"FunctionCallOutput",
"AgentConfigUpdate",
"AgentHandoff",
"StopResponse",
"ToolError",
"RunContext",
Expand Down
38 changes: 38 additions & 0 deletions livekit-agents/livekit/agents/evals/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from .evaluation import (
EvaluationResult,
Evaluator,
JudgeGroup,
)
from .judge import (
Judge,
JudgmentResult,
Verdict,
accuracy_judge,
coherence_judge,
conciseness_judge,
handoff_judge,
relevancy_judge,
safety_judge,
task_completion_judge,
tool_use_judge,
)

__all__ = [
# Evaluation
"EvaluationResult",
"Evaluator",
"JudgeGroup",
# Core types
"Judge",
"JudgmentResult",
"Verdict",
# Built-in judges
"accuracy_judge",
"coherence_judge",
"conciseness_judge",
"handoff_judge",
"relevancy_judge",
"safety_judge",
"task_completion_judge",
"tool_use_judge",
]
165 changes: 165 additions & 0 deletions livekit-agents/livekit/agents/evals/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
from __future__ import annotations

import asyncio
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Protocol

from ..llm import LLM, ChatContext
from .judge import JudgmentResult

if TYPE_CHECKING:
from ..inference import LLMModels


class Evaluator(Protocol):
"""Protocol for any object that can evaluate a conversation."""

@property
def name(self) -> str:
"""Name identifying this evaluator."""
...

async def evaluate(
self,
*,
chat_ctx: ChatContext,
reference: ChatContext | None = None,
llm: LLM | None = None,
) -> JudgmentResult: ...


@dataclass
class EvaluationResult:
"""Result of evaluating a conversation with a group of judges."""

judgments: dict[str, JudgmentResult] = field(default_factory=dict)
"""Individual judgment results keyed by judge name."""

@property
def score(self) -> float:
"""Score from 0.0 to 1.0. Pass=1, maybe=0.5, fail=0."""
if not self.judgments:
return 0.0
total = 0.0
for j in self.judgments.values():
if j.passed:
total += 1.0
elif j.uncertain:
total += 0.5
return total / len(self.judgments)

@property
def all_passed(self) -> bool:
"""True if all judgments passed. Maybes count as not passed."""
return all(j.passed for j in self.judgments.values())

@property
def any_passed(self) -> bool:
"""True if at least one judgment passed."""
return any(j.passed for j in self.judgments.values())

@property
def majority_passed(self) -> bool:
"""True if more than half of the judgments passed."""
if not self.judgments:
return True
return self.score > len(self.judgments) / 2

@property
def none_failed(self) -> bool:
"""True if no judgments explicitly failed. Maybes are allowed."""
return not any(j.failed for j in self.judgments.values())

class JudgeGroup:
"""A group of judges that evaluate conversations together.

Automatically tags the session with judgment results when called within a job context.

Example:
```python
async def on_session_end(ctx: JobContext) -> None:
judges = JudgeGroup(
llm="openai/gpt-4o-mini",
judges=[
task_completion_judge(),
accuracy_judge(),
],
)

report = ctx.make_session_report()
result = await judges.evaluate(report.chat_history)
# Results are automatically tagged to the session
```
"""

def __init__(
self,
*,
llm: LLM | LLMModels | str,
judges: list[Evaluator] | None = None,
) -> None:
"""Initialize a JudgeGroup.

Args:
llm: The LLM to use for evaluation. Can be an LLM instance or a model
string like "openai/gpt-4o-mini" (uses LiveKit inference gateway).
judges: The judges to run during evaluation.
"""
if isinstance(llm, str):
from ..inference import LLM as InferenceLLM

self._llm: LLM = InferenceLLM(llm)
else:
self._llm = llm

self._judges = judges or []

@property
def llm(self) -> LLM:
"""The LLM used for evaluation."""
return self._llm

@property
def judges(self) -> list[Evaluator]:
"""The judges to run during evaluation."""
return self._judges

async def evaluate(
self,
chat_ctx: ChatContext,
*,
reference: ChatContext | None = None,
) -> EvaluationResult:
"""Evaluate a conversation with all judges.

Automatically tags the session with results when called within a job context.

Args:
chat_ctx: The conversation to evaluate.
reference: Optional reference conversation for comparison.

Returns:
EvaluationResult containing all judgment results.
"""
from ..job import get_job_context

# Run all judges concurrently
async def run_judge(judge: Evaluator) -> tuple[str, JudgmentResult]:
result = await judge.evaluate(
chat_ctx=chat_ctx,
reference=reference,
llm=self._llm,
)
return judge.name, result

results = await asyncio.gather(*[run_judge(j) for j in self._judges])
evaluation_result = EvaluationResult(judgments=dict(results))

# Auto-tag if running within a job context
try:
ctx = get_job_context()
ctx.tagger._evaluation(evaluation_result)
except RuntimeError:
pass # Not in a job context, skip tagging

return evaluation_result
Loading
Loading