Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
ce60b71
add AMD interface
chenghao-mou Feb 5, 2026
26d3aaa
add amd event emitter
chenghao-mou Feb 9, 2026
5672b37
update event emitter
chenghao-mou Feb 9, 2026
7e67624
add amd hold
chenghao-mou Feb 19, 2026
1786e0c
refactor amd result and example
chenghao-mou Feb 20, 2026
1131225
Merge branch 'main' into feat/amd
chenghao-mou Feb 20, 2026
b69884e
minor fixes
chenghao-mou Feb 20, 2026
44d845f
clean up example
chenghao-mou Feb 20, 2026
a3222bd
clean up and refactoring
chenghao-mou Mar 3, 2026
404bc2b
fix example
chenghao-mou Mar 3, 2026
3be962f
disable AMD after first use
chenghao-mou Mar 3, 2026
8517423
Merge branch 'main' into feat/amd
chenghao-mou Mar 15, 2026
0ee8791
refactoring
chenghao-mou Mar 15, 2026
1b78498
use function call for prediction
chenghao-mou Mar 15, 2026
9dc196c
exclude silence from speech duration calculation
chenghao-mou Mar 15, 2026
56c0b38
minor refactoring
chenghao-mou Mar 16, 2026
9baf7df
more fixes
chenghao-mou Mar 16, 2026
865737a
Merge branch 'main' into feat/amd
chenghao-mou Apr 2, 2026
3c845c3
fix type issues
chenghao-mou Apr 2, 2026
348e1a3
move amd out of session
chenghao-mou Apr 2, 2026
25bcf65
use llm prediction when timing out
chenghao-mou Apr 2, 2026
0feb3e7
refactor based on feedback
chenghao-mou Apr 2, 2026
46eca3c
reformat
chenghao-mou Apr 2, 2026
772b731
add docstring
chenghao-mou Apr 2, 2026
0b4bd23
update to use str enum
chenghao-mou Apr 2, 2026
8790091
use string match in example instead
chenghao-mou Apr 2, 2026
a7925e2
rename amd -> machine detection/detector
chenghao-mou Apr 3, 2026
382c9b4
address comments
chenghao-mou Apr 3, 2026
efa2286
clean up autocomplete error
chenghao-mou Apr 3, 2026
265bdf1
rename to amd
chenghao-mou Apr 3, 2026
91dadc0
address comments
chenghao-mou Apr 3, 2026
4cfcae9
rename
chenghao-mou Apr 3, 2026
6751e9a
address more comments
chenghao-mou Apr 3, 2026
8bed855
fix type issues
chenghao-mou Apr 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
87 changes: 87 additions & 0 deletions examples/telephony/amd.py
Comment thread
chenghao-mou marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import logging

from dotenv import load_dotenv

from livekit.agents import (
AMD,
Agent,
AgentServer,
AgentSession,
JobContext,
JobProcess,
cli,
inference,
)
from livekit.plugins import silero
from livekit.plugins.turn_detector.multilingual import MultilingualModel

logger = logging.getLogger("basic-agent")

load_dotenv("../agents/.env")


class MyAgent(Agent):
def __init__(self) -> None:
super().__init__(
instructions=(
"You are reaching out to a customer with a phone call. "
"You are calling to see if they are home. "
"You might encounter an answering machine with a DTMF menu or IVR system. "
"If you do, you will try to leave a message to ask them to call back."
),
)


server = AgentServer()


def prewarm(proc: JobProcess):
proc.userdata["vad"] = silero.VAD.load()


server.setup_fnc = prewarm


@server.rtc_session()
async def entrypoint(ctx: JobContext):
ctx.log_context_fields = {
"room": ctx.room.name,
}
session = AgentSession(
stt=inference.STT("deepgram/nova-3", language="multi"),
llm=inference.LLM("openai/gpt-4.1-mini"),
tts=inference.TTS("cartesia/sonic-3", voice="9626c31c-bec5-4cca-baa8-f8ba9e84c8bc"),
turn_detection=MultilingualModel(),
vad=ctx.proc.userdata["vad"],
preemptive_generation=True,
)

await session.start(
agent=MyAgent(),
room=ctx.room,
)

async with AMD(session, llm="openai/gpt-5-mini") as detector:
result = await detector.execute()

if result.category == "human":
logger.info("human answered the call, proceeding with normal conversation")
elif result.category == "machine-ivr":
logger.info("ivr menu detected, starting navigation")
elif result.category == "machine-vm":
logger.info("voicemail detected, leaving a message")
speech_handle = session.generate_reply(
instructions=(
"You've reached voicemail. Leave a brief message asking "
"the customer to call back."
),
)
await speech_handle.wait_for_playout()
session.shutdown()
elif result.category == "machine-unavailable":
logger.info("mailbox unavailable, ending call")
session.shutdown()


if __name__ == "__main__":
cli.run_app(server)
File renamed without changes.
File renamed without changes.
8 changes: 8 additions & 0 deletions livekit-agents/livekit/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,11 @@
room_io,
text_transforms,
)
from .voice.amd import (
AMD,
AMDCategory,
AMDResult,
)
from .voice.background_audio import AudioConfig, BackgroundAudioPlayer, BuiltinAudioClip, PlayHandle
from .voice.room_io import RoomInputOptions, RoomIO, RoomOutputOptions
from .voice.run_result import (
Expand Down Expand Up @@ -220,6 +225,9 @@ def __getattr__(name: str) -> typing.Any:
"FunctionCallEvent",
"FunctionCallOutputEvent",
"AgentHandoffEvent",
"AMD",
"AMDCategory",
"AMDResult",
"TurnHandlingOptions",
"EndpointingOptions",
"InterruptionOptions",
Expand Down
20 changes: 16 additions & 4 deletions livekit-agents/livekit/agents/voice/agent_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ def __init__(self, agent: Agent, sess: AgentSession) -> None:
self._speech_tasks: list[asyncio.Task[Any]] = []

self._preemptive_generation: _PreemptiveGeneration | None = None
self._authorization_allowed = asyncio.Event()
self._authorization_allowed.set()

self._drain_blocked_tasks: list[asyncio.Task[Any]] = []
self._mcp_tools: list[mcp.MCPToolset] = []
Expand Down Expand Up @@ -1075,6 +1077,12 @@ def _cancel_preemptive_generation(self) -> None:
self._preemptive_generation.speech_handle._cancel()
self._preemptive_generation = None

def _pause_authorization(self) -> None:
self._authorization_allowed.clear()

def _resume_authorization(self) -> None:
self._authorization_allowed.set()

def _interrupt_background_speeches(self, force: bool = False) -> list[SpeechHandle]:
interrupted_speeches: list[SpeechHandle] = []
for speech in self._background_speeches:
Expand Down Expand Up @@ -1977,7 +1985,8 @@ async def _tts_task_impl(

# See discussion in https://github.com/livekit/agents/issues/4432
authorization_tasks: list[asyncio.Future[Any]] = [
asyncio.ensure_future(speech_handle._wait_for_authorization())
asyncio.ensure_future(speech_handle._wait_for_authorization()),
asyncio.ensure_future(self._authorization_allowed.wait()),
]
if speech_handle.allow_interruptions:
authorization_tasks.append(asyncio.ensure_future(self._user_silence_event.wait()))
Expand Down Expand Up @@ -2279,7 +2288,8 @@ async def _pipeline_reply_task_impl(
self._session._update_agent_state("thinking")

authorization_tasks: list[asyncio.Future[Any]] = [
asyncio.ensure_future(speech_handle._wait_for_authorization())
asyncio.ensure_future(speech_handle._wait_for_authorization()),
asyncio.ensure_future(self._authorization_allowed.wait()),
]
if speech_handle.allow_interruptions:
authorization_tasks.append(asyncio.ensure_future(self._user_silence_event.wait()))
Expand Down Expand Up @@ -2605,7 +2615,8 @@ async def _realtime_reply_task(

# realtime_reply_task is called only when there's text input, native audio input is handled by _realtime_generation_task
authorization_tasks: list[asyncio.Future[Any]] = [
asyncio.ensure_future(speech_handle._wait_for_authorization())
asyncio.ensure_future(speech_handle._wait_for_authorization()),
asyncio.ensure_future(self._authorization_allowed.wait()),
]
if speech_handle.allow_interruptions:
authorization_tasks.append(asyncio.ensure_future(self._user_silence_event.wait()))
Expand Down Expand Up @@ -2721,7 +2732,8 @@ async def _realtime_generation_task_impl(
tool_ctx = llm.ToolContext(self.tools)

authorization_tasks: list[asyncio.Future[Any]] = [
asyncio.ensure_future(speech_handle._wait_for_authorization())
asyncio.ensure_future(speech_handle._wait_for_authorization()),
asyncio.ensure_future(self._authorization_allowed.wait()),
]
if speech_handle.allow_interruptions:
authorization_tasks.append(asyncio.ensure_future(self._user_silence_event.wait()))
Expand Down
42 changes: 35 additions & 7 deletions livekit-agents/livekit/agents/voice/agent_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from ._utils import _set_participant_attributes
from .agent import Agent, AgentTask
from .agent_activity import AgentActivity
from .amd import AMD
from .events import (
AgentEvent,
AgentState,
Expand Down Expand Up @@ -450,8 +451,14 @@ def __init__(
self._started_at: float | None = None
self._usage_collector = ModelUsageCollector()

# ivr activity
# ivr and AMD
self._ivr_activity: IVRActivity | None = None
self._amd: AMD | None = None

@property
def amd(self) -> AMD | None:
"""The Answering Machine Detection (AMD) instance, or ``None`` if AMD is disabled."""
return self._amd

def on(self, event: EventTypes, callback: Callable | None = None) -> Callable:
if event == "metrics_collected" and callback is not None:
Expand Down Expand Up @@ -736,13 +743,8 @@ async def start(
tasks.append(task)

if self.options.ivr_detection:
self._ivr_activity = IVRActivity(self)

# inject the IVR activity tools into the session tools
self._tools.extend(self._ivr_activity.tools)

tasks.append(
asyncio.create_task(self._ivr_activity.start(), name="_ivr_activity_start")
asyncio.create_task(self._start_ivr_detection(), name="_ivr_activity_start")
)

current_span.set_attribute(trace_types.ATTR_ROOM_NAME, job_ctx.room.name)
Expand Down Expand Up @@ -908,6 +910,10 @@ async def _aclose_impl(
self._cancel_user_away_timer()
self._on_aec_warmup_expired() # always clear aec warmup when closing the session

if self._amd is not None:
await self._amd.aclose()
self._amd = None

activity = self._activity
while activity and isinstance(agent_task := activity.agent, AgentTask):
# notify AgentTask to complete and wait it to resume the parent agent
Expand Down Expand Up @@ -1059,6 +1065,28 @@ def update_options(
turn_detection=turn_detection,
)

async def _start_ivr_detection(self, transcript: str | None = None) -> None:
"""Start IVR detection on this session.

This method injects the DTMF tool and enables loop/silence detection,
allowing the agent to navigate IVR phone trees. Safe to call after AMD resolves.

Args:
transcript (str | None, optional): The transcript to start IVR detection with.
"""
if self._ivr_activity is not None:
logger.warning("IVR detection already started, skipping")
return

self._ivr_activity = IVRActivity(self)
self._tools.extend(self._ivr_activity.tools)
await self._ivr_activity.start()
if transcript is not None:
logger.debug("IVR detection started with transcript", extra={"transcript": transcript})
self._ivr_activity._on_user_input_transcribed(
UserInputTranscribedEvent(transcript=transcript, is_final=True)
)

def say(
self,
text: str | AsyncIterable[str],
Expand Down
4 changes: 4 additions & 0 deletions livekit-agents/livekit/agents/voice/amd/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .classifier import AMDCategory, AMDResult
from .detector import AMD

__all__ = ["AMD", "AMDCategory", "AMDResult"]
Loading
Loading