Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions examples/voice_agents/basic_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@ def __init__(self) -> None:
async def on_enter(self):
# when the agent is added to the session, it'll generate a reply
# according to its instructions
# Keep it uninterruptible so the client has time to calibrate AEC (Acoustic Echo Cancellation).
self.session.generate_reply(allow_interruptions=False)
self.session.generate_reply()

# all functions annotated with @function_tool will be passed to the LLM when this
# agent is active
Expand Down Expand Up @@ -102,6 +101,8 @@ async def entrypoint(ctx: JobContext):
# when it's detected, you may resume the agent's speech
resume_false_interruption=True,
false_interruption_timeout=1.0,
# blocks interruptions for a few seconds after the agent starts speaking to allow client to calibrate AEC
echo_guard_duration=3.0,
)

# log metrics as they are emitted, and total usage after session is over
Expand Down
9 changes: 8 additions & 1 deletion livekit-agents/livekit/agents/voice/agent_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,10 +781,13 @@ def push_audio(self, frame: rtc.AudioFrame) -> None:
if not self._started:
return

should_discard = bool(
should_discard = (
self._current_speech
and not self._current_speech.allow_interruptions
and self._session.options.discard_audio_if_uninterruptible
) or (
self._session.agent_state == "speaking"
and self._session._echo_guard_remaining_duration > 0
)

if not should_discard:
Expand Down Expand Up @@ -1223,6 +1226,10 @@ def _on_generation_created(self, ev: llm.GenerationCreatedEvent) -> None:
self._schedule_speech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL)

def _interrupt_by_audio_activity(self) -> None:
if self._session._echo_guard_remaining_duration > 0:
# disable interruption from audio activity while echo guard is active
return
Comment on lines +1229 to +1231
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Echo guard blocks interruptions even when agent is not speaking (between turns)

The _interrupt_by_audio_activity check at agent_activity.py:1229 only checks _echo_guard_remaining_duration > 0 without verifying the agent is currently speaking. When the agent transitions from "speaking" to "thinking" (e.g., during a tool call or between LLM response and TTS), _cancel_echo_guard_timer preserves the remaining echo guard duration. This causes _interrupt_by_audio_activity to block genuine user interruptions during non-speaking phases where there is no audio output and thus no echo to guard against.

Root Cause and Impact

The push_audio method at agent_activity.py:788-791 correctly gates audio discarding on agent_state == "speaking", but _interrupt_by_audio_activity at agent_activity.py:1229 does not include this check:

# push_audio - correctly checks agent_state
should_discard = ... or (
    self._session.agent_state == "speaking"
    and self._session._echo_guard_remaining_duration > 0
)

# _interrupt_by_audio_activity - missing agent_state check
def _interrupt_by_audio_activity(self) -> None:
    if self._session._echo_guard_remaining_duration > 0:
        return  # blocks even when agent is in "thinking" or "listening" state

Scenario: With echo_guard_duration=3.0, the agent speaks for 1 second then transitions to "thinking". _cancel_echo_guard_timer (agent_session.py:1217-1227) saves 2.0s of remaining duration. During the thinking phase, the user speaks but _interrupt_by_audio_activity returns early because _echo_guard_remaining_duration is 2.0 > 0, even though there's no audio output to cause echo. This blocks the user from interrupting the agent during non-speaking phases.

Impact: Users cannot interrupt the agent during "thinking" or "listening" states while the echo guard has remaining duration, even though echo is only possible during "speaking" state.

Suggested change
if self._session._echo_guard_remaining_duration > 0:
# disable interruption from audio activity while echo guard is active
return
if self._session.agent_state == "speaking" and self._session._echo_guard_remaining_duration > 0:
# disable interruption from audio activity while echo guard is active
return
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.


opt = self._session.options
use_pause = opt.resume_false_interruption and opt.false_interruption_timeout is not None

Expand Down
48 changes: 48 additions & 0 deletions livekit-agents/livekit/agents/voice/agent_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ class AgentSessionOptions:
preemptive_generation: bool
tts_text_transforms: Sequence[TextTransforms] | None
ivr_detection: bool
echo_guard_duration: float | None


Userdata_T = TypeVar("Userdata_T")
Expand Down Expand Up @@ -158,6 +159,7 @@ def __init__(
use_tts_aligned_transcript: NotGivenOr[bool] = NOT_GIVEN,
tts_text_transforms: NotGivenOr[Sequence[TextTransforms] | None] = NOT_GIVEN,
preemptive_generation: bool = False,
echo_guard_duration: float | None = None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should do it by default.
Maybe we shouldn't even have an option for it, it seems like an issue everybody has

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how long does it usually need to warm up the AEC?

ivr_detection: bool = False,
conn_options: NotGivenOr[SessionConnectOptions] = NOT_GIVEN,
loop: asyncio.AbstractEventLoop | None = None,
Expand Down Expand Up @@ -246,6 +248,10 @@ def __init__(
can reduce response latency by overlapping model inference with user audio,
but may incur extra compute if the user interrupts or revises mid-utterance.
Defaults to ``False``.
echo_guard_duration (float, optional): The duration in seconds that the agent
will ignore user's audio interruptions after the agent starts speaking.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: Does this apply to both cases when the session starts:

  1. Agent speaks first
  2. Agent's first response (the user might speak first)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, it only considers agent speaking, no matter who speaking first.

This is useful to prevent the agent from being interrupted by echo before AEC is ready.
Default ``None``.
ivr_detection (bool): Whether to detect if the agent is interacting with an IVR system.
Default ``False``.
conn_options (SessionConnectOptions, optional): Connection options for
Expand Down Expand Up @@ -291,6 +297,7 @@ def __init__(
use_tts_aligned_transcript=use_tts_aligned_transcript
if is_given(use_tts_aligned_transcript)
else None,
echo_guard_duration=echo_guard_duration,
)
self._conn_options = conn_options or SessionConnectOptions()
self._started = False
Expand All @@ -316,6 +323,11 @@ def __init__(
self._llm_error_counts = 0
self._tts_error_counts = 0

# echo guard: disable interruptions while AEC warms up
self._echo_guard_remaining_duration = echo_guard_duration or 0.0
self._echo_guard_timer: asyncio.TimerHandle | None = None
self._echo_guard_speaking_start: float | None = None

# configurable IO
self._input = io.AgentInput(self._on_video_input_changed, self._on_audio_input_changed)
self._output = io.AgentOutput(
Expand Down Expand Up @@ -787,6 +799,8 @@ async def _aclose_impl(

self._closing = True
self._cancel_user_away_timer()
self._cancel_echo_guard_timer()
self._on_echo_guard_expired() # always clear echo guard when closing the session

if self._activity is not None:
if not drain:
Expand Down Expand Up @@ -1192,6 +1206,26 @@ def _cancel_user_away_timer(self) -> None:
self._user_away_timer.cancel()
self._user_away_timer = None

def _on_echo_guard_expired(self) -> None:
if self._echo_guard_remaining_duration > 0:
logger.debug("echo guard expired, re-enabling interruptions")

self._echo_guard_remaining_duration = 0.0
self._echo_guard_timer = None
self._echo_guard_speaking_start = None

def _cancel_echo_guard_timer(self) -> None:
if self._echo_guard_timer is not None:
self._echo_guard_timer.cancel()
self._echo_guard_timer = None

if self._echo_guard_speaking_start is not None:
elapsed = time.time() - self._echo_guard_speaking_start
self._echo_guard_remaining_duration = max(
0.0, self._echo_guard_remaining_duration - elapsed
)
self._echo_guard_speaking_start = None

def _update_agent_state(
self,
state: AgentState,
Expand Down Expand Up @@ -1223,6 +1257,20 @@ def _update_agent_state(
self._agent_speaking_span.end()
self._agent_speaking_span = None

# echo guard: disable interruptions while AEC warms up
if state == "speaking" and self._echo_guard_remaining_duration > 0:
self._echo_guard_speaking_start = time.time()
self._echo_guard_timer = self._loop.call_later(
self._echo_guard_remaining_duration, self._on_echo_guard_expired
)
logger.debug(
"echo guard active, disabling interruptions for %.2fs",
self._echo_guard_remaining_duration,
)

if self._agent_state == "speaking" and state != "speaking":
self._cancel_echo_guard_timer()

if state == "listening" and self._user_state == "listening":
self._set_user_away_timer()
else:
Expand Down
Loading