feat: expressive mode#6116
Conversation
| class Instructions: | ||
| """Instructions with optional modality-specific additions. | ||
|
|
||
| ``str(self)`` is what providers see when treating this as a plain string. | ||
| By default it equals the ``audio`` variant; after :meth:`as_modality` it | ||
| equals the chosen variant. | ||
| Construction:: | ||
|
|
||
| ``_audio_variant`` and ``_text_variant`` are always preserved so | ||
| :meth:`as_modality` can be called again for a different modality (e.g., | ||
| when the same ``ChatContext`` is reused across tool-call turns). | ||
| """ | ||
| # Simple — same instructions for all modalities | ||
| Instructions("You are a helpful assistant.") | ||
|
|
||
| # With modality-specific additions | ||
| Instructions( | ||
| "You are a helpful assistant.", | ||
| audio="Keep responses short for voice.", | ||
| text="Use markdown formatting.", | ||
| ) | ||
|
|
||
| _audio_variant: str | ||
| _text_variant: str | None | ||
| Rendering:: | ||
|
|
||
| def __new__( | ||
| cls, audio: str, *, text: str | None = None, _represent: str | None = None | ||
| ) -> Instructions: | ||
| """Create an Instructions object. | ||
| instr.render() # → common text | ||
| instr.render(modality="audio") # → common + audio addition | ||
| instr.render(modality="text", name="Alex") # → common + text, with {name} filled | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| common: str = "", | ||
| *, | ||
| audio: str | None = None, | ||
| text: str | None = None, | ||
| ) -> None: | ||
| self.common = common | ||
| self.audio = audio | ||
| self.text = text | ||
|
|
||
| def render( | ||
| self, | ||
| *, | ||
| modality: Literal["audio", "text"] | None = None, | ||
| data: dict[str, object] | None = None, | ||
| ) -> str: | ||
| """Render instructions to a plain string. | ||
|
|
||
| Args: | ||
| audio: The audio (voice) variant. | ||
| text: The text variant. Falls back to ``audio`` when omitted. | ||
| modality: If given, appends the modality-specific addition to the common text. | ||
| data: Template variables to fill. Missing placeholders log a warning | ||
| and are replaced with empty strings. | ||
| """ | ||
| instance = super().__new__(cls, _represent if _represent is not None else audio) | ||
| instance._audio_variant = audio | ||
| instance._text_variant = text | ||
| return instance | ||
| parts = [self.common] | ||
| if modality is not None: | ||
| addition = self.audio if modality == "audio" else self.text | ||
| if addition: | ||
| parts.append(addition) | ||
|
|
||
| @property | ||
| def audio(self) -> str: | ||
| """The audio (voice) variant of the instructions.""" | ||
| return self._audio_variant | ||
| result = "\n\n".join(p for p in parts if p) | ||
|
|
||
| @property | ||
| def text(self) -> str: | ||
| """The text variant of the instructions. | ||
| if data: | ||
| result = utils.misc.safe_render(result, data) | ||
|
|
||
| Falls back to the audio variant when no text variant was provided. | ||
| """ | ||
| return self._text_variant if self._text_variant is not None else self._audio_variant | ||
| return result | ||
|
|
||
| def format(self, *args: object, **kwargs: object) -> Instructions: | ||
| """Format the instructions with the given keyword arguments.""" | ||
| @staticmethod | ||
| def resolve_template(template: str, **kwargs: object) -> Instructions: | ||
| """Fill a template string, producing an ``Instructions`` with modality variants. | ||
|
|
||
| any_instructions = any(isinstance(arg, Instructions) for arg in args) or any( | ||
| isinstance(v, Instructions) for v in kwargs.values() | ||
| ) | ||
| If any kwarg value is an ``Instructions`` object, its ``common``/``audio``/``text`` | ||
| parts are substituted into the matching variant of the result. This is used by | ||
| workflow tasks to build modality-aware instructions from a single template. | ||
| """ | ||
| any_instructions = any(isinstance(v, Instructions) for v in kwargs.values()) | ||
| if any_instructions: | ||
| audio_args = tuple(arg.audio if isinstance(arg, Instructions) else arg for arg in args) | ||
| text_args = tuple(arg.text if isinstance(arg, Instructions) else arg for arg in args) | ||
| audio_kwargs = { | ||
| k: v.audio if isinstance(v, Instructions) else v for k, v in kwargs.items() | ||
| common_kw: dict[str, object] = { | ||
| k: str(v) if isinstance(v, Instructions) else v for k, v in kwargs.items() | ||
| } | ||
| text_kwargs = { | ||
| k: v.text if isinstance(v, Instructions) else v for k, v in kwargs.items() | ||
| audio_kw: dict[str, object] = { | ||
| # an explicit "" removes the section; only None falls back to common | ||
| k: (v.audio if v.audio is not None else str(v)) if isinstance(v, Instructions) else v | ||
| for k, v in kwargs.items() | ||
| } | ||
| text_kw: dict[str, object] = { | ||
| k: (v.text if v.text is not None else str(v)) if isinstance(v, Instructions) else v | ||
| for k, v in kwargs.items() | ||
| } | ||
| return Instructions( | ||
| common=utils.misc.safe_render(template, common_kw), | ||
| audio=utils.misc.safe_render(template, audio_kw), | ||
| text=utils.misc.safe_render(template, text_kw), | ||
| ) | ||
| else: | ||
| audio_args = text_args = args | ||
| audio_kwargs = text_kwargs = kwargs | ||
|
|
||
| return Instructions( | ||
| audio=self.audio.format(*audio_args, **audio_kwargs), | ||
| text=( | ||
| self.text.format(*text_args, **text_kwargs) | ||
| if any_instructions or self._text_variant is not None | ||
| else None | ||
| ), | ||
| _represent=str(self).format(*args, **kwargs), | ||
| ) | ||
| rendered = utils.misc.safe_render(template, kwargs) | ||
| return Instructions(common=rendered) | ||
|
|
||
| def as_modality(self, modality: Literal["audio", "text"]) -> Instructions: | ||
| """Return a copy whose ``str`` value is the correct variant for *modality*. | ||
| def __str__(self) -> str: | ||
| return self.common | ||
|
|
||
| Both ``_audio_variant`` and ``_text_variant`` are preserved so this can | ||
| be called again for a different modality (e.g. across tool-call turns). | ||
| """ | ||
| return Instructions( | ||
| audio=self._audio_variant, | ||
| text=self._text_variant, | ||
| _represent=self.audio if modality == "audio" else self.text, | ||
| ) | ||
| def __repr__(self) -> str: | ||
| return f"Instructions({self.common!r})" | ||
|
|
||
| def __hash__(self) -> int: | ||
| return hash((self.common, self.audio, self.text)) | ||
|
|
||
| def __add__(self, other: object) -> Instructions: | ||
| """Concatenate, propagating both variants and the current str value.""" | ||
| def __eq__(self, other: object) -> bool: | ||
| if isinstance(other, Instructions): | ||
| has_text = self._text_variant is not None or other._text_variant is not None | ||
| return Instructions( | ||
| audio=self.audio + other.audio, | ||
| text=(self.text + other.text) if has_text else None, | ||
| _represent=str(self) + str(other), | ||
| ) | ||
| if isinstance(other, str): | ||
| return Instructions( | ||
| audio=self.audio + other, | ||
| text=(self._text_variant + other) if self._text_variant is not None else None, | ||
| _represent=str(self) + other, | ||
| return ( | ||
| self.common == other.common | ||
| and self.audio == other.audio | ||
| and self.text == other.text | ||
| ) | ||
| raise TypeError(f"Cannot add Instructions and {type(other)}") | ||
|
|
||
| def __radd__(self, other: object) -> Instructions: | ||
| """Support ``plain_str + Instructions``, propagating both variants.""" | ||
| if isinstance(other, str): | ||
| return Instructions( | ||
| audio=other + self.audio, | ||
| text=(other + self._text_variant) if self._text_variant is not None else None, | ||
| _represent=other + str(self), | ||
| ) | ||
| raise TypeError(f"Cannot add {type(other)} and Instructions") | ||
|
|
||
| def __repr__(self) -> str: | ||
| return f"Instructions({str(self)!r})" | ||
|
|
||
| @classmethod | ||
| def __get_pydantic_core_schema__(cls, source_type: Any, handler: Any) -> Any: | ||
| from pydantic_core import core_schema | ||
|
|
||
| def validate_python(v: Any) -> Instructions: | ||
| if isinstance(v, Instructions): | ||
| return v | ||
| if isinstance(v, dict) and v.get("type") == "instructions": | ||
| return cls(v["audio"], text=v.get("text")) | ||
| raise ValueError(f"Cannot convert {type(v)!r} to Instructions") | ||
|
|
||
| def validate_json(v: Any) -> Instructions: | ||
| if isinstance(v, dict) and v.get("type") == "instructions": | ||
| return cls(v["audio"], text=v.get("text")) | ||
| raise ValueError(f"Cannot convert {type(v)!r} to Instructions") | ||
|
|
||
| def serialize(v: Instructions) -> dict[str, Any]: | ||
| d: dict[str, Any] = {"type": "instructions", "audio": v.audio} | ||
| if v._text_variant is not None: | ||
| d["text"] = v._text_variant | ||
| return d | ||
|
|
||
| return core_schema.json_or_python_schema( | ||
| python_schema=core_schema.no_info_plain_validator_function(validate_python), | ||
| json_schema=core_schema.no_info_plain_validator_function(validate_json), | ||
| serialization=core_schema.plain_serializer_function_ser_schema( | ||
| serialize, info_arg=False | ||
| ), | ||
| ) | ||
| return self.common == other | ||
| return NotImplemented |
There was a problem hiding this comment.
🚩 Instructions class migration: str subclass → regular class
The Instructions class was previously a str subclass (inheriting all string behavior). It's now a regular class with __str__ returning self.common. This is a significant API break for any external code that relied on isinstance(instr, str) being True, or passing Instructions directly where a str was expected. The PR consistently updates all internal usages: ChatContent no longer includes Instructions in the union, add_message explicitly converts via str(content), and update_instructions in generation.py calls .render(modality=...). The provider format files (openai, anthropic, google, aws, mistralai) add defensive isinstance(c, (str, Instructions)) checks in the realtime/traces paths. However, any downstream plugin or user code that stored Instructions objects directly in ChatMessage.content (relying on Instructions being a str) would silently break.
Was this helpful? React with 👍 or 👎 to provide feedback.
8623c30 to
a31d53f
Compare
| def __init__( | ||
| self, | ||
| audio: str = "", | ||
| *, | ||
| text: str | None = None, | ||
| persona: NotGivenOr[Instructions | str] = NOT_GIVEN, | ||
| extra: Instructions | str = "", | ||
| ) -> None: | ||
| super().__init__(audio, text=text) |
There was a problem hiding this comment.
🟡 WorkflowInstructions passes audio to Instructions.audio instead of Instructions.common, breaking str() on the base class
WorkflowInstructions.__init__ calls super().__init__(audio=audio, text=text) at livekit-agents/livekit/agents/beta/workflows/utils.py:67, but the parent Instructions.__init__ signature is (self, common: str = "", *, audio: str | None = None, text: str | None = None). This means the value passed to audio lands in Instructions.audio (the modality-specific addition) while Instructions.common stays "". Since Instructions.__str__ returns self.common, any call to str() on a WorkflowInstructions will always return "". Current call sites happen to use keyword-only args (persona=, extra=), so the first positional stays default — but the design is misleading: a user constructing WorkflowInstructions("my text") would have "my text" routed to Instructions.audio with no indication.
Was this helpful? React with 👍 or 👎 to provide feedback.
| task_group = TaskGroup() | ||
| task_group.add( | ||
| lambda: GetCardNumberTask( | ||
| chat_ctx=ctx, | ||
| lambda: GetNameTask( | ||
| last_name=True, | ||
| extra_instructions="This is in the context of credit card information collection, ask specifically for the full name listed on it.", | ||
| require_confirmation=self._require_confirmation, | ||
| extra_instructions=self._extra_instructions, | ||
| ), | ||
| id="card_number_task", | ||
| description="Collects the user's card number", | ||
| id="cardholder_name_task", | ||
| description="Collects the cardholder's full name", | ||
| ) | ||
| task_group.add( | ||
| lambda: GetExpirationDateTask( | ||
| chat_ctx=ctx, | ||
| require_confirmation=self._require_confirmation, | ||
| extra_instructions=self._extra_instructions, | ||
| ), | ||
| id="expiration_date_task", | ||
| description="Collects the card's expiration date", | ||
| lambda: GetCardNumberTask(require_confirmation=self._require_confirmation), | ||
| id="card_number_task", | ||
| description="Collects the user's card number", | ||
| ) | ||
| task_group.add( | ||
| lambda: GetSecurityCodeTask( | ||
| chat_ctx=ctx, | ||
| require_confirmation=self._require_confirmation, | ||
| extra_instructions=self._extra_instructions, | ||
| ), | ||
| lambda: GetSecurityCodeTask(require_confirmation=self._require_confirmation), | ||
| id="security_code_task", | ||
| description="Collects the card's security code", | ||
| ) | ||
| task_group.add( | ||
| lambda: GetNameTask( | ||
| last_name=True, | ||
| chat_ctx=ctx, | ||
| extra_instructions=cardholder_extra, | ||
| require_confirmation=self._require_confirmation, | ||
| # The cardholder may differ from the caller or any guest | ||
| # mentioned earlier in chat_ctx. Apply IGNORE_ON_ENTER on | ||
| # update_name so the model must produce an asking turn | ||
| # rather than silently filling from chat_ctx. | ||
| require_explicit_ask=True, | ||
| ), | ||
| id="cardholder_name_task", | ||
| description="Collects the cardholder's full name", | ||
| lambda: GetExpirationDateTask(require_confirmation=self._require_confirmation), | ||
| id="expiration_date_task", | ||
| description="Collects the card's expiration date", | ||
| ) |
There was a problem hiding this comment.
🚩 Removal of require_explicit_ask and chat_ctx from workflow sub-tasks is intentional simplification
The credit card, name, email, address, phone number, and DOB workflow tasks had require_explicit_ask and explicit chat_ctx parameters removed. The old pattern used ToolFlag.IGNORE_ON_ENTER to prevent the LLM from pre-filling from chat context, and passed chat_ctx explicitly through TaskGroup to sub-tasks. The new pattern relies on the session's inherited chat context and removes the dynamic tool-building pattern in favor of @function_tool() decorators directly on the class. This means the LLM can now potentially pre-fill answers from context on task entry (since IGNORE_ON_ENTER is gone from tools like update_name, record_card_number, etc.), which changes behavior for the cardholder name collection during credit card capture. The old code had specific logic to force the LLM to ask explicitly for the cardholder name since it might differ from a previously mentioned name.
Was this helpful? React with 👍 or 👎 to provide feedback.
| audio=_CARD_NUMBER_BASE_INSTRUCTIONS.format( | ||
| modality_specific=_CARD_NUMBER_AUDIO_SPECIFIC, | ||
| confirmation_instructions=( | ||
| confirmation_instructions if require_confirmation is not False else "" |
There was a problem hiding this comment.
🟡 Workflow task Instructions created with audio=/text= but no common cause str() to return empty string, breaking AgentConfigUpdate recording
In credit_card.py:175, dob.py:97, name.py:123, and phone_number.py:88, the migration from the old Instructions(str) to the new Instructions class changed Instructions(audio_text, text=text_text) to Instructions(audio=audio_text, text=text_text). Since the new Instructions.__init__ has common as the first positional param (defaulting to ""), these instructions have common="". str(instructions) now returns "" instead of the audio-variant text.
This matters because agent_activity.py:830 records instructions=str(self._agent.instructions) into an AgentConfigUpdate. For these workflow tasks, the config update now has empty instructions. Downstream consumers like evals/judge.py:83 (_get_latest_instructions) skip empty-string instructions (if item.instructions: is falsy for ""), so the eval system silently fails to find the active instructions during these workflow tasks.
(Refers to lines 175-181)
Prompt for agents
The issue is that Instructions(audio=..., text=...) leaves common empty, so str() returns empty string. This affects AgentConfigUpdate recording at agent_activity.py:830 and eval judges at evals/judge.py:83.
The fix is to pass the audio text as the common parameter as well, so str() returns meaningful text:
In credit_card.py, dob.py, name.py, and phone_number.py, change the pattern from:
Instructions(audio=AUDIO_TEXT, text=TEXT_TEXT)
to:
Instructions(AUDIO_TEXT, audio=AUDIO_TEXT, text=TEXT_TEXT)
Or alternatively, change the AgentConfigUpdate recording in agent_activity.py to use .render(modality='audio') instead of str() when the instructions are an Instructions object. Similarly update evals/judge.py to handle Instructions objects.
Was this helpful? React with 👍 or 👎 to provide feedback.
|
|
||
|
|
||
| ChatContent: TypeAlias = ImageContent | AudioContent | Instructions | str | ||
| ChatContent: TypeAlias = ImageContent | AudioContent | str |
There was a problem hiding this comment.
🚩 Instructions no longer valid as ChatContent — Pydantic serialization implications
The ChatContent type alias changed from ImageContent | AudioContent | Instructions | str to ImageContent | AudioContent | str. This means Instructions objects can no longer be stored in ChatMessage.content. The add_message method was updated to resolve Instructions to str(content) before storage, and all provider format files were updated to handle Instructions via str(). However, Instructions.__get_pydantic_core_schema__ was removed, so any serialized chat context containing the old {"type": "instructions", "audio": ..., "text": ...} format in content arrays would fail to deserialize with ChatContext.from_dict(). This is a breaking change for persisted chat histories that contained Instructions objects.
Was this helpful? React with 👍 or 👎 to provide feedback.
| audio=_CARD_NUMBER_BASE_INSTRUCTIONS.format( | ||
| modality_specific=_CARD_NUMBER_AUDIO_SPECIFIC, | ||
| confirmation_instructions=( | ||
| confirmation_instructions if require_confirmation is not False else "" |
There was a problem hiding this comment.
🚩 Instructions str() returns empty string for workflow task instructions
The migration of workflow tasks (credit_card.py, dob.py, name.py, phone_number.py) from Instructions(audio_text, text=text_text) to Instructions(audio=audio_text, text=text_text) changes str() behavior: it returns self.common which is "" instead of the audio variant text.
This matters at agent_activity.py:830 where str(self._agent.instructions) is recorded in AgentConfigUpdate. For workflow subtasks, the recorded instructions will be empty strings. The evals judge at judge.py:84 (_get_latest_instructions) skips falsy values, so it would return None instead of the actual instructions for these tasks.
The functional LLM path through update_instructions() → render(modality=...) at agent_activity.py:2721 is correct — only the metadata recording is affected. Similarly, the realtime model path at agent_activity.py:803 (str(self._agent.instructions)) would pass empty instructions, but workflow tasks don't use realtime models.
(Refers to lines 175-181)
Was this helpful? React with 👍 or 👎 to provide feedback.
|
|
||
| class Instructions(str): | ||
| """Instructions that adapt based on the user's input modality (audio vs. text). | ||
| class Instructions: |
There was a problem hiding this comment.
I suggest keeping Instructions a str subclass:
| class Instructions: | |
| class Instructions(str): |
Reasons articulated well in Devin's comment:
The
Instructionsclass was previously astrsubclass (inheriting all string behavior). It's now a regular class with__str__returningself.common. This is a significant API break for any external code that relied onisinstance(instr, str)being True, or passingInstructionsdirectly where astrwas expected. [...], any downstream plugin or user code that storedInstructionsobjects directly inChatMessage.content(relying onInstructionsbeing astr) would silently break.
There was a problem hiding this comment.
Hmm I think this change was introduced in the original branch theo/expressiveness-mode, the commit message associated with it was:
Rework Instructions from str subclass to stateless class with common/audio/text fields. No Pydantic dependency, no runtime state.
I will double check on the intention there
| instr.render(modality="audio") # → common + audio addition | ||
| instr.render(modality="text", name="Alex") # → common + text, with {name} filled | ||
| """ | ||
|
|
There was a problem hiding this comment.
Goes together with previous suggestion:
| def __new__(cls, common: str = "", **kwargs: Any) -> Instructions: | |
| return super().__new__(cls, common) | |
__init__ is then supplementary for setting instance attributes.
| buf = "" | ||
| async for chunk in text_stream: | ||
| buf += chunk | ||
| if buf.rfind("<") > buf.rfind(">"): |
There was a problem hiding this comment.
Not a huge issue right now, but not all vendors use XML syntax. e.g. https://docs.fish.audio/developer-guide/core-features/emotions use []
There was a problem hiding this comment.
do you think this further justifies LK having our own syntax that can be converted to XML and other syntax? thinking out loud
There was a problem hiding this comment.
It is tricky. Even if we have our own syntax, different plugins support different values. Then we still need to translate something. If we only allow the same set of values, LLM will be generating unused/wasteful tokens. But it should be fine here as plugin can override this function.
| # the active TTS. They do NOT use the {tts.markup.llm_instructions} placeholder — the | ||
| # Inworld tag reference is inlined directly, so the prompt is self-contained. | ||
|
|
||
| _INWORLD_CUSTOMER_SERVICE: ExpressiveOptions = { |
There was a problem hiding this comment.
Q: Do we expect users to use these instructions for other languages too?
There was a problem hiding this comment.
i believe we discussed that the LLM should be smart enough to translate, we can also test it via the demo
|
|
||
|
|
||
| def convert_break_to_ellipsis(text: str) -> str: | ||
| """Replace ``<break time="..."/>`` tags with an ellipsis (``...``). |
There was a problem hiding this comment.
This might break a niche use case:
User: Can you just stop for 10 seconds before responding?
LLM: <break time="10s">
TTS: sees only "…"
definitely nitpicking here. maybe we should instruct the LLM know to use ellipsis for natural short pauses and only use break when needed.
LLM-driven TTS prosody via provider-specific markup tags, end to end: