feature #861 [Demo] Converting audio demo to speech-to-text-to-speech (chr-hertel)

chr-hertel · chr-hertel · commit c840dadb6aa6 · 2025-11-13T01:04:53.000+01:00
This PR was merged into the main branch. Discussion ---------- [Demo] Converting audio demo to speech-to-text-to-speech | Q | A | ------------- | --- | Bug fix? | no | New feature? | yes | Docs? | no | Issues | | License | MIT Converting the audio bot demo to speech-to-text-to-speech with subagent for RAG on Symfony Blog. <img width="1479" height="1002" alt="image" src="https://github.com/user-attachments/assets/47cd9bdf-9038-416c-8ecb-16cff25fadc7" /> [Last Response](https://github.com/user-attachments/files/23511827/download.mp3) Commits ------- 52c13d0 Converting audio demo to speech-to-text-to-speech
diff --git a/demo/config/packages/ai.yaml b/demo/config/packages/ai.yaml
@@ -39,12 +39,19 @@ ai:
         audio:
             platform: 'ai.platform.openai'
             model: 'gpt-4o-mini?temperature=1.0'
-            prompt: 'You are a friendly chatbot that likes to have a conversation with users and asks them some questions.'
+            prompt: |
+                You are a friendly, positive and energetic voice assistant. You can engage in light discussions, ask
+                questions, and do general small-talk. If asked about the Symfony Framework or their community events,
+                you delegate to your subagent "symfony_blog" and use their answer for answering user's questions.
+                If you don't know the answer, say so. Keep in mind that you are in a spoken conversation, so keep your
+                answers concise and to the point. They will be read out loud to the user.
             tools:
                 # Agent in agent 🤯
                 - agent: 'blog'
                   name: 'symfony_blog'
-                  description: 'Can answer questions based on the Symfony blog.'
+                  description: |
+                      Subagent, that can answer questions about latest news around the Symfony Framework, like latest
+                      features, events or community news.
         orchestrator:
             platform: 'ai.platform.openai'
             model: 'gpt-4o-mini'
diff --git a/demo/src/Audio/Chat.php b/demo/src/Audio/Chat.php
@@ -12,6 +12,7 @@
 namespace App\Audio;
 
 use Symfony\AI\Agent\AgentInterface;
+use Symfony\AI\Platform\Bridge\OpenAi\TextToSpeech\Voice;
 use Symfony\AI\Platform\Message\Content\Audio;
 use Symfony\AI\Platform\Message\Message;
 use Symfony\AI\Platform\Message\MessageBag;
@@ -58,7 +59,14 @@ public function submitMessage(string $message): void
 
         \assert($result instanceof TextResult);
 
-        $messages->add(Message::ofAssistant($result->getContent()));
+        $assistantMessage = Message::ofAssistant($result->getContent());
+        $messages->add($assistantMessage);
+
+        $result = $this->platform->invoke('tts-1', $result->getContent(), [
+            'voice' => Voice::CORAL,
+            'instructions' => 'Speak in a cheerful and positive tone.',
+        ]);
+        $assistantMessage->getMetadata()->add('audio', $result->asDataUri('audio/mpeg'));
 
         $this->saveMessages($messages);
     }
diff --git a/demo/templates/components/audio.html.twig b/demo/templates/components/audio.html.twig
@@ -1,4 +1,4 @@
-{% import "_message.html.twig" as message %}
+{% import "_message.html.twig" as msg %}
 
 <div class="card mx-auto shadow-lg" {{ attributes.defaults(stimulus_controller('audio')) }}>
     <div class="card-header p-2">
@@ -8,7 +8,18 @@
     </div>
     <div id="chat-body" class="card-body p-4 overflow-auto">
         {% for message in this.messages %}
-            {% include '_message.html.twig' with { message, latest: loop.last } %}
+            {% if 'user' == message.role.value %}
+                {% include '_message.html.twig' with { message, latest: loop.last } %}
+            {% else %}
+                <div class="d-flex align-items-baseline mb-4">
+                    <div class="bot avatar rounded-3 shadow-sm">
+                        {{ ux_icon('fluent:bot-24-filled', { height: '45px', width: '45px' }) }}
+                    </div>
+                    <div class="ps-2">
+                        <audio class="pt-3" controls {{ loop.last ? 'autoplay' }} src="{{ message.metadata.get('audio') }}"></audio>
+                    </div>
+                </div>
+            {% endif %}
         {% else %}
             <div id="welcome" class="text-center mt-5 py-5 bg-white rounded-5 shadow-sm w-75 mx-auto">
                 {{ ux_icon('iconoir:microphone-solid', { height: '200px', width: '200px' }) }}
@@ -17,8 +28,8 @@
             </div>
         {% endfor %}
         <div id="loading-message" class="d-none">
-            {{ message.user([{text:'Converting your speech to text ...'}], true) }}
-            {{ message.bot('The Bot is looking for an answer ...', true) }}
+            {{ msg.user([{text:'Converting your speech to text ...'}], true) }}
+            {{ msg.bot('The Bot is looking for an answer ...', true) }}
         </div>
     </div>
     <div class="card-footer p-2 text-center">