|
| 1 | +# Copyright 2025 Google LLC |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# https://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +# [START googlegenaisdk_live_conversation_audio_with_audio] |
| 16 | + |
| 17 | +import asyncio |
| 18 | +import base64 |
| 19 | + |
| 20 | +from google import genai |
| 21 | +from google.genai.types import ( |
| 22 | + AudioTranscriptionConfig, |
| 23 | + Blob, |
| 24 | + HttpOptions, |
| 25 | + LiveConnectConfig, |
| 26 | + Modality, |
| 27 | +) |
| 28 | +import numpy as np |
| 29 | + |
| 30 | +from scipy.io import wavfile |
| 31 | + |
| 32 | +# The number of audio frames to send in each chunk. |
| 33 | +CHUNK = 4200 |
| 34 | +CHANNELS = 1 |
| 35 | +MODEL = "gemini-live-2.5-flash-preview-native-audio-09-2025" |
| 36 | + |
| 37 | +# The audio sample rate expected by the model. |
| 38 | +INPUT_RATE = 16000 |
| 39 | +# The audio sample rate of the audio generated by the model. |
| 40 | +OUTPUT_RATE = 24000 |
| 41 | + |
| 42 | +# The sample width for 16-bit audio, which is standard for this type of audio data. |
| 43 | +SAMPLE_WIDTH = 2 |
| 44 | + |
| 45 | +client = genai.Client(http_options=HttpOptions(api_version="v1beta1"), location="us-central1") |
| 46 | + |
| 47 | + |
| 48 | +def read_wavefile(filepath: str) -> tuple[str, str]: |
| 49 | + # Read the .wav file using scipy.io.wavfile.read |
| 50 | + rate, data = wavfile.read(filepath) |
| 51 | + # Convert the NumPy array of audio samples back to raw bytes |
| 52 | + raw_audio_bytes = data.tobytes() |
| 53 | + # Encode the raw bytes to a base64 string. |
| 54 | + # The result needs to be decoded from bytes to a UTF-8 string |
| 55 | + base64_encoded_data = base64.b64encode(raw_audio_bytes).decode("ascii") |
| 56 | + mime_type = f"audio/pcm;rate={rate}" |
| 57 | + return base64_encoded_data, mime_type |
| 58 | + |
| 59 | + |
| 60 | +def write_wavefile(filepath: str, audio_frames: list[bytes], rate: int) -> None: |
| 61 | + """Writes a list of audio byte frames to a WAV file using scipy.""" |
| 62 | + # Combine the list of byte frames into a single byte string |
| 63 | + raw_audio_bytes = b"".join(audio_frames) |
| 64 | + |
| 65 | + # Convert the raw bytes to a NumPy array. |
| 66 | + # The sample width is 2 bytes (16-bit), so we use np.int16 |
| 67 | + audio_data = np.frombuffer(raw_audio_bytes, dtype=np.int16) |
| 68 | + |
| 69 | + # Write the NumPy array to a .wav file |
| 70 | + wavfile.write(filepath, rate, audio_data) |
| 71 | + print(f"Model response saved to {filepath}") |
| 72 | + |
| 73 | + |
| 74 | +async def main() -> bool: |
| 75 | + print("Starting the code") |
| 76 | + |
| 77 | + async with client.aio.live.connect( |
| 78 | + model=MODEL, |
| 79 | + config=LiveConnectConfig( |
| 80 | + # Set Model responses to be in Audio |
| 81 | + response_modalities=[Modality.AUDIO], |
| 82 | + # To generate transcript for input audio |
| 83 | + input_audio_transcription=AudioTranscriptionConfig(), |
| 84 | + # To generate transcript for output audio |
| 85 | + output_audio_transcription=AudioTranscriptionConfig(), |
| 86 | + ), |
| 87 | + ) as session: |
| 88 | + |
| 89 | + async def send() -> None: |
| 90 | + # using local file as an example for live audio input |
| 91 | + wav_file_path = "hello_gemini_are_you_there.wav" |
| 92 | + base64_data, mime_type = read_wavefile(wav_file_path) |
| 93 | + audio_bytes = base64.b64decode(base64_data) |
| 94 | + await session.send_realtime_input(media=Blob(data=audio_bytes, mime_type=mime_type)) |
| 95 | + |
| 96 | + async def receive() -> None: |
| 97 | + audio_frames = [] |
| 98 | + |
| 99 | + async for message in session.receive(): |
| 100 | + if message.server_content.input_transcription: |
| 101 | + print(message.server_content.model_dump(mode="json", exclude_none=True)) |
| 102 | + if message.server_content.output_transcription: |
| 103 | + print(message.server_content.model_dump(mode="json", exclude_none=True)) |
| 104 | + if message.server_content.model_turn: |
| 105 | + for part in message.server_content.model_turn.parts: |
| 106 | + if part.inline_data.data: |
| 107 | + audio_data = part.inline_data.data |
| 108 | + audio_frames.append(audio_data) |
| 109 | + |
| 110 | + if audio_frames: |
| 111 | + write_wavefile( |
| 112 | + "example_model_response.wav", |
| 113 | + audio_frames, |
| 114 | + OUTPUT_RATE, |
| 115 | + ) |
| 116 | + |
| 117 | + send_task = asyncio.create_task(send()) |
| 118 | + receive_task = asyncio.create_task(receive()) |
| 119 | + await asyncio.gather(send_task, receive_task) |
| 120 | + # Example response: |
| 121 | + # gemini-2.0-flash-live-preview-04-09 |
| 122 | + # {'input_transcription': {'text': 'Hello.'}} |
| 123 | + # {'output_transcription': {}} |
| 124 | + # {'output_transcription': {'text': 'Hi'}} |
| 125 | + # {'output_transcription': {'text': ' there. What can I do for you today?'}} |
| 126 | + # {'output_transcription': {'finished': True}} |
| 127 | + # Model response saved to example_model_response.wav |
| 128 | + |
| 129 | +# [END googlegenaisdk_live_conversation_audio_with_audio] |
| 130 | + return True |
| 131 | + |
| 132 | +if __name__ == "__main__": |
| 133 | + asyncio.run(main()) |
0 commit comments