Skip to content

Commit 4cd0939

Browse files
GuinersGuiners
andauthored
feat(genai): Adding live samples (#13598)
* Adding samples * Live samples updates * adding mocking * changing region in test_live_conversation_audio_with_audio * removing rag memory corpus * mocking live_conversation_audio_with_audio * fixing mock live_conversation_audio_with_audio --------- Co-authored-by: Guiners <rkoza@softserveinc.com>
1 parent c4187f6 commit 4cd0939

10 files changed

+602
-12
lines changed

genai/live/live_audio_with_txt.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav
16+
# Install helpers for converting files: pip install librosa soundfile simpleaudio
17+
18+
import asyncio
19+
20+
21+
async def generate_content() -> list:
22+
# [START googlegenaisdk_live_audio_with_txt]
23+
from google import genai
24+
from google.genai.types import (
25+
Content, LiveConnectConfig, Modality, Part,
26+
PrebuiltVoiceConfig, SpeechConfig, VoiceConfig
27+
)
28+
import numpy as np
29+
import soundfile as sf
30+
import simpleaudio as sa
31+
32+
def play_audio(audio_array: np.ndarray, sample_rate: int = 24000) -> None:
33+
sf.write("output.wav", audio_array, sample_rate)
34+
wave_obj = sa.WaveObject.from_wave_file("output.wav")
35+
play_obj = wave_obj.play()
36+
play_obj.wait_done()
37+
38+
client = genai.Client()
39+
voice_name = "Aoede"
40+
model = "gemini-2.0-flash-live-preview-04-09"
41+
42+
config = LiveConnectConfig(
43+
response_modalities=[Modality.AUDIO],
44+
speech_config=SpeechConfig(
45+
voice_config=VoiceConfig(
46+
prebuilt_voice_config=PrebuiltVoiceConfig(
47+
voice_name=voice_name,
48+
)
49+
),
50+
),
51+
)
52+
53+
async with client.aio.live.connect(
54+
model=model,
55+
config=config,
56+
) as session:
57+
text_input = "Hello? Gemini are you there?"
58+
print("> ", text_input, "\n")
59+
60+
await session.send_client_content(
61+
turns=Content(role="user", parts=[Part(text=text_input)])
62+
)
63+
64+
audio_data = []
65+
async for message in session.receive():
66+
if (
67+
message.server_content.model_turn
68+
and message.server_content.model_turn.parts
69+
):
70+
for part in message.server_content.model_turn.parts:
71+
if part.inline_data:
72+
audio_data.append(
73+
np.frombuffer(part.inline_data.data, dtype=np.int16)
74+
)
75+
76+
if audio_data:
77+
print("Received audio answer: ")
78+
play_audio(np.concatenate(audio_data), sample_rate=24000)
79+
80+
# [END googlegenaisdk_live_audio_with_txt]
81+
return []
82+
83+
84+
if __name__ == "__main__":
85+
asyncio.run(generate_content())
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# [START googlegenaisdk_live_conversation_audio_with_audio]
16+
17+
import asyncio
18+
import base64
19+
20+
from google import genai
21+
from google.genai.types import (
22+
AudioTranscriptionConfig,
23+
Blob,
24+
HttpOptions,
25+
LiveConnectConfig,
26+
Modality,
27+
)
28+
import numpy as np
29+
30+
from scipy.io import wavfile
31+
32+
# The number of audio frames to send in each chunk.
33+
CHUNK = 4200
34+
CHANNELS = 1
35+
MODEL = "gemini-live-2.5-flash-preview-native-audio-09-2025"
36+
37+
# The audio sample rate expected by the model.
38+
INPUT_RATE = 16000
39+
# The audio sample rate of the audio generated by the model.
40+
OUTPUT_RATE = 24000
41+
42+
# The sample width for 16-bit audio, which is standard for this type of audio data.
43+
SAMPLE_WIDTH = 2
44+
45+
client = genai.Client(http_options=HttpOptions(api_version="v1beta1"), location="us-central1")
46+
47+
48+
def read_wavefile(filepath: str) -> tuple[str, str]:
49+
# Read the .wav file using scipy.io.wavfile.read
50+
rate, data = wavfile.read(filepath)
51+
# Convert the NumPy array of audio samples back to raw bytes
52+
raw_audio_bytes = data.tobytes()
53+
# Encode the raw bytes to a base64 string.
54+
# The result needs to be decoded from bytes to a UTF-8 string
55+
base64_encoded_data = base64.b64encode(raw_audio_bytes).decode("ascii")
56+
mime_type = f"audio/pcm;rate={rate}"
57+
return base64_encoded_data, mime_type
58+
59+
60+
def write_wavefile(filepath: str, audio_frames: list[bytes], rate: int) -> None:
61+
"""Writes a list of audio byte frames to a WAV file using scipy."""
62+
# Combine the list of byte frames into a single byte string
63+
raw_audio_bytes = b"".join(audio_frames)
64+
65+
# Convert the raw bytes to a NumPy array.
66+
# The sample width is 2 bytes (16-bit), so we use np.int16
67+
audio_data = np.frombuffer(raw_audio_bytes, dtype=np.int16)
68+
69+
# Write the NumPy array to a .wav file
70+
wavfile.write(filepath, rate, audio_data)
71+
print(f"Model response saved to {filepath}")
72+
73+
74+
async def main() -> bool:
75+
print("Starting the code")
76+
77+
async with client.aio.live.connect(
78+
model=MODEL,
79+
config=LiveConnectConfig(
80+
# Set Model responses to be in Audio
81+
response_modalities=[Modality.AUDIO],
82+
# To generate transcript for input audio
83+
input_audio_transcription=AudioTranscriptionConfig(),
84+
# To generate transcript for output audio
85+
output_audio_transcription=AudioTranscriptionConfig(),
86+
),
87+
) as session:
88+
89+
async def send() -> None:
90+
# using local file as an example for live audio input
91+
wav_file_path = "hello_gemini_are_you_there.wav"
92+
base64_data, mime_type = read_wavefile(wav_file_path)
93+
audio_bytes = base64.b64decode(base64_data)
94+
await session.send_realtime_input(media=Blob(data=audio_bytes, mime_type=mime_type))
95+
96+
async def receive() -> None:
97+
audio_frames = []
98+
99+
async for message in session.receive():
100+
if message.server_content.input_transcription:
101+
print(message.server_content.model_dump(mode="json", exclude_none=True))
102+
if message.server_content.output_transcription:
103+
print(message.server_content.model_dump(mode="json", exclude_none=True))
104+
if message.server_content.model_turn:
105+
for part in message.server_content.model_turn.parts:
106+
if part.inline_data.data:
107+
audio_data = part.inline_data.data
108+
audio_frames.append(audio_data)
109+
110+
if audio_frames:
111+
write_wavefile(
112+
"example_model_response.wav",
113+
audio_frames,
114+
OUTPUT_RATE,
115+
)
116+
117+
send_task = asyncio.create_task(send())
118+
receive_task = asyncio.create_task(receive())
119+
await asyncio.gather(send_task, receive_task)
120+
# Example response:
121+
# gemini-2.0-flash-live-preview-04-09
122+
# {'input_transcription': {'text': 'Hello.'}}
123+
# {'output_transcription': {}}
124+
# {'output_transcription': {'text': 'Hi'}}
125+
# {'output_transcription': {'text': ' there. What can I do for you today?'}}
126+
# {'output_transcription': {'finished': True}}
127+
# Model response saved to example_model_response.wav
128+
129+
# [END googlegenaisdk_live_conversation_audio_with_audio]
130+
return True
131+
132+
if __name__ == "__main__":
133+
asyncio.run(main())
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import asyncio
15+
16+
17+
async def generate_content(memory_corpus: str) -> list[str]:
18+
# [START googlegenaisdk_live_ground_ragengine_with_txt]
19+
from google import genai
20+
from google.genai.types import (Content, LiveConnectConfig, Modality, Part,
21+
Retrieval, Tool, VertexRagStore,
22+
VertexRagStoreRagResource)
23+
24+
client = genai.Client()
25+
model_id = "gemini-2.0-flash-live-preview-04-09"
26+
rag_store = VertexRagStore(
27+
rag_resources=[
28+
VertexRagStoreRagResource(
29+
rag_corpus=memory_corpus # Use memory corpus if you want to store context.
30+
)
31+
],
32+
# Set `store_context` to true to allow Live API sink context into your memory corpus.
33+
store_context=True,
34+
)
35+
config = LiveConnectConfig(
36+
response_modalities=[Modality.TEXT],
37+
tools=[Tool(retrieval=Retrieval(vertex_rag_store=rag_store))],
38+
)
39+
40+
async with client.aio.live.connect(model=model_id, config=config) as session:
41+
text_input = "What are newest gemini models?"
42+
print("> ", text_input, "\n")
43+
44+
await session.send_client_content(
45+
turns=Content(role="user", parts=[Part(text=text_input)])
46+
)
47+
48+
response = []
49+
50+
async for message in session.receive():
51+
if message.text:
52+
response.append(message.text)
53+
54+
print("".join(response))
55+
# Example output:
56+
# > What are newest gemini models?
57+
# In December 2023, Google launched Gemini, their "most capable and general model". It's multimodal, meaning it understands and combines different types of information like text, code, audio, images, and video.
58+
# [END googlegenaisdk_live_ground_ragengine_with_txt]
59+
return response
60+
61+
62+
if __name__ == "__main__":
63+
asyncio.run(generate_content("test_memory_corpus"))

genai/live/live_structured_ouput_with_txt.py renamed to genai/live/live_structured_output_with_txt.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class CalendarEvent(BaseModel):
2424

2525

2626
def generate_content() -> CalendarEvent:
27-
# [START googlegenaisdk_live_structured_ouput_with_txt]
27+
# [START googlegenaisdk_live_structured_output_with_txt]
2828
import os
2929

3030
import google.auth.transport.requests
@@ -78,8 +78,8 @@ def generate_content() -> CalendarEvent:
7878
# System message: Extract the event information.
7979
# User message: Alice and Bob are going to a science fair on Friday.
8080
# Output message: name='science fair' date='Friday' participants=['Alice', 'Bob']
81-
# [END googlegenaisdk_live_structured_ouput_with_txt]
82-
return True
81+
# [END googlegenaisdk_live_structured_output_with_txt]
82+
return response
8383

8484

8585
if __name__ == "__main__":

genai/live/live_txt_with_audio.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
# Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav
17+
# Install helpers for converting files: pip install librosa soundfile
18+
19+
import asyncio
20+
21+
22+
async def generate_content() -> list[str]:
23+
# [START googlegenaisdk_live_txt_with_audio]
24+
import io
25+
26+
import librosa
27+
import requests
28+
import soundfile as sf
29+
from google import genai
30+
from google.genai.types import Blob, LiveConnectConfig, Modality
31+
32+
client = genai.Client()
33+
model = "gemini-2.0-flash-live-preview-04-09"
34+
config = LiveConnectConfig(response_modalities=[Modality.TEXT])
35+
36+
async with client.aio.live.connect(model=model, config=config) as session:
37+
audio_url = (
38+
"https://storage.googleapis.com/generativeai-downloads/data/16000.wav"
39+
)
40+
response = requests.get(audio_url)
41+
response.raise_for_status()
42+
buffer = io.BytesIO(response.content)
43+
y, sr = librosa.load(buffer, sr=16000)
44+
sf.write(buffer, y, sr, format="RAW", subtype="PCM_16")
45+
buffer.seek(0)
46+
audio_bytes = buffer.read()
47+
48+
# If you've pre-converted to sample.pcm using ffmpeg, use this instead:
49+
# audio_bytes = Path("sample.pcm").read_bytes()
50+
51+
print("> Answer to this audio url", audio_url, "\n")
52+
53+
await session.send_realtime_input(
54+
media=Blob(data=audio_bytes, mime_type="audio/pcm;rate=16000")
55+
)
56+
57+
response = []
58+
59+
async for message in session.receive():
60+
if message.text is not None:
61+
response.append(message.text)
62+
63+
print("".join(response))
64+
# Example output:
65+
# > Answer to this audio url https://storage.googleapis.com/generativeai-downloads/data/16000.wav
66+
# Yes, I can hear you. How can I help you today?
67+
# [END googlegenaisdk_live_txt_with_audio]
68+
return response
69+
70+
71+
if __name__ == "__main__":
72+
asyncio.run(generate_content())

genai/live/requirements-test.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ backoff==2.2.1
22
google-api-core==2.25.1
33
pytest==8.4.1
44
pytest-asyncio==1.1.0
5+
pytest-mock==3.14.0

genai/live/requirements.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,7 @@ websockets==15.0.1
44
numpy==1.26.4
55
soundfile==0.12.1
66
openai==1.99.1
7-
setuptools==80.9.0
7+
setuptools==80.9.0
8+
pyaudio==0.2.14
9+
librosa==0.11.0
10+
simpleaudio==1.0.0

0 commit comments

Comments
 (0)