From 22d4737b77f0a89e7a90e538679cc143e28b5494 Mon Sep 17 00:00:00 2001 From: Jack Herrington Date: Tue, 20 Jan 2026 07:06:00 -0800 Subject: [PATCH 1/5] Realtime chat basically working --- examples/ts-react-chat/.env.example | 10 +- examples/ts-react-chat/package.json | 1 + .../ts-react-chat/src/components/Header.tsx | 15 +- examples/ts-react-chat/src/routeTree.gen.ts | 48 +- .../src/routes/api.realtime-token.ts | 86 +++ .../ts-react-chat/src/routes/realtime.tsx | 501 +++++++++++++++ packages/typescript/ai-client/src/index.ts | 8 + .../ai-client/src/realtime-client.ts | 451 ++++++++++++++ .../ai-client/src/realtime-types.ts | 142 +++++ packages/typescript/ai-elevenlabs/README.md | 76 +++ .../typescript/ai-elevenlabs/package.json | 55 ++ .../typescript/ai-elevenlabs/src/index.ts | 16 + .../ai-elevenlabs/src/realtime/adapter.ts | 251 ++++++++ .../ai-elevenlabs/src/realtime/index.ts | 14 + .../ai-elevenlabs/src/realtime/token.ts | 103 +++ .../ai-elevenlabs/src/realtime/types.ts | 62 ++ .../typescript/ai-elevenlabs/tsconfig.json | 8 + .../typescript/ai-elevenlabs/vite.config.ts | 37 ++ packages/typescript/ai-openai/package.json | 2 + packages/typescript/ai-openai/src/index.ts | 19 + .../ai-openai/src/realtime/adapter.ts | 585 ++++++++++++++++++ .../ai-openai/src/realtime/index.ts | 16 + .../ai-openai/src/realtime/token.ts | 153 +++++ .../ai-openai/src/realtime/types.ts | 127 ++++ packages/typescript/ai-react/src/index.ts | 5 + .../typescript/ai-react/src/realtime-types.ts | 109 ++++ .../ai-react/src/use-realtime-chat.ts | 241 ++++++++ packages/typescript/ai/src/index.ts | 24 + packages/typescript/ai/src/realtime/index.ts | 38 ++ packages/typescript/ai/src/realtime/types.ts | 266 ++++++++ pnpm-lock.yaml | 115 ++++ 31 files changed, 3581 insertions(+), 3 deletions(-) create mode 100644 examples/ts-react-chat/src/routes/api.realtime-token.ts create mode 100644 examples/ts-react-chat/src/routes/realtime.tsx create mode 100644 packages/typescript/ai-client/src/realtime-client.ts create mode 100644 packages/typescript/ai-client/src/realtime-types.ts create mode 100644 packages/typescript/ai-elevenlabs/README.md create mode 100644 packages/typescript/ai-elevenlabs/package.json create mode 100644 packages/typescript/ai-elevenlabs/src/index.ts create mode 100644 packages/typescript/ai-elevenlabs/src/realtime/adapter.ts create mode 100644 packages/typescript/ai-elevenlabs/src/realtime/index.ts create mode 100644 packages/typescript/ai-elevenlabs/src/realtime/token.ts create mode 100644 packages/typescript/ai-elevenlabs/src/realtime/types.ts create mode 100644 packages/typescript/ai-elevenlabs/tsconfig.json create mode 100644 packages/typescript/ai-elevenlabs/vite.config.ts create mode 100644 packages/typescript/ai-openai/src/realtime/adapter.ts create mode 100644 packages/typescript/ai-openai/src/realtime/index.ts create mode 100644 packages/typescript/ai-openai/src/realtime/token.ts create mode 100644 packages/typescript/ai-openai/src/realtime/types.ts create mode 100644 packages/typescript/ai-react/src/realtime-types.ts create mode 100644 packages/typescript/ai-react/src/use-realtime-chat.ts create mode 100644 packages/typescript/ai/src/realtime/index.ts create mode 100644 packages/typescript/ai/src/realtime/types.ts diff --git a/examples/ts-react-chat/.env.example b/examples/ts-react-chat/.env.example index 613cb664b..2bdb43f49 100644 --- a/examples/ts-react-chat/.env.example +++ b/examples/ts-react-chat/.env.example @@ -1,3 +1,11 @@ # OpenAI API Key # Get yours at: https://platform.openai.com/api-keys -OPENAI_API_KEY=sk-... \ No newline at end of file +OPENAI_API_KEY=sk-... + +# ElevenLabs API Key (for realtime voice) +# Get yours at: https://elevenlabs.io/app/settings/api-keys +ELEVENLABS_API_KEY=xi-... + +# ElevenLabs Agent ID (for realtime voice) +# Create an agent at: https://elevenlabs.io/app/conversational-ai +ELEVENLABS_AGENT_ID=... diff --git a/examples/ts-react-chat/package.json b/examples/ts-react-chat/package.json index f58f54bda..42935c510 100644 --- a/examples/ts-react-chat/package.json +++ b/examples/ts-react-chat/package.json @@ -17,6 +17,7 @@ "@tanstack/ai-grok": "workspace:*", "@tanstack/ai-ollama": "workspace:*", "@tanstack/ai-openai": "workspace:*", + "@tanstack/ai-elevenlabs": "workspace:*", "@tanstack/ai-openrouter": "workspace:*", "@tanstack/ai-react": "workspace:*", "@tanstack/ai-react-ui": "workspace:*", diff --git a/examples/ts-react-chat/src/components/Header.tsx b/examples/ts-react-chat/src/components/Header.tsx index 57745b7b0..6af310b71 100644 --- a/examples/ts-react-chat/src/components/Header.tsx +++ b/examples/ts-react-chat/src/components/Header.tsx @@ -1,7 +1,7 @@ import { Link } from '@tanstack/react-router' import { useState } from 'react' -import { Guitar, Home, Menu, X } from 'lucide-react' +import { Guitar, Home, Menu, Mic, X } from 'lucide-react' export default function Header() { const [isOpen, setIsOpen] = useState(false) @@ -71,6 +71,19 @@ export default function Header() { Guitar Demo + + setIsOpen(false)} + className="flex items-center gap-3 p-3 rounded-lg hover:bg-gray-800 transition-colors mb-2" + activeProps={{ + className: + 'flex items-center gap-3 p-3 rounded-lg bg-cyan-600 hover:bg-cyan-700 transition-colors mb-2', + }} + > + + Voice Chat (Realtime) + diff --git a/examples/ts-react-chat/src/routeTree.gen.ts b/examples/ts-react-chat/src/routeTree.gen.ts index 627a240cf..a8525c1c4 100644 --- a/examples/ts-react-chat/src/routeTree.gen.ts +++ b/examples/ts-react-chat/src/routeTree.gen.ts @@ -9,11 +9,18 @@ // Additionally, you should also exclude this file from your linter and/or formatter to prevent it from being checked or modified. import { Route as rootRouteImport } from './routes/__root' +import { Route as RealtimeRouteImport } from './routes/realtime' import { Route as IndexRouteImport } from './routes/index' import { Route as ApiTanchatRouteImport } from './routes/api.tanchat' +import { Route as ApiRealtimeTokenRouteImport } from './routes/api.realtime-token' import { Route as ExampleGuitarsIndexRouteImport } from './routes/example.guitars/index' import { Route as ExampleGuitarsGuitarIdRouteImport } from './routes/example.guitars/$guitarId' +const RealtimeRoute = RealtimeRouteImport.update({ + id: '/realtime', + path: '/realtime', + getParentRoute: () => rootRouteImport, +} as any) const IndexRoute = IndexRouteImport.update({ id: '/', path: '/', @@ -24,6 +31,11 @@ const ApiTanchatRoute = ApiTanchatRouteImport.update({ path: '/api/tanchat', getParentRoute: () => rootRouteImport, } as any) +const ApiRealtimeTokenRoute = ApiRealtimeTokenRouteImport.update({ + id: '/api/realtime-token', + path: '/api/realtime-token', + getParentRoute: () => rootRouteImport, +} as any) const ExampleGuitarsIndexRoute = ExampleGuitarsIndexRouteImport.update({ id: '/example/guitars/', path: '/example/guitars/', @@ -37,12 +49,16 @@ const ExampleGuitarsGuitarIdRoute = ExampleGuitarsGuitarIdRouteImport.update({ export interface FileRoutesByFullPath { '/': typeof IndexRoute + '/realtime': typeof RealtimeRoute + '/api/realtime-token': typeof ApiRealtimeTokenRoute '/api/tanchat': typeof ApiTanchatRoute '/example/guitars/$guitarId': typeof ExampleGuitarsGuitarIdRoute '/example/guitars': typeof ExampleGuitarsIndexRoute } export interface FileRoutesByTo { '/': typeof IndexRoute + '/realtime': typeof RealtimeRoute + '/api/realtime-token': typeof ApiRealtimeTokenRoute '/api/tanchat': typeof ApiTanchatRoute '/example/guitars/$guitarId': typeof ExampleGuitarsGuitarIdRoute '/example/guitars': typeof ExampleGuitarsIndexRoute @@ -50,6 +66,8 @@ export interface FileRoutesByTo { export interface FileRoutesById { __root__: typeof rootRouteImport '/': typeof IndexRoute + '/realtime': typeof RealtimeRoute + '/api/realtime-token': typeof ApiRealtimeTokenRoute '/api/tanchat': typeof ApiTanchatRoute '/example/guitars/$guitarId': typeof ExampleGuitarsGuitarIdRoute '/example/guitars/': typeof ExampleGuitarsIndexRoute @@ -58,14 +76,24 @@ export interface FileRouteTypes { fileRoutesByFullPath: FileRoutesByFullPath fullPaths: | '/' + | '/realtime' + | '/api/realtime-token' | '/api/tanchat' | '/example/guitars/$guitarId' | '/example/guitars' fileRoutesByTo: FileRoutesByTo - to: '/' | '/api/tanchat' | '/example/guitars/$guitarId' | '/example/guitars' + to: + | '/' + | '/realtime' + | '/api/realtime-token' + | '/api/tanchat' + | '/example/guitars/$guitarId' + | '/example/guitars' id: | '__root__' | '/' + | '/realtime' + | '/api/realtime-token' | '/api/tanchat' | '/example/guitars/$guitarId' | '/example/guitars/' @@ -73,6 +101,8 @@ export interface FileRouteTypes { } export interface RootRouteChildren { IndexRoute: typeof IndexRoute + RealtimeRoute: typeof RealtimeRoute + ApiRealtimeTokenRoute: typeof ApiRealtimeTokenRoute ApiTanchatRoute: typeof ApiTanchatRoute ExampleGuitarsGuitarIdRoute: typeof ExampleGuitarsGuitarIdRoute ExampleGuitarsIndexRoute: typeof ExampleGuitarsIndexRoute @@ -80,6 +110,13 @@ export interface RootRouteChildren { declare module '@tanstack/react-router' { interface FileRoutesByPath { + '/realtime': { + id: '/realtime' + path: '/realtime' + fullPath: '/realtime' + preLoaderRoute: typeof RealtimeRouteImport + parentRoute: typeof rootRouteImport + } '/': { id: '/' path: '/' @@ -94,6 +131,13 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof ApiTanchatRouteImport parentRoute: typeof rootRouteImport } + '/api/realtime-token': { + id: '/api/realtime-token' + path: '/api/realtime-token' + fullPath: '/api/realtime-token' + preLoaderRoute: typeof ApiRealtimeTokenRouteImport + parentRoute: typeof rootRouteImport + } '/example/guitars/': { id: '/example/guitars/' path: '/example/guitars' @@ -113,6 +157,8 @@ declare module '@tanstack/react-router' { const rootRouteChildren: RootRouteChildren = { IndexRoute: IndexRoute, + RealtimeRoute: RealtimeRoute, + ApiRealtimeTokenRoute: ApiRealtimeTokenRoute, ApiTanchatRoute: ApiTanchatRoute, ExampleGuitarsGuitarIdRoute: ExampleGuitarsGuitarIdRoute, ExampleGuitarsIndexRoute: ExampleGuitarsIndexRoute, diff --git a/examples/ts-react-chat/src/routes/api.realtime-token.ts b/examples/ts-react-chat/src/routes/api.realtime-token.ts new file mode 100644 index 000000000..0f7a46bda --- /dev/null +++ b/examples/ts-react-chat/src/routes/api.realtime-token.ts @@ -0,0 +1,86 @@ +import { createFileRoute } from '@tanstack/react-router' +import { realtimeToken } from '@tanstack/ai' +import { openaiRealtimeToken } from '@tanstack/ai-openai' +import { elevenlabsRealtimeToken } from '@tanstack/ai-elevenlabs' + +type Provider = 'openai' | 'elevenlabs' + +export const Route = createFileRoute('/api/realtime-token')({ + server: { + handlers: { + POST: async ({ request }) => { + try { + const body = await request.json() + const provider: Provider = body.provider || 'openai' + + let token + + if (provider === 'openai') { + token = await realtimeToken({ + adapter: openaiRealtimeToken({ + model: 'gpt-4o-realtime-preview', + voice: 'alloy', + instructions: `You are a helpful, friendly assistant. + +Keep your responses concise and conversational since this is a voice interface. +Be natural and engaging in your responses.`, + turnDetection: { + type: 'server_vad', + threshold: 0.5, + prefix_padding_ms: 300, + silence_duration_ms: 500, + }, + inputAudioTranscription: { + model: 'whisper-1', + }, + }), + }) + } else if (provider === 'elevenlabs') { + const agentId = body.agentId || process.env.ELEVENLABS_AGENT_ID + + if (!agentId) { + return new Response( + JSON.stringify({ + error: 'ElevenLabs agent ID is required. Set ELEVENLABS_AGENT_ID or pass agentId in request body.', + }), + { + status: 400, + headers: { 'Content-Type': 'application/json' }, + }, + ) + } + + token = await realtimeToken({ + adapter: elevenlabsRealtimeToken({ + agentId, + }), + }) + } else { + return new Response( + JSON.stringify({ error: `Unknown provider: ${provider}` }), + { + status: 400, + headers: { 'Content-Type': 'application/json' }, + }, + ) + } + + return new Response(JSON.stringify(token), { + headers: { 'Content-Type': 'application/json' }, + }) + } catch (error: any) { + console.error('[Realtime Token API] Error:', error) + return new Response( + JSON.stringify({ + error: error.message || 'Failed to generate realtime token', + }), + { + status: 500, + headers: { 'Content-Type': 'application/json' }, + }, + ) + } + }, + }, + }, +}) diff --git a/examples/ts-react-chat/src/routes/realtime.tsx b/examples/ts-react-chat/src/routes/realtime.tsx new file mode 100644 index 000000000..fcc954cb1 --- /dev/null +++ b/examples/ts-react-chat/src/routes/realtime.tsx @@ -0,0 +1,501 @@ +import { useEffect, useRef, useState } from 'react' +import { createFileRoute } from '@tanstack/react-router' +import { useRealtimeChat } from '@tanstack/ai-react' +import { openaiRealtime } from '@tanstack/ai-openai' +import { elevenlabsRealtime } from '@tanstack/ai-elevenlabs' +import { Mic, MicOff, Phone, PhoneOff, Volume2 } from 'lucide-react' + +type Provider = 'openai' | 'elevenlabs' + +const PROVIDER_OPTIONS: Array<{ value: Provider; label: string }> = [ + { value: 'openai', label: 'OpenAI Realtime' }, + { value: 'elevenlabs', label: 'ElevenLabs' }, +] + +// Sparkline component to visualize audio waveform +function AudioSparkline({ + getData, + color, + label, +}: { + getData: () => Uint8Array + color: string + label: string +}) { + const canvasRef = useRef(null) + const animationRef = useRef(null) + + useEffect(() => { + const canvas = canvasRef.current + if (!canvas) return + + const ctx = canvas.getContext('2d') + if (!ctx) return + + function draw() { + const data = getData() + const width = canvas!.width + const height = canvas!.height + + // Clear canvas + ctx!.fillStyle = '#1f2937' // gray-800 + ctx!.fillRect(0, 0, width, height) + + // Draw waveform + ctx!.strokeStyle = color + ctx!.lineWidth = 1 + ctx!.beginPath() + + // Sample the data to fit the canvas width + const step = Math.max(1, Math.floor(data.length / width)) + + for (let i = 0; i < width; i++) { + const dataIndex = Math.min(i * step, data.length - 1) + const value = data[dataIndex] ?? 128 + // Convert 0-255 to canvas height (128 is center/silence) + const y = height - ((value / 255) * height) + + if (i === 0) { + ctx!.moveTo(i, y) + } else { + ctx!.lineTo(i, y) + } + } + + ctx!.stroke() + + // Draw center line (silence level) + ctx!.strokeStyle = '#4b5563' // gray-600 + ctx!.setLineDash([2, 2]) + ctx!.beginPath() + ctx!.moveTo(0, height / 2) + ctx!.lineTo(width, height / 2) + ctx!.stroke() + ctx!.setLineDash([]) + + animationRef.current = requestAnimationFrame(draw) + } + + draw() + + return () => { + if (animationRef.current) { + cancelAnimationFrame(animationRef.current) + } + } + }, [getData, color]) + + return ( +
+ {label} + +
+ ) +} + +// Debug component to show raw audio data stats +function AudioDebug({ + getData, + label +}: { + getData: () => Uint8Array + label: string +}) { + const [stats, setStats] = useState({ length: 0, min: 0, max: 0, allSame: true, sample: '[]' }) + + useEffect(() => { + function update() { + const data = getData() + const min = Math.min(...data) + const max = Math.max(...data) + const allSame = data.every(v => v === data[0]) + // Get a few samples from different parts of the array + const samples = data.length > 0 ? [ + data[0], + data[Math.floor(data.length / 4)], + data[Math.floor(data.length / 2)], + data[Math.floor(data.length * 3 / 4)], + data[data.length - 1] + ] : [] + setStats({ + length: data.length, + min, + max, + allSame, + sample: `[${samples.join(', ')}]` + }) + requestAnimationFrame(update) + } + const id = requestAnimationFrame(update) + return () => cancelAnimationFrame(id) + }, [getData]) + + return ( +
+ {label}: len={stats.length}, min={stats.min}, max={stats.max}, + {stats.allSame ? ALL SAME! : varying} + {stats.sample} +
+ ) +} + +function RealtimePage() { + const [provider, setProvider] = useState('openai') + const [agentId, setAgentId] = useState('') + const [showDebug, setShowDebug] = useState(true) + const messagesEndRef = useRef(null) + + // Get the appropriate adapter based on provider + const adapter = provider === 'openai' ? openaiRealtime() : elevenlabsRealtime() + + const { + status, + mode, + messages, + pendingUserTranscript, + pendingAssistantTranscript, + error, + connect, + disconnect, + interrupt, + inputLevel, + outputLevel, + sendText, + getInputTimeDomainData, + getOutputTimeDomainData, + } = useRealtimeChat({ + getToken: async () => { + const body: Record = { provider } + if (provider === 'elevenlabs' && agentId) { + body.agentId = agentId + } + const response = await fetch('/api/realtime-token', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(body), + }) + if (!response.ok) { + const error = await response.json() + throw new Error(error.error || 'Failed to get token') + } + return response.json() + }, + adapter, + onError: (err) => { + console.error('Realtime error:', err) + }, + }) + + // Auto-scroll to bottom when messages change + useEffect(() => { + messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' }) + }, [messages, pendingUserTranscript, pendingAssistantTranscript]) + + // Get status color + const getStatusColor = () => { + switch (status) { + case 'connected': + return 'bg-green-500' + case 'connecting': + case 'reconnecting': + return 'bg-yellow-500' + case 'error': + return 'bg-red-500' + default: + return 'bg-gray-500' + } + } + + // Get mode icon + const getModeIndicator = () => { + switch (mode) { + case 'listening': + return ( +
+ + Listening... +
+ ) + case 'thinking': + return ( +
+
+ Thinking... +
+ ) + case 'speaking': + return ( +
+ + Speaking... +
+ ) + default: + return ( +
+ + Idle +
+ ) + } + } + + return ( +
+
+ {/* Header */} +
+
+
+ {/* Provider selector */} +
+ + +
+ + {/* ElevenLabs Agent ID (conditional) */} + {provider === 'elevenlabs' && ( +
+ + setAgentId(e.target.value)} + placeholder="Your ElevenLabs Agent ID" + disabled={status !== 'idle'} + className="rounded-lg border border-orange-500/20 bg-gray-900 px-3 py-2 text-sm text-white focus:outline-none focus:ring-2 focus:ring-orange-500/50 disabled:opacity-50 w-64" + /> +
+ )} +
+ + {/* Status */} +
+
+
+ + {status} + +
+ {getModeIndicator()} +
+
+
+ + {/* Messages area */} +
+ {messages.length === 0 && status === 'idle' && ( +
+ +

Voice Chat

+

+ Click "Start Conversation" to begin talking with the AI +

+
+ )} + + {messages.map((message) => ( +
+
+ {message.role === 'assistant' ? ( +
+ AI +
+ ) : ( +
+ U +
+ )} +
+ {message.parts.map((part, idx) => { + if (part.type === 'audio') { + return ( +

+ {part.transcript} +

+ ) + } + if (part.type === 'text') { + return ( +

+ {part.content} +

+ ) + } + return null + })} + {message.interrupted && ( + + (interrupted) + + )} +
+
+
+ ))} + + {/* Pending transcripts */} + {pendingUserTranscript && ( +
+
+
+ U +
+

{pendingUserTranscript}...

+
+
+ )} + + {pendingAssistantTranscript && ( +
+
+
+ AI +
+

+ {pendingAssistantTranscript}... +

+
+
+ )} + +
+
+ + {/* Error display */} + {error && ( +
+ Error: {error.message} +
+ )} + + {/* Audio visualization & controls */} +
+ {/* Volume meters and waveforms */} + {status === 'connected' && ( +
+ {/* Input (Microphone) */} +
+ +
+
+
+ + {Math.round(inputLevel * 100)}% + + +
+ {/* Output (Speaker) */} +
+ +
+
+
+ + {Math.round(outputLevel * 100)}% + + +
+ + {/* Debug info */} + {showDebug && ( +
+
+ Audio Debug + +
+ + +
+ inputLevel: {inputLevel.toFixed(4)}, outputLevel: {outputLevel.toFixed(4)} +
+
+ )} +
+ )} + + {/* Controls */} +
+ {status === 'idle' ? ( + + ) : ( + <> + {mode === 'speaking' && ( + + )} + + + )} +
+
+
+
+ ) +} + +export const Route = createFileRoute('/realtime')({ + component: RealtimePage, +}) diff --git a/packages/typescript/ai-client/src/index.ts b/packages/typescript/ai-client/src/index.ts index 5bc664c0a..02a9ce3ff 100644 --- a/packages/typescript/ai-client/src/index.ts +++ b/packages/typescript/ai-client/src/index.ts @@ -1,4 +1,5 @@ export { ChatClient } from './chat-client' +export { RealtimeClient } from './realtime-client' export type { // Core message types (re-exported from @tanstack/ai via types.ts) UIMessage, @@ -19,6 +20,13 @@ export type { ExtractToolOutput, } from './tool-types' export type { AnyClientTool } from '@tanstack/ai' +export type { + RealtimeAdapter, + RealtimeConnection, + RealtimeClientOptions, + RealtimeClientState, + RealtimeStateChangeCallback, +} from './realtime-types' export { fetchServerSentEvents, fetchHttpStream, diff --git a/packages/typescript/ai-client/src/realtime-client.ts b/packages/typescript/ai-client/src/realtime-client.ts new file mode 100644 index 000000000..0672fe23e --- /dev/null +++ b/packages/typescript/ai-client/src/realtime-client.ts @@ -0,0 +1,451 @@ +import type { + AudioVisualization, + RealtimeMessage, + RealtimeMode, + RealtimeStatus, + RealtimeToken, +} from '@tanstack/ai' +import type { AnyClientTool } from '@tanstack/ai' +import type { + RealtimeClientOptions, + RealtimeClientState, + RealtimeConnection, + RealtimeStateChangeCallback, +} from './realtime-types' + +// Token refresh buffer - refresh 1 minute before expiry +const TOKEN_REFRESH_BUFFER_MS = 60_000 + +/** + * Client for managing realtime voice conversations. + * + * Handles connection lifecycle, audio I/O, message state, + * and tool execution for realtime voice-to-voice AI interactions. + * + * @example + * ```typescript + * import { RealtimeClient } from '@tanstack/ai-client' + * import { openaiRealtime } from '@tanstack/ai-openai' + * + * const client = new RealtimeClient({ + * getToken: () => fetch('/api/realtime-token').then(r => r.json()), + * adapter: openaiRealtime(), + * tools: [myTool.client(handler)], + * onMessage: (msg) => console.log('Message:', msg), + * }) + * + * await client.connect() + * ``` + */ +let clientIdCounter = 0 + +export class RealtimeClient { + private instanceId: number + private options: RealtimeClientOptions + private connection: RealtimeConnection | null = null + private token: RealtimeToken | null = null + private tokenRefreshTimeout: ReturnType | null = null + private clientTools: Map + private stateChangeCallbacks: Set = new Set() + private unsubscribers: Array<() => void> = [] + + private state: RealtimeClientState = { + status: 'idle', + mode: 'idle', + messages: [], + pendingUserTranscript: null, + pendingAssistantTranscript: null, + error: null, + } + + constructor(options: RealtimeClientOptions) { + this.instanceId = ++clientIdCounter + console.log(`[RealtimeClient #${this.instanceId}] Created`) + + this.options = { + autoPlayback: true, + autoCapture: true, + vadMode: 'server', + ...options, + } + + // Build client tools map + this.clientTools = new Map() + if (options.tools) { + for (const tool of options.tools) { + this.clientTools.set(tool.name, tool) + } + } + } + + // ============================================================================ + // Connection Lifecycle + // ============================================================================ + + /** + * Connect to the realtime session. + * Fetches a token and establishes the connection. + */ + async connect(): Promise { + if (this.state.status === 'connected') { + return + } + + this.updateState({ status: 'connecting', error: null }) + + try { + // Fetch token from server + this.token = await this.options.getToken() + + // Schedule token refresh + this.scheduleTokenRefresh() + + // Connect via adapter + this.connection = await this.options.adapter.connect(this.token) + console.log(`[RealtimeClient #${this.instanceId}] Connection established:`, !!this.connection) + + // Subscribe to connection events + this.subscribeToConnectionEvents() + + // Start audio capture if configured + if (this.options.autoCapture) { + await this.connection.startAudioCapture() + } + + this.updateState({ status: 'connected', mode: 'listening' }) + this.options.onConnect?.() + } catch (error) { + const err = error instanceof Error ? error : new Error(String(error)) + this.updateState({ status: 'error', error: err }) + this.options.onError?.(err) + throw err + } + } + + /** + * Disconnect from the realtime session. + */ + async disconnect(): Promise { + if (this.tokenRefreshTimeout) { + clearTimeout(this.tokenRefreshTimeout) + this.tokenRefreshTimeout = null + } + + // Unsubscribe from all events + for (const unsub of this.unsubscribers) { + unsub() + } + this.unsubscribers = [] + + if (this.connection) { + await this.connection.disconnect() + this.connection = null + } + + this.token = null + this.updateState({ + status: 'idle', + mode: 'idle', + pendingUserTranscript: null, + pendingAssistantTranscript: null, + }) + this.options.onDisconnect?.() + } + + // ============================================================================ + // Voice Control + // ============================================================================ + + /** + * Start listening for voice input. + * Only needed when vadMode is 'manual'. + */ + startListening(): void { + if (!this.connection || this.state.status !== 'connected') { + return + } + this.connection.startAudioCapture() + this.updateState({ mode: 'listening' }) + } + + /** + * Stop listening for voice input. + * Only needed when vadMode is 'manual'. + */ + stopListening(): void { + if (!this.connection) { + return + } + this.connection.stopAudioCapture() + this.updateState({ mode: 'idle' }) + } + + /** + * Interrupt the current assistant response. + */ + interrupt(): void { + if (!this.connection) { + return + } + this.connection.interrupt() + } + + // ============================================================================ + // Text Input + // ============================================================================ + + /** + * Send a text message instead of voice. + */ + sendText(text: string): void { + if (!this.connection || this.state.status !== 'connected') { + return + } + + // Add user message + const userMessage: RealtimeMessage = { + id: this.generateId(), + role: 'user', + timestamp: Date.now(), + parts: [{ type: 'text', content: text }], + } + this.addMessage(userMessage) + + // Send to provider + this.connection.sendText(text) + } + + // ============================================================================ + // State Access + // ============================================================================ + + /** Get current connection status */ + get status(): RealtimeStatus { + return this.state.status + } + + /** Get current mode */ + get mode(): RealtimeMode { + return this.state.mode + } + + /** Get conversation messages */ + get messages(): Array { + return this.state.messages + } + + /** Get current error, if any */ + get error(): Error | null { + return this.state.error + } + + /** Get pending user transcript (while user is speaking) */ + get pendingUserTranscript(): string | null { + return this.state.pendingUserTranscript + } + + /** Get pending assistant transcript (while assistant is speaking) */ + get pendingAssistantTranscript(): string | null { + return this.state.pendingAssistantTranscript + } + + /** Get audio visualization data */ + get audio(): AudioVisualization | null { + console.log(`[RealtimeClient #${this.instanceId}] audio getter, connection:`, !!this.connection) + return this.connection?.getAudioVisualization() ?? null + } + + // ============================================================================ + // State Subscription + // ============================================================================ + + /** + * Subscribe to state changes. + * @returns Unsubscribe function + */ + onStateChange(callback: RealtimeStateChangeCallback): () => void { + this.stateChangeCallbacks.add(callback) + return () => { + this.stateChangeCallbacks.delete(callback) + } + } + + // ============================================================================ + // Cleanup + // ============================================================================ + + /** + * Clean up resources. + * Call this when disposing of the client. + */ + destroy(): void { + this.disconnect() + this.stateChangeCallbacks.clear() + } + + // ============================================================================ + // Private Methods + // ============================================================================ + + private updateState(updates: Partial): void { + this.state = { ...this.state, ...updates } + + // Notify callbacks + for (const callback of this.stateChangeCallbacks) { + callback(this.state) + } + + // Notify specific callbacks + if ('status' in updates && updates.status !== undefined) { + this.options.onStatusChange?.(updates.status) + } + if ('mode' in updates && updates.mode !== undefined) { + this.options.onModeChange?.(updates.mode) + } + } + + private addMessage(message: RealtimeMessage): void { + this.updateState({ + messages: [...this.state.messages, message], + }) + this.options.onMessage?.(message) + } + + private scheduleTokenRefresh(): void { + if (!this.token) return + + const timeUntilExpiry = this.token.expiresAt - Date.now() + const refreshIn = Math.max(0, timeUntilExpiry - TOKEN_REFRESH_BUFFER_MS) + + this.tokenRefreshTimeout = setTimeout(() => { + this.refreshToken() + }, refreshIn) + } + + private async refreshToken(): Promise { + try { + this.token = await this.options.getToken() + this.scheduleTokenRefresh() + // Note: Some providers may require reconnection with new token + // This is handled by the adapter implementation + } catch (error) { + const err = error instanceof Error ? error : new Error(String(error)) + this.updateState({ error: err }) + this.options.onError?.(err) + } + } + + private subscribeToConnectionEvents(): void { + if (!this.connection) return + + // Status changes + this.unsubscribers.push( + this.connection.on('status_change', ({ status }) => { + this.updateState({ status }) + }), + ) + + // Mode changes + this.unsubscribers.push( + this.connection.on('mode_change', ({ mode }) => { + this.updateState({ mode }) + }), + ) + + // Transcripts (streaming) + // User transcripts are added as messages when final (no separate message_complete for user input) + // Assistant transcripts are streamed, final message comes via message_complete + this.unsubscribers.push( + this.connection.on('transcript', ({ role, transcript, isFinal }) => { + if (role === 'user') { + this.updateState({ + pendingUserTranscript: isFinal ? null : transcript, + }) + // Add user message when transcript is finalized + if (isFinal && transcript) { + this.addMessage({ + id: this.generateId(), + role: 'user', + timestamp: Date.now(), + parts: [{ type: 'audio', transcript, durationMs: 0 }], + }) + } + } else { + // Assistant transcripts - just update pending, message_complete handles final + this.updateState({ + pendingAssistantTranscript: isFinal ? null : transcript, + }) + } + }), + ) + + // Tool calls + this.unsubscribers.push( + this.connection.on('tool_call', async ({ toolCallId, toolName, input }) => { + const tool = this.clientTools.get(toolName) + if (tool?.execute) { + try { + const output = await tool.execute(input) + this.connection?.sendToolResult( + toolCallId, + typeof output === 'string' ? output : JSON.stringify(output), + ) + } catch (error) { + const errMsg = + error instanceof Error ? error.message : String(error) + this.connection?.sendToolResult( + toolCallId, + JSON.stringify({ error: errMsg }), + ) + } + } + }), + ) + + // Message complete + this.unsubscribers.push( + this.connection.on('message_complete', ({ message }) => { + // Replace pending message with final version if needed + const existingIndex = this.state.messages.findIndex( + (m) => m.id === message.id, + ) + if (existingIndex >= 0) { + const newMessages = [...this.state.messages] + newMessages[existingIndex] = message + this.updateState({ messages: newMessages }) + } else { + this.addMessage(message) + } + }), + ) + + // Interruption + this.unsubscribers.push( + this.connection.on('interrupted', ({ messageId }) => { + if (messageId) { + const newMessages = this.state.messages.map((m) => + m.id === messageId ? { ...m, interrupted: true } : m, + ) + this.updateState({ messages: newMessages }) + } + this.updateState({ + mode: 'listening', + pendingAssistantTranscript: null, + }) + this.options.onInterrupted?.() + }), + ) + + // Errors + this.unsubscribers.push( + this.connection.on('error', ({ error }) => { + this.updateState({ error }) + this.options.onError?.(error) + }), + ) + } + + private generateId(): string { + return `msg-${Date.now()}-${Math.random().toString(36).substring(7)}` + } +} diff --git a/packages/typescript/ai-client/src/realtime-types.ts b/packages/typescript/ai-client/src/realtime-types.ts new file mode 100644 index 000000000..3393ee92c --- /dev/null +++ b/packages/typescript/ai-client/src/realtime-types.ts @@ -0,0 +1,142 @@ +import type { + AudioVisualization, + RealtimeEvent, + RealtimeEventHandler, + RealtimeMessage, + RealtimeMode, + RealtimeSessionConfig, + RealtimeStatus, + RealtimeToken, +} from '@tanstack/ai' +import type { AnyClientTool } from '@tanstack/ai' + +// ============================================================================ +// Adapter Interface +// ============================================================================ + +/** + * Adapter interface for connecting to realtime providers. + * Each provider (OpenAI, ElevenLabs, etc.) implements this interface. + */ +export interface RealtimeAdapter { + /** Provider identifier */ + provider: string + + /** + * Create a connection using the provided token + * @param token - The ephemeral token from the server + * @returns A connection instance + */ + connect(token: RealtimeToken): Promise +} + +/** + * Connection interface representing an active realtime session. + * Handles audio I/O, events, and session management. + */ +export interface RealtimeConnection { + // Lifecycle + /** Disconnect from the realtime session */ + disconnect(): Promise + + // Audio I/O + /** Start capturing audio from the microphone */ + startAudioCapture(): Promise + /** Stop capturing audio */ + stopAudioCapture(): void + + // Text input + /** Send a text message (fallback for when voice isn't available) */ + sendText(text: string): void + + // Tool results + /** Send a tool execution result back to the provider */ + sendToolResult(callId: string, result: string): void + + // Session management + /** Update session configuration */ + updateSession(config: Partial): void + /** Interrupt the current response */ + interrupt(): void + + // Events + /** Subscribe to connection events */ + on( + event: E, + handler: RealtimeEventHandler, + ): () => void + + // Audio visualization + /** Get audio visualization data */ + getAudioVisualization(): AudioVisualization +} + +// ============================================================================ +// Client Options +// ============================================================================ + +/** + * Options for the RealtimeClient + */ +export interface RealtimeClientOptions { + /** + * Function to fetch a realtime token from the server. + * Called on connect and when token needs refresh. + */ + getToken: () => Promise + + /** + * The realtime adapter to use (e.g., openaiRealtime()) + */ + adapter: RealtimeAdapter + + /** + * Client-side tools with execution logic + */ + tools?: ReadonlyArray + + /** + * Auto-play assistant audio (default: true) + */ + autoPlayback?: boolean + + /** + * Request microphone access on connect (default: true) + */ + autoCapture?: boolean + + /** + * Voice activity detection mode (default: 'server') + */ + vadMode?: 'server' | 'semantic' | 'manual' + + // Callbacks + onStatusChange?: (status: RealtimeStatus) => void + onModeChange?: (mode: RealtimeMode) => void + onMessage?: (message: RealtimeMessage) => void + onError?: (error: Error) => void + onConnect?: () => void + onDisconnect?: () => void + onInterrupted?: () => void +} + +// ============================================================================ +// Client State +// ============================================================================ + +/** + * Internal state of the RealtimeClient + */ +export interface RealtimeClientState { + status: RealtimeStatus + mode: RealtimeMode + messages: Array + pendingUserTranscript: string | null + pendingAssistantTranscript: string | null + error: Error | null +} + +/** + * Callback type for state changes + */ +export type RealtimeStateChangeCallback = (state: RealtimeClientState) => void diff --git a/packages/typescript/ai-elevenlabs/README.md b/packages/typescript/ai-elevenlabs/README.md new file mode 100644 index 000000000..6f85bb8ed --- /dev/null +++ b/packages/typescript/ai-elevenlabs/README.md @@ -0,0 +1,76 @@ +# @tanstack/ai-elevenlabs + +ElevenLabs adapter for TanStack AI realtime voice conversations. + +## Installation + +```bash +npm install @tanstack/ai-elevenlabs @tanstack/ai @tanstack/ai-client +``` + +## Usage + +### Server-Side Token Generation + +```typescript +import { realtimeToken } from '@tanstack/ai' +import { elevenlabsRealtimeToken } from '@tanstack/ai-elevenlabs' + +// Generate a signed URL for client use +const token = await realtimeToken({ + adapter: elevenlabsRealtimeToken({ + agentId: 'your-agent-id', + }), +}) +``` + +### Client-Side Usage + +```typescript +import { RealtimeClient } from '@tanstack/ai-client' +import { elevenlabsRealtime } from '@tanstack/ai-elevenlabs' + +const client = new RealtimeClient({ + getToken: () => fetch('/api/realtime-token').then(r => r.json()), + adapter: elevenlabsRealtime(), +}) + +await client.connect() +``` + +### With React + +```typescript +import { useRealtimeChat } from '@tanstack/ai-react' +import { elevenlabsRealtime } from '@tanstack/ai-elevenlabs' + +function VoiceChat() { + const { status, mode, messages, connect, disconnect } = useRealtimeChat({ + getToken: () => fetch('/api/realtime-token').then(r => r.json()), + adapter: elevenlabsRealtime(), + }) + + return ( +
+

Status: {status}

+

Mode: {mode}

+ +
+ ) +} +``` + +## Environment Variables + +Set `ELEVENLABS_API_KEY` in your environment for server-side token generation. + +## Requirements + +- ElevenLabs account with Conversational AI agent configured +- Agent ID from ElevenLabs dashboard + +## License + +MIT diff --git a/packages/typescript/ai-elevenlabs/package.json b/packages/typescript/ai-elevenlabs/package.json new file mode 100644 index 000000000..0edafe92a --- /dev/null +++ b/packages/typescript/ai-elevenlabs/package.json @@ -0,0 +1,55 @@ +{ + "name": "@tanstack/ai-elevenlabs", + "version": "0.0.1", + "description": "ElevenLabs adapter for TanStack AI realtime voice", + "author": "", + "license": "MIT", + "repository": { + "type": "git", + "url": "git+https://github.com/TanStack/ai.git", + "directory": "packages/typescript/ai-elevenlabs" + }, + "keywords": [ + "ai", + "elevenlabs", + "voice", + "realtime", + "tanstack", + "adapter" + ], + "type": "module", + "module": "./dist/esm/index.js", + "types": "./dist/esm/index.d.ts", + "exports": { + ".": { + "types": "./dist/esm/index.d.ts", + "import": "./dist/esm/index.js" + } + }, + "files": [ + "dist", + "src" + ], + "scripts": { + "build": "vite build", + "clean": "premove ./build ./dist", + "lint:fix": "eslint ./src --fix", + "test:build": "publint --strict", + "test:eslint": "eslint ./src", + "test:lib": "vitest", + "test:lib:dev": "pnpm test:lib --watch", + "test:types": "tsc" + }, + "dependencies": { + "@11labs/client": "^0.2.0" + }, + "peerDependencies": { + "@tanstack/ai": "workspace:^", + "@tanstack/ai-client": "workspace:^" + }, + "devDependencies": { + "@tanstack/ai": "workspace:*", + "@tanstack/ai-client": "workspace:*", + "@vitest/coverage-v8": "4.0.14" + } +} diff --git a/packages/typescript/ai-elevenlabs/src/index.ts b/packages/typescript/ai-elevenlabs/src/index.ts new file mode 100644 index 000000000..14702a1da --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/index.ts @@ -0,0 +1,16 @@ +// ============================================================================ +// ElevenLabs Realtime (Voice) Adapters +// ============================================================================ + +export { + elevenlabsRealtimeToken, + elevenlabsRealtime, +} from './realtime/index' + +export type { + ElevenLabsRealtimeTokenOptions, + ElevenLabsRealtimeOptions, + ElevenLabsConversationMode, + ElevenLabsVADConfig, + ElevenLabsClientTool, +} from './realtime/index' diff --git a/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts b/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts new file mode 100644 index 000000000..cb193485c --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts @@ -0,0 +1,251 @@ +import { Conversation } from '@11labs/client' +import type { + AudioVisualization, + RealtimeEvent, + RealtimeEventHandler, + RealtimeMessage, + RealtimeMode, + RealtimeSessionConfig, + RealtimeStatus, + RealtimeToken, +} from '@tanstack/ai' +import type { RealtimeAdapter, RealtimeConnection } from '@tanstack/ai-client' +import type { ElevenLabsRealtimeOptions } from './types' + +/** + * Creates an ElevenLabs realtime adapter for client-side use. + * + * Wraps the @11labs/client SDK for voice conversations. + * + * @param options - Optional configuration + * @returns A RealtimeAdapter for use with RealtimeClient + * + * @example + * ```typescript + * import { RealtimeClient } from '@tanstack/ai-client' + * import { elevenlabsRealtime } from '@tanstack/ai-elevenlabs' + * + * const client = new RealtimeClient({ + * getToken: () => fetch('/api/realtime-token').then(r => r.json()), + * adapter: elevenlabsRealtime(), + * }) + * ``` + */ +export function elevenlabsRealtime( + options: ElevenLabsRealtimeOptions = {}, +): RealtimeAdapter { + return { + provider: 'elevenlabs', + + async connect(token: RealtimeToken): Promise { + return createElevenLabsConnection(token, options) + }, + } +} + +/** + * Creates a connection to ElevenLabs conversational AI + */ +async function createElevenLabsConnection( + token: RealtimeToken, + _options: ElevenLabsRealtimeOptions, +): Promise { + const eventHandlers = new Map>>() + let conversation: Awaited> | null = null + let messageIdCounter = 0 + + // Empty arrays for when visualization isn't available + const emptyFrequencyData = new Uint8Array(128) + const emptyTimeDomainData = new Uint8Array(128).fill(128) + + // Helper to emit events + function emit( + event: E, + payload: Parameters>[0], + ) { + const handlers = eventHandlers.get(event) + if (handlers) { + for (const handler of handlers) { + handler(payload) + } + } + } + + function generateMessageId(): string { + return `el-msg-${Date.now()}-${++messageIdCounter}` + } + + // Start the conversation session + conversation = await Conversation.startSession({ + signedUrl: token.token, + + onConnect: () => { + emit('status_change', { status: 'connected' as RealtimeStatus }) + emit('mode_change', { mode: 'listening' }) + }, + + onDisconnect: () => { + emit('status_change', { status: 'idle' as RealtimeStatus }) + emit('mode_change', { mode: 'idle' }) + }, + + onModeChange: ({ mode }) => { + const mappedMode: RealtimeMode = + mode === 'speaking' ? 'speaking' : 'listening' + emit('mode_change', { mode: mappedMode }) + }, + + onMessage: ({ message, source }) => { + const role = source === 'user' ? 'user' : 'assistant' + + // Emit transcript update + emit('transcript', { + role, + transcript: message, + isFinal: true, + }) + + // Create and emit message + const realtimeMessage: RealtimeMessage = { + id: generateMessageId(), + role, + timestamp: Date.now(), + parts: [{ type: 'audio', transcript: message }], + } + emit('message_complete', { message: realtimeMessage }) + }, + + onError: (error: string | Error) => { + emit('error', { + error: new Error( + typeof error === 'string' ? error : error.message || 'Unknown error', + ), + }) + }, + }) + + // Connection implementation + const connection: RealtimeConnection = { + async disconnect() { + if (conversation) { + await conversation.endSession() + conversation = null + } + emit('status_change', { status: 'idle' as RealtimeStatus }) + }, + + async startAudioCapture() { + // ElevenLabs SDK handles audio capture automatically + // This is called when the session starts + emit('mode_change', { mode: 'listening' }) + }, + + stopAudioCapture() { + // ElevenLabs SDK handles this + emit('mode_change', { mode: 'idle' }) + }, + + sendText(text: string) { + // ElevenLabs doesn't support direct text input in the same way + // The SDK is voice-first. Log a warning. + console.warn( + 'ElevenLabs realtime adapter does not support sendText. Use voice input.', + ) + }, + + sendToolResult(callId: string, result: string) { + // ElevenLabs handles client tools differently - they're registered at session start + console.warn( + 'ElevenLabs tool results are handled via clientTools option during session creation.', + ) + }, + + updateSession(_config: Partial) { + // ElevenLabs session config is set at creation time + console.warn( + 'ElevenLabs does not support runtime session updates. Configure at connection time.', + ) + }, + + interrupt() { + // ElevenLabs handles interruption automatically via barge-in + // No explicit API to call + emit('mode_change', { mode: 'listening' }) + emit('interrupted', {}) + }, + + on( + event: E, + handler: RealtimeEventHandler, + ): () => void { + if (!eventHandlers.has(event)) { + eventHandlers.set(event, new Set()) + } + eventHandlers.get(event)!.add(handler) + + return () => { + eventHandlers.get(event)?.delete(handler) + } + }, + + getAudioVisualization(): AudioVisualization { + return { + get inputLevel() { + if (!conversation) return 0 + try { + return conversation.getInputVolume() + } catch { + return 0 + } + }, + + get outputLevel() { + if (!conversation) return 0 + try { + return conversation.getOutputVolume() + } catch { + return 0 + } + }, + + getInputFrequencyData() { + if (!conversation) return emptyFrequencyData + try { + return conversation.getInputByteFrequencyData() + } catch { + return emptyFrequencyData + } + }, + + getOutputFrequencyData() { + if (!conversation) return emptyFrequencyData + try { + return conversation.getOutputByteFrequencyData() + } catch { + return emptyFrequencyData + } + }, + + getInputTimeDomainData() { + // ElevenLabs SDK doesn't expose time domain data + return emptyTimeDomainData + }, + + getOutputTimeDomainData() { + // ElevenLabs SDK doesn't expose time domain data + return emptyTimeDomainData + }, + + get inputSampleRate() { + return 16000 + }, + + get outputSampleRate() { + return 16000 + }, + } + }, + } + + return connection +} diff --git a/packages/typescript/ai-elevenlabs/src/realtime/index.ts b/packages/typescript/ai-elevenlabs/src/realtime/index.ts new file mode 100644 index 000000000..db176897e --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/realtime/index.ts @@ -0,0 +1,14 @@ +// Token adapter for server-side use +export { elevenlabsRealtimeToken } from './token' + +// Client adapter for browser use +export { elevenlabsRealtime } from './adapter' + +// Types +export type { + ElevenLabsRealtimeTokenOptions, + ElevenLabsRealtimeOptions, + ElevenLabsConversationMode, + ElevenLabsVADConfig, + ElevenLabsClientTool, +} from './types' diff --git a/packages/typescript/ai-elevenlabs/src/realtime/token.ts b/packages/typescript/ai-elevenlabs/src/realtime/token.ts new file mode 100644 index 000000000..99989b06b --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/realtime/token.ts @@ -0,0 +1,103 @@ +import type { RealtimeToken, RealtimeTokenAdapter } from '@tanstack/ai' +import type { ElevenLabsRealtimeTokenOptions } from './types' + +const ELEVENLABS_API_URL = 'https://api.elevenlabs.io/v1' + +/** + * Get ElevenLabs API key from environment + */ +function getElevenLabsApiKey(): string { + // Check process.env (Node.js) + if (typeof process !== 'undefined' && process.env?.ELEVENLABS_API_KEY) { + return process.env.ELEVENLABS_API_KEY + } + + // Check window.env (Browser with injected env) + if ( + typeof window !== 'undefined' && + (window as unknown as { env?: { ELEVENLABS_API_KEY?: string } }).env + ?.ELEVENLABS_API_KEY + ) { + return (window as unknown as { env: { ELEVENLABS_API_KEY: string } }).env + .ELEVENLABS_API_KEY + } + + throw new Error( + 'ELEVENLABS_API_KEY not found in environment variables. ' + + 'Please set ELEVENLABS_API_KEY in your environment.', + ) +} + +/** + * Creates an ElevenLabs realtime token adapter. + * + * This adapter generates signed URLs for client-side connections. + * The signed URL is valid for 30 minutes. + * + * @param options - Configuration options including agentId + * @returns A RealtimeTokenAdapter for use with realtimeToken() + * + * @example + * ```typescript + * import { realtimeToken } from '@tanstack/ai' + * import { elevenlabsRealtimeToken } from '@tanstack/ai-elevenlabs' + * + * const token = await realtimeToken({ + * adapter: elevenlabsRealtimeToken({ + * agentId: 'your-agent-id', + * }), + * }) + * ``` + */ +export function elevenlabsRealtimeToken( + options: ElevenLabsRealtimeTokenOptions, +): RealtimeTokenAdapter { + const apiKey = getElevenLabsApiKey() + + return { + provider: 'elevenlabs', + + async generateToken(): Promise { + const { agentId, overrides } = options + + // Get signed URL from ElevenLabs + const response = await fetch( + `${ELEVENLABS_API_URL}/convai/conversation/get_signed_url?agent_id=${agentId}`, + { + method: 'GET', + headers: { + 'xi-api-key': apiKey, + }, + }, + ) + + if (!response.ok) { + const errorText = await response.text() + throw new Error( + `ElevenLabs signed URL request failed: ${response.status} ${errorText}`, + ) + } + + const data = await response.json() + const signedUrl = data.signed_url as string + + // Signed URLs are valid for 30 minutes + const expiresAt = Date.now() + 30 * 60 * 1000 + + return { + provider: 'elevenlabs', + token: signedUrl, + expiresAt, + config: { + voice: overrides?.voiceId, + instructions: overrides?.systemPrompt, + providerOptions: { + agentId, + firstMessage: overrides?.firstMessage, + language: overrides?.language, + }, + }, + } + }, + } +} diff --git a/packages/typescript/ai-elevenlabs/src/realtime/types.ts b/packages/typescript/ai-elevenlabs/src/realtime/types.ts new file mode 100644 index 000000000..ff2166f08 --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/realtime/types.ts @@ -0,0 +1,62 @@ +/** + * Options for the ElevenLabs realtime token adapter + */ +export interface ElevenLabsRealtimeTokenOptions { + /** Agent ID configured in ElevenLabs dashboard */ + agentId: string + /** Optional override values for the agent */ + overrides?: { + /** Custom voice ID to use */ + voiceId?: string + /** Custom system prompt */ + systemPrompt?: string + /** First message the agent should speak */ + firstMessage?: string + /** Language code (e.g., 'en') */ + language?: string + } +} + +/** + * Options for the ElevenLabs realtime client adapter + */ +export interface ElevenLabsRealtimeOptions { + /** Connection mode (default: auto-detect) */ + connectionMode?: 'websocket' | 'webrtc' + /** Enable debug logging */ + debug?: boolean +} + +/** + * ElevenLabs conversation mode + */ +export type ElevenLabsConversationMode = 'speaking' | 'listening' + +/** + * ElevenLabs voice activity detection configuration + */ +export interface ElevenLabsVADConfig { + /** VAD threshold (0.1-0.9) */ + vadThreshold?: number + /** Silence threshold in seconds (0.3-3.0) */ + vadSilenceThresholdSecs?: number + /** Minimum speech duration in ms */ + minSpeechDurationMs?: number + /** Minimum silence duration in ms */ + minSilenceDurationMs?: number +} + +/** + * Client tool definition for ElevenLabs + */ +export interface ElevenLabsClientTool { + /** Tool handler function */ + handler: (params: TParams) => Promise | TResult +} + +/** + * ElevenLabs signed URL response + */ +export interface ElevenLabsSignedUrlResponse { + signed_url: string +} diff --git a/packages/typescript/ai-elevenlabs/tsconfig.json b/packages/typescript/ai-elevenlabs/tsconfig.json new file mode 100644 index 000000000..e5e872741 --- /dev/null +++ b/packages/typescript/ai-elevenlabs/tsconfig.json @@ -0,0 +1,8 @@ +{ + "extends": "../../../tsconfig.json", + "compilerOptions": { + "outDir": "dist" + }, + "include": ["vite.config.ts", "./src"], + "exclude": ["node_modules", "dist", "**/*.config.ts"] +} diff --git a/packages/typescript/ai-elevenlabs/vite.config.ts b/packages/typescript/ai-elevenlabs/vite.config.ts new file mode 100644 index 000000000..11f5b20b7 --- /dev/null +++ b/packages/typescript/ai-elevenlabs/vite.config.ts @@ -0,0 +1,37 @@ +import { defineConfig, mergeConfig } from 'vitest/config' +import { tanstackViteConfig } from '@tanstack/vite-config' +import packageJson from './package.json' + +const config = defineConfig({ + test: { + name: packageJson.name, + dir: './', + watch: false, + + globals: true, + environment: 'node', + include: ['tests/**/*.test.ts'], + coverage: { + provider: 'v8', + reporter: ['text', 'json', 'html', 'lcov'], + exclude: [ + 'node_modules/', + 'dist/', + 'tests/', + '**/*.test.ts', + '**/*.config.ts', + '**/types.ts', + ], + include: ['src/**/*.ts'], + }, + }, +}) + +export default mergeConfig( + config, + tanstackViteConfig({ + entry: ['./src/index.ts'], + srcDir: './src', + cjs: false, + }), +) diff --git a/packages/typescript/ai-openai/package.json b/packages/typescript/ai-openai/package.json index 60f3d1abe..2483cb13d 100644 --- a/packages/typescript/ai-openai/package.json +++ b/packages/typescript/ai-openai/package.json @@ -44,10 +44,12 @@ }, "peerDependencies": { "@tanstack/ai": "workspace:^", + "@tanstack/ai-client": "workspace:^", "zod": "^4.0.0" }, "devDependencies": { "@tanstack/ai": "workspace:*", + "@tanstack/ai-client": "workspace:*", "@vitest/coverage-v8": "4.0.14", "vite": "^7.2.7", "zod": "^4.2.0" diff --git a/packages/typescript/ai-openai/src/index.ts b/packages/typescript/ai-openai/src/index.ts index cf2759f22..adfc0254e 100644 --- a/packages/typescript/ai-openai/src/index.ts +++ b/packages/typescript/ai-openai/src/index.ts @@ -94,3 +94,22 @@ export type { OpenAIDocumentMetadata, OpenAIMessageMetadataByModality, } from './message-types' + +// ============================================================================ +// Realtime (Voice) Adapters +// ============================================================================ + +export { + openaiRealtimeToken, + openaiRealtime, +} from './realtime/index' + +export type { + OpenAIRealtimeVoice, + OpenAIRealtimeModel, + OpenAIRealtimeTokenOptions, + OpenAIRealtimeOptions, + OpenAITurnDetection, + OpenAISemanticVADConfig, + OpenAIServerVADConfig, +} from './realtime/index' diff --git a/packages/typescript/ai-openai/src/realtime/adapter.ts b/packages/typescript/ai-openai/src/realtime/adapter.ts new file mode 100644 index 000000000..51745a26c --- /dev/null +++ b/packages/typescript/ai-openai/src/realtime/adapter.ts @@ -0,0 +1,585 @@ +import type { + AudioVisualization, + RealtimeEvent, + RealtimeEventHandler, + RealtimeMessage, + RealtimeMode, + RealtimeSessionConfig, + RealtimeStatus, + RealtimeToken, +} from '@tanstack/ai' +import type { + RealtimeAdapter, + RealtimeConnection, +} from '@tanstack/ai-client' +import type { OpenAIRealtimeOptions } from './types' + +const OPENAI_REALTIME_URL = 'https://api.openai.com/v1/realtime' + +/** + * Creates an OpenAI realtime adapter for client-side use. + * + * Uses WebRTC for browser connections (default) or WebSocket for Node.js. + * + * @param options - Optional configuration + * @returns A RealtimeAdapter for use with RealtimeClient + * + * @example + * ```typescript + * import { RealtimeClient } from '@tanstack/ai-client' + * import { openaiRealtime } from '@tanstack/ai-openai' + * + * const client = new RealtimeClient({ + * getToken: () => fetch('/api/realtime-token').then(r => r.json()), + * adapter: openaiRealtime(), + * }) + * ``` + */ +export function openaiRealtime( + options: OpenAIRealtimeOptions = {}, +): RealtimeAdapter { + const connectionMode = options.connectionMode ?? 'webrtc' + + return { + provider: 'openai', + + async connect(token: RealtimeToken): Promise { + if (connectionMode === 'webrtc') { + return createWebRTCConnection(token) + } + throw new Error('WebSocket connection mode not yet implemented') + }, + } +} + +/** + * Creates a WebRTC connection to OpenAI's realtime API + */ +async function createWebRTCConnection( + token: RealtimeToken, +): Promise { + const model = token.config.model ?? 'gpt-4o-realtime-preview' + const eventHandlers = new Map>>() + + // WebRTC peer connection + const pc = new RTCPeerConnection() + + // Audio context for visualization + let audioContext: AudioContext | null = null + let inputAnalyser: AnalyserNode | null = null + let outputAnalyser: AnalyserNode | null = null + let inputSource: MediaStreamAudioSourceNode | null = null + let outputSource: MediaStreamAudioSourceNode | null = null + let localStream: MediaStream | null = null + + // Audio element for playback (more reliable than AudioContext.destination) + let audioElement: HTMLAudioElement | null = null + + // Data channel for events + let dataChannel: RTCDataChannel | null = null + + // Current state + let currentMode: RealtimeMode = 'idle' + let currentMessageId: string | null = null + + // Empty arrays for when visualization isn't available + // frequencyBinCount = fftSize / 2 = 1024 + const emptyFrequencyData = new Uint8Array(1024) + const emptyTimeDomainData = new Uint8Array(2048).fill(128) // 128 is silence + + // Helper to emit events (defined early so it can be used during setup) + function emit( + event: E, + payload: Parameters>[0], + ) { + const handlers = eventHandlers.get(event) + if (handlers) { + for (const handler of handlers) { + handler(payload) + } + } + } + + // Set up data channel for bidirectional communication + dataChannel = pc.createDataChannel('oai-events') + + dataChannel.onopen = () => { + emit('status_change', { status: 'connected' as RealtimeStatus }) + } + + dataChannel.onmessage = (event) => { + try { + const message = JSON.parse(event.data) + handleServerEvent(message) + } catch (e) { + console.error('Failed to parse realtime event:', e) + } + } + + dataChannel.onerror = (error) => { + emit('error', { error: new Error(`Data channel error: ${error}`) }) + } + + // Handle incoming audio track + pc.ontrack = (event) => { + console.log('[Realtime] ontrack event:', event.track.kind, event.streams[0]) + if (event.track.kind === 'audio' && event.streams[0]) { + setupOutputAudioAnalysis(event.streams[0]) + console.log('[Realtime] Output analyser created:', outputAnalyser) + } + } + + // IMPORTANT: Request microphone access and add audio track BEFORE creating offer + // OpenAI's Realtime API requires an audio track in the SDP offer + try { + localStream = await navigator.mediaDevices.getUserMedia({ + audio: { + echoCancellation: true, + noiseSuppression: true, + sampleRate: 24000, + }, + }) + + // Add audio track to peer connection + for (const track of localStream.getAudioTracks()) { + pc.addTrack(track, localStream) + } + } catch (error) { + throw new Error( + `Microphone access required for realtime voice: ${error instanceof Error ? error.message : error}`, + ) + } + + // Create and set local description (now includes audio track) + const offer = await pc.createOffer() + await pc.setLocalDescription(offer) + + // Send SDP to OpenAI and get answer + const sdpResponse = await fetch(`${OPENAI_REALTIME_URL}?model=${model}`, { + method: 'POST', + headers: { + Authorization: `Bearer ${token.token}`, + 'Content-Type': 'application/sdp', + }, + body: offer.sdp, + }) + + if (!sdpResponse.ok) { + const errorText = await sdpResponse.text() + throw new Error( + `Failed to establish WebRTC connection: ${sdpResponse.status} - ${errorText}`, + ) + } + + const answerSdp = await sdpResponse.text() + await pc.setRemoteDescription({ type: 'answer', sdp: answerSdp }) + + // Set up input audio analysis now that we have the stream + console.log('[Realtime] Setting up input audio analysis, localStream:', localStream) + setupInputAudioAnalysis(localStream) + console.log('[Realtime] Input analyser created:', inputAnalyser) + + // Handle server events + function handleServerEvent(event: Record) { + const type = event.type as string + + switch (type) { + case 'session.created': + case 'session.updated': + // Session ready + break + + case 'input_audio_buffer.speech_started': + currentMode = 'listening' + emit('mode_change', { mode: 'listening' }) + break + + case 'input_audio_buffer.speech_stopped': + currentMode = 'thinking' + emit('mode_change', { mode: 'thinking' }) + break + + case 'input_audio_buffer.committed': + // Audio buffer committed for processing + break + + case 'conversation.item.input_audio_transcription.completed': { + const transcript = event.transcript as string + emit('transcript', { role: 'user', transcript, isFinal: true }) + break + } + + case 'response.created': + currentMode = 'thinking' + emit('mode_change', { mode: 'thinking' }) + break + + case 'response.output_item.added': { + const item = event.item as Record + if (item.type === 'message') { + currentMessageId = item.id as string + } + break + } + + case 'response.audio_transcript.delta': { + const delta = event.delta as string + emit('transcript', { role: 'assistant', transcript: delta, isFinal: false }) + break + } + + case 'response.audio_transcript.done': { + const transcript = event.transcript as string + emit('transcript', { role: 'assistant', transcript, isFinal: true }) + break + } + + case 'response.audio.delta': + if (currentMode !== 'speaking') { + currentMode = 'speaking' + emit('mode_change', { mode: 'speaking' }) + } + break + + case 'response.audio.done': + break + + case 'response.function_call_arguments.done': { + const callId = event.call_id as string + const name = event.name as string + const args = event.arguments as string + try { + const input = JSON.parse(args) + emit('tool_call', { toolCallId: callId, toolName: name, input }) + } catch { + emit('tool_call', { toolCallId: callId, toolName: name, input: args }) + } + break + } + + case 'response.done': { + currentMode = 'listening' + emit('mode_change', { mode: 'listening' }) + + // Emit message complete if we have a current message + if (currentMessageId) { + const response = event.response as Record + const output = response.output as Array> + + const message: RealtimeMessage = { + id: currentMessageId, + role: 'assistant', + timestamp: Date.now(), + parts: [], + } + + // Extract content from output items + for (const item of output || []) { + if (item.type === 'message' && item.content) { + const content = item.content as Array> + for (const part of content) { + if (part.type === 'audio' && part.transcript) { + message.parts.push({ + type: 'audio', + transcript: part.transcript as string, + }) + } else if (part.type === 'text' && part.text) { + message.parts.push({ + type: 'text', + content: part.text as string, + }) + } + } + } + } + + emit('message_complete', { message }) + currentMessageId = null + } + break + } + + case 'conversation.item.truncated': + emit('interrupted', { messageId: currentMessageId ?? undefined }) + break + + case 'error': { + const error = event.error as Record + emit('error', { + error: new Error((error.message as string) || 'Unknown error'), + }) + break + } + } + } + + // Set up audio analysis for output + function setupOutputAudioAnalysis(stream: MediaStream) { + // Create audio element for playback - this is the standard way to play WebRTC audio + audioElement = new Audio() + audioElement.srcObject = stream + audioElement.autoplay = true + // Some browsers require this for autoplay + audioElement.play().catch((e) => { + console.warn('Audio autoplay failed:', e) + }) + + // Set up AudioContext for visualization only (not playback) + if (!audioContext) { + audioContext = new AudioContext() + } + + // Resume AudioContext if suspended (browsers require user interaction) + if (audioContext.state === 'suspended') { + audioContext.resume().catch(() => { + // Ignore - visualization just won't work + }) + } + + outputAnalyser = audioContext.createAnalyser() + outputAnalyser.fftSize = 2048 // Larger size for more accurate level detection + outputAnalyser.smoothingTimeConstant = 0.3 + + outputSource = audioContext.createMediaStreamSource(stream) + outputSource.connect(outputAnalyser) + // Don't connect to destination - the Audio element handles playback + } + + // Set up audio analysis for input + function setupInputAudioAnalysis(stream: MediaStream) { + if (!audioContext) { + audioContext = new AudioContext() + } + + // Resume AudioContext if suspended (browsers require user interaction) + if (audioContext.state === 'suspended') { + audioContext.resume().catch(() => { + // Ignore - visualization just won't work + }) + } + + inputAnalyser = audioContext.createAnalyser() + inputAnalyser.fftSize = 2048 // Larger size for more accurate level detection + inputAnalyser.smoothingTimeConstant = 0.3 + + inputSource = audioContext.createMediaStreamSource(stream) + inputSource.connect(inputAnalyser) + } + + // Send event to server + function sendEvent(event: Record) { + if (dataChannel?.readyState === 'open') { + dataChannel.send(JSON.stringify(event)) + } + } + + // Connection implementation + const connection: RealtimeConnection = { + async disconnect() { + if (localStream) { + for (const track of localStream.getTracks()) { + track.stop() + } + localStream = null + } + + if (audioElement) { + audioElement.pause() + audioElement.srcObject = null + audioElement = null + } + + if (dataChannel) { + dataChannel.close() + dataChannel = null + } + + pc.close() + + if (audioContext) { + await audioContext.close() + audioContext = null + } + + emit('status_change', { status: 'idle' as RealtimeStatus }) + }, + + async startAudioCapture() { + // Audio capture is established during connection setup + // This method enables the tracks and signals listening mode + if (localStream) { + for (const track of localStream.getAudioTracks()) { + track.enabled = true + } + } + currentMode = 'listening' + emit('mode_change', { mode: 'listening' }) + }, + + stopAudioCapture() { + // Disable tracks rather than stopping them to allow re-enabling + if (localStream) { + for (const track of localStream.getAudioTracks()) { + track.enabled = false + } + } + currentMode = 'idle' + emit('mode_change', { mode: 'idle' }) + }, + + sendText(text: string) { + sendEvent({ + type: 'conversation.item.create', + item: { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text }], + }, + }) + sendEvent({ type: 'response.create' }) + }, + + sendToolResult(callId: string, result: string) { + sendEvent({ + type: 'conversation.item.create', + item: { + type: 'function_call_output', + call_id: callId, + output: result, + }, + }) + sendEvent({ type: 'response.create' }) + }, + + updateSession(config: Partial) { + const sessionUpdate: Record = {} + + if (config.instructions) { + sessionUpdate.instructions = config.instructions + } + + if (config.voice) { + sessionUpdate.voice = config.voice + } + + if (config.vadMode) { + if (config.vadMode === 'semantic') { + sessionUpdate.turn_detection = { + type: 'semantic_vad', + eagerness: 'medium', + } + } else if (config.vadMode === 'server') { + sessionUpdate.turn_detection = { + type: 'server_vad', + threshold: config.vadConfig?.threshold ?? 0.5, + prefix_padding_ms: config.vadConfig?.prefixPaddingMs ?? 300, + silence_duration_ms: config.vadConfig?.silenceDurationMs ?? 500, + } + } else { + sessionUpdate.turn_detection = null + } + } + + if (Object.keys(sessionUpdate).length > 0) { + sendEvent({ + type: 'session.update', + session: sessionUpdate, + }) + } + }, + + interrupt() { + sendEvent({ type: 'response.cancel' }) + currentMode = 'listening' + emit('mode_change', { mode: 'listening' }) + emit('interrupted', { messageId: currentMessageId ?? undefined }) + }, + + on( + event: E, + handler: RealtimeEventHandler, + ): () => void { + if (!eventHandlers.has(event)) { + eventHandlers.set(event, new Set()) + } + eventHandlers.get(event)!.add(handler) + + return () => { + eventHandlers.get(event)?.delete(handler) + } + }, + + getAudioVisualization(): AudioVisualization { + // Log analyser state for debugging + console.log('[Realtime] getAudioVisualization called, inputAnalyser:', !!inputAnalyser, 'outputAnalyser:', !!outputAnalyser) + + // Helper to calculate RMS (Root Mean Square) from time domain data + // This gives a better measure of perceived loudness than frequency data + function calculateRMS(analyser: AnalyserNode): number { + const data = new Uint8Array(analyser.fftSize) + analyser.getByteTimeDomainData(data) + + // Calculate RMS - values are 0-255 with 128 being silence + let sumSquares = 0 + for (const sample of data) { + const normalized = (sample - 128) / 128 // Convert to -1 to 1 range + sumSquares += normalized * normalized + } + const rms = Math.sqrt(sumSquares / data.length) + + // Scale and clamp to 0-1 range (RMS of full-scale sine is ~0.707) + return Math.min(1, rms * 1.5) + } + + return { + get inputLevel() { + if (!inputAnalyser) return 0 + return calculateRMS(inputAnalyser) + }, + + get outputLevel() { + if (!outputAnalyser) return 0 + return calculateRMS(outputAnalyser) + }, + + getInputFrequencyData() { + if (!inputAnalyser) return emptyFrequencyData + const data = new Uint8Array(inputAnalyser.frequencyBinCount) + inputAnalyser.getByteFrequencyData(data) + return data + }, + + getOutputFrequencyData() { + if (!outputAnalyser) return emptyFrequencyData + const data = new Uint8Array(outputAnalyser.frequencyBinCount) + outputAnalyser.getByteFrequencyData(data) + return data + }, + + getInputTimeDomainData() { + if (!inputAnalyser) return emptyTimeDomainData + const data = new Uint8Array(inputAnalyser.fftSize) + inputAnalyser.getByteTimeDomainData(data) + return data + }, + + getOutputTimeDomainData() { + if (!outputAnalyser) return emptyTimeDomainData + const data = new Uint8Array(outputAnalyser.fftSize) + outputAnalyser.getByteTimeDomainData(data) + return data + }, + + get inputSampleRate() { + return 24000 + }, + + get outputSampleRate() { + return 24000 + }, + } + }, + } + + return connection +} diff --git a/packages/typescript/ai-openai/src/realtime/index.ts b/packages/typescript/ai-openai/src/realtime/index.ts new file mode 100644 index 000000000..d5ea156e6 --- /dev/null +++ b/packages/typescript/ai-openai/src/realtime/index.ts @@ -0,0 +1,16 @@ +// Token adapter for server-side use +export { openaiRealtimeToken } from './token' + +// Client adapter for browser use +export { openaiRealtime } from './adapter' + +// Types +export type { + OpenAIRealtimeVoice, + OpenAIRealtimeModel, + OpenAIRealtimeTokenOptions, + OpenAIRealtimeOptions, + OpenAITurnDetection, + OpenAISemanticVADConfig, + OpenAIServerVADConfig, +} from './types' diff --git a/packages/typescript/ai-openai/src/realtime/token.ts b/packages/typescript/ai-openai/src/realtime/token.ts new file mode 100644 index 000000000..d226cacbb --- /dev/null +++ b/packages/typescript/ai-openai/src/realtime/token.ts @@ -0,0 +1,153 @@ +import type { RealtimeToken, RealtimeTokenAdapter, Tool } from '@tanstack/ai' +import { getOpenAIApiKeyFromEnv } from '../utils' +import type { + OpenAIRealtimeModel, + OpenAIRealtimeSessionResponse, + OpenAIRealtimeTokenOptions, +} from './types' + +const OPENAI_REALTIME_SESSIONS_URL = + 'https://api.openai.com/v1/realtime/sessions' + +/** + * Creates an OpenAI realtime token adapter. + * + * This adapter generates ephemeral tokens for client-side WebRTC connections. + * The token is valid for 10 minutes. + * + * @param options - Configuration options for the realtime session + * @returns A RealtimeTokenAdapter for use with realtimeToken() + * + * @example + * ```typescript + * import { realtimeToken } from '@tanstack/ai' + * import { openaiRealtimeToken } from '@tanstack/ai-openai' + * + * const token = await realtimeToken({ + * adapter: openaiRealtimeToken({ + * model: 'gpt-4o-realtime-preview', + * voice: 'alloy', + * instructions: 'You are a helpful assistant.', + * turnDetection: { + * type: 'semantic_vad', + * eagerness: 'medium', + * }, + * }), + * }) + * ``` + */ +export function openaiRealtimeToken( + options: OpenAIRealtimeTokenOptions = {}, +): RealtimeTokenAdapter { + const apiKey = getOpenAIApiKeyFromEnv() + + return { + provider: 'openai', + + async generateToken(): Promise { + const model: OpenAIRealtimeModel = + options.model ?? 'gpt-4o-realtime-preview' + const voice = options.voice ?? 'alloy' + + // Build request body + const body: Record = { + model, + voice, + } + + if (options.instructions) { + body.instructions = options.instructions + } + + if (options.turnDetection !== undefined) { + body.turn_detection = options.turnDetection + } + + if (options.inputAudioFormat) { + body.input_audio_format = options.inputAudioFormat + } + + if (options.outputAudioFormat) { + body.output_audio_format = options.outputAudioFormat + } + + if (options.inputAudioTranscription) { + body.input_audio_transcription = options.inputAudioTranscription + } + + if (options.tools) { + body.tools = options.tools + } + + if (options.toolChoice) { + body.tool_choice = options.toolChoice + } + + if (options.temperature !== undefined) { + body.temperature = options.temperature + } + + if (options.maxResponseOutputTokens !== undefined) { + body.max_response_output_tokens = options.maxResponseOutputTokens + } + + // Call OpenAI API to create session and get ephemeral token + const response = await fetch(OPENAI_REALTIME_SESSIONS_URL, { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(body), + }) + + if (!response.ok) { + const errorText = await response.text() + throw new Error( + `OpenAI realtime session creation failed: ${response.status} ${errorText}`, + ) + } + + const sessionData: OpenAIRealtimeSessionResponse = await response.json() + + // Convert tools to our format + const tools: Array = (sessionData.tools || []).map((t) => ({ + name: t.name, + description: t.description, + inputSchema: t.parameters, + })) + + return { + provider: 'openai', + token: sessionData.client_secret.value, + expiresAt: sessionData.client_secret.expires_at * 1000, // Convert to ms + config: { + model: sessionData.model, + voice: sessionData.voice, + instructions: sessionData.instructions, + tools, + vadMode: sessionData.turn_detection?.type === 'semantic_vad' + ? 'semantic' + : sessionData.turn_detection?.type === 'server_vad' + ? 'server' + : 'manual', + vadConfig: sessionData.turn_detection + ? { + threshold: sessionData.turn_detection.threshold, + prefixPaddingMs: sessionData.turn_detection.prefix_padding_ms, + silenceDurationMs: + sessionData.turn_detection.silence_duration_ms, + } + : undefined, + providerOptions: { + inputAudioFormat: sessionData.input_audio_format, + outputAudioFormat: sessionData.output_audio_format, + inputAudioTranscription: sessionData.input_audio_transcription, + temperature: sessionData.temperature, + maxResponseOutputTokens: sessionData.max_response_output_tokens, + }, + }, + } + }, + } +} diff --git a/packages/typescript/ai-openai/src/realtime/types.ts b/packages/typescript/ai-openai/src/realtime/types.ts new file mode 100644 index 000000000..713bd2b4c --- /dev/null +++ b/packages/typescript/ai-openai/src/realtime/types.ts @@ -0,0 +1,127 @@ +import type { VADConfig } from '@tanstack/ai' + +/** + * OpenAI realtime voice options + */ +export type OpenAIRealtimeVoice = + | 'alloy' + | 'ash' + | 'ballad' + | 'coral' + | 'echo' + | 'sage' + | 'shimmer' + | 'verse' + +/** + * OpenAI realtime model options + */ +export type OpenAIRealtimeModel = + | 'gpt-4o-realtime-preview' + | 'gpt-4o-realtime-preview-2024-10-01' + | 'gpt-4o-mini-realtime-preview' + | 'gpt-4o-mini-realtime-preview-2024-12-17' + +/** + * OpenAI semantic VAD configuration + */ +export interface OpenAISemanticVADConfig { + type: 'semantic_vad' + /** Eagerness level for turn detection */ + eagerness?: 'low' | 'medium' | 'high' +} + +/** + * OpenAI server VAD configuration + */ +export interface OpenAIServerVADConfig extends VADConfig { + type: 'server_vad' +} + +/** + * OpenAI turn detection configuration + */ +export type OpenAITurnDetection = + | OpenAISemanticVADConfig + | OpenAIServerVADConfig + | null + +/** + * Options for the OpenAI realtime token adapter + */ +export interface OpenAIRealtimeTokenOptions { + /** Model to use (default: 'gpt-4o-realtime-preview') */ + model?: OpenAIRealtimeModel + /** Voice to use (default: 'alloy') */ + voice?: OpenAIRealtimeVoice + /** System instructions */ + instructions?: string + /** Turn detection configuration */ + turnDetection?: OpenAITurnDetection + /** Input audio format (default: 'pcm16') */ + inputAudioFormat?: 'pcm16' | 'g711_ulaw' | 'g711_alaw' + /** Output audio format (default: 'pcm16') */ + outputAudioFormat?: 'pcm16' | 'g711_ulaw' | 'g711_alaw' + /** Input audio transcription model */ + inputAudioTranscription?: { + model: 'whisper-1' + } + /** Tools available in the session */ + tools?: Array<{ + type: 'function' + name: string + description: string + parameters: Record + }> + /** Tool choice strategy */ + toolChoice?: 'auto' | 'none' | 'required' | { type: 'function'; name: string } + /** Temperature for response generation */ + temperature?: number + /** Maximum response output tokens */ + maxResponseOutputTokens?: number | 'inf' +} + +/** + * Options for the OpenAI realtime client adapter + */ +export interface OpenAIRealtimeOptions { + /** Connection mode (default: 'webrtc' in browser) */ + connectionMode?: 'webrtc' | 'websocket' +} + +/** + * OpenAI realtime session response from the API + */ +export interface OpenAIRealtimeSessionResponse { + id: string + object: 'realtime.session' + model: string + modalities: Array + instructions: string + voice: string + input_audio_format: string + output_audio_format: string + input_audio_transcription: { + model: string + } | null + turn_detection: { + type: string + threshold?: number + prefix_padding_ms?: number + silence_duration_ms?: number + eagerness?: string + } | null + tools: Array<{ + type: string + name: string + description: string + parameters: Record + }> + tool_choice: string + temperature: number + max_response_output_tokens: number | string + client_secret: { + value: string + expires_at: number + } +} diff --git a/packages/typescript/ai-react/src/index.ts b/packages/typescript/ai-react/src/index.ts index b08f90e36..175db7bb1 100644 --- a/packages/typescript/ai-react/src/index.ts +++ b/packages/typescript/ai-react/src/index.ts @@ -1,10 +1,15 @@ export { useChat } from './use-chat' +export { useRealtimeChat } from './use-realtime-chat' export type { UseChatOptions, UseChatReturn, UIMessage, ChatRequestBody, } from './types' +export type { + UseRealtimeChatOptions, + UseRealtimeChatReturn, +} from './realtime-types' // Re-export from ai-client for convenience export { diff --git a/packages/typescript/ai-react/src/realtime-types.ts b/packages/typescript/ai-react/src/realtime-types.ts new file mode 100644 index 000000000..f3a403464 --- /dev/null +++ b/packages/typescript/ai-react/src/realtime-types.ts @@ -0,0 +1,109 @@ +import type { + RealtimeMessage, + RealtimeMode, + RealtimeStatus, + RealtimeToken, +} from '@tanstack/ai' +import type { RealtimeAdapter } from '@tanstack/ai-client' +import type { AnyClientTool } from '@tanstack/ai' + +/** + * Options for the useRealtimeChat hook. + */ +export interface UseRealtimeChatOptions { + /** + * Function to fetch a realtime token from the server. + * Called on connect and when token needs refresh. + */ + getToken: () => Promise + + /** + * The realtime adapter to use (e.g., openaiRealtime()) + */ + adapter: RealtimeAdapter + + /** + * Client-side tools with execution logic + */ + tools?: ReadonlyArray + + /** + * Auto-play assistant audio (default: true) + */ + autoPlayback?: boolean + + /** + * Request microphone access on connect (default: true) + */ + autoCapture?: boolean + + /** + * Voice activity detection mode (default: 'server') + */ + vadMode?: 'server' | 'semantic' | 'manual' + + // Callbacks + onConnect?: () => void + onDisconnect?: () => void + onError?: (error: Error) => void + onMessage?: (message: RealtimeMessage) => void + onModeChange?: (mode: RealtimeMode) => void + onInterrupted?: () => void +} + +/** + * Return type for the useRealtimeChat hook. + */ +export interface UseRealtimeChatReturn { + // Connection state + /** Current connection status */ + status: RealtimeStatus + /** Current error, if any */ + error: Error | null + /** Connect to the realtime session */ + connect: () => Promise + /** Disconnect from the realtime session */ + disconnect: () => Promise + + // Conversation state + /** Current mode (idle, listening, thinking, speaking) */ + mode: RealtimeMode + /** Conversation messages */ + messages: Array + /** User transcript while speaking (before finalized) */ + pendingUserTranscript: string | null + /** Assistant transcript while speaking (before finalized) */ + pendingAssistantTranscript: string | null + + // Voice control + /** Start listening for voice input (manual VAD mode) */ + startListening: () => void + /** Stop listening for voice input (manual VAD mode) */ + stopListening: () => void + /** Interrupt the current assistant response */ + interrupt: () => void + + // Text input + /** Send a text message instead of voice */ + sendText: (text: string) => void + + // Audio visualization (0-1 normalized) + /** Current input (microphone) volume level */ + inputLevel: number + /** Current output (speaker) volume level */ + outputLevel: number + /** Get frequency data for input audio visualization */ + getInputFrequencyData: () => Uint8Array + /** Get frequency data for output audio visualization */ + getOutputFrequencyData: () => Uint8Array + /** Get time domain data for input waveform */ + getInputTimeDomainData: () => Uint8Array + /** Get time domain data for output waveform */ + getOutputTimeDomainData: () => Uint8Array + + // VAD control + /** Current VAD mode */ + vadMode: 'server' | 'semantic' | 'manual' + /** Change VAD mode at runtime */ + setVADMode: (mode: 'server' | 'semantic' | 'manual') => void +} diff --git a/packages/typescript/ai-react/src/use-realtime-chat.ts b/packages/typescript/ai-react/src/use-realtime-chat.ts new file mode 100644 index 000000000..3434db607 --- /dev/null +++ b/packages/typescript/ai-react/src/use-realtime-chat.ts @@ -0,0 +1,241 @@ +import { useCallback, useEffect, useRef, useState } from 'react' +import { RealtimeClient } from '@tanstack/ai-client' +import type { RealtimeMessage, RealtimeMode, RealtimeStatus } from '@tanstack/ai' +import type { UseRealtimeChatOptions, UseRealtimeChatReturn } from './realtime-types' + +// Empty frequency data for when client is not connected +const emptyFrequencyData = new Uint8Array(128) +const emptyTimeDomainData = new Uint8Array(128).fill(128) + +/** + * React hook for realtime voice conversations. + * + * Provides a simple interface for voice-to-voice AI interactions + * with support for multiple providers (OpenAI, ElevenLabs, etc.). + * + * @param options - Configuration options including adapter and callbacks + * @returns Hook return value with state and control methods + * + * @example + * ```typescript + * import { useRealtimeChat } from '@tanstack/ai-react' + * import { openaiRealtime } from '@tanstack/ai-openai' + * + * function VoiceChat() { + * const { + * status, + * mode, + * messages, + * connect, + * disconnect, + * inputLevel, + * outputLevel, + * } = useRealtimeChat({ + * getToken: () => fetch('/api/realtime-token').then(r => r.json()), + * adapter: openaiRealtime(), + * }) + * + * return ( + *
+ *

Status: {status}

+ *

Mode: {mode}

+ * + *
+ * ) + * } + * ``` + */ +export function useRealtimeChat( + options: UseRealtimeChatOptions, +): UseRealtimeChatReturn { + // State + const [status, setStatus] = useState('idle') + const [mode, setMode] = useState('idle') + const [messages, setMessages] = useState>([]) + const [pendingUserTranscript, setPendingUserTranscript] = useState(null) + const [pendingAssistantTranscript, setPendingAssistantTranscript] = useState(null) + const [error, setError] = useState(null) + const [inputLevel, setInputLevel] = useState(0) + const [outputLevel, setOutputLevel] = useState(0) + const [vadMode, setVADModeState] = useState<'server' | 'semantic' | 'manual'>( + options.vadMode ?? 'server', + ) + + // Refs + const clientRef = useRef(null) + const optionsRef = useRef(options) + optionsRef.current = options + const animationFrameRef = useRef(null) + + // Create client instance - use ref to ensure we reuse the same instance + // This handles React StrictMode double-rendering + if (!clientRef.current) { + clientRef.current = new RealtimeClient({ + getToken: optionsRef.current.getToken, + adapter: optionsRef.current.adapter, + tools: optionsRef.current.tools, + autoPlayback: optionsRef.current.autoPlayback, + autoCapture: optionsRef.current.autoCapture, + vadMode: optionsRef.current.vadMode, + onStatusChange: (newStatus) => { + setStatus(newStatus) + }, + onModeChange: (newMode) => { + setMode(newMode) + optionsRef.current.onModeChange?.(newMode) + }, + onMessage: (message) => { + setMessages((prev) => [...prev, message]) + optionsRef.current.onMessage?.(message) + }, + onError: (err) => { + setError(err) + optionsRef.current.onError?.(err) + }, + onConnect: () => { + setError(null) + optionsRef.current.onConnect?.() + }, + onDisconnect: () => { + optionsRef.current.onDisconnect?.() + }, + onInterrupted: () => { + setPendingAssistantTranscript(null) + optionsRef.current.onInterrupted?.() + }, + }) + + // Subscribe to state changes for transcripts + clientRef.current.onStateChange((state) => { + setPendingUserTranscript(state.pendingUserTranscript) + setPendingAssistantTranscript(state.pendingAssistantTranscript) + }) + } + + const client = clientRef.current + + // Audio level animation loop + useEffect(() => { + function updateLevels() { + if (clientRef.current?.audio) { + setInputLevel(clientRef.current.audio.inputLevel) + setOutputLevel(clientRef.current.audio.outputLevel) + } + animationFrameRef.current = requestAnimationFrame(updateLevels) + } + + if (status === 'connected') { + updateLevels() + } + + return () => { + if (animationFrameRef.current) { + cancelAnimationFrame(animationFrameRef.current) + animationFrameRef.current = null + } + } + }, [status]) + + // Cleanup on unmount + useEffect(() => { + return () => { + clientRef.current?.destroy() + } + }, []) + + // Connection methods + const connect = useCallback(async () => { + setError(null) + setMessages([]) + setPendingUserTranscript(null) + setPendingAssistantTranscript(null) + await client.connect() + }, [client]) + + const disconnect = useCallback(async () => { + await client.disconnect() + }, [client]) + + // Voice control methods + const startListening = useCallback(() => { + client.startListening() + }, [client]) + + const stopListening = useCallback(() => { + client.stopListening() + }, [client]) + + const interrupt = useCallback(() => { + client.interrupt() + }, [client]) + + // Text input + const sendText = useCallback( + (text: string) => { + client.sendText(text) + }, + [client], + ) + + // Audio visualization + const getInputFrequencyData = useCallback(() => { + return clientRef.current?.audio?.getInputFrequencyData() ?? emptyFrequencyData + }, []) + + const getOutputFrequencyData = useCallback(() => { + return clientRef.current?.audio?.getOutputFrequencyData() ?? emptyFrequencyData + }, []) + + const getInputTimeDomainData = useCallback(() => { + return clientRef.current?.audio?.getInputTimeDomainData() ?? emptyTimeDomainData + }, []) + + const getOutputTimeDomainData = useCallback(() => { + return clientRef.current?.audio?.getOutputTimeDomainData() ?? emptyTimeDomainData + }, []) + + // VAD mode control + const setVADMode = useCallback( + (newMode: 'server' | 'semantic' | 'manual') => { + setVADModeState(newMode) + // TODO: Update session config if connected + }, + [], + ) + + return { + // Connection state + status, + error, + connect, + disconnect, + + // Conversation state + mode, + messages, + pendingUserTranscript, + pendingAssistantTranscript, + + // Voice control + startListening, + stopListening, + interrupt, + + // Text input + sendText, + + // Audio visualization + inputLevel, + outputLevel, + getInputFrequencyData, + getOutputFrequencyData, + getInputTimeDomainData, + getOutputTimeDomainData, + + // VAD control + vadMode, + setVADMode, + } +} diff --git a/packages/typescript/ai/src/index.ts b/packages/typescript/ai/src/index.ts index 0476457d5..34ec893a6 100644 --- a/packages/typescript/ai/src/index.ts +++ b/packages/typescript/ai/src/index.ts @@ -76,6 +76,30 @@ export * from './types' // Event client export { aiEventClient } from './event-client' +// Realtime +export { realtimeToken } from './realtime/index' +export type { + RealtimeToken, + RealtimeTokenAdapter, + RealtimeTokenOptions, + RealtimeSessionConfig, + VADConfig, + RealtimeMessage, + RealtimeMessagePart, + RealtimeTextPart, + RealtimeAudioPart, + RealtimeToolCallPart, + RealtimeToolResultPart, + RealtimeStatus, + RealtimeMode, + AudioVisualization, + RealtimeEvent, + RealtimeEventPayloads, + RealtimeEventHandler, + RealtimeErrorCode, + RealtimeError, +} from './realtime/index' + // Message converters export { convertMessagesToModelMessages, diff --git a/packages/typescript/ai/src/realtime/index.ts b/packages/typescript/ai/src/realtime/index.ts new file mode 100644 index 000000000..74c450c1d --- /dev/null +++ b/packages/typescript/ai/src/realtime/index.ts @@ -0,0 +1,38 @@ +import type { RealtimeToken, RealtimeTokenOptions } from './types' + +// Re-export all types +export * from './types' + +/** + * Generate a realtime token using the provided adapter. + * + * This function is used on the server to generate ephemeral tokens + * that clients can use to establish realtime connections. + * + * @param options - Token generation options including the adapter + * @returns Promise resolving to a RealtimeToken + * + * @example + * ```typescript + * import { realtimeToken } from '@tanstack/ai' + * import { openaiRealtimeToken } from '@tanstack/ai-openai' + * + * // Server function (TanStack Start example) + * export const getRealtimeToken = createServerFn() + * .handler(async () => { + * return realtimeToken({ + * adapter: openaiRealtimeToken({ + * model: 'gpt-4o-realtime-preview', + * voice: 'alloy', + * instructions: 'You are a helpful assistant...', + * }), + * }) + * }) + * ``` + */ +export async function realtimeToken( + options: RealtimeTokenOptions, +): Promise { + const { adapter } = options + return adapter.generateToken() +} diff --git a/packages/typescript/ai/src/realtime/types.ts b/packages/typescript/ai/src/realtime/types.ts new file mode 100644 index 000000000..e563a2ae7 --- /dev/null +++ b/packages/typescript/ai/src/realtime/types.ts @@ -0,0 +1,266 @@ +import type { Tool } from '../types' + +// ============================================================================ +// Token Types +// ============================================================================ + +/** + * Voice activity detection configuration + */ +export interface VADConfig { + /** Sensitivity threshold (0.0-1.0) */ + threshold?: number + /** Audio to include before speech detection (ms) */ + prefixPaddingMs?: number + /** Silence duration to end turn (ms) */ + silenceDurationMs?: number +} + +/** + * Configuration for a realtime session + */ +export interface RealtimeSessionConfig { + /** Model to use for the session */ + model?: string + /** Voice to use for audio output */ + voice?: string + /** System instructions for the assistant */ + instructions?: string + /** Tools available in the session */ + tools?: Array + /** VAD mode */ + vadMode?: 'server' | 'semantic' | 'manual' + /** VAD configuration */ + vadConfig?: VADConfig + /** Provider-specific options */ + providerOptions?: Record +} + +/** + * Token returned by the server for client authentication + */ +export interface RealtimeToken { + /** Provider identifier */ + provider: string + /** The ephemeral token value */ + token: string + /** Token expiration timestamp (ms since epoch) */ + expiresAt: number + /** Session configuration embedded in the token */ + config: RealtimeSessionConfig +} + +/** + * Adapter interface for generating provider-specific tokens + */ +export interface RealtimeTokenAdapter { + /** Provider identifier */ + provider: string + /** Generate an ephemeral token for client use */ + generateToken(): Promise +} + +/** + * Options for the realtimeToken function + */ +export interface RealtimeTokenOptions { + /** The token adapter to use */ + adapter: RealtimeTokenAdapter +} + +// ============================================================================ +// Message Types +// ============================================================================ + +/** + * Text content part in a realtime message + */ +export interface RealtimeTextPart { + type: 'text' + content: string +} + +/** + * Audio content part in a realtime message + */ +export interface RealtimeAudioPart { + type: 'audio' + /** Transcription of the audio */ + transcript: string + /** Raw audio data (optional, if stored) */ + audioData?: ArrayBuffer + /** Duration of the audio in milliseconds */ + durationMs?: number +} + +/** + * Tool call part in a realtime message + */ +export interface RealtimeToolCallPart { + type: 'tool-call' + id: string + name: string + arguments: string + input?: unknown + output?: unknown +} + +/** + * Tool result part in a realtime message + */ +export interface RealtimeToolResultPart { + type: 'tool-result' + toolCallId: string + content: string +} + +/** + * Union of all realtime message parts + */ +export type RealtimeMessagePart = + | RealtimeTextPart + | RealtimeAudioPart + | RealtimeToolCallPart + | RealtimeToolResultPart + +/** + * A message in a realtime conversation + */ +export interface RealtimeMessage { + /** Unique message identifier */ + id: string + /** Message role */ + role: 'user' | 'assistant' + /** Timestamp when the message was created */ + timestamp: number + /** Content parts of the message */ + parts: Array + /** Whether this message was interrupted */ + interrupted?: boolean + /** Reference to audio buffer if stored */ + audioId?: string + /** Duration of the audio in milliseconds */ + durationMs?: number +} + +// ============================================================================ +// Status Types +// ============================================================================ + +/** + * Connection status of the realtime client + */ +export type RealtimeStatus = + | 'idle' + | 'connecting' + | 'connected' + | 'reconnecting' + | 'error' + +/** + * Current mode of the realtime session + */ +export type RealtimeMode = 'idle' | 'listening' | 'thinking' | 'speaking' + +// ============================================================================ +// Audio Visualization Types +// ============================================================================ + +/** + * Interface for accessing audio visualization data + */ +export interface AudioVisualization { + /** Input volume level (0-1 normalized) */ + readonly inputLevel: number + /** Output volume level (0-1 normalized) */ + readonly outputLevel: number + + /** Get frequency data for input audio visualization */ + getInputFrequencyData(): Uint8Array + /** Get frequency data for output audio visualization */ + getOutputFrequencyData(): Uint8Array + + /** Get time domain data for input waveform */ + getInputTimeDomainData(): Uint8Array + /** Get time domain data for output waveform */ + getOutputTimeDomainData(): Uint8Array + + /** Input sample rate */ + readonly inputSampleRate: number + /** Output sample rate */ + readonly outputSampleRate: number + + /** Subscribe to raw input audio samples */ + onInputAudio?( + callback: (samples: Float32Array, sampleRate: number) => void, + ): () => void + /** Subscribe to raw output audio samples */ + onOutputAudio?( + callback: (samples: Float32Array, sampleRate: number) => void, + ): () => void +} + +// ============================================================================ +// Event Types +// ============================================================================ + +/** + * Events emitted by the realtime connection + */ +export type RealtimeEvent = + | 'status_change' + | 'mode_change' + | 'transcript' + | 'audio_chunk' + | 'tool_call' + | 'message_complete' + | 'interrupted' + | 'error' + +/** + * Event payloads for realtime events + */ +export interface RealtimeEventPayloads { + status_change: { status: RealtimeStatus } + mode_change: { mode: RealtimeMode } + transcript: { + role: 'user' | 'assistant' + transcript: string + isFinal: boolean + } + audio_chunk: { data: ArrayBuffer; sampleRate: number } + tool_call: { toolCallId: string; toolName: string; input: unknown } + message_complete: { message: RealtimeMessage } + interrupted: { messageId?: string } + error: { error: Error } +} + +/** + * Handler type for realtime events + */ +export type RealtimeEventHandler = ( + payload: RealtimeEventPayloads[E], +) => void + +// ============================================================================ +// Error Types +// ============================================================================ + +/** + * Error codes for realtime errors + */ +export type RealtimeErrorCode = + | 'TOKEN_EXPIRED' + | 'CONNECTION_FAILED' + | 'PERMISSION_DENIED' + | 'PROVIDER_ERROR' + | 'UNKNOWN' + +/** + * Extended error with realtime-specific information + */ +export interface RealtimeError extends Error { + code: RealtimeErrorCode + provider?: string + details?: unknown +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index bac957175..f53f865f5 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -201,6 +201,9 @@ importers: '@tanstack/ai-client': specifier: workspace:* version: link:../../packages/typescript/ai-client + '@tanstack/ai-elevenlabs': + specifier: workspace:* + version: link:../../packages/typescript/ai-elevenlabs '@tanstack/ai-gemini': specifier: workspace:* version: link:../../packages/typescript/ai-gemini @@ -685,6 +688,22 @@ importers: specifier: ^2.11.10 version: 2.11.10(solid-js@1.9.10)(vite@7.2.7(@types/node@25.0.1)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.1)(tsx@4.21.0)(yaml@2.8.2)) + packages/typescript/ai-elevenlabs: + dependencies: + '@11labs/client': + specifier: ^0.2.0 + version: 0.2.0(@types/dom-mediacapture-record@1.0.22) + devDependencies: + '@tanstack/ai': + specifier: workspace:* + version: link:../ai + '@tanstack/ai-client': + specifier: workspace:* + version: link:../ai-client + '@vitest/coverage-v8': + specifier: 4.0.14 + version: 4.0.14(vitest@4.0.17(@types/node@25.0.1)(happy-dom@20.0.11)(jiti@2.6.1)(jsdom@27.3.0(postcss@8.5.6))(lightningcss@1.30.2)(terser@5.44.1)(tsx@4.21.0)(yaml@2.8.2)) + packages/typescript/ai-gemini: dependencies: '@google/genai': @@ -745,6 +764,9 @@ importers: '@tanstack/ai': specifier: workspace:* version: link:../ai + '@tanstack/ai-client': + specifier: workspace:* + version: link:../ai-client '@vitest/coverage-v8': specifier: 4.0.14 version: 4.0.14(vitest@4.0.17(@types/node@25.0.1)(happy-dom@20.0.11)(jiti@2.6.1)(jsdom@27.3.0(postcss@8.5.6))(lightningcss@1.30.2)(terser@5.44.1)(tsx@4.21.0)(yaml@2.8.2)) @@ -1317,6 +1339,10 @@ importers: packages: + '@11labs/client@0.2.0': + resolution: {integrity: sha512-GBplAV4WDbcoThsIzdSDPN3xbcitK0ZZ4iJfJZKfltqvgvS6Uw8GZxHwVgiPwnQoA3uosYyY3L9TuPwmel18xQ==} + deprecated: This package is no longer maintained. Please use @elevenlabs/client for the latest version + '@acemir/cssom@0.9.29': resolution: {integrity: sha512-G90x0VW+9nW4dFajtjCoT+NM0scAfH9Mb08IcjgFHYbfiL/lU04dTF9JuVOi3/OH+DJCQdcIseSXkdCB9Ky6JA==} @@ -1500,6 +1526,9 @@ packages: resolution: {integrity: sha512-6zABk/ECA/QYSCQ1NGiVwwbQerUCZ+TQbp64Q3AgmfNvurHH0j8TtXa1qbShXA6qqkpAj4V5W8pP6mLe1mcMqA==} engines: {node: '>=18'} + '@bufbuild/protobuf@1.10.1': + resolution: {integrity: sha512-wJ8ReQbHxsAfXhrf9ixl0aYbZorRuOWpBNzm8pL8ftmSxQx/wnJD5Eg861NwJU/czy2VXFIebCeZnZrI9rktIQ==} + '@changesets/apply-release-plan@7.0.14': resolution: {integrity: sha512-ddBvf9PHdy2YY0OUiEl3TV78mH9sckndJR14QAt87KLEbIov81XO0q0QAmvooBxXlqRRP8I9B7XOzZwQG7JkWA==} @@ -2362,6 +2391,12 @@ packages: '@jridgewell/trace-mapping@0.3.31': resolution: {integrity: sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==} + '@livekit/mutex@1.1.1': + resolution: {integrity: sha512-EsshAucklmpuUAfkABPxJNhzj9v2sG7JuzFDL4ML1oJQSV14sqrpTYnsaOudMAw9yOaW53NU3QQTlUQoRs4czw==} + + '@livekit/protocol@1.42.2': + resolution: {integrity: sha512-0jeCwoMJKcwsZICg5S6RZM4xhJoF78qMvQELjACJQn6/VB+jmiySQKOSELTXvPBVafHfEbMlqxUw2UR1jTXs2g==} + '@manypkg/find-root@1.1.0': resolution: {integrity: sha512-mki5uBvhHzO8kYYix/WRy2WX8S3B5wdVSc9D6KcU5lQNglP2yt58/VfLuAK49glRXChosY8ap2oJ1qgma3GUVA==} @@ -3887,6 +3922,9 @@ packages: '@types/deep-eql@4.0.2': resolution: {integrity: sha512-c9h9dVVMigMPc4bwTvC5dxqtqJZwQPePsWjPlpSOnojbor6pGqdk541lfA7AqFQr5pB1BRdq0juY9db81BwyFw==} + '@types/dom-mediacapture-record@1.0.22': + resolution: {integrity: sha512-mUMZLK3NvwRLcAAT9qmcK+9p7tpU2FHdDsntR3YI4+GY88XrgG4XiE7u1Q2LAN2/FZOz/tdMDC3GQCR4T8nFuw==} + '@types/estree-jsx@1.0.5': resolution: {integrity: sha512-52CcUVNFyfb1A2ALocQw/Dd1BQFNmSdkuC3BkZ6iqhdMfQz7JWOFRuJFloOzjk+6WijU56m9oKXFAXc7o3Towg==} @@ -6017,6 +6055,9 @@ packages: jju@1.4.0: resolution: {integrity: sha512-8wb9Yw966OSxApiCt0K3yNJL8pnNeIv+OEq2YMidz4FKP6nonSRoOXc80iXY4JaN2FC11B9qsNmDsm+ZOfMROA==} + jose@6.1.3: + resolution: {integrity: sha512-0TpaTfihd4QMNwrz/ob2Bp7X04yuxJkjRGi4aKmOqwhov54i6u79oCv7T+C7lo70MKH6BesI3vscD1yb/yzKXQ==} + js-beautify@1.15.4: resolution: {integrity: sha512-9/KXeZUKKJwqCXUdBxFJ3vPh467OCckSBmYDwSK/EtV090K+iMJ7zx2S3HLVDIWFQdqMIsZWbnaGiba18aWhaA==} engines: {node: '>=14'} @@ -6217,6 +6258,11 @@ packages: resolution: {integrity: sha512-I8oW2+QL5KJo8zXNWX046M134WchxsXC7SawLPvRQpogCbkyQIaFxPE89A2HiwR7vAK2Dm2ERBAmyjTYGYEpBg==} hasBin: true + livekit-client@2.17.0: + resolution: {integrity: sha512-BD1QUS44ancVTBdnAher0aO7DV5holFYH2lYradYT/HgXtn6R8xPyvtDAH3UH40jGcesDo9fEopCFwEdOgrIhg==} + peerDependencies: + '@types/dom-mediacapture-record': ^1 + local-pkg@0.5.1: resolution: {integrity: sha512-9rrA30MRRP3gBD3HTGnC6cDFpaE1kVDWxWgqWJUN0RvDNAo+Nz/9GxB+nHOH0ifbVFy0hSA1V6vFDvnx54lTEQ==} engines: {node: '>=14'} @@ -6255,6 +6301,10 @@ packages: resolution: {integrity: sha512-8XPvpAA8uyhfteu8pIvQxpJZ7SYYdpUivZpGy6sFsBuKRY/7rQGavedeB8aK+Zkyq6upMFVL/9AW6vOYzfRyLg==} engines: {node: '>=10'} + loglevel@1.9.2: + resolution: {integrity: sha512-HgMmCqIJSAKqo68l0rS2AanEWfkxaZ5wNiEFb5ggm08lDs9Xl2KxBlX3PTcaD2chBM1gXAYf491/M2Rv8Jwayg==} + engines: {node: '>= 0.6.0'} + longest-streak@3.1.0: resolution: {integrity: sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==} @@ -7270,6 +7320,13 @@ packages: scule@1.3.0: resolution: {integrity: sha512-6FtHJEvt+pVMIB9IBY+IcCJ6Z5f1iQnytgyfKMhDKgmzYG+TeH/wx1y3l27rshSbLiSanrR9ffZDrEsmjlQF2g==} + sdp-transform@2.15.0: + resolution: {integrity: sha512-KrOH82c/W+GYQ0LHqtr3caRpM3ITglq3ljGUIb8LTki7ByacJZ9z+piSGiwZDsRyhQbYBOBJgr2k6X4BZXi3Kw==} + hasBin: true + + sdp@3.2.1: + resolution: {integrity: sha512-lwsAIzOPlH8/7IIjjz3K0zYBk7aBVVcvjMwt3M4fLxpjMYyy7i3I97SLHebgn4YBjirkzfp3RvRDWSKsh/+WFw==} + semver@6.3.1: resolution: {integrity: sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==} hasBin: true @@ -7718,6 +7775,9 @@ packages: peerDependencies: typescript: '>=4.8.4' + ts-debounce@4.0.0: + resolution: {integrity: sha512-+1iDGY6NmOGidq7i7xZGA4cm8DAa6fqdYcvO5Z6yBevH++Bdo9Qt/mN0TzHUgcCcKv1gmh9+W5dHqz8pMWbCbg==} + ts-declaration-location@1.0.7: resolution: {integrity: sha512-EDyGAwH1gO0Ausm9gV6T2nUvBgXT5kGoCMJPllOaooZ+4VvJiKBdZE7wK18N1deEowhcUptS+5GXZK8U/fvpwA==} peerDependencies: @@ -7787,6 +7847,9 @@ packages: resolution: {integrity: sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==} engines: {node: '>= 0.6'} + typed-emitter@2.1.0: + resolution: {integrity: sha512-g/KzbYKbH5C2vPkaXGu8DJlHrGKHLsM25Zg9WuC9pMGfuvT+X25tZQWo5fK1BjBm8+UrVE9LDCvaY0CQk+fXDA==} + typedoc-plugin-frontmatter@1.3.0: resolution: {integrity: sha512-xYQFMAecMlsRUjmf9oM/Sq2FVz4zlgcbIeVFNLdO118CHTN06gIKJNSlyExh9+Xl8sK0YhIvoQwViUURxritWA==} peerDependencies: @@ -8327,6 +8390,10 @@ packages: webpack-virtual-modules@0.6.2: resolution: {integrity: sha512-66/V2i5hQanC51vBQKPH4aI8NMAcBW59FVBs+rC7eGHupMyfn34q7rZIE+ETlJ+XTevqfUhVVBgSUNSW2flEUQ==} + webrtc-adapter@9.0.3: + resolution: {integrity: sha512-5fALBcroIl31OeXAdd1YUntxiZl1eHlZZWzNg3U4Fn+J9/cGL3eT80YlrsWGvj2ojuz1rZr2OXkgCzIxAZ7vRQ==} + engines: {node: '>=6.0.0', npm: '>=3.10.0'} + whatwg-encoding@3.1.1: resolution: {integrity: sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==} engines: {node: '>=18'} @@ -8477,6 +8544,12 @@ packages: snapshots: + '@11labs/client@0.2.0(@types/dom-mediacapture-record@1.0.22)': + dependencies: + livekit-client: 2.17.0(@types/dom-mediacapture-record@1.0.22) + transitivePeerDependencies: + - '@types/dom-mediacapture-record' + '@acemir/cssom@0.9.29': {} '@alcyone-labs/zod-to-json-schema@4.0.10(zod@4.2.1)': @@ -8717,6 +8790,8 @@ snapshots: '@bcoe/v8-coverage@1.0.2': {} + '@bufbuild/protobuf@1.10.1': {} + '@changesets/apply-release-plan@7.0.14': dependencies: '@changesets/config': 3.1.2 @@ -9379,6 +9454,12 @@ snapshots: '@jridgewell/resolve-uri': 3.1.2 '@jridgewell/sourcemap-codec': 1.5.5 + '@livekit/mutex@1.1.1': {} + + '@livekit/protocol@1.42.2': + dependencies: + '@bufbuild/protobuf': 1.10.1 + '@manypkg/find-root@1.1.0': dependencies: '@babel/runtime': 7.28.4 @@ -11380,6 +11461,8 @@ snapshots: '@types/deep-eql@4.0.2': {} + '@types/dom-mediacapture-record@1.0.22': {} + '@types/estree-jsx@1.0.5': dependencies: '@types/estree': 1.0.8 @@ -13896,6 +13979,8 @@ snapshots: jju@1.4.0: {} + jose@6.1.3: {} + js-beautify@1.15.4: dependencies: config-chain: 1.1.13 @@ -14113,6 +14198,20 @@ snapshots: untun: 0.1.3 uqr: 0.1.2 + livekit-client@2.17.0(@types/dom-mediacapture-record@1.0.22): + dependencies: + '@livekit/mutex': 1.1.1 + '@livekit/protocol': 1.42.2 + '@types/dom-mediacapture-record': 1.0.22 + events: 3.3.0 + jose: 6.1.3 + loglevel: 1.9.2 + sdp-transform: 2.15.0 + ts-debounce: 4.0.0 + tslib: 2.8.1 + typed-emitter: 2.1.0 + webrtc-adapter: 9.0.3 + local-pkg@0.5.1: dependencies: mlly: 1.8.0 @@ -14149,6 +14248,8 @@ snapshots: chalk: 4.1.2 is-unicode-supported: 0.1.0 + loglevel@1.9.2: {} + longest-streak@3.1.0: {} lowlight@3.3.0: @@ -15551,6 +15652,10 @@ snapshots: scule@1.3.0: {} + sdp-transform@2.15.0: {} + + sdp@3.2.1: {} + semver@6.3.1: {} semver@7.5.4: @@ -16031,6 +16136,8 @@ snapshots: dependencies: typescript: 5.9.3 + ts-debounce@4.0.0: {} + ts-declaration-location@1.0.7(typescript@5.9.3): dependencies: picomatch: 4.0.3 @@ -16098,6 +16205,10 @@ snapshots: media-typer: 1.1.0 mime-types: 3.0.2 + typed-emitter@2.1.0: + optionalDependencies: + rxjs: 7.8.2 + typedoc-plugin-frontmatter@1.3.0(typedoc-plugin-markdown@4.9.0(typedoc@0.28.14(typescript@5.9.3))): dependencies: typedoc-plugin-markdown: 4.9.0(typedoc@0.28.14(typescript@5.9.3)) @@ -16769,6 +16880,10 @@ snapshots: webpack-virtual-modules@0.6.2: {} + webrtc-adapter@9.0.3: + dependencies: + sdp: 3.2.1 + whatwg-encoding@3.1.1: dependencies: iconv-lite: 0.6.3 From 71ad066469625d12de3c7108b9ae5c92a93497e4 Mon Sep 17 00:00:00 2001 From: Jack Herrington Date: Tue, 20 Jan 2026 07:10:18 -0800 Subject: [PATCH 2/5] Realtime chat basically working --- docs/architecture/realtime-chat.md | 313 +++++++++++++++++++++++++++++ 1 file changed, 313 insertions(+) create mode 100644 docs/architecture/realtime-chat.md diff --git a/docs/architecture/realtime-chat.md b/docs/architecture/realtime-chat.md new file mode 100644 index 000000000..305029c3e --- /dev/null +++ b/docs/architecture/realtime-chat.md @@ -0,0 +1,313 @@ +--- +title: Realtime Voice Chat Architecture +id: realtime-chat-architecture +--- + +# Realtime Voice Chat Architecture + +This document describes the architecture of TanStack AI's realtime voice-to-voice chat capability, which enables browser-based voice conversations with AI models. + +## Overview + +The realtime chat system provides a vendor-neutral, type-safe abstraction for voice-to-voice AI interactions. It currently supports: + +- **OpenAI Realtime API** - WebRTC-based connection with GPT-4o realtime models +- **ElevenLabs Conversational AI** - SDK-based connection for voice conversations + +## Architecture Layers + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ React Application │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ useRealtimeChat() │ │ +│ │ - Connection state (status, mode) │ │ +│ │ - Messages & transcripts │ │ +│ │ - Audio visualization (levels, waveforms) │ │ +│ │ - Control methods (connect, disconnect, interrupt) │ │ +│ └─────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ @tanstack/ai-client │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ RealtimeClient │ │ +│ │ - Connection lifecycle management │ │ +│ │ - Token refresh scheduling │ │ +│ │ - Event subscription & dispatch │ │ +│ │ - Tool execution coordination │ │ +│ └─────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Provider Adapters │ +│ ┌──────────────────────┐ ┌──────────────────────┐ │ +│ │ openaiRealtime() │ │ elevenlabsRealtime() │ │ +│ │ - WebRTC connection │ │ - SDK wrapper │ │ +│ │ - Audio I/O │ │ - Signed URL auth │ │ +│ │ - Event mapping │ │ - Event mapping │ │ +│ └──────────────────────┘ └──────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Server-Side │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ Token Generation Endpoint │ │ +│ │ - openaiRealtimeToken() - ephemeral client secrets │ │ +│ │ - elevenlabsRealtimeToken() - signed URLs │ │ +│ └─────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Key Components + +### 1. Token Adapters (Server-Side) + +Token adapters generate short-lived credentials for client-side connections. This keeps API keys secure on the server. + +```typescript +// Server-side token endpoint +import { realtimeToken } from '@tanstack/ai' +import { openaiRealtimeToken } from '@tanstack/ai-openai' + +const token = await realtimeToken({ + adapter: openaiRealtimeToken({ + model: 'gpt-4o-realtime-preview', + voice: 'alloy', + instructions: 'You are a helpful assistant.', + turnDetection: { + type: 'server_vad', + threshold: 0.5, + silence_duration_ms: 500, + }, + }), +}) +``` + +**Token Structure:** +```typescript +interface RealtimeToken { + provider: string // 'openai' | 'elevenlabs' + token: string // Ephemeral token or signed URL + expiresAt: number // Expiration timestamp (ms) + config: RealtimeSessionConfig // Session configuration +} +``` + +### 2. Client Adapters (Browser-Side) + +Client adapters handle the actual connection to provider APIs, managing: +- WebRTC or WebSocket connections +- Audio capture and playback +- Event translation to common format +- Audio visualization data + +```typescript +// Client-side adapter usage +import { openaiRealtime } from '@tanstack/ai-openai' + +const adapter = openaiRealtime({ + connectionMode: 'webrtc', // default +}) +``` + +### 3. RealtimeClient + +The `RealtimeClient` class manages the connection lifecycle: + +- **Connection Management**: Connect, disconnect, reconnect +- **Token Refresh**: Automatically refreshes tokens before expiry +- **Event Handling**: Subscribes to adapter events and dispatches to callbacks +- **State Management**: Tracks status, mode, messages, transcripts +- **Tool Execution**: Coordinates client-side tool calls + +### 4. useRealtimeChat Hook + +The React hook provides a reactive interface: + +```typescript +const { + // Connection state + status, // 'idle' | 'connecting' | 'connected' | 'reconnecting' | 'error' + error, + connect, + disconnect, + + // Conversation state + mode, // 'idle' | 'listening' | 'thinking' | 'speaking' + messages, + pendingUserTranscript, + pendingAssistantTranscript, + + // Voice control + startListening, + stopListening, + interrupt, + + // Audio visualization + inputLevel, + outputLevel, + getInputTimeDomainData, + getOutputTimeDomainData, +} = useRealtimeChat({ + getToken: () => fetch('/api/realtime-token').then(r => r.json()), + adapter: openaiRealtime(), +}) +``` + +## Connection Flow + +### OpenAI WebRTC Flow + +```mermaid +sequenceDiagram + participant Browser + participant Server + participant OpenAI + + Browser->>Server: POST /api/realtime-token + Server->>OpenAI: POST /v1/realtime/sessions + OpenAI-->>Server: { client_secret, expires_at } + Server-->>Browser: RealtimeToken + + Browser->>Browser: getUserMedia() - request mic + Browser->>Browser: Create RTCPeerConnection + Browser->>Browser: Add audio track to PC + Browser->>Browser: createOffer() + + Browser->>OpenAI: POST /v1/realtime?model=... + Note right of Browser: SDP offer + ephemeral token + OpenAI-->>Browser: SDP answer + + Browser->>Browser: setRemoteDescription() + Note over Browser,OpenAI: WebRTC connection established + + Browser->>OpenAI: Audio via WebRTC + OpenAI-->>Browser: Audio + events via WebRTC +``` + +### ElevenLabs Flow + +```mermaid +sequenceDiagram + participant Browser + participant Server + participant ElevenLabs + + Browser->>Server: POST /api/realtime-token + Server->>ElevenLabs: POST /v1/convai/conversation/get_signed_url + ElevenLabs-->>Server: { signed_url } + Server-->>Browser: RealtimeToken + + Browser->>ElevenLabs: Conversation.startSession(signedUrl) + Note over Browser,ElevenLabs: SDK handles WebSocket/WebRTC + + Browser->>ElevenLabs: Audio via SDK + ElevenLabs-->>Browser: Audio + events via SDK +``` + +## Audio Visualization + +The system provides real-time audio visualization through the `AudioVisualization` interface: + +```typescript +interface AudioVisualization { + inputLevel: number // 0-1 normalized input volume (RMS) + outputLevel: number // 0-1 normalized output volume (RMS) + getInputFrequencyData(): Uint8Array // FFT frequency bins + getOutputFrequencyData(): Uint8Array + getInputTimeDomainData(): Uint8Array // Raw waveform samples + getOutputTimeDomainData(): Uint8Array + inputSampleRate: number + outputSampleRate: number +} +``` + +The OpenAI adapter uses Web Audio API `AnalyserNode` for visualization: +- `fftSize: 2048` for high-resolution analysis +- RMS (Root Mean Square) calculation for accurate volume levels +- Separate analysers for input (microphone) and output (AI voice) + +## Event System + +Adapters emit standardized events: + +| Event | Payload | Description | +|-------|---------|-------------| +| `status_change` | `{ status }` | Connection status changed | +| `mode_change` | `{ mode }` | Conversation mode changed | +| `transcript` | `{ role, transcript, isFinal }` | Speech-to-text update | +| `message_complete` | `{ message }` | Full message received | +| `tool_call` | `{ toolCallId, toolName, input }` | Tool invocation requested | +| `interrupted` | `{ messageId? }` | Response was interrupted | +| `error` | `{ error }` | Error occurred | + +## Current Status + +### Implemented Features + +- [x] OpenAI Realtime API integration (WebRTC) +- [x] ElevenLabs Conversational AI integration +- [x] Token generation and refresh +- [x] Audio capture and playback +- [x] Real-time transcription display +- [x] Audio visualization (levels, waveforms) +- [x] Interrupt capability +- [x] React hook (`useRealtimeChat`) +- [x] Demo application at `/realtime` route + +### Known Limitations + +- **Device Selection**: Currently uses system default audio devices. Custom device selection not yet implemented. +- **ElevenLabs SDK**: Using `@11labs/client@0.2.0` which has limited TypeScript support. +- **Push-to-Talk**: Manual VAD mode implemented but not exposed in demo UI. +- **Tool Calling**: Framework supports tools but demo doesn't showcase them. + +### Demo Application + +The `examples/ts-react-chat` application includes a realtime voice chat demo at the `/realtime` route: + +**Features:** +- Provider selection (OpenAI / ElevenLabs) +- Connection status indicator +- Conversation mode indicator (Listening/Thinking/Speaking) +- Message history with transcripts +- Audio level meters +- Waveform visualization (debug mode) +- Interrupt button during AI speech + +**Required Environment Variables:** +```bash +OPENAI_API_KEY=sk-... +ELEVENLABS_API_KEY=xi-... # Optional, for ElevenLabs +ELEVENLABS_AGENT_ID=... # Optional, for ElevenLabs +``` + +## Files Reference + +### Core Types +- `packages/typescript/ai/src/realtime/types.ts` - Core type definitions +- `packages/typescript/ai-client/src/realtime-types.ts` - Client-side types + +### Token Generation (Server) +- `packages/typescript/ai/src/realtime/index.ts` - `realtimeToken()` function +- `packages/typescript/ai-openai/src/realtime/token.ts` - OpenAI token adapter +- `packages/typescript/ai-elevenlabs/src/realtime/token.ts` - ElevenLabs token adapter + +### Client Adapters +- `packages/typescript/ai-openai/src/realtime/adapter.ts` - OpenAI WebRTC adapter +- `packages/typescript/ai-elevenlabs/src/realtime/adapter.ts` - ElevenLabs SDK adapter + +### Client Library +- `packages/typescript/ai-client/src/realtime-client.ts` - RealtimeClient class + +### React Integration +- `packages/typescript/ai-react/src/use-realtime-chat.ts` - React hook +- `packages/typescript/ai-react/src/realtime-types.ts` - Hook types + +### Demo Application +- `examples/ts-react-chat/src/routes/realtime.tsx` - Demo UI component +- `examples/ts-react-chat/src/routes/api.realtime-token.ts` - Token API endpoint From 5ad2d2c035afb3426f1f8f3bfd2862186f063b23 Mon Sep 17 00:00:00 2001 From: Jack Herrington Date: Wed, 21 Jan 2026 16:15:17 -0800 Subject: [PATCH 3/5] more updates --- .../ts-react-chat/src/lib/realtime-tools.ts | 141 ++++++++++++++++++ .../src/routes/api.realtime-token.ts | 50 ++++++- .../ts-react-chat/src/routes/realtime.tsx | 95 ++++-------- .../ai-openai/src/realtime/adapter.ts | 29 ++-- 4 files changed, 230 insertions(+), 85 deletions(-) create mode 100644 examples/ts-react-chat/src/lib/realtime-tools.ts diff --git a/examples/ts-react-chat/src/lib/realtime-tools.ts b/examples/ts-react-chat/src/lib/realtime-tools.ts new file mode 100644 index 000000000..2cf28d7bd --- /dev/null +++ b/examples/ts-react-chat/src/lib/realtime-tools.ts @@ -0,0 +1,141 @@ +import { toolDefinition } from '@tanstack/ai' +import { z } from 'zod' + +// Tool to get current time - useful for voice assistants +export const getCurrentTimeToolDef = toolDefinition({ + name: 'getCurrentTime', + description: 'Get the current date and time. Use this when the user asks what time it is or the current date.', + inputSchema: z.object({ + timezone: z.string().optional().describe('Optional timezone like "America/New_York" or "Europe/London"'), + }), + outputSchema: z.object({ + time: z.string(), + date: z.string(), + timezone: z.string(), + }), +}) + +// Tool to get weather - common voice assistant use case +export const getWeatherToolDef = toolDefinition({ + name: 'getWeather', + description: 'Get the current weather for a location. Use this when the user asks about the weather.', + inputSchema: z.object({ + location: z.string().describe('The city and state/country, e.g. "San Francisco, CA" or "London, UK"'), + }), + outputSchema: z.object({ + location: z.string(), + temperature: z.number(), + unit: z.string(), + condition: z.string(), + humidity: z.number(), + }), +}) + +// Tool to set a reminder - demonstrates user interaction +export const setReminderToolDef = toolDefinition({ + name: 'setReminder', + description: 'Set a reminder for the user. Use this when the user asks to be reminded about something.', + inputSchema: z.object({ + message: z.string().describe('What to remind the user about'), + inMinutes: z.number().describe('How many minutes from now to remind'), + }), + outputSchema: z.object({ + success: z.boolean(), + message: z.string(), + remindAt: z.string(), + }), +}) + +// Tool to search knowledge base - useful for assistants with specific knowledge +export const searchKnowledgeToolDef = toolDefinition({ + name: 'searchKnowledge', + description: 'Search a knowledge base for information. Use this to find specific facts or documentation.', + inputSchema: z.object({ + query: z.string().describe('The search query'), + }), + outputSchema: z.object({ + results: z.array(z.object({ + title: z.string(), + snippet: z.string(), + })), + }), +}) + +// Client-side implementation of getCurrentTime +export const getCurrentTimeClient = getCurrentTimeToolDef.client(({ timezone }) => { + const now = new Date() + const tz = timezone || Intl.DateTimeFormat().resolvedOptions().timeZone + + return { + time: now.toLocaleTimeString('en-US', { timeZone: tz }), + date: now.toLocaleDateString('en-US', { + weekday: 'long', + year: 'numeric', + month: 'long', + day: 'numeric', + timeZone: tz, + }), + timezone: tz, + } +}) + +// Client-side implementation of getWeather (mock data for demo) +export const getWeatherClient = getWeatherToolDef.client(({ location }) => { + // Mock weather data for demo purposes + const conditions = ['Sunny', 'Partly Cloudy', 'Cloudy', 'Rainy', 'Snowy'] + const randomCondition = conditions[Math.floor(Math.random() * conditions.length)]! + const randomTemp = Math.floor(Math.random() * 30) + 50 // 50-80°F + const randomHumidity = Math.floor(Math.random() * 50) + 30 // 30-80% + + return { + location, + temperature: randomTemp, + unit: 'F', + condition: randomCondition, + humidity: randomHumidity, + } +}) + +// Client-side implementation of setReminder +export const setReminderClient = setReminderToolDef.client(({ message, inMinutes }) => { + const remindAt = new Date(Date.now() + inMinutes * 60 * 1000) + + // In a real app, you'd schedule a notification here + console.log(`[Reminder] Will remind about "${message}" at ${remindAt.toLocaleTimeString()}`) + + // For demo purposes, show an alert after the specified time + setTimeout(() => { + alert(`Reminder: ${message}`) + }, inMinutes * 60 * 1000) + + return { + success: true, + message: `Reminder set: "${message}"`, + remindAt: remindAt.toLocaleTimeString(), + } +}) + +// Client-side implementation of searchKnowledge (mock data for demo) +export const searchKnowledgeClient = searchKnowledgeToolDef.client(({ query }) => { + // Mock search results for demo + const mockResults = [ + { + title: `Result for: ${query}`, + snippet: `This is a mock search result for the query "${query}". In a real application, this would return actual search results from a knowledge base.`, + }, + { + title: 'Additional Information', + snippet: 'More relevant information would appear here based on your search query.', + }, + ] + + return { results: mockResults } +}) + +// Export all client tools as an array for easy use +export const realtimeClientTools = [ + getCurrentTimeClient, + getWeatherClient, + setReminderClient, + searchKnowledgeClient, +] as const diff --git a/examples/ts-react-chat/src/routes/api.realtime-token.ts b/examples/ts-react-chat/src/routes/api.realtime-token.ts index 0f7a46bda..c9367ad19 100644 --- a/examples/ts-react-chat/src/routes/api.realtime-token.ts +++ b/examples/ts-react-chat/src/routes/api.realtime-token.ts @@ -2,9 +2,36 @@ import { createFileRoute } from '@tanstack/react-router' import { realtimeToken } from '@tanstack/ai' import { openaiRealtimeToken } from '@tanstack/ai-openai' import { elevenlabsRealtimeToken } from '@tanstack/ai-elevenlabs' +import { + getCurrentTimeToolDef, + getWeatherToolDef, + setReminderToolDef, + searchKnowledgeToolDef, +} from '@/lib/realtime-tools' +import * as z from 'zod' type Provider = 'openai' | 'elevenlabs' +// Convert tool definitions to OpenAI's format using Zod's native toJSONSchema +function toolDefToOpenAI(toolDef: { name: string; description: string; inputSchema?: unknown }) { + let parameters: Record = { type: 'object', properties: {} } + + if (toolDef.inputSchema) { + // Use Zod's native toJSONSchema for Zod v4+ + const jsonSchema = z.toJSONSchema(toolDef.inputSchema as z.ZodType) + // Remove $schema as OpenAI doesn't need it + const { $schema, ...rest } = jsonSchema as Record + parameters = rest + } + + return { + type: 'function' as const, + name: toolDef.name, + description: toolDef.description, + parameters, + } +} + export const Route = createFileRoute('/api/realtime-token')({ server: { handlers: { @@ -16,14 +43,29 @@ export const Route = createFileRoute('/api/realtime-token')({ let token if (provider === 'openai') { + // Convert tool definitions to OpenAI format + const tools = [ + toolDefToOpenAI(getCurrentTimeToolDef), + toolDefToOpenAI(getWeatherToolDef), + toolDefToOpenAI(setReminderToolDef), + toolDefToOpenAI(searchKnowledgeToolDef), + ] + token = await realtimeToken({ adapter: openaiRealtimeToken({ model: 'gpt-4o-realtime-preview', voice: 'alloy', - instructions: `You are a helpful, friendly assistant. - + instructions: `You are a helpful, friendly voice assistant with access to several tools. + +You can: +- Tell the user the current time and date (getCurrentTime) +- Get weather information for any location (getWeather) +- Set reminders for the user (setReminder) +- Search a knowledge base for information (searchKnowledge) + Keep your responses concise and conversational since this is a voice interface. -Be natural and engaging in your responses.`, +When using tools, briefly explain what you're doing and then share the results naturally. +Be friendly and engaging!`, turnDetection: { type: 'server_vad', threshold: 0.5, @@ -33,6 +75,8 @@ Be natural and engaging in your responses.`, inputAudioTranscription: { model: 'whisper-1', }, + tools, + toolChoice: 'auto', }), }) } else if (provider === 'elevenlabs') { diff --git a/examples/ts-react-chat/src/routes/realtime.tsx b/examples/ts-react-chat/src/routes/realtime.tsx index fcc954cb1..4be974a8f 100644 --- a/examples/ts-react-chat/src/routes/realtime.tsx +++ b/examples/ts-react-chat/src/routes/realtime.tsx @@ -3,7 +3,8 @@ import { createFileRoute } from '@tanstack/react-router' import { useRealtimeChat } from '@tanstack/ai-react' import { openaiRealtime } from '@tanstack/ai-openai' import { elevenlabsRealtime } from '@tanstack/ai-elevenlabs' -import { Mic, MicOff, Phone, PhoneOff, Volume2 } from 'lucide-react' +import { Mic, MicOff, Phone, PhoneOff, Volume2, Wrench } from 'lucide-react' +import { realtimeClientTools } from '@/lib/realtime-tools' type Provider = 'openai' | 'elevenlabs' @@ -98,56 +99,9 @@ function AudioSparkline({ ) } -// Debug component to show raw audio data stats -function AudioDebug({ - getData, - label -}: { - getData: () => Uint8Array - label: string -}) { - const [stats, setStats] = useState({ length: 0, min: 0, max: 0, allSame: true, sample: '[]' }) - - useEffect(() => { - function update() { - const data = getData() - const min = Math.min(...data) - const max = Math.max(...data) - const allSame = data.every(v => v === data[0]) - // Get a few samples from different parts of the array - const samples = data.length > 0 ? [ - data[0], - data[Math.floor(data.length / 4)], - data[Math.floor(data.length / 2)], - data[Math.floor(data.length * 3 / 4)], - data[data.length - 1] - ] : [] - setStats({ - length: data.length, - min, - max, - allSame, - sample: `[${samples.join(', ')}]` - }) - requestAnimationFrame(update) - } - const id = requestAnimationFrame(update) - return () => cancelAnimationFrame(id) - }, [getData]) - - return ( -
- {label}: len={stats.length}, min={stats.min}, max={stats.max}, - {stats.allSame ? ALL SAME! : varying} - {stats.sample} -
- ) -} - function RealtimePage() { const [provider, setProvider] = useState('openai') const [agentId, setAgentId] = useState('') - const [showDebug, setShowDebug] = useState(true) const messagesEndRef = useRef(null) // Get the appropriate adapter based on provider @@ -165,7 +119,6 @@ function RealtimePage() { interrupt, inputLevel, outputLevel, - sendText, getInputTimeDomainData, getOutputTimeDomainData, } = useRealtimeChat({ @@ -186,6 +139,8 @@ function RealtimePage() { return response.json() }, adapter, + // Pass the client tools - these execute locally when the AI calls them + tools: realtimeClientTools, onError: (err) => { console.error('Realtime error:', err) }, @@ -302,15 +257,35 @@ function RealtimePage() {
+ {/* Tools indicator */} + {provider === 'openai' && ( +
+
+ + Tools enabled: + getCurrentTime + + getWeather + + setReminder + + searchKnowledge +
+
+ )} + {/* Messages area */}
{messages.length === 0 && status === 'idle' && (
-

Voice Chat

+

Voice Chat with Tools

Click "Start Conversation" to begin talking with the AI

+

+ Try asking: "What time is it?" or "What's the weather in San Francisco?" +

)} @@ -437,26 +412,6 @@ function RealtimePage() { label="Output" />
- - {/* Debug info */} - {showDebug && ( -
-
- Audio Debug - -
- - -
- inputLevel: {inputLevel.toFixed(4)}, outputLevel: {outputLevel.toFixed(4)} -
-
- )}
)} diff --git a/packages/typescript/ai-openai/src/realtime/adapter.ts b/packages/typescript/ai-openai/src/realtime/adapter.ts index 51745a26c..645a5e716 100644 --- a/packages/typescript/ai-openai/src/realtime/adapter.ts +++ b/packages/typescript/ai-openai/src/realtime/adapter.ts @@ -513,33 +513,38 @@ async function createWebRTCConnection( // Log analyser state for debugging console.log('[Realtime] getAudioVisualization called, inputAnalyser:', !!inputAnalyser, 'outputAnalyser:', !!outputAnalyser) - // Helper to calculate RMS (Root Mean Square) from time domain data - // This gives a better measure of perceived loudness than frequency data - function calculateRMS(analyser: AnalyserNode): number { + // Helper to calculate audio level from time domain data + // Uses peak amplitude which is more responsive for voice audio meters + function calculateLevel(analyser: AnalyserNode): number { const data = new Uint8Array(analyser.fftSize) analyser.getByteTimeDomainData(data) - // Calculate RMS - values are 0-255 with 128 being silence - let sumSquares = 0 + // Find peak deviation from center (128 is silence) + // This is more responsive than RMS for voice level meters + let maxDeviation = 0 for (const sample of data) { - const normalized = (sample - 128) / 128 // Convert to -1 to 1 range - sumSquares += normalized * normalized + const deviation = Math.abs(sample - 128) + if (deviation > maxDeviation) { + maxDeviation = deviation + } } - const rms = Math.sqrt(sumSquares / data.length) - // Scale and clamp to 0-1 range (RMS of full-scale sine is ~0.707) - return Math.min(1, rms * 1.5) + // Normalize to 0-1 range (max deviation is 128) + // Scale by 1.5x so that ~66% amplitude reads as full scale + // This provides good visual feedback without pegging too early + const normalized = maxDeviation / 128 + return Math.min(1, normalized * 1.5) } return { get inputLevel() { if (!inputAnalyser) return 0 - return calculateRMS(inputAnalyser) + return calculateLevel(inputAnalyser) }, get outputLevel() { if (!outputAnalyser) return 0 - return calculateRMS(outputAnalyser) + return calculateLevel(outputAnalyser) }, getInputFrequencyData() { From 41a5e97ff987c6992a847535db54ca16f768b4f0 Mon Sep 17 00:00:00 2001 From: Jack Herrington Date: Thu, 19 Feb 2026 10:18:57 -0800 Subject: [PATCH 4/5] chore: update pnpm-lock.yaml and latest realtime chat changes Co-Authored-By: Warp --- .changeset/realtime-chat.md | 11 ++ examples/ts-react-chat/package.json | 2 +- .../ai-client/src/realtime-client.ts | 2 +- .../ai-client/src/realtime-types.ts | 28 ++--- .../typescript/ai-elevenlabs/package.json | 2 +- .../ai-elevenlabs/src/realtime/adapter.ts | 16 +-- .../ai-elevenlabs/src/realtime/token.ts | 2 +- .../ai-elevenlabs/src/realtime/types.ts | 7 -- .../ai-openai/src/realtime/adapter.ts | 14 +-- .../ai-openai/src/realtime/token.ts | 4 +- .../typescript/ai-react/src/realtime-types.ts | 2 +- packages/typescript/ai/src/realtime/types.ts | 22 ++-- pnpm-lock.yaml | 115 ++++++++++++++++++ 13 files changed, 173 insertions(+), 54 deletions(-) create mode 100644 .changeset/realtime-chat.md diff --git a/.changeset/realtime-chat.md b/.changeset/realtime-chat.md new file mode 100644 index 000000000..11cc3cca1 --- /dev/null +++ b/.changeset/realtime-chat.md @@ -0,0 +1,11 @@ +--- +'@tanstack/ai': minor +'@tanstack/ai-client': minor +'@tanstack/ai-openai': minor +'@tanstack/ai-elevenlabs': minor +'@tanstack/ai-react': minor +--- + +feat: add realtime chat support with OpenAI and ElevenLabs adapters + +Adds realtime voice/text chat capabilities including a provider-agnostic realtime client, OpenAI Realtime API adapter, ElevenLabs conversational AI adapter, React `useRealtimeChat` hook, and shared realtime types across the core, client, and framework packages. diff --git a/examples/ts-react-chat/package.json b/examples/ts-react-chat/package.json index cd160a2f8..9580b8e76 100644 --- a/examples/ts-react-chat/package.json +++ b/examples/ts-react-chat/package.json @@ -13,11 +13,11 @@ "@tanstack/ai": "workspace:*", "@tanstack/ai-anthropic": "workspace:*", "@tanstack/ai-client": "workspace:*", + "@tanstack/ai-elevenlabs": "workspace:*", "@tanstack/ai-gemini": "workspace:*", "@tanstack/ai-grok": "workspace:*", "@tanstack/ai-ollama": "workspace:*", "@tanstack/ai-openai": "workspace:*", - "@tanstack/ai-elevenlabs": "workspace:*", "@tanstack/ai-openrouter": "workspace:*", "@tanstack/ai-react": "workspace:*", "@tanstack/ai-react-ui": "workspace:*", diff --git a/packages/typescript/ai-client/src/realtime-client.ts b/packages/typescript/ai-client/src/realtime-client.ts index 0672fe23e..f49d4d25e 100644 --- a/packages/typescript/ai-client/src/realtime-client.ts +++ b/packages/typescript/ai-client/src/realtime-client.ts @@ -1,11 +1,11 @@ import type { + AnyClientTool, AudioVisualization, RealtimeMessage, RealtimeMode, RealtimeStatus, RealtimeToken, } from '@tanstack/ai' -import type { AnyClientTool } from '@tanstack/ai' import type { RealtimeClientOptions, RealtimeClientState, diff --git a/packages/typescript/ai-client/src/realtime-types.ts b/packages/typescript/ai-client/src/realtime-types.ts index 3393ee92c..4ff6bf9bd 100644 --- a/packages/typescript/ai-client/src/realtime-types.ts +++ b/packages/typescript/ai-client/src/realtime-types.ts @@ -1,4 +1,5 @@ import type { + AnyClientTool, AudioVisualization, RealtimeEvent, RealtimeEventHandler, @@ -8,7 +9,6 @@ import type { RealtimeStatus, RealtimeToken, } from '@tanstack/ai' -import type { AnyClientTool } from '@tanstack/ai' // ============================================================================ // Adapter Interface @@ -27,7 +27,7 @@ export interface RealtimeAdapter { * @param token - The ephemeral token from the server * @returns A connection instance */ - connect(token: RealtimeToken): Promise + connect: (token: RealtimeToken) => Promise } /** @@ -37,38 +37,38 @@ export interface RealtimeAdapter { export interface RealtimeConnection { // Lifecycle /** Disconnect from the realtime session */ - disconnect(): Promise + disconnect: () => Promise // Audio I/O /** Start capturing audio from the microphone */ - startAudioCapture(): Promise + startAudioCapture: () => Promise /** Stop capturing audio */ - stopAudioCapture(): void + stopAudioCapture: () => void // Text input /** Send a text message (fallback for when voice isn't available) */ - sendText(text: string): void + sendText: (text: string) => void // Tool results /** Send a tool execution result back to the provider */ - sendToolResult(callId: string, result: string): void + sendToolResult: (callId: string, result: string) => void // Session management /** Update session configuration */ - updateSession(config: Partial): void + updateSession: (config: Partial) => void /** Interrupt the current response */ - interrupt(): void + interrupt: () => void // Events /** Subscribe to connection events */ - on( - event: E, - handler: RealtimeEventHandler, - ): () => void + on: ( + event: TEvent, + handler: RealtimeEventHandler, + ) => () => void // Audio visualization /** Get audio visualization data */ - getAudioVisualization(): AudioVisualization + getAudioVisualization: () => AudioVisualization } // ============================================================================ diff --git a/packages/typescript/ai-elevenlabs/package.json b/packages/typescript/ai-elevenlabs/package.json index 0edafe92a..4ff1754e8 100644 --- a/packages/typescript/ai-elevenlabs/package.json +++ b/packages/typescript/ai-elevenlabs/package.json @@ -36,7 +36,7 @@ "lint:fix": "eslint ./src --fix", "test:build": "publint --strict", "test:eslint": "eslint ./src", - "test:lib": "vitest", + "test:lib": "vitest --passWithNoTests", "test:lib:dev": "pnpm test:lib --watch", "test:types": "tsc" }, diff --git a/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts b/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts index cb193485c..39ee355bd 100644 --- a/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts +++ b/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts @@ -59,9 +59,9 @@ async function createElevenLabsConnection( const emptyTimeDomainData = new Uint8Array(128).fill(128) // Helper to emit events - function emit( - event: E, - payload: Parameters>[0], + function emit( + event: TEvent, + payload: Parameters>[0], ) { const handlers = eventHandlers.get(event) if (handlers) { @@ -145,7 +145,7 @@ async function createElevenLabsConnection( emit('mode_change', { mode: 'idle' }) }, - sendText(text: string) { + sendText(_text: string) { // ElevenLabs doesn't support direct text input in the same way // The SDK is voice-first. Log a warning. console.warn( @@ -153,7 +153,7 @@ async function createElevenLabsConnection( ) }, - sendToolResult(callId: string, result: string) { + sendToolResult(_callId: string, _result: string) { // ElevenLabs handles client tools differently - they're registered at session start console.warn( 'ElevenLabs tool results are handled via clientTools option during session creation.', @@ -174,9 +174,9 @@ async function createElevenLabsConnection( emit('interrupted', {}) }, - on( - event: E, - handler: RealtimeEventHandler, + on( + event: TEvent, + handler: RealtimeEventHandler, ): () => void { if (!eventHandlers.has(event)) { eventHandlers.set(event, new Set()) diff --git a/packages/typescript/ai-elevenlabs/src/realtime/token.ts b/packages/typescript/ai-elevenlabs/src/realtime/token.ts index 99989b06b..030d0c9a9 100644 --- a/packages/typescript/ai-elevenlabs/src/realtime/token.ts +++ b/packages/typescript/ai-elevenlabs/src/realtime/token.ts @@ -8,7 +8,7 @@ const ELEVENLABS_API_URL = 'https://api.elevenlabs.io/v1' */ function getElevenLabsApiKey(): string { // Check process.env (Node.js) - if (typeof process !== 'undefined' && process.env?.ELEVENLABS_API_KEY) { + if (typeof process !== 'undefined' && process.env.ELEVENLABS_API_KEY) { return process.env.ELEVENLABS_API_KEY } diff --git a/packages/typescript/ai-elevenlabs/src/realtime/types.ts b/packages/typescript/ai-elevenlabs/src/realtime/types.ts index ff2166f08..c3f5227f7 100644 --- a/packages/typescript/ai-elevenlabs/src/realtime/types.ts +++ b/packages/typescript/ai-elevenlabs/src/realtime/types.ts @@ -53,10 +53,3 @@ export interface ElevenLabsClientTool { /** Tool handler function */ handler: (params: TParams) => Promise | TResult } - -/** - * ElevenLabs signed URL response - */ -export interface ElevenLabsSignedUrlResponse { - signed_url: string -} diff --git a/packages/typescript/ai-openai/src/realtime/adapter.ts b/packages/typescript/ai-openai/src/realtime/adapter.ts index 645a5e716..6245a3c05 100644 --- a/packages/typescript/ai-openai/src/realtime/adapter.ts +++ b/packages/typescript/ai-openai/src/realtime/adapter.ts @@ -88,9 +88,9 @@ async function createWebRTCConnection( const emptyTimeDomainData = new Uint8Array(2048).fill(128) // 128 is silence // Helper to emit events (defined early so it can be used during setup) - function emit( - event: E, - payload: Parameters>[0], + function emit( + event: TEvent, + payload: Parameters>[0], ) { const handlers = eventHandlers.get(event) if (handlers) { @@ -264,7 +264,7 @@ async function createWebRTCConnection( // Emit message complete if we have a current message if (currentMessageId) { const response = event.response as Record - const output = response.output as Array> + const output = response.output as Array> | undefined const message: RealtimeMessage = { id: currentMessageId, @@ -495,9 +495,9 @@ async function createWebRTCConnection( emit('interrupted', { messageId: currentMessageId ?? undefined }) }, - on( - event: E, - handler: RealtimeEventHandler, + on( + event: TEvent, + handler: RealtimeEventHandler, ): () => void { if (!eventHandlers.has(event)) { eventHandlers.set(event, new Set()) diff --git a/packages/typescript/ai-openai/src/realtime/token.ts b/packages/typescript/ai-openai/src/realtime/token.ts index d226cacbb..e09d8c766 100644 --- a/packages/typescript/ai-openai/src/realtime/token.ts +++ b/packages/typescript/ai-openai/src/realtime/token.ts @@ -1,5 +1,5 @@ +import { getOpenAIApiKeyFromEnv } from '../utils/client' import type { RealtimeToken, RealtimeTokenAdapter, Tool } from '@tanstack/ai' -import { getOpenAIApiKeyFromEnv } from '../utils' import type { OpenAIRealtimeModel, OpenAIRealtimeSessionResponse, @@ -111,7 +111,7 @@ export function openaiRealtimeToken( const sessionData: OpenAIRealtimeSessionResponse = await response.json() // Convert tools to our format - const tools: Array = (sessionData.tools || []).map((t) => ({ + const tools: Array = sessionData.tools.map((t) => ({ name: t.name, description: t.description, inputSchema: t.parameters, diff --git a/packages/typescript/ai-react/src/realtime-types.ts b/packages/typescript/ai-react/src/realtime-types.ts index f3a403464..9b44269d9 100644 --- a/packages/typescript/ai-react/src/realtime-types.ts +++ b/packages/typescript/ai-react/src/realtime-types.ts @@ -1,11 +1,11 @@ import type { + AnyClientTool, RealtimeMessage, RealtimeMode, RealtimeStatus, RealtimeToken, } from '@tanstack/ai' import type { RealtimeAdapter } from '@tanstack/ai-client' -import type { AnyClientTool } from '@tanstack/ai' /** * Options for the useRealtimeChat hook. diff --git a/packages/typescript/ai/src/realtime/types.ts b/packages/typescript/ai/src/realtime/types.ts index e563a2ae7..8a4be3a98 100644 --- a/packages/typescript/ai/src/realtime/types.ts +++ b/packages/typescript/ai/src/realtime/types.ts @@ -57,7 +57,7 @@ export interface RealtimeTokenAdapter { /** Provider identifier */ provider: string /** Generate an ephemeral token for client use */ - generateToken(): Promise + generateToken: () => Promise } /** @@ -176,14 +176,14 @@ export interface AudioVisualization { readonly outputLevel: number /** Get frequency data for input audio visualization */ - getInputFrequencyData(): Uint8Array + getInputFrequencyData: () => Uint8Array /** Get frequency data for output audio visualization */ - getOutputFrequencyData(): Uint8Array + getOutputFrequencyData: () => Uint8Array /** Get time domain data for input waveform */ - getInputTimeDomainData(): Uint8Array + getInputTimeDomainData: () => Uint8Array /** Get time domain data for output waveform */ - getOutputTimeDomainData(): Uint8Array + getOutputTimeDomainData: () => Uint8Array /** Input sample rate */ readonly inputSampleRate: number @@ -191,13 +191,13 @@ export interface AudioVisualization { readonly outputSampleRate: number /** Subscribe to raw input audio samples */ - onInputAudio?( + onInputAudio?: ( callback: (samples: Float32Array, sampleRate: number) => void, - ): () => void + ) => () => void /** Subscribe to raw output audio samples */ - onOutputAudio?( + onOutputAudio?: ( callback: (samples: Float32Array, sampleRate: number) => void, - ): () => void + ) => () => void } // ============================================================================ @@ -238,8 +238,8 @@ export interface RealtimeEventPayloads { /** * Handler type for realtime events */ -export type RealtimeEventHandler = ( - payload: RealtimeEventPayloads[E], +export type RealtimeEventHandler = ( + payload: RealtimeEventPayloads[TEvent], ) => void // ============================================================================ diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9b669fda9..cb260e569 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -201,6 +201,9 @@ importers: '@tanstack/ai-client': specifier: workspace:* version: link:../../packages/typescript/ai-client + '@tanstack/ai-elevenlabs': + specifier: workspace:* + version: link:../../packages/typescript/ai-elevenlabs '@tanstack/ai-gemini': specifier: workspace:* version: link:../../packages/typescript/ai-gemini @@ -743,6 +746,22 @@ importers: specifier: ^2.11.10 version: 2.11.10(solid-js@1.9.10)(vite@7.2.7(@types/node@25.0.1)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.1)(tsx@4.21.0)(yaml@2.8.2)) + packages/typescript/ai-elevenlabs: + dependencies: + '@11labs/client': + specifier: ^0.2.0 + version: 0.2.0(@types/dom-mediacapture-record@1.0.22) + devDependencies: + '@tanstack/ai': + specifier: workspace:* + version: link:../ai + '@tanstack/ai-client': + specifier: workspace:* + version: link:../ai-client + '@vitest/coverage-v8': + specifier: 4.0.14 + version: 4.0.14(vitest@4.0.18(@types/node@25.0.1)(happy-dom@20.0.11)(jiti@2.6.1)(jsdom@27.3.0(postcss@8.5.6))(lightningcss@1.30.2)(terser@5.44.1)(tsx@4.21.0)(yaml@2.8.2)) + packages/typescript/ai-fal: dependencies: '@fal-ai/client': @@ -819,6 +838,9 @@ importers: '@tanstack/ai': specifier: workspace:* version: link:../ai + '@tanstack/ai-client': + specifier: workspace:* + version: link:../ai-client '@vitest/coverage-v8': specifier: 4.0.14 version: 4.0.14(vitest@4.0.18(@types/node@25.0.1)(happy-dom@20.0.11)(jiti@2.6.1)(jsdom@27.3.0(postcss@8.5.6))(lightningcss@1.30.2)(terser@5.44.1)(tsx@4.21.0)(yaml@2.8.2)) @@ -1409,6 +1431,10 @@ importers: packages: + '@11labs/client@0.2.0': + resolution: {integrity: sha512-GBplAV4WDbcoThsIzdSDPN3xbcitK0ZZ4iJfJZKfltqvgvS6Uw8GZxHwVgiPwnQoA3uosYyY3L9TuPwmel18xQ==} + deprecated: This package is no longer maintained. Please use @elevenlabs/client for the latest version + '@acemir/cssom@0.9.29': resolution: {integrity: sha512-G90x0VW+9nW4dFajtjCoT+NM0scAfH9Mb08IcjgFHYbfiL/lU04dTF9JuVOi3/OH+DJCQdcIseSXkdCB9Ky6JA==} @@ -1601,6 +1627,9 @@ packages: resolution: {integrity: sha512-6zABk/ECA/QYSCQ1NGiVwwbQerUCZ+TQbp64Q3AgmfNvurHH0j8TtXa1qbShXA6qqkpAj4V5W8pP6mLe1mcMqA==} engines: {node: '>=18'} + '@bufbuild/protobuf@1.10.1': + resolution: {integrity: sha512-wJ8ReQbHxsAfXhrf9ixl0aYbZorRuOWpBNzm8pL8ftmSxQx/wnJD5Eg861NwJU/czy2VXFIebCeZnZrI9rktIQ==} + '@changesets/apply-release-plan@7.0.14': resolution: {integrity: sha512-ddBvf9PHdy2YY0OUiEl3TV78mH9sckndJR14QAt87KLEbIov81XO0q0QAmvooBxXlqRRP8I9B7XOzZwQG7JkWA==} @@ -2474,6 +2503,12 @@ packages: '@jridgewell/trace-mapping@0.3.31': resolution: {integrity: sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==} + '@livekit/mutex@1.1.1': + resolution: {integrity: sha512-EsshAucklmpuUAfkABPxJNhzj9v2sG7JuzFDL4ML1oJQSV14sqrpTYnsaOudMAw9yOaW53NU3QQTlUQoRs4czw==} + + '@livekit/protocol@1.44.0': + resolution: {integrity: sha512-/vfhDUGcUKO8Q43r6i+5FrDhl5oZjm/X3U4x2Iciqvgn5C8qbj+57YPcWSJ1kyIZm5Cm6AV2nAPjMm3ETD/iyg==} + '@manypkg/find-root@1.1.0': resolution: {integrity: sha512-mki5uBvhHzO8kYYix/WRy2WX8S3B5wdVSc9D6KcU5lQNglP2yt58/VfLuAK49glRXChosY8ap2oJ1qgma3GUVA==} @@ -4367,6 +4402,9 @@ packages: '@types/deep-eql@4.0.2': resolution: {integrity: sha512-c9h9dVVMigMPc4bwTvC5dxqtqJZwQPePsWjPlpSOnojbor6pGqdk541lfA7AqFQr5pB1BRdq0juY9db81BwyFw==} + '@types/dom-mediacapture-record@1.0.22': + resolution: {integrity: sha512-mUMZLK3NvwRLcAAT9qmcK+9p7tpU2FHdDsntR3YI4+GY88XrgG4XiE7u1Q2LAN2/FZOz/tdMDC3GQCR4T8nFuw==} + '@types/estree-jsx@1.0.5': resolution: {integrity: sha512-52CcUVNFyfb1A2ALocQw/Dd1BQFNmSdkuC3BkZ6iqhdMfQz7JWOFRuJFloOzjk+6WijU56m9oKXFAXc7o3Towg==} @@ -6555,6 +6593,9 @@ packages: jju@1.4.0: resolution: {integrity: sha512-8wb9Yw966OSxApiCt0K3yNJL8pnNeIv+OEq2YMidz4FKP6nonSRoOXc80iXY4JaN2FC11B9qsNmDsm+ZOfMROA==} + jose@6.1.3: + resolution: {integrity: sha512-0TpaTfihd4QMNwrz/ob2Bp7X04yuxJkjRGi4aKmOqwhov54i6u79oCv7T+C7lo70MKH6BesI3vscD1yb/yzKXQ==} + js-beautify@1.15.4: resolution: {integrity: sha512-9/KXeZUKKJwqCXUdBxFJ3vPh467OCckSBmYDwSK/EtV090K+iMJ7zx2S3HLVDIWFQdqMIsZWbnaGiba18aWhaA==} engines: {node: '>=14'} @@ -6758,6 +6799,11 @@ packages: resolution: {integrity: sha512-I8oW2+QL5KJo8zXNWX046M134WchxsXC7SawLPvRQpogCbkyQIaFxPE89A2HiwR7vAK2Dm2ERBAmyjTYGYEpBg==} hasBin: true + livekit-client@2.17.2: + resolution: {integrity: sha512-+67y2EtAWZabARlY7kANl/VT1Uu1EJYR5a8qwpT2ub/uBCltsEgEDOxCIMwE9HFR5w+z41HR6GL9hyEvW/y6CQ==} + peerDependencies: + '@types/dom-mediacapture-record': ^1 + local-pkg@0.5.1: resolution: {integrity: sha512-9rrA30MRRP3gBD3HTGnC6cDFpaE1kVDWxWgqWJUN0RvDNAo+Nz/9GxB+nHOH0ifbVFy0hSA1V6vFDvnx54lTEQ==} engines: {node: '>=14'} @@ -6796,6 +6842,10 @@ packages: resolution: {integrity: sha512-8XPvpAA8uyhfteu8pIvQxpJZ7SYYdpUivZpGy6sFsBuKRY/7rQGavedeB8aK+Zkyq6upMFVL/9AW6vOYzfRyLg==} engines: {node: '>=10'} + loglevel@1.9.2: + resolution: {integrity: sha512-HgMmCqIJSAKqo68l0rS2AanEWfkxaZ5wNiEFb5ggm08lDs9Xl2KxBlX3PTcaD2chBM1gXAYf491/M2Rv8Jwayg==} + engines: {node: '>= 0.6.0'} + longest-streak@3.1.0: resolution: {integrity: sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==} @@ -7861,6 +7911,13 @@ packages: scule@1.3.0: resolution: {integrity: sha512-6FtHJEvt+pVMIB9IBY+IcCJ6Z5f1iQnytgyfKMhDKgmzYG+TeH/wx1y3l27rshSbLiSanrR9ffZDrEsmjlQF2g==} + sdp-transform@2.15.0: + resolution: {integrity: sha512-KrOH82c/W+GYQ0LHqtr3caRpM3ITglq3ljGUIb8LTki7ByacJZ9z+piSGiwZDsRyhQbYBOBJgr2k6X4BZXi3Kw==} + hasBin: true + + sdp@3.2.1: + resolution: {integrity: sha512-lwsAIzOPlH8/7IIjjz3K0zYBk7aBVVcvjMwt3M4fLxpjMYyy7i3I97SLHebgn4YBjirkzfp3RvRDWSKsh/+WFw==} + semver@6.3.1: resolution: {integrity: sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==} hasBin: true @@ -8339,6 +8396,9 @@ packages: peerDependencies: typescript: '>=4.8.4' + ts-debounce@4.0.0: + resolution: {integrity: sha512-+1iDGY6NmOGidq7i7xZGA4cm8DAa6fqdYcvO5Z6yBevH++Bdo9Qt/mN0TzHUgcCcKv1gmh9+W5dHqz8pMWbCbg==} + ts-declaration-location@1.0.7: resolution: {integrity: sha512-EDyGAwH1gO0Ausm9gV6T2nUvBgXT5kGoCMJPllOaooZ+4VvJiKBdZE7wK18N1deEowhcUptS+5GXZK8U/fvpwA==} peerDependencies: @@ -8408,6 +8468,9 @@ packages: resolution: {integrity: sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==} engines: {node: '>= 0.6'} + typed-emitter@2.1.0: + resolution: {integrity: sha512-g/KzbYKbH5C2vPkaXGu8DJlHrGKHLsM25Zg9WuC9pMGfuvT+X25tZQWo5fK1BjBm8+UrVE9LDCvaY0CQk+fXDA==} + typedoc-plugin-frontmatter@1.3.0: resolution: {integrity: sha512-xYQFMAecMlsRUjmf9oM/Sq2FVz4zlgcbIeVFNLdO118CHTN06gIKJNSlyExh9+Xl8sK0YhIvoQwViUURxritWA==} peerDependencies: @@ -9105,6 +9168,10 @@ packages: webpack-virtual-modules@0.6.2: resolution: {integrity: sha512-66/V2i5hQanC51vBQKPH4aI8NMAcBW59FVBs+rC7eGHupMyfn34q7rZIE+ETlJ+XTevqfUhVVBgSUNSW2flEUQ==} + webrtc-adapter@9.0.3: + resolution: {integrity: sha512-5fALBcroIl31OeXAdd1YUntxiZl1eHlZZWzNg3U4Fn+J9/cGL3eT80YlrsWGvj2ojuz1rZr2OXkgCzIxAZ7vRQ==} + engines: {node: '>=6.0.0', npm: '>=3.10.0'} + whatwg-encoding@3.1.1: resolution: {integrity: sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==} engines: {node: '>=18'} @@ -9259,6 +9326,12 @@ packages: snapshots: + '@11labs/client@0.2.0(@types/dom-mediacapture-record@1.0.22)': + dependencies: + livekit-client: 2.17.2(@types/dom-mediacapture-record@1.0.22) + transitivePeerDependencies: + - '@types/dom-mediacapture-record' + '@acemir/cssom@0.9.29': {} '@alcyone-labs/zod-to-json-schema@4.0.10(zod@4.2.1)': @@ -9508,6 +9581,8 @@ snapshots: '@bcoe/v8-coverage@1.0.2': {} + '@bufbuild/protobuf@1.10.1': {} + '@changesets/apply-release-plan@7.0.14': dependencies: '@changesets/config': 3.1.2 @@ -10180,6 +10255,12 @@ snapshots: '@jridgewell/resolve-uri': 3.1.2 '@jridgewell/sourcemap-codec': 1.5.5 + '@livekit/mutex@1.1.1': {} + + '@livekit/protocol@1.44.0': + dependencies: + '@bufbuild/protobuf': 1.10.1 + '@manypkg/find-root@1.1.0': dependencies: '@babel/runtime': 7.28.4 @@ -12648,6 +12729,8 @@ snapshots: '@types/deep-eql@4.0.2': {} + '@types/dom-mediacapture-record@1.0.22': {} + '@types/estree-jsx@1.0.5': dependencies: '@types/estree': 1.0.8 @@ -15291,6 +15374,8 @@ snapshots: jju@1.4.0: {} + jose@6.1.3: {} + js-beautify@1.15.4: dependencies: config-chain: 1.1.13 @@ -15510,6 +15595,20 @@ snapshots: untun: 0.1.3 uqr: 0.1.2 + livekit-client@2.17.2(@types/dom-mediacapture-record@1.0.22): + dependencies: + '@livekit/mutex': 1.1.1 + '@livekit/protocol': 1.44.0 + '@types/dom-mediacapture-record': 1.0.22 + events: 3.3.0 + jose: 6.1.3 + loglevel: 1.9.2 + sdp-transform: 2.15.0 + ts-debounce: 4.0.0 + tslib: 2.8.1 + typed-emitter: 2.1.0 + webrtc-adapter: 9.0.3 + local-pkg@0.5.1: dependencies: mlly: 1.8.0 @@ -15546,6 +15645,8 @@ snapshots: chalk: 4.1.2 is-unicode-supported: 0.1.0 + loglevel@1.9.2: {} + longest-streak@3.1.0: {} lowlight@3.3.0: @@ -17166,6 +17267,10 @@ snapshots: scule@1.3.0: {} + sdp-transform@2.15.0: {} + + sdp@3.2.1: {} + semver@6.3.1: {} semver@7.5.4: @@ -17667,6 +17772,8 @@ snapshots: dependencies: typescript: 5.9.3 + ts-debounce@4.0.0: {} + ts-declaration-location@1.0.7(typescript@5.9.3): dependencies: picomatch: 4.0.3 @@ -17734,6 +17841,10 @@ snapshots: media-typer: 1.1.0 mime-types: 3.0.2 + typed-emitter@2.1.0: + optionalDependencies: + rxjs: 7.8.2 + typedoc-plugin-frontmatter@1.3.0(typedoc-plugin-markdown@4.9.0(typedoc@0.28.14(typescript@5.9.3))): dependencies: typedoc-plugin-markdown: 4.9.0(typedoc@0.28.14(typescript@5.9.3)) @@ -18509,6 +18620,10 @@ snapshots: webpack-virtual-modules@0.6.2: {} + webrtc-adapter@9.0.3: + dependencies: + sdp: 3.2.1 + whatwg-encoding@3.1.1: dependencies: iconv-lite: 0.6.3 From aa48ba285f9db6b511a9d54c67d9643572829356 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Thu, 19 Feb 2026 18:21:02 +0000 Subject: [PATCH 5/5] ci: apply automated fixes --- .../ts-react-chat/src/lib/realtime-tools.ts | 146 +++++++++++------- .../src/routes/api.realtime-token.ts | 13 +- .../ts-react-chat/src/routes/realtime.tsx | 40 ++--- .../ai-client/src/realtime-client.ts | 51 +++--- packages/typescript/ai-elevenlabs/README.md | 2 +- .../typescript/ai-elevenlabs/src/index.ts | 5 +- .../ai-elevenlabs/src/realtime/adapter.ts | 4 +- packages/typescript/ai-openai/src/index.ts | 5 +- .../ai-openai/src/realtime/adapter.ts | 35 +++-- .../ai-openai/src/realtime/token.ts | 11 +- .../ai-react/src/use-realtime-chat.ts | 35 ++++- 11 files changed, 208 insertions(+), 139 deletions(-) diff --git a/examples/ts-react-chat/src/lib/realtime-tools.ts b/examples/ts-react-chat/src/lib/realtime-tools.ts index 2cf28d7bd..e19c52226 100644 --- a/examples/ts-react-chat/src/lib/realtime-tools.ts +++ b/examples/ts-react-chat/src/lib/realtime-tools.ts @@ -4,9 +4,13 @@ import { z } from 'zod' // Tool to get current time - useful for voice assistants export const getCurrentTimeToolDef = toolDefinition({ name: 'getCurrentTime', - description: 'Get the current date and time. Use this when the user asks what time it is or the current date.', + description: + 'Get the current date and time. Use this when the user asks what time it is or the current date.', inputSchema: z.object({ - timezone: z.string().optional().describe('Optional timezone like "America/New_York" or "Europe/London"'), + timezone: z + .string() + .optional() + .describe('Optional timezone like "America/New_York" or "Europe/London"'), }), outputSchema: z.object({ time: z.string(), @@ -18,9 +22,14 @@ export const getCurrentTimeToolDef = toolDefinition({ // Tool to get weather - common voice assistant use case export const getWeatherToolDef = toolDefinition({ name: 'getWeather', - description: 'Get the current weather for a location. Use this when the user asks about the weather.', + description: + 'Get the current weather for a location. Use this when the user asks about the weather.', inputSchema: z.object({ - location: z.string().describe('The city and state/country, e.g. "San Francisco, CA" or "London, UK"'), + location: z + .string() + .describe( + 'The city and state/country, e.g. "San Francisco, CA" or "London, UK"', + ), }), outputSchema: z.object({ location: z.string(), @@ -34,7 +43,8 @@ export const getWeatherToolDef = toolDefinition({ // Tool to set a reminder - demonstrates user interaction export const setReminderToolDef = toolDefinition({ name: 'setReminder', - description: 'Set a reminder for the user. Use this when the user asks to be reminded about something.', + description: + 'Set a reminder for the user. Use this when the user asks to be reminded about something.', inputSchema: z.object({ message: z.string().describe('What to remind the user about'), inMinutes: z.number().describe('How many minutes from now to remind'), @@ -49,44 +59,50 @@ export const setReminderToolDef = toolDefinition({ // Tool to search knowledge base - useful for assistants with specific knowledge export const searchKnowledgeToolDef = toolDefinition({ name: 'searchKnowledge', - description: 'Search a knowledge base for information. Use this to find specific facts or documentation.', + description: + 'Search a knowledge base for information. Use this to find specific facts or documentation.', inputSchema: z.object({ query: z.string().describe('The search query'), }), outputSchema: z.object({ - results: z.array(z.object({ - title: z.string(), - snippet: z.string(), - })), + results: z.array( + z.object({ + title: z.string(), + snippet: z.string(), + }), + ), }), }) // Client-side implementation of getCurrentTime -export const getCurrentTimeClient = getCurrentTimeToolDef.client(({ timezone }) => { - const now = new Date() - const tz = timezone || Intl.DateTimeFormat().resolvedOptions().timeZone - - return { - time: now.toLocaleTimeString('en-US', { timeZone: tz }), - date: now.toLocaleDateString('en-US', { - weekday: 'long', - year: 'numeric', - month: 'long', - day: 'numeric', - timeZone: tz, - }), - timezone: tz, - } -}) +export const getCurrentTimeClient = getCurrentTimeToolDef.client( + ({ timezone }) => { + const now = new Date() + const tz = timezone || Intl.DateTimeFormat().resolvedOptions().timeZone + + return { + time: now.toLocaleTimeString('en-US', { timeZone: tz }), + date: now.toLocaleDateString('en-US', { + weekday: 'long', + year: 'numeric', + month: 'long', + day: 'numeric', + timeZone: tz, + }), + timezone: tz, + } + }, +) // Client-side implementation of getWeather (mock data for demo) export const getWeatherClient = getWeatherToolDef.client(({ location }) => { // Mock weather data for demo purposes const conditions = ['Sunny', 'Partly Cloudy', 'Cloudy', 'Rainy', 'Snowy'] - const randomCondition = conditions[Math.floor(Math.random() * conditions.length)]! + const randomCondition = + conditions[Math.floor(Math.random() * conditions.length)]! const randomTemp = Math.floor(Math.random() * 30) + 50 // 50-80°F const randomHumidity = Math.floor(Math.random() * 50) + 30 // 30-80% - + return { location, temperature: randomTemp, @@ -97,40 +113,50 @@ export const getWeatherClient = getWeatherToolDef.client(({ location }) => { }) // Client-side implementation of setReminder -export const setReminderClient = setReminderToolDef.client(({ message, inMinutes }) => { - const remindAt = new Date(Date.now() + inMinutes * 60 * 1000) - - // In a real app, you'd schedule a notification here - console.log(`[Reminder] Will remind about "${message}" at ${remindAt.toLocaleTimeString()}`) - - // For demo purposes, show an alert after the specified time - setTimeout(() => { - alert(`Reminder: ${message}`) - }, inMinutes * 60 * 1000) - - return { - success: true, - message: `Reminder set: "${message}"`, - remindAt: remindAt.toLocaleTimeString(), - } -}) +export const setReminderClient = setReminderToolDef.client( + ({ message, inMinutes }) => { + const remindAt = new Date(Date.now() + inMinutes * 60 * 1000) + + // In a real app, you'd schedule a notification here + console.log( + `[Reminder] Will remind about "${message}" at ${remindAt.toLocaleTimeString()}`, + ) + + // For demo purposes, show an alert after the specified time + setTimeout( + () => { + alert(`Reminder: ${message}`) + }, + inMinutes * 60 * 1000, + ) + + return { + success: true, + message: `Reminder set: "${message}"`, + remindAt: remindAt.toLocaleTimeString(), + } + }, +) // Client-side implementation of searchKnowledge (mock data for demo) -export const searchKnowledgeClient = searchKnowledgeToolDef.client(({ query }) => { - // Mock search results for demo - const mockResults = [ - { - title: `Result for: ${query}`, - snippet: `This is a mock search result for the query "${query}". In a real application, this would return actual search results from a knowledge base.`, - }, - { - title: 'Additional Information', - snippet: 'More relevant information would appear here based on your search query.', - }, - ] - - return { results: mockResults } -}) +export const searchKnowledgeClient = searchKnowledgeToolDef.client( + ({ query }) => { + // Mock search results for demo + const mockResults = [ + { + title: `Result for: ${query}`, + snippet: `This is a mock search result for the query "${query}". In a real application, this would return actual search results from a knowledge base.`, + }, + { + title: 'Additional Information', + snippet: + 'More relevant information would appear here based on your search query.', + }, + ] + + return { results: mockResults } + }, +) // Export all client tools as an array for easy use export const realtimeClientTools = [ diff --git a/examples/ts-react-chat/src/routes/api.realtime-token.ts b/examples/ts-react-chat/src/routes/api.realtime-token.ts index c9367ad19..807623ecf 100644 --- a/examples/ts-react-chat/src/routes/api.realtime-token.ts +++ b/examples/ts-react-chat/src/routes/api.realtime-token.ts @@ -13,9 +13,13 @@ import * as z from 'zod' type Provider = 'openai' | 'elevenlabs' // Convert tool definitions to OpenAI's format using Zod's native toJSONSchema -function toolDefToOpenAI(toolDef: { name: string; description: string; inputSchema?: unknown }) { +function toolDefToOpenAI(toolDef: { + name: string + description: string + inputSchema?: unknown +}) { let parameters: Record = { type: 'object', properties: {} } - + if (toolDef.inputSchema) { // Use Zod's native toJSONSchema for Zod v4+ const jsonSchema = z.toJSONSchema(toolDef.inputSchema as z.ZodType) @@ -23,7 +27,7 @@ function toolDefToOpenAI(toolDef: { name: string; description: string; inputSche const { $schema, ...rest } = jsonSchema as Record parameters = rest } - + return { type: 'function' as const, name: toolDef.name, @@ -85,7 +89,8 @@ Be friendly and engaging!`, if (!agentId) { return new Response( JSON.stringify({ - error: 'ElevenLabs agent ID is required. Set ELEVENLABS_AGENT_ID or pass agentId in request body.', + error: + 'ElevenLabs agent ID is required. Set ELEVENLABS_AGENT_ID or pass agentId in request body.', }), { status: 400, diff --git a/examples/ts-react-chat/src/routes/realtime.tsx b/examples/ts-react-chat/src/routes/realtime.tsx index 4be974a8f..107b839e5 100644 --- a/examples/ts-react-chat/src/routes/realtime.tsx +++ b/examples/ts-react-chat/src/routes/realtime.tsx @@ -14,11 +14,11 @@ const PROVIDER_OPTIONS: Array<{ value: Provider; label: string }> = [ ] // Sparkline component to visualize audio waveform -function AudioSparkline({ - getData, +function AudioSparkline({ + getData, color, label, -}: { +}: { getData: () => Uint8Array color: string label: string @@ -49,20 +49,20 @@ function AudioSparkline({ // Sample the data to fit the canvas width const step = Math.max(1, Math.floor(data.length / width)) - + for (let i = 0; i < width; i++) { const dataIndex = Math.min(i * step, data.length - 1) const value = data[dataIndex] ?? 128 // Convert 0-255 to canvas height (128 is center/silence) - const y = height - ((value / 255) * height) - + const y = height - (value / 255) * height + if (i === 0) { ctx!.moveTo(i, y) } else { ctx!.lineTo(i, y) } } - + ctx!.stroke() // Draw center line (silence level) @@ -89,10 +89,10 @@ function AudioSparkline({ return (
{label} -
@@ -105,7 +105,8 @@ function RealtimePage() { const messagesEndRef = useRef(null) // Get the appropriate adapter based on provider - const adapter = provider === 'openai' ? openaiRealtime() : elevenlabsRealtime() + const adapter = + provider === 'openai' ? openaiRealtime() : elevenlabsRealtime() const { status, @@ -284,7 +285,8 @@ function RealtimePage() { Click "Start Conversation" to begin talking with the AI

- Try asking: "What time is it?" or "What's the weather in San Francisco?" + Try asking: "What time is it?" or "What's the weather in San + Francisco?"

)} @@ -388,9 +390,9 @@ function RealtimePage() { {Math.round(inputLevel * 100)}% -
@@ -406,9 +408,9 @@ function RealtimePage() { {Math.round(outputLevel * 100)}% - diff --git a/packages/typescript/ai-client/src/realtime-client.ts b/packages/typescript/ai-client/src/realtime-client.ts index f49d4d25e..170a4d585 100644 --- a/packages/typescript/ai-client/src/realtime-client.ts +++ b/packages/typescript/ai-client/src/realtime-client.ts @@ -61,7 +61,7 @@ export class RealtimeClient { constructor(options: RealtimeClientOptions) { this.instanceId = ++clientIdCounter console.log(`[RealtimeClient #${this.instanceId}] Created`) - + this.options = { autoPlayback: true, autoCapture: true, @@ -102,7 +102,10 @@ export class RealtimeClient { // Connect via adapter this.connection = await this.options.adapter.connect(this.token) - console.log(`[RealtimeClient #${this.instanceId}] Connection established:`, !!this.connection) + console.log( + `[RealtimeClient #${this.instanceId}] Connection established:`, + !!this.connection, + ) // Subscribe to connection events this.subscribeToConnectionEvents() @@ -251,7 +254,10 @@ export class RealtimeClient { /** Get audio visualization data */ get audio(): AudioVisualization | null { - console.log(`[RealtimeClient #${this.instanceId}] audio getter, connection:`, !!this.connection) + console.log( + `[RealtimeClient #${this.instanceId}] audio getter, connection:`, + !!this.connection, + ) return this.connection?.getAudioVisualization() ?? null } @@ -381,25 +387,28 @@ export class RealtimeClient { // Tool calls this.unsubscribers.push( - this.connection.on('tool_call', async ({ toolCallId, toolName, input }) => { - const tool = this.clientTools.get(toolName) - if (tool?.execute) { - try { - const output = await tool.execute(input) - this.connection?.sendToolResult( - toolCallId, - typeof output === 'string' ? output : JSON.stringify(output), - ) - } catch (error) { - const errMsg = - error instanceof Error ? error.message : String(error) - this.connection?.sendToolResult( - toolCallId, - JSON.stringify({ error: errMsg }), - ) + this.connection.on( + 'tool_call', + async ({ toolCallId, toolName, input }) => { + const tool = this.clientTools.get(toolName) + if (tool?.execute) { + try { + const output = await tool.execute(input) + this.connection?.sendToolResult( + toolCallId, + typeof output === 'string' ? output : JSON.stringify(output), + ) + } catch (error) { + const errMsg = + error instanceof Error ? error.message : String(error) + this.connection?.sendToolResult( + toolCallId, + JSON.stringify({ error: errMsg }), + ) + } } - } - }), + }, + ), ) // Message complete diff --git a/packages/typescript/ai-elevenlabs/README.md b/packages/typescript/ai-elevenlabs/README.md index 6f85bb8ed..71b0d979b 100644 --- a/packages/typescript/ai-elevenlabs/README.md +++ b/packages/typescript/ai-elevenlabs/README.md @@ -31,7 +31,7 @@ import { RealtimeClient } from '@tanstack/ai-client' import { elevenlabsRealtime } from '@tanstack/ai-elevenlabs' const client = new RealtimeClient({ - getToken: () => fetch('/api/realtime-token').then(r => r.json()), + getToken: () => fetch('/api/realtime-token').then((r) => r.json()), adapter: elevenlabsRealtime(), }) diff --git a/packages/typescript/ai-elevenlabs/src/index.ts b/packages/typescript/ai-elevenlabs/src/index.ts index 14702a1da..8f3789e84 100644 --- a/packages/typescript/ai-elevenlabs/src/index.ts +++ b/packages/typescript/ai-elevenlabs/src/index.ts @@ -2,10 +2,7 @@ // ElevenLabs Realtime (Voice) Adapters // ============================================================================ -export { - elevenlabsRealtimeToken, - elevenlabsRealtime, -} from './realtime/index' +export { elevenlabsRealtimeToken, elevenlabsRealtime } from './realtime/index' export type { ElevenLabsRealtimeTokenOptions, diff --git a/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts b/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts index 39ee355bd..dfe33d318 100644 --- a/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts +++ b/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts @@ -51,7 +51,9 @@ async function createElevenLabsConnection( _options: ElevenLabsRealtimeOptions, ): Promise { const eventHandlers = new Map>>() - let conversation: Awaited> | null = null + let conversation: Awaited< + ReturnType + > | null = null let messageIdCounter = 0 // Empty arrays for when visualization isn't available diff --git a/packages/typescript/ai-openai/src/index.ts b/packages/typescript/ai-openai/src/index.ts index 2b7916a37..afadc4529 100644 --- a/packages/typescript/ai-openai/src/index.ts +++ b/packages/typescript/ai-openai/src/index.ts @@ -105,10 +105,7 @@ export type { OpenAIClientConfig } from './utils/client' // Realtime (Voice) Adapters // ============================================================================ -export { - openaiRealtimeToken, - openaiRealtime, -} from './realtime/index' +export { openaiRealtimeToken, openaiRealtime } from './realtime/index' export type { OpenAIRealtimeVoice, diff --git a/packages/typescript/ai-openai/src/realtime/adapter.ts b/packages/typescript/ai-openai/src/realtime/adapter.ts index 6245a3c05..e25a7dbd3 100644 --- a/packages/typescript/ai-openai/src/realtime/adapter.ts +++ b/packages/typescript/ai-openai/src/realtime/adapter.ts @@ -8,10 +8,7 @@ import type { RealtimeStatus, RealtimeToken, } from '@tanstack/ai' -import type { - RealtimeAdapter, - RealtimeConnection, -} from '@tanstack/ai-client' +import type { RealtimeAdapter, RealtimeConnection } from '@tanstack/ai-client' import type { OpenAIRealtimeOptions } from './types' const OPENAI_REALTIME_URL = 'https://api.openai.com/v1/realtime' @@ -175,7 +172,10 @@ async function createWebRTCConnection( await pc.setRemoteDescription({ type: 'answer', sdp: answerSdp }) // Set up input audio analysis now that we have the stream - console.log('[Realtime] Setting up input audio analysis, localStream:', localStream) + console.log( + '[Realtime] Setting up input audio analysis, localStream:', + localStream, + ) setupInputAudioAnalysis(localStream) console.log('[Realtime] Input analyser created:', inputAnalyser) @@ -224,7 +224,11 @@ async function createWebRTCConnection( case 'response.audio_transcript.delta': { const delta = event.delta as string - emit('transcript', { role: 'assistant', transcript: delta, isFinal: false }) + emit('transcript', { + role: 'assistant', + transcript: delta, + isFinal: false, + }) break } @@ -264,8 +268,10 @@ async function createWebRTCConnection( // Emit message complete if we have a current message if (currentMessageId) { const response = event.response as Record - const output = response.output as Array> | undefined - + const output = response.output as + | Array> + | undefined + const message: RealtimeMessage = { id: currentMessageId, role: 'assistant', @@ -511,14 +517,19 @@ async function createWebRTCConnection( getAudioVisualization(): AudioVisualization { // Log analyser state for debugging - console.log('[Realtime] getAudioVisualization called, inputAnalyser:', !!inputAnalyser, 'outputAnalyser:', !!outputAnalyser) - + console.log( + '[Realtime] getAudioVisualization called, inputAnalyser:', + !!inputAnalyser, + 'outputAnalyser:', + !!outputAnalyser, + ) + // Helper to calculate audio level from time domain data // Uses peak amplitude which is more responsive for voice audio meters function calculateLevel(analyser: AnalyserNode): number { const data = new Uint8Array(analyser.fftSize) analyser.getByteTimeDomainData(data) - + // Find peak deviation from center (128 is silence) // This is more responsive than RMS for voice level meters let maxDeviation = 0 @@ -528,7 +539,7 @@ async function createWebRTCConnection( maxDeviation = deviation } } - + // Normalize to 0-1 range (max deviation is 128) // Scale by 1.5x so that ~66% amplitude reads as full scale // This provides good visual feedback without pegging too early diff --git a/packages/typescript/ai-openai/src/realtime/token.ts b/packages/typescript/ai-openai/src/realtime/token.ts index e09d8c766..832adaa79 100644 --- a/packages/typescript/ai-openai/src/realtime/token.ts +++ b/packages/typescript/ai-openai/src/realtime/token.ts @@ -126,11 +126,12 @@ export function openaiRealtimeToken( voice: sessionData.voice, instructions: sessionData.instructions, tools, - vadMode: sessionData.turn_detection?.type === 'semantic_vad' - ? 'semantic' - : sessionData.turn_detection?.type === 'server_vad' - ? 'server' - : 'manual', + vadMode: + sessionData.turn_detection?.type === 'semantic_vad' + ? 'semantic' + : sessionData.turn_detection?.type === 'server_vad' + ? 'server' + : 'manual', vadConfig: sessionData.turn_detection ? { threshold: sessionData.turn_detection.threshold, diff --git a/packages/typescript/ai-react/src/use-realtime-chat.ts b/packages/typescript/ai-react/src/use-realtime-chat.ts index 3434db607..9d3f0af41 100644 --- a/packages/typescript/ai-react/src/use-realtime-chat.ts +++ b/packages/typescript/ai-react/src/use-realtime-chat.ts @@ -1,7 +1,14 @@ import { useCallback, useEffect, useRef, useState } from 'react' import { RealtimeClient } from '@tanstack/ai-client' -import type { RealtimeMessage, RealtimeMode, RealtimeStatus } from '@tanstack/ai' -import type { UseRealtimeChatOptions, UseRealtimeChatReturn } from './realtime-types' +import type { + RealtimeMessage, + RealtimeMode, + RealtimeStatus, +} from '@tanstack/ai' +import type { + UseRealtimeChatOptions, + UseRealtimeChatReturn, +} from './realtime-types' // Empty frequency data for when client is not connected const emptyFrequencyData = new Uint8Array(128) @@ -54,8 +61,12 @@ export function useRealtimeChat( const [status, setStatus] = useState('idle') const [mode, setMode] = useState('idle') const [messages, setMessages] = useState>([]) - const [pendingUserTranscript, setPendingUserTranscript] = useState(null) - const [pendingAssistantTranscript, setPendingAssistantTranscript] = useState(null) + const [pendingUserTranscript, setPendingUserTranscript] = useState< + string | null + >(null) + const [pendingAssistantTranscript, setPendingAssistantTranscript] = useState< + string | null + >(null) const [error, setError] = useState(null) const [inputLevel, setInputLevel] = useState(0) const [outputLevel, setOutputLevel] = useState(0) @@ -181,19 +192,27 @@ export function useRealtimeChat( // Audio visualization const getInputFrequencyData = useCallback(() => { - return clientRef.current?.audio?.getInputFrequencyData() ?? emptyFrequencyData + return ( + clientRef.current?.audio?.getInputFrequencyData() ?? emptyFrequencyData + ) }, []) const getOutputFrequencyData = useCallback(() => { - return clientRef.current?.audio?.getOutputFrequencyData() ?? emptyFrequencyData + return ( + clientRef.current?.audio?.getOutputFrequencyData() ?? emptyFrequencyData + ) }, []) const getInputTimeDomainData = useCallback(() => { - return clientRef.current?.audio?.getInputTimeDomainData() ?? emptyTimeDomainData + return ( + clientRef.current?.audio?.getInputTimeDomainData() ?? emptyTimeDomainData + ) }, []) const getOutputTimeDomainData = useCallback(() => { - return clientRef.current?.audio?.getOutputTimeDomainData() ?? emptyTimeDomainData + return ( + clientRef.current?.audio?.getOutputTimeDomainData() ?? emptyTimeDomainData + ) }, []) // VAD mode control