Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 131 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ This repository contains Swift community-maintained implementation over [OpenAI]
- [Audio Create Speech](#audio-create-speech)
- [Audio Transcriptions](#audio-transcriptions)
- [Audio Translations](#audio-translations)
- [Audio Chat](#audio-chat-gpt-4o-audio-preview)
- [Structured Outputs](#structured-outputs)
- [Specialized models](#specialized-models)
- [Embeddings](#embeddings)
Expand Down Expand Up @@ -735,6 +736,136 @@ openAI.audioTranslations(query: query) { result in
let result = try await openAI.audioTranslations(query: query)
```

### Audio Chat (gpt-4o-audio-preview)

The Audio Chat API enables audio-to-audio conversations with GPT-4o Audio models. This replaces the traditional STT→Chat→TTS pipeline with a single API call, providing 2-3x faster response times and better voice quality.

**Supported Models:** `gpt-4o-audio-preview`, `gpt-4o-mini-audio-preview`

**Important Format Requirements:**
- **Input audio formats:** Only `wav` and `mp3` are supported
- **Output audio formats:** `wav`, `mp3`, `flac`, `opus`, `pcm16`
- **Recommended for streaming:** Use `pcm16` for output to get optimal streaming performance

**Request:**

```swift
public struct AudioChatQuery: Codable, Equatable, Streamable, Sendable {
public let model: Model
public let messages: [Message]
public let modalities: [Modality]? // [.text, .audio]
public let audio: AudioConfig?
public var stream: Bool

public struct AudioConfig {
public let voice: Voice // .alloy, .echo, .fable, .onyx, .nova, .shimmer
public let format: AudioFormat // .wav, .mp3, .flac, .opus, .pcm16
}

public struct Message {
public let role: ChatQuery.ChatCompletionMessageParam.Role
public let content: Content // .text(String) or .parts([ContentPart])
}
}

public enum Modality: String, Codable, Sendable {
case text
case audio
}
```

**Response:**

```swift
public struct AudioChatResult: Codable, Equatable, Sendable {
public let id: String
public let choices: [Choice]

public struct Choice {
public let message: Message
}

public struct Message {
public let content: String? // Text transcript
public let audio: AudioOutput? // Base64-encoded audio data
}

public struct AudioOutput {
public let data: String // Base64-encoded audio
public let transcript: String
}
}
```

**Example (Non-Streaming):**

```swift
let audioData = Data(contentsOf: audioFileURL)
let base64Audio = audioData.base64EncodedString()

let query = AudioChatQuery(
model: .gpt_4o_audio_preview,
messages: [
.init(role: .system, content: .text("You are a helpful voice assistant.")),
.init(role: .user, content: .parts([
.init(inputAudio: .init(data: base64Audio, format: .wav))
]))
],
modalities: [.text, .audio],
audio: .init(voice: .alloy, format: .pcm16)
)

let result = try await openAI.audioChats(query: query)
if let audioOutput = result.choices.first?.message.audio,
let audioData = Data(base64Encoded: audioOutput.data) {
// Use audioData and audioOutput.transcript
}
```

**Example (Streaming):**

```swift
for try await chunk in openAI.audioChatsStream(query: query) {
if let audioDelta = chunk.choices.first?.delta.audio?.data,
let audioChunk = Data(base64Encoded: audioDelta) {
// Play audio chunk in real-time
}
}
```

**AudioConversationManager Utility:**

The SDK includes a convenient `AudioConversationManager` actor for managing multi-turn conversations with automatic history tracking:

```swift
let manager = AudioConversationManager(
openAI: openAI,
systemPrompt: "You are a helpful voice assistant.",
maxHistoryTurns: 10
)

// Send audio and get audio response
let (audioData, transcript) = try await manager.sendAudio(
audioData,
audioFormat: .wav,
voice: .alloy,
responseFormat: .pcm16
)

// Send text and get audio response
let (audioData, transcript) = try await manager.sendText(
"What's the weather like?",
voice: .alloy,
responseFormat: .pcm16
)

// Get conversation transcript
let transcript = manager.getTranscript()

// Reset conversation
manager.reset()
```

Review [Audio Documentation](https://platform.openai.com/docs/api-reference/audio) for more info.

## Structured Outputs
Expand Down
16 changes: 15 additions & 1 deletion Sources/OpenAI/OpenAI+OpenAIAsync.swift
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,21 @@ extension OpenAI: OpenAIAsync {
request: makeAudioTranslationsRequest(query: query)
)
}


public func audioChats(query: AudioChatQuery) async throws -> AudioChatResult {
try await performRequestAsync(
request: makeAudioChatsRequest(query: query)
)
}

public func audioChatsStream(
query: AudioChatQuery
) -> AsyncThrowingStream<AudioChatStreamResult, Error> {
makeAsyncStream { onResult, completion in
audioChatsStream(query: query, onResult: onResult, completion: completion)
}
}

public func assistants() async throws -> AssistantsResult {
try await assistants(after: nil)
}
Expand Down
14 changes: 13 additions & 1 deletion Sources/OpenAI/OpenAI.swift
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,19 @@ final public class OpenAI: OpenAIProtocol, @unchecked Sendable {
public func audioTranslations(query: AudioTranslationQuery, completion: @escaping @Sendable (Result<AudioTranslationResult, Error>) -> Void) -> CancellableRequest {
performRequest(request: makeAudioTranslationsRequest(query: query), completion: completion)
}


public func audioChats(query: AudioChatQuery, completion: @escaping @Sendable (Result<AudioChatResult, Error>) -> Void) -> CancellableRequest {
performRequest(request: makeAudioChatsRequest(query: query.makeNonStreamable()), completion: completion)
}

public func audioChatsStream(query: AudioChatQuery, onResult: @escaping @Sendable (Result<AudioChatStreamResult, Error>) -> Void, completion: (@Sendable (Error?) -> Void)?) -> CancellableRequest {
performStreamingRequest(
request: JSONRequest<AudioChatStreamResult>(body: query.makeStreamable(), url: buildURL(path: .chats)),
onResult: onResult,
completion: completion
)
}

public func audioCreateSpeech(query: AudioSpeechQuery, completion: @escaping @Sendable (Result<AudioSpeechResult, Error>) -> Void) -> CancellableRequest {
performSpeechRequest(request: makeAudioCreateSpeechRequest(query: query), completion: completion)
}
Expand Down
6 changes: 5 additions & 1 deletion Sources/OpenAI/Private/OpenAI+MakeRequest.swift
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,11 @@ extension OpenAI {
func makeAudioTranslationsRequest(query: AudioTranslationQuery) -> MultipartFormDataRequest<AudioTranslationResult> {
.init(body: query, url: buildURL(path: .audioTranslations))
}


func makeAudioChatsRequest(query: AudioChatQuery) -> JSONRequest<AudioChatResult> {
.init(body: query, url: buildURL(path: .chats))
}

func makeAudioCreateSpeechRequest(query: AudioSpeechQuery) -> JSONRequest<AudioSpeechResult> {
.init(body: query, url: buildURL(path: .audioSpeech))
}
Expand Down
Loading