Skip to content

Commit 7d5804b

Browse files
committed
Add realtime models and Modality enum
- Add gpt-4o-realtime-preview and gpt-4o-mini-realtime-preview models - Add dated variants: gpt-4o-realtime-preview-2024-12-17 and gpt-4o-mini-realtime-preview-2024-12-17 - Replace string-based modalities with type-safe Modality enum - Update modalities parameter from [String] to [Modality] - Update all usage examples and tests to use [.text, .audio] syntax - Update AudioConversationManager to use new enum - Update README documentation with Modality enum
1 parent 1e057d5 commit 7d5804b

File tree

6 files changed

+36
-11
lines changed

6 files changed

+36
-11
lines changed

README.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -753,7 +753,7 @@ The Audio Chat API enables audio-to-audio conversations with GPT-4o Audio models
753753
public struct AudioChatQuery: Codable, Equatable, Streamable, Sendable {
754754
public let model: Model
755755
public let messages: [Message]
756-
public let modalities: [String]? // ["text", "audio"]
756+
public let modalities: [Modality]? // [.text, .audio]
757757
public let audio: AudioConfig?
758758
public var stream: Bool
759759

@@ -767,6 +767,11 @@ public struct AudioChatQuery: Codable, Equatable, Streamable, Sendable {
767767
public let content: Content // .text(String) or .parts([ContentPart])
768768
}
769769
}
770+
771+
public enum Modality: String, Codable, Sendable {
772+
case text
773+
case audio
774+
}
770775
```
771776

772777
**Response:**
@@ -806,7 +811,7 @@ let query = AudioChatQuery(
806811
.init(inputAudio: .init(data: base64Audio, format: .wav))
807812
]))
808813
],
809-
modalities: ["text", "audio"],
814+
modalities: [.text, .audio],
810815
audio: .init(voice: .alloy, format: .pcm16)
811816
)
812817

Sources/OpenAI/Public/Models/AudioChatQuery.swift

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ public struct AudioChatQuery: Codable, Equatable, Streamable, Sendable {
2424
/// A list of messages comprising the conversation so far.
2525
public let messages: [Message]
2626

27-
/// Output types to enable for this request. Can include "text" and "audio".
28-
/// Defaults to ["text", "audio"]
29-
public let modalities: [String]?
27+
/// Output types to enable for this request. Can include text and audio.
28+
/// Defaults to [.text, .audio]
29+
public let modalities: [Modality]?
3030

3131
/// Configuration for audio output
3232
public let audio: AudioConfig?
@@ -59,7 +59,7 @@ public struct AudioChatQuery: Codable, Equatable, Streamable, Sendable {
5959
public init(
6060
model: Model,
6161
messages: [Message],
62-
modalities: [String]? = ["text", "audio"],
62+
modalities: [Modality]? = [.text, .audio],
6363
audio: AudioConfig? = nil,
6464
temperature: Double? = nil,
6565
maxTokens: Int? = nil,
@@ -236,3 +236,13 @@ public enum AudioFormat: String, Codable, Sendable {
236236
case opus
237237
case pcm16
238238
}
239+
240+
/// Output modality options for audio chat requests
241+
///
242+
/// Specifies which types of output the model should generate
243+
public enum Modality: String, Codable, Sendable {
244+
/// Text output
245+
case text
246+
/// Audio output
247+
case audio
248+
}

Sources/OpenAI/Public/Models/Models/Models.swift

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,13 +117,19 @@ public extension Model {
117117
///
118118
/// This is a preview release of the GPT-4o Realtime model, capable of responding to audio and text inputs in realtime over WebRTC or a WebSocket interface.
119119
static let gpt_4o_realtime_preview = "gpt-4o-realtime-preview"
120-
120+
121+
/// GPT-4o Realtime (2024-12-17): Snapshot of gpt-4o-realtime-preview from December 17th 2024
122+
static let gpt_4o_realtime_preview_2024_12_17 = "gpt-4o-realtime-preview-2024-12-17"
123+
121124
/// GPT-4o mini Realtime: Smaller realtime model for text and audio inputs and outputs
122125
///
123126
/// `gpt-4o-mini-realtime-preview`
124127
///
125128
/// This is a preview release of the GPT-4o-mini Realtime model, capable of responding to audio and text inputs in realtime over WebRTC or a WebSocket interface.
126129
static let gpt_4o_mini_realtime_preview = "gpt-4o-mini-realtime-preview"
130+
131+
/// GPT-4o mini Realtime (2024-12-17): Snapshot of gpt-4o-mini-realtime-preview from December 17th 2024
132+
static let gpt_4o_mini_realtime_preview_2024_12_17 = "gpt-4o-mini-realtime-preview-2024-12-17"
127133

128134
// MARK: - Older GPT models
129135
// Supported older versions of our general purpose and chat models.

Sources/OpenAI/Public/Schemas/Generated/Components.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4288,6 +4288,10 @@ public enum Components {
42884288
case gpt4oAudioPreview20250603 = "gpt-4o-audio-preview-2025-06-03"
42894289
case gpt4oMiniAudioPreview = "gpt-4o-mini-audio-preview"
42904290
case gpt4oMiniAudioPreview20241217 = "gpt-4o-mini-audio-preview-2024-12-17"
4291+
case gpt4oRealtimePreview = "gpt-4o-realtime-preview"
4292+
case gpt4oRealtimePreview20241217 = "gpt-4o-realtime-preview-2024-12-17"
4293+
case gpt4oMiniRealtimePreview = "gpt-4o-mini-realtime-preview"
4294+
case gpt4oMiniRealtimePreview20241217 = "gpt-4o-mini-realtime-preview-2024-12-17"
42914295
case gpt4oSearchPreview = "gpt-4o-search-preview"
42924296
case gpt4oMiniSearchPreview = "gpt-4o-mini-search-preview"
42934297
case gpt4oSearchPreview20250311 = "gpt-4o-search-preview-2025-03-11"

Sources/OpenAI/Public/Utilities/AudioConversationManager.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ public actor AudioConversationManager {
8181
let query = AudioChatQuery(
8282
model: .gpt_4o_audio_preview,
8383
messages: conversationHistory,
84-
modalities: ["text", "audio"],
84+
modalities: [.text, .audio],
8585
audio: .init(voice: voice, format: responseFormat)
8686
)
8787

@@ -132,7 +132,7 @@ public actor AudioConversationManager {
132132
let query = AudioChatQuery(
133133
model: .gpt_4o_audio_preview,
134134
messages: conversationHistory,
135-
modalities: ["text", "audio"],
135+
modalities: [.text, .audio],
136136
audio: .init(voice: voice, format: responseFormat)
137137
)
138138

Tests/OpenAITests/AudioChatQueryCodingTests.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ struct AudioChatQueryCodingTests {
1717
.init(role: .system, content: .text("You are a helpful assistant")),
1818
.init(role: .user, content: .text("Hello"))
1919
],
20-
modalities: ["text", "audio"],
20+
modalities: [.text, .audio],
2121
audio: .init(voice: .alloy, format: .pcm16)
2222
)
2323

@@ -57,7 +57,7 @@ struct AudioChatQueryCodingTests {
5757
.init(inputAudio: .init(data: audioData, format: .wav))
5858
]))
5959
],
60-
modalities: ["text", "audio"],
60+
modalities: [.text, .audio],
6161
audio: .init(voice: .onyx, format: .pcm16)
6262
)
6363

0 commit comments

Comments
 (0)