Merge pull request tjardoo#45 from tjardoo/audio-stream

tjardoo · web-flow · commit 58c5b03a620c · 2024-01-16T16:09:11.000+01:00
Audio speech stream support
diff --git a/README.md b/README.md
@@ -425,6 +425,9 @@ Learn how to turn audio into text or text into audio.
 
 Generates audio from the input text.
 
+> [!NOTE]
+> This endpoint also has `stream` support. See the [examples/audio/create_speech_stream](https://github.com/tjardoo/openai-client/tree/master/examples/audio/create_speech_stream) example.
+
 ```rust
 use openai_dive::v1::api::Client;
 use openai_dive::v1::resources::audio::{
diff --git a/examples/Cargo.toml b/examples/Cargo.toml
@@ -4,6 +4,7 @@ members = [
     "audio/create_transcription",
     "audio/create_translation",
     "audio/create_speech",
+    "audio/create_speech_stream",
     "chat/create_chat_completion",
     "chat/create_chat_completion_stream",
     "chat/create_image_chat_completion",
diff --git a/examples/audio/Cargo.toml b/examples/audio/Cargo.toml
@@ -1,3 +1,8 @@
 [workspace]
 resolver = "2"
-members = ["create_transcription", "create_translation", "create_speech"]
+members = [
+    "create_transcription",
+    "create_translation",
+    "create_speech",
+    "create_speech_stream",
+]
diff --git a/examples/audio/create_speech_stream/Cargo.toml b/examples/audio/create_speech_stream/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "create_speech_stream"
+version = "0.1.0"
+edition = "2021"
+publish = false
+
+[dependencies]
+openai_dive = { path = "./../../../../openai-client", features = ["stream"] }
+tokio = { version = "1.0", features = ["full"] }
+futures = "0.3"
diff --git a/examples/audio/create_speech_stream/files/.gitignore b/examples/audio/create_speech_stream/files/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/examples/audio/create_speech_stream/src/main.rs b/examples/audio/create_speech_stream/src/main.rs
@@ -0,0 +1,47 @@
+use futures::future;
+use futures::stream::StreamExt;
+use openai_dive::v1::api::Client;
+use openai_dive::v1::resources::audio::{
+    AudioSpeechParameters, AudioSpeechResponseFormat, AudioVoice,
+};
+use std::env;
+use std::fs::File;
+use std::io::Write;
+
+#[tokio::main]
+async fn main() {
+    let api_key = env::var("OPENAI_API_KEY").expect("$OPENAI_API_KEY is not set");
+
+    let client = Client::new(api_key);
+
+    let parameters = AudioSpeechParameters {
+        model: "tts-1".to_string(),
+        input: "The quick brown fox jumped over the lazy dog.".to_string(),
+        voice: AudioVoice::Alloy,
+        response_format: Some(AudioSpeechResponseFormat::Mp3),
+        speed: Some(1.0),
+    };
+
+    let mut file = File::create("./files/example-stream.mp3").unwrap();
+
+    let stream = client
+        .audio()
+        .create_speech_stream(parameters)
+        .await
+        .unwrap();
+
+    stream
+        .for_each(|chunk| {
+            match chunk {
+                Ok(chunk) => {
+                    println!("Received chunk of {} bytes", chunk.bytes.len());
+
+                    file.write(&chunk.bytes).unwrap();
+                }
+                Err(error) => println!("Steam error: {:?}", error),
+            }
+
+            future::ready(())
+        })
+        .await;
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -418,6 +418,9 @@
 //!
 //! Generates audio from the input text.
 //!
+//! > [!NOTE]
+//! > This endpoint also has `stream` support. See the [examples/audio/create_speech_stream](https://github.com/tjardoo/openai-client/tree/master/examples/audio/create_speech_stream) example.
+//!
 //! ```rust
 //! use openai_dive::v1::api::Client;
 //! use openai_dive::v1::resources::audio::{
diff --git a/src/v1/api.rs b/src/v1/api.rs
@@ -183,6 +183,30 @@ impl Client {
         Client::process_stream::<O>(event_source).await
     }
 
+    #[cfg(feature = "stream")]
+    pub async fn post_stream_raw<I>(
+        &self,
+        path: &str,
+        parameters: &I,
+    ) -> Result<Pin<Box<dyn Stream<Item = Result<Bytes, APIError>> + Send>>, APIError>
+    where
+        I: Serialize,
+    {
+        let stream = self
+            .build_request(Method::POST, path)
+            .json(&parameters)
+            .send()
+            .await
+            .unwrap()
+            .bytes_stream()
+            .map(|item| item.map_err(|error| APIError::StreamError(error.to_string())));
+
+        Ok(Box::pin(stream)
+            as Pin<
+                Box<dyn Stream<Item = Result<Bytes, APIError>> + Send>,
+            >)
+    }
+
     #[cfg(feature = "stream")]
     pub async fn process_stream<O>(
         mut event_soure: EventSource,
diff --git a/src/v1/endpoints/audio.rs b/src/v1/endpoints/audio.rs
@@ -3,7 +3,15 @@ use crate::v1::error::APIError;
 use crate::v1::helpers::file_from_disk_to_form_part;
 use crate::v1::resources::audio::AudioSpeechParameters;
 use crate::v1::resources::audio::AudioSpeechResponse;
+#[cfg(feature = "stream")]
+use crate::v1::resources::audio::AudioSpeechResponseChunkResponse;
 use crate::v1::resources::audio::{AudioTranscriptionParameters, AudioTranslationParameters};
+#[cfg(feature = "stream")]
+use futures::Stream;
+#[cfg(feature = "stream")]
+use futures::StreamExt;
+#[cfg(feature = "stream")]
+use std::pin::Pin;
 
 pub struct Audio<'a> {
     pub client: &'a Client,
@@ -94,4 +102,35 @@ impl Audio<'_> {
 
         Ok(response)
     }
+
+    #[cfg(feature = "stream")]
+    /// Generates audio from the input text.
+    pub async fn create_speech_stream(
+        &self,
+        parameters: AudioSpeechParameters,
+    ) -> Result<
+        Pin<Box<dyn Stream<Item = Result<AudioSpeechResponseChunkResponse, APIError>> + Send>>,
+        APIError,
+    > {
+        use crate::v1::resources::audio::StreamAudioSpeechParameters;
+
+        let stream_parameters = StreamAudioSpeechParameters {
+            model: parameters.model,
+            input: parameters.input,
+            voice: parameters.voice,
+            response_format: parameters.response_format,
+            speed: parameters.speed,
+            stream: true,
+        };
+
+        let stream = Box::pin(
+            self.client
+                .post_stream_raw("/audio/speech", &stream_parameters)
+                .await
+                .unwrap()
+                .map(|item| item.map(|bytes| AudioSpeechResponseChunkResponse { bytes })),
+        );
+
+        Ok(stream)
+    }
 }
diff --git a/src/v1/resources/audio.rs b/src/v1/resources/audio.rs
@@ -65,6 +65,30 @@ pub struct AudioSpeechResponse {
     pub bytes: Bytes,
 }
 
+#[cfg(feature = "stream")]
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
+pub struct StreamAudioSpeechParameters {
+    /// One of the available TTS models: tts-1 or tts-1-hd.
+    pub model: String,
+    /// The text to generate audio for. The maximum length is 4096 characters.
+    pub input: String,
+    /// The voice to use when generating the audio. Supported voices are alloy, echo, fable, onyx, nova, and shimmer.
+    pub voice: AudioVoice,
+    /// The format to audio in. Supported formats are mp3, opus, aac, and flac.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub response_format: Option<AudioSpeechResponseFormat>,
+    /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub speed: Option<f32>,
+    pub stream: bool,
+}
+
+#[cfg(feature = "stream")]
+#[derive(Debug, Clone, PartialEq)]
+pub struct AudioSpeechResponseChunkResponse {
+    pub bytes: Bytes,
+}
+
 #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum AudioOutputFormat {