From a07cd64715c4d7e161c1f7381b61098367d71fa6 Mon Sep 17 00:00:00 2001 From: wangyuyan-agent <265828726+wangyuyan-agent@users.noreply.github.com> Date: Wed, 6 May 2026 23:58:02 +0800 Subject: [PATCH] feat(gateway): feishu voice message STT via gateway audio attachment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add msg_type=audio support to feishu adapter (parse, download, base64 encode) - Add MediaRef::Audio variant and download_feishu_audio() function - Add "audio" attachment type to core gateway handler (decode → stt::transcribe) - Pass SttConfig to gateway handler via GatewayParams - Update docs/feishu.md and docs/stt.md for multi-platform voice support Feishu voice messages (opus/ogg) are downloaded by the gateway, passed as base64-encoded audio attachments to core, and transcribed via the existing [stt] infrastructure (Groq Whisper by default). This is the first gateway platform to support audio — LINE/Telegram can reuse the core-side handler. Tested: 102 gateway tests + 197 core tests pass. E2E verified. --- docs/feishu.md | 1 + docs/stt.md | 6 +-- gateway/src/adapters/feishu.rs | 70 +++++++++++++++++++++++++++++++++- src/gateway.rs | 20 ++++++++++ src/main.rs | 1 + 5 files changed, 94 insertions(+), 4 deletions(-) diff --git a/docs/feishu.md b/docs/feishu.md index f1139d08..c18f8dca 100644 --- a/docs/feishu.md +++ b/docs/feishu.md @@ -167,6 +167,7 @@ The gateway downloads and forwards image and text file attachments to the AI age | `text` | Text extracted, forwarded as prompt | | `image` | Image downloaded, resized (max 1200px), JPEG compressed, base64 encoded → `ContentBlock::Image` | | `file` | Text files only (`.txt`, `.py`, `.rs`, `.md`, `.json`, etc., max 512KB). Non-text files (`.pdf`, `.zip`, etc.) are silently ignored. | +| `audio` | Voice message downloaded (opus/ogg, max 25MB), base64 encoded, forwarded to core. If `[stt]` is enabled, core transcribes via Whisper API and injects `[Voice message transcript]: ...` into the prompt. If STT is disabled or fails, the message is silently skipped. | | `post` | Rich text: text nodes extracted as prompt, `img` nodes downloaded as image attachments. This is the format Feishu uses when @mention + paste image in a group. | **Group chat limitation:** Feishu does not allow @mention and image upload in the same message. However, @mention + paste (Ctrl+V) an image works — Feishu sends this as a `post` message containing both the mention and the image. Direct image upload (via the attachment button) cannot include @mention, so the bot will not respond in groups. diff --git a/docs/stt.md b/docs/stt.md index 202f9678..5e76ff54 100644 --- a/docs/stt.md +++ b/docs/stt.md @@ -1,6 +1,6 @@ # Speech-to-Text (STT) for Voice Messages -openab can automatically transcribe Discord voice message attachments and forward the transcript to your ACP agent as text. +openab can automatically transcribe voice message attachments (Discord, Feishu, and other gateway platforms) and forward the transcript to your ACP agent as text. ## Quick Start @@ -24,7 +24,7 @@ api_key = "${GROQ_API_KEY}" ## How It Works ``` -Discord voice message (.ogg) +Voice message (Discord .ogg, Feishu opus/ogg, etc.) │ ▼ openab downloads the audio file @@ -170,6 +170,6 @@ When disabled, audio attachments are silently skipped with no impact on existing ## Technical Notes - openab sends `response_format=json` in the transcription request to ensure the response is always parseable JSON. Some local whisper servers default to plain text output without this parameter. -- The actual MIME type from the Discord attachment is passed through to the STT API (e.g. `audio/ogg`, `audio/mp4`, `audio/wav`). +- The actual MIME type from the platform attachment is passed through to the STT API (e.g. `audio/ogg` for Discord and Feishu voice messages, `audio/mp4`, `audio/wav`). - Environment variables in config values are expanded via `${VAR}` syntax (e.g. `api_key = "${GROQ_API_KEY}"`). - The `api_key` field is auto-detected from the `GROQ_API_KEY` environment variable when using the default Groq endpoint. If you set a custom `base_url` (e.g. local server), auto-detect is disabled to avoid leaking the Groq key to unrelated endpoints — you must set `api_key` explicitly. diff --git a/gateway/src/adapters/feishu.rs b/gateway/src/adapters/feishu.rs index 09e97fe0..deff731b 100644 --- a/gateway/src/adapters/feishu.rs +++ b/gateway/src/adapters/feishu.rs @@ -297,7 +297,7 @@ mod event_types { let sender = event.sender.as_ref()?; let msg_type = msg.message_type.as_deref().unwrap_or("text"); - if !matches!(msg_type, "text" | "image" | "file" | "post") { + if !matches!(msg_type, "text" | "image" | "file" | "post" | "audio") { return None; } // Skip bot messages with explicit sender_type @@ -385,6 +385,17 @@ mod event_types { }]; (String::new(), mentions.1, refs) } + "audio" => { + let file_key = content_json.get("file_key")?.as_str()?; + let mentions = extract_mentions( + "", msg.mentions.as_deref().unwrap_or(&[]), bot_open_id, + ); + let refs = vec![MediaRef::Audio { + message_id: message_id.to_string(), + file_key: file_key.to_string(), + }]; + (String::new(), mentions.1, refs) + } "post" => { // Rich text: content is {"title":"...","content":[[{tag,text,...},{tag,image_key,...}]]} let mut texts = Vec::new(); @@ -1038,6 +1049,9 @@ async fn handle_ws_message( MediaRef::File { message_id, file_key, file_name } => { download_feishu_file(client, &api_base, &token, message_id, file_key, file_name).await } + MediaRef::Audio { message_id, file_key } => { + download_feishu_audio(client, &api_base, &token, message_id, file_key).await + } }; if let Some(att) = attachment { gateway_event.content.attachments.push(att); @@ -1343,6 +1357,7 @@ fn try_parse_link(chars: &[char], start: usize) -> Option<(String, String, usize pub enum MediaRef { Image { message_id: String, image_key: String }, File { message_id: String, file_key: String, file_name: String }, + Audio { message_id: String, file_key: String }, } const IMAGE_MAX_DIMENSION_PX: u32 = 1200; @@ -1497,6 +1512,56 @@ pub async fn download_feishu_file( }) } +const AUDIO_MAX_DOWNLOAD: u64 = 25 * 1024 * 1024; // 25 MB (Whisper API limit) + +/// Download a Feishu audio message by message_id + file_key → base64 Attachment. +pub async fn download_feishu_audio( + client: &reqwest::Client, + api_base: &str, + token: &str, + message_id: &str, + file_key: &str, +) -> Option { + let url = format!( + "{}/open-apis/im/v1/messages/{}/resources/{}?type=file", + api_base, message_id, file_key + ); + let resp = match client.get(&url).bearer_auth(token).send().await { + Ok(r) => r, + Err(e) => { + tracing::warn!(file_key, error = %e, "feishu audio download failed"); + return None; + } + }; + if !resp.status().is_success() { + tracing::warn!(file_key, status = %resp.status(), "feishu audio download failed"); + return None; + } + if let Some(cl) = resp.headers().get(reqwest::header::CONTENT_LENGTH) { + if let Ok(size) = cl.to_str().unwrap_or("0").parse::() { + if size > AUDIO_MAX_DOWNLOAD { + tracing::warn!(file_key, size, "feishu audio exceeds 25MB limit"); + return None; + } + } + } + let bytes = resp.bytes().await.ok()?; + if bytes.len() as u64 > AUDIO_MAX_DOWNLOAD { + tracing::warn!(file_key, size = bytes.len(), "feishu audio exceeds 25MB limit"); + return None; + } + tracing::debug!(file_key, size = bytes.len(), "feishu audio downloaded"); + use base64::Engine; + let data = base64::engine::general_purpose::STANDARD.encode(&bytes); + Some(crate::schema::Attachment { + attachment_type: "audio".into(), + filename: format!("{}.ogg", file_key), + mime_type: "audio/ogg".into(), + data, + size: bytes.len() as u64, + }) +} + /// Send a post (rich text) message to a feishu chat_id. /// Returns the sent message_id on success, None on failure. /// When `reply_to` is Some(root_id), uses the reply API to stay in a thread. @@ -2263,6 +2328,9 @@ pub async fn webhook( MediaRef::File { message_id, file_key, file_name } => { download_feishu_file(&feishu.client, &api_base, &token, message_id, file_key, file_name).await } + MediaRef::Audio { message_id, file_key } => { + download_feishu_audio(&feishu.client, &api_base, &token, message_id, file_key).await + } }; if let Some(att) = attachment { gateway_event.content.attachments.push(att); diff --git a/src/gateway.rs b/src/gateway.rs index d8fa967c..8c13873e 100644 --- a/src/gateway.rs +++ b/src/gateway.rs @@ -488,6 +488,7 @@ pub struct GatewayParams { pub allow_all_users: bool, pub allowed_users: Vec, pub streaming: bool, + pub stt: crate::config::SttConfig, } pub async fn run_gateway_adapter( @@ -506,6 +507,7 @@ pub async fn run_gateway_adapter( let allow_all_users = params.allow_all_users; let allowed_users = params.allowed_users; let streaming = params.streaming; + let stt_config = params.stt; let connect_url = match ¶ms.token { Some(token) => { @@ -676,6 +678,24 @@ pub async fn run_gateway_adapter( }); } } + "audio" if stt_config.enabled => { + use base64::Engine; + if let Ok(audio_bytes) = base64::engine::general_purpose::STANDARD.decode(&att.data) { + if let Some(transcript) = crate::stt::transcribe( + &crate::media::HTTP_CLIENT, + &stt_config, + audio_bytes, + att.filename.clone(), + &att.mime_type, + ).await { + extra_blocks.push(ContentBlock::Text { + text: format!("[Voice message transcript]: {transcript}"), + }); + } + } else { + warn!(filename = %att.filename, "audio attachment base64 decode failed"); + } + } _ => {} } } diff --git a/src/main.rs b/src/main.rs index 706079b6..c2e5f41a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -298,6 +298,7 @@ async fn main() -> anyhow::Result<()> { ), allowed_users: gw_cfg.allowed_users, streaming: gw_cfg.streaming, + stt: cfg.stt.clone(), }; let gw_router = router.clone(); Some(tokio::spawn(async move {