Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/feishu.md
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ The gateway downloads and forwards image and text file attachments to the AI age
| `text` | Text extracted, forwarded as prompt |
| `image` | Image downloaded, resized (max 1200px), JPEG compressed, base64 encoded → `ContentBlock::Image` |
| `file` | Text files only (`.txt`, `.py`, `.rs`, `.md`, `.json`, etc., max 512KB). Non-text files (`.pdf`, `.zip`, etc.) are silently ignored. |
| `audio` | Voice message downloaded (opus/ogg, max 25MB), base64 encoded, forwarded to core. If `[stt]` is enabled, core transcribes via Whisper API and injects `[Voice message transcript]: ...` into the prompt. If STT is disabled or fails, the message is silently skipped. |
| `post` | Rich text: text nodes extracted as prompt, `img` nodes downloaded as image attachments. This is the format Feishu uses when @mention + paste image in a group. |

**Group chat limitation:** Feishu does not allow @mention and image upload in the same message. However, @mention + paste (Ctrl+V) an image works — Feishu sends this as a `post` message containing both the mention and the image. Direct image upload (via the attachment button) cannot include @mention, so the bot will not respond in groups.
Expand Down
6 changes: 3 additions & 3 deletions docs/stt.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Speech-to-Text (STT) for Voice Messages

openab can automatically transcribe Discord voice message attachments and forward the transcript to your ACP agent as text.
openab can automatically transcribe voice message attachments (Discord, Feishu, and other gateway platforms) and forward the transcript to your ACP agent as text.

## Quick Start

Expand All @@ -24,7 +24,7 @@ api_key = "${GROQ_API_KEY}"
## How It Works

```
Discord voice message (.ogg)
Voice message (Discord .ogg, Feishu opus/ogg, etc.)
openab downloads the audio file
Expand Down Expand Up @@ -170,6 +170,6 @@ When disabled, audio attachments are silently skipped with no impact on existing
## Technical Notes

- openab sends `response_format=json` in the transcription request to ensure the response is always parseable JSON. Some local whisper servers default to plain text output without this parameter.
- The actual MIME type from the Discord attachment is passed through to the STT API (e.g. `audio/ogg`, `audio/mp4`, `audio/wav`).
- The actual MIME type from the platform attachment is passed through to the STT API (e.g. `audio/ogg` for Discord and Feishu voice messages, `audio/mp4`, `audio/wav`).
- Environment variables in config values are expanded via `${VAR}` syntax (e.g. `api_key = "${GROQ_API_KEY}"`).
- The `api_key` field is auto-detected from the `GROQ_API_KEY` environment variable when using the default Groq endpoint. If you set a custom `base_url` (e.g. local server), auto-detect is disabled to avoid leaking the Groq key to unrelated endpoints — you must set `api_key` explicitly.
70 changes: 69 additions & 1 deletion gateway/src/adapters/feishu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ mod event_types {
let sender = event.sender.as_ref()?;

let msg_type = msg.message_type.as_deref().unwrap_or("text");
if !matches!(msg_type, "text" | "image" | "file" | "post") {
if !matches!(msg_type, "text" | "image" | "file" | "post" | "audio") {
return None;
}
// Skip bot messages with explicit sender_type
Expand Down Expand Up @@ -385,6 +385,17 @@ mod event_types {
}];
(String::new(), mentions.1, refs)
}
"audio" => {
let file_key = content_json.get("file_key")?.as_str()?;
let mentions = extract_mentions(
"", msg.mentions.as_deref().unwrap_or(&[]), bot_open_id,
);
let refs = vec![MediaRef::Audio {
message_id: message_id.to_string(),
file_key: file_key.to_string(),
}];
(String::new(), mentions.1, refs)
}
"post" => {
// Rich text: content is {"title":"...","content":[[{tag,text,...},{tag,image_key,...}]]}
let mut texts = Vec::new();
Expand Down Expand Up @@ -1038,6 +1049,9 @@ async fn handle_ws_message(
MediaRef::File { message_id, file_key, file_name } => {
download_feishu_file(client, &api_base, &token, message_id, file_key, file_name).await
}
MediaRef::Audio { message_id, file_key } => {
download_feishu_audio(client, &api_base, &token, message_id, file_key).await
}
};
if let Some(att) = attachment {
gateway_event.content.attachments.push(att);
Expand Down Expand Up @@ -1343,6 +1357,7 @@ fn try_parse_link(chars: &[char], start: usize) -> Option<(String, String, usize
pub enum MediaRef {
Image { message_id: String, image_key: String },
File { message_id: String, file_key: String, file_name: String },
Audio { message_id: String, file_key: String },
}

const IMAGE_MAX_DIMENSION_PX: u32 = 1200;
Expand Down Expand Up @@ -1497,6 +1512,56 @@ pub async fn download_feishu_file(
})
}

const AUDIO_MAX_DOWNLOAD: u64 = 25 * 1024 * 1024; // 25 MB (Whisper API limit)

/// Download a Feishu audio message by message_id + file_key → base64 Attachment.
pub async fn download_feishu_audio(
client: &reqwest::Client,
api_base: &str,
token: &str,
message_id: &str,
file_key: &str,
) -> Option<crate::schema::Attachment> {
let url = format!(
"{}/open-apis/im/v1/messages/{}/resources/{}?type=file",
api_base, message_id, file_key
);
let resp = match client.get(&url).bearer_auth(token).send().await {
Ok(r) => r,
Err(e) => {
tracing::warn!(file_key, error = %e, "feishu audio download failed");
return None;
}
};
if !resp.status().is_success() {
tracing::warn!(file_key, status = %resp.status(), "feishu audio download failed");
return None;
}
if let Some(cl) = resp.headers().get(reqwest::header::CONTENT_LENGTH) {
if let Ok(size) = cl.to_str().unwrap_or("0").parse::<u64>() {
if size > AUDIO_MAX_DOWNLOAD {
tracing::warn!(file_key, size, "feishu audio exceeds 25MB limit");
return None;
}
}
}
let bytes = resp.bytes().await.ok()?;
if bytes.len() as u64 > AUDIO_MAX_DOWNLOAD {
tracing::warn!(file_key, size = bytes.len(), "feishu audio exceeds 25MB limit");
return None;
}
tracing::debug!(file_key, size = bytes.len(), "feishu audio downloaded");
use base64::Engine;
let data = base64::engine::general_purpose::STANDARD.encode(&bytes);
Some(crate::schema::Attachment {
attachment_type: "audio".into(),
filename: format!("{}.ogg", file_key),
mime_type: "audio/ogg".into(),
data,
size: bytes.len() as u64,
})
}

/// Send a post (rich text) message to a feishu chat_id.
/// Returns the sent message_id on success, None on failure.
/// When `reply_to` is Some(root_id), uses the reply API to stay in a thread.
Expand Down Expand Up @@ -2263,6 +2328,9 @@ pub async fn webhook(
MediaRef::File { message_id, file_key, file_name } => {
download_feishu_file(&feishu.client, &api_base, &token, message_id, file_key, file_name).await
}
MediaRef::Audio { message_id, file_key } => {
download_feishu_audio(&feishu.client, &api_base, &token, message_id, file_key).await
}
};
if let Some(att) = attachment {
gateway_event.content.attachments.push(att);
Expand Down
20 changes: 20 additions & 0 deletions src/gateway.rs
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,7 @@ pub struct GatewayParams {
pub allow_all_users: bool,
pub allowed_users: Vec<String>,
pub streaming: bool,
pub stt: crate::config::SttConfig,
}

pub async fn run_gateway_adapter(
Expand All @@ -506,6 +507,7 @@ pub async fn run_gateway_adapter(
let allow_all_users = params.allow_all_users;
let allowed_users = params.allowed_users;
let streaming = params.streaming;
let stt_config = params.stt;

let connect_url = match &params.token {
Some(token) => {
Expand Down Expand Up @@ -676,6 +678,24 @@ pub async fn run_gateway_adapter(
});
}
}
"audio" if stt_config.enabled => {
use base64::Engine;
if let Ok(audio_bytes) = base64::engine::general_purpose::STANDARD.decode(&att.data) {
if let Some(transcript) = crate::stt::transcribe(
&crate::media::HTTP_CLIENT,
&stt_config,
audio_bytes,
att.filename.clone(),
&att.mime_type,
).await {
extra_blocks.push(ContentBlock::Text {
text: format!("[Voice message transcript]: {transcript}"),
});
}
} else {
warn!(filename = %att.filename, "audio attachment base64 decode failed");
}
}
_ => {}
}
}
Expand Down
1 change: 1 addition & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ async fn main() -> anyhow::Result<()> {
),
allowed_users: gw_cfg.allowed_users,
streaming: gw_cfg.streaming,
stt: cfg.stt.clone(),
};
let gw_router = router.clone();
Some(tokio::spawn(async move {
Expand Down
Loading