feat(omni): add voice listing functionality and update voice options

hopelynd · hopelynd · commit 9761932b4c8a · 2026-06-23T14:50:09.000+08:00
diff --git a/packages/cli/src/commands/omni/chat.ts b/packages/cli/src/commands/omni/chat.ts
@@ -16,12 +16,52 @@ import {
   type StreamChunk,
   isInteractive,
   resolveFileUrl,
+  resolveOutputDir,
+  resolveCredential,
 } from "bailian-cli-core";
 import { promptText, failIfMissing } from "../../output/prompt.ts";
 import { emitResult } from "../../output/output.ts";
-import { resolveOutputDir, resolveCredential } from "bailian-cli-core";
 
-const OMNI_VOICES = ["Chelsie", "Cherry", "Ethan", "Serena", "Sunny", "Tina"];
+interface VoiceEntry {
+  voice: string;
+  name: string;
+  desc: string;
+  lang: string;
+}
+
+// qwen-omni 系统音色
+const OMNI_VOICES: VoiceEntry[] = [
+  { voice: "Dylan", name: "北京-晓东", desc: "胡同少年", lang: "中文/北京" },
+  { voice: "Kiki", name: "粤语-阿清", desc: "甜美港妹", lang: "中文/英文" },
+  { voice: "Li", name: "南京-老李", desc: "南京大叔", lang: "中文/英文" },
+  { voice: "Sunny", name: "四川-晴儿", desc: "甜飒川妹", lang: "中文" },
+  { voice: "Elias", name: "墨讲师", desc: "学术讲师女", lang: "中文/英文" },
+  { voice: "Nofish", name: "不吃鱼", desc: "南方口音男", lang: "中文/英文" },
+  { voice: "Marcus", name: "陕西-秦川", desc: "陕北汉子", lang: "中文/英文" },
+  { voice: "Eric", name: "四川-程川", desc: "成都大哥", lang: "中文/英文" },
+  { voice: "Jada", name: "上海-阿珍", desc: "沪上阿姐", lang: "中文" },
+  { voice: "Rocky", name: "粤语-阿强", desc: "幽默港仔", lang: "中文/英文" },
+  { voice: "Cherry", name: "芊悦", desc: "阳光自然女", lang: "中文/英文" },
+  { voice: "Roy", name: "闽南-阿杰", desc: "闽南哥仔", lang: "中文/英文" },
+  { voice: "Jennifer", name: "詹妮弗", desc: "美剧大女主", lang: "中文/英文" },
+  { voice: "Ryan", name: "甜茶", desc: "美剧张力男", lang: "中文/英文" },
+  { voice: "Katerina", name: "卡捷琳娜", desc: "御姐深情女", lang: "中文/英文" },
+  { voice: "Peter", name: "天津-李彼得", desc: "天津捧哏", lang: "中文/英文" },
+  { voice: "Ethan", name: "晨煦", desc: "北方口音男", lang: "中文/英文" },
+];
+
+function printVoiceList(): void {
+  const col = (s: string, w: number) => s.padEnd(w);
+  process.stdout.write("\nOmni output voices:\n");
+  process.stdout.write(
+    `${col("VOICE ID", 12)} ${col("NAME", 14)} ${col("DESCRIPTION", 14)} LANGUAGE\n`,
+  );
+  process.stdout.write(`${"-".repeat(12)} ${"-".repeat(14)} ${"-".repeat(14)} ${"-".repeat(12)}\n`);
+  for (const v of OMNI_VOICES) {
+    process.stdout.write(`${col(v.voice, 12)} ${col(v.name, 14)} ${col(v.desc, 14)} ${v.lang}\n`);
+  }
+  process.stdout.write(`\nTotal: ${OMNI_VOICES.length} voices\n`);
+}
 
 /**
  * Extension to input audio format.
@@ -110,7 +150,11 @@ export default defineCommand({
     },
     {
       flag: "--voice <voice>",
-      description: `Output voice (default: Cherry). Options: ${OMNI_VOICES.join(", ")}`,
+      description: "Output voice ID (default: Cherry). Use --list-voices to see all options",
+    },
+    {
+      flag: "--list-voices",
+      description: "List available output voices and exit",
     },
     { flag: "--audio-format <fmt>", description: "Audio output format (default: wav)" },
     { flag: "--audio-out <path>", description: "Save audio to file (default: auto-generate)" },
@@ -119,6 +163,7 @@ export default defineCommand({
     { flag: "--temperature <n>", description: "Sampling temperature (0.0, 2.0]", type: "number" },
   ],
   examples: [
+    "bl omni --list-voices",
     'bl omni --message "Hello, who are you?"',
     'bl omni --message "Describe this image" --image ./photo.jpg',
     'bl omni --message "What is this audio saying?" --audio https://example.com/audio.wav',
@@ -129,6 +174,11 @@ export default defineCommand({
     'bl omni --message "Read this passage aloud" --audio-out greeting.wav',
   ],
   async run(config: Config, flags: GlobalFlags) {
+    if (flags.listVoices) {
+      printVoiceList();
+      return;
+    }
+
     // --- Parse messages ---
     let userMessages: string[] = [];
     if (flags.message) {
diff --git a/packages/cli/src/commands/speech/synthesize.ts b/packages/cli/src/commands/speech/synthesize.ts
@@ -20,12 +20,14 @@ import {
   DOCS_HOSTS,
 } from "bailian-cli-core";
 
-const COSYVOICE_CLONE_DESIGN_DOC = `${DOCS_HOSTS.cn}/cosyvoice-clone-design-api`;
+import { VOICE_TTS_PAGE } from "../../urls.ts";
 import { downloadFile } from "../../utils/download.ts";
 import { runConcurrent, downloadParallel, getConcurrency } from "../../utils/concurrent.ts";
 import { promptText, promptSelect, failIfMissing } from "../../output/prompt.ts";
 import { emitResult, emitBare } from "../../output/output.ts";
 
+const COSYVOICE_CLONE_DESIGN_DOC = `${DOCS_HOSTS.cn}/cosyvoice-clone-design-api`;
+
 interface VoiceEntry {
   voice: string;
   name: string;
@@ -37,7 +39,7 @@ interface VoiceEntry {
 const COSYVOICE_V3_FLASH_VOICES: VoiceEntry[] = [
   // 社交陪伴
   { voice: "longanyang", name: "龙安洋", desc: "阳光大男孩", lang: "中文/英文" },
-  { voice: "longanhuan", name: "龙安欢", desc: "欢脱元气女", lang: "中文/英文" },
+  { voice: "longanhuan_v3", name: "龙安欢", desc: "欢脱元气女", lang: "中文/英文" },
   { voice: "longantai_v3", name: "龙安台", desc: "嗲甜台湾女", lang: "中文/英文" },
   { voice: "longhua_v3", name: "龙华", desc: "元气甜美女", lang: "中文/英文" },
   { voice: "longcheng_v3", name: "龙橙", desc: "智慧青年男", lang: "中文/英文" },
@@ -121,12 +123,14 @@ function printVoiceList(model: string): void {
   const voices = MODEL_VOICES[model];
   if (!voices) {
     process.stdout.write(`No built-in voice list available for model: ${model}\n`);
+    process.stdout.write(`Browse voices in the console: ${VOICE_TTS_PAGE}\n`);
     return;
   }
   if (voices.length === 0) {
     process.stdout.write(`Model ${model} has no system voices.\n`);
     process.stdout.write("Use clone or design voices created via the CosyVoice API.\n");
     process.stdout.write(`See: ${COSYVOICE_CLONE_DESIGN_DOC}\n`);
+    process.stdout.write(`Browse voices in the console: ${VOICE_TTS_PAGE}\n`);
     return;
   }
   const col = (s: string, w: number) => s.padEnd(w);
@@ -139,6 +143,7 @@ function printVoiceList(model: string): void {
     process.stdout.write(`${col(v.voice, 26)} ${col(v.name, 10)} ${col(v.desc, 16)} ${v.lang}\n`);
   }
   process.stdout.write(`\nTotal: ${voices.length} voices\n`);
+  process.stdout.write(`Preview and browse more voices in the console: \n${VOICE_TTS_PAGE}\n`);
 }
 
 export default defineCommand({
@@ -156,11 +161,12 @@ export default defineCommand({
     {
       flag: "--voice <voice>",
       description:
-        "Voice ID. Use --list-voices to see system voices for cosyvoice-v3-flash; for v3.5-flash provide a clone/design voice ID",
+        "Voice ID. Use --list-voices to see built-in voices for cosyvoice-v3-flash; for v3.5-flash provide a clone/design voice ID",
     },
     {
       flag: "--list-voices",
-      description: "List available system voices for the selected model and exit",
+      description:
+        "List built-in system voices for the selected model and exit (console link shown in output)",
     },
     { flag: "--format <format>", description: "Audio format: mp3, pcm, wav, opus (default: mp3)" },
     { flag: "--sample-rate <rate>", description: "Audio sample rate in Hz (e.g. 24000)" },
@@ -264,7 +270,7 @@ export default defineCommand({
         const modelVoices = MODEL_VOICES[model];
         if (modelVoices && modelVoices.length > 0) {
           throw new BailianError(
-            `--voice is required.\nRun the following to see available voices:\n  bl speech synthesize --list-voices --model ${model}`,
+            `--voice is required.\nRun the following to see available voices:\n  bl speech synthesize --list-voices --model ${model}\nBrowse more voices: ${VOICE_TTS_PAGE}`,
             ExitCode.USAGE,
           );
         } else {
diff --git a/packages/cli/src/urls.ts b/packages/cli/src/urls.ts
@@ -14,3 +14,6 @@ export const BAILIAN_CONSOLE = `${BAILIAN_CONSOLE_ROOT}/cn-beijing`;
 
 /** Direct deep link to API key management page. */
 export const API_KEY_PAGE = `${BAILIAN_CONSOLE}/?tab=app#/api-key`;
+
+/** Voice TTS experience center — browse system and custom voices. */
+export const VOICE_TTS_PAGE = "https://help.aliyun.com/zh/model-studio/cosyvoice-voice-list";
diff --git a/packages/cli/tests/e2e/omni.e2e.test.ts b/packages/cli/tests/e2e/omni.e2e.test.ts
@@ -15,6 +15,15 @@ describe("e2e: omni", () => {
     expect(exitCode, stderr).toBe(0);
     expect(stderr).toMatch(/omni|--message|--audio|text-only/i);
   });
+
+  test("omni --list-voices 输出音色列表并退出", async () => {
+    const { stdout, stderr, exitCode } = await runCli(["omni", "--list-voices"]);
+    expect(exitCode, stderr).toBe(0);
+    expect(stdout).toMatch(/Omni output voices:/);
+    expect(stdout).toMatch(/Dylan/);
+    expect(stdout).toMatch(/Cherry/);
+    expect(stdout).toMatch(/Total: 17 voices/);
+  });
 });
 
 describe.skipIf(!isBailianE2EMediaEnabled() || !isDashScopeE2EReady())(
diff --git a/skills/bailian-cli/reference/omni.md b/skills/bailian-cli/reference/omni.md
@@ -23,23 +23,28 @@ Index: [index.md](index.md)
 
 #### Options
 
-| Flag                   | Type    | Required | Description                                                                          |
-| ---------------------- | ------- | -------- | ------------------------------------------------------------------------------------ |
-| `--message <text>`     | array   | yes      | Message text (repeatable, prefix role: to set role)                                  |
-| `--model <model>`      | string  | no       | Model ID (default: qwen3.5-omni-plus)                                                |
-| `--system <text>`      | string  | no       | System prompt                                                                        |
-| `--image <url>`        | array   | no       | Image URL or local file (repeatable)                                                 |
-| `--audio <url>`        | array   | no       | Audio URL or local file (.wav/.mp3/.amr/.aac/.m4a/.ogg/.3gp/.3gpp)                   |
-| `--video <url>`        | array   | no       | Video file URL / local path, or comma-separated frame URLs                           |
-| `--voice <voice>`      | string  | no       | Output voice (default: Cherry). Options: Chelsie, Cherry, Ethan, Serena, Sunny, Tina |
-| `--audio-format <fmt>` | string  | no       | Audio output format (default: wav)                                                   |
-| `--audio-out <path>`   | string  | no       | Save audio to file (default: auto-generate)                                          |
-| `--text-only`          | boolean | no       | Output text only, no audio generation                                                |
-| `--max-tokens <n>`     | number  | no       | Maximum tokens to generate                                                           |
-| `--temperature <n>`    | number  | no       | Sampling temperature (0.0, 2.0]                                                      |
+| Flag                   | Type    | Required | Description                                                             |
+| ---------------------- | ------- | -------- | ----------------------------------------------------------------------- |
+| `--message <text>`     | array   | yes      | Message text (repeatable, prefix role: to set role)                     |
+| `--model <model>`      | string  | no       | Model ID (default: qwen3.5-omni-plus)                                   |
+| `--system <text>`      | string  | no       | System prompt                                                           |
+| `--image <url>`        | array   | no       | Image URL or local file (repeatable)                                    |
+| `--audio <url>`        | array   | no       | Audio URL or local file (.wav/.mp3/.amr/.aac/.m4a/.ogg/.3gp/.3gpp)      |
+| `--video <url>`        | array   | no       | Video file URL / local path, or comma-separated frame URLs              |
+| `--voice <voice>`      | string  | no       | Output voice ID (default: Cherry). Use --list-voices to see all options |
+| `--list-voices`        | boolean | no       | List available output voices and exit                                   |
+| `--audio-format <fmt>` | string  | no       | Audio output format (default: wav)                                      |
+| `--audio-out <path>`   | string  | no       | Save audio to file (default: auto-generate)                             |
+| `--text-only`          | boolean | no       | Output text only, no audio generation                                   |
+| `--max-tokens <n>`     | number  | no       | Maximum tokens to generate                                              |
+| `--temperature <n>`    | number  | no       | Sampling temperature (0.0, 2.0]                                         |
 
 #### Examples
 
+```bash
+bl omni --list-voices
+```
+
 ```bash
 bl omni --message "Hello, who are you?"
 ```
diff --git a/skills/bailian-cli/reference/speech.md b/skills/bailian-cli/reference/speech.md
@@ -77,24 +77,24 @@ bl speech recognize --url https://example.com/audio.mp3 --no-wait --quiet
 
 #### Options
 
-| Flag                   | Type    | Required | Description                                                                                                             |
-| ---------------------- | ------- | -------- | ----------------------------------------------------------------------------------------------------------------------- |
-| `--text <text>`        | string  | yes      | Text to synthesize into speech                                                                                          |
-| `--text-file <path>`   | string  | no       | Read text from a file instead of --text                                                                                 |
-| `--model <model>`      | string  | no       | Model ID (default: cosyvoice-v3-flash). System voices available for cosyvoice-v3-flash                                  |
-| `--voice <voice>`      | string  | no       | Voice ID. Use --list-voices to see system voices for cosyvoice-v3-flash; for v3.5-flash provide a clone/design voice ID |
-| `--list-voices`        | boolean | no       | List available system voices for the selected model and exit                                                            |
-| `--format <format>`    | string  | no       | Audio format: mp3, pcm, wav, opus (default: mp3)                                                                        |
-| `--sample-rate <rate>` | string  | no       | Audio sample rate in Hz (e.g. 24000)                                                                                    |
-| `--volume <volume>`    | string  | no       | Volume 0-100 (default: 50)                                                                                              |
-| `--rate <rate>`        | string  | no       | Speech rate 0.5-2.0 (default: 1.0)                                                                                      |
-| `--pitch <pitch>`      | string  | no       | Pitch multiplier 0.5-2.0 (default: 1.0)                                                                                 |
-| `--seed <seed>`        | string  | no       | Random seed 0-65535 for reproducible synthesis                                                                          |
-| `--language <lang>`    | string  | no       | Language hint (e.g. zh, en, ja, ko, fr, de)                                                                             |
-| `--instruction <text>` | string  | no       | Natural language instruction to control speech style (e.g. "Use a gentle tone"）                                        |
-| `--enable-ssml`        | boolean | no       | Enable SSML markup parsing in input text                                                                                |
-| `--out <path>`         | string  | no       | Save audio to file (default: auto-generate in temp dir)                                                                 |
-| `--stream`             | boolean | no       | Stream raw PCM audio to stdout (pipe to player)                                                                         |
+| Flag                   | Type    | Required | Description                                                                                                               |
+| ---------------------- | ------- | -------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `--text <text>`        | string  | yes      | Text to synthesize into speech                                                                                            |
+| `--text-file <path>`   | string  | no       | Read text from a file instead of --text                                                                                   |
+| `--model <model>`      | string  | no       | Model ID (default: cosyvoice-v3-flash). System voices available for cosyvoice-v3-flash                                    |
+| `--voice <voice>`      | string  | no       | Voice ID. Use --list-voices to see built-in voices for cosyvoice-v3-flash; for v3.5-flash provide a clone/design voice ID |
+| `--list-voices`        | boolean | no       | List built-in system voices for the selected model and exit (console link shown in output)                                |
+| `--format <format>`    | string  | no       | Audio format: mp3, pcm, wav, opus (default: mp3)                                                                          |
+| `--sample-rate <rate>` | string  | no       | Audio sample rate in Hz (e.g. 24000)                                                                                      |
+| `--volume <volume>`    | string  | no       | Volume 0-100 (default: 50)                                                                                                |
+| `--rate <rate>`        | string  | no       | Speech rate 0.5-2.0 (default: 1.0)                                                                                        |
+| `--pitch <pitch>`      | string  | no       | Pitch multiplier 0.5-2.0 (default: 1.0)                                                                                   |
+| `--seed <seed>`        | string  | no       | Random seed 0-65535 for reproducible synthesis                                                                            |
+| `--language <lang>`    | string  | no       | Language hint (e.g. zh, en, ja, ko, fr, de)                                                                               |
+| `--instruction <text>` | string  | no       | Natural language instruction to control speech style (e.g. "Use a gentle tone"）                                          |
+| `--enable-ssml`        | boolean | no       | Enable SSML markup parsing in input text                                                                                  |
+| `--out <path>`         | string  | no       | Save audio to file (default: auto-generate in temp dir)                                                                   |
+| `--stream`             | boolean | no       | Stream raw PCM audio to stdout (pipe to player)                                                                           |
 
 #### Examples