Skip to content

Commit 9761932

Browse files
committed
feat(omni): add voice listing functionality and update voice options
1 parent 19c4f5f commit 9761932

6 files changed

Lines changed: 113 additions & 40 deletions

File tree

packages/cli/src/commands/omni/chat.ts

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,52 @@ import {
1616
type StreamChunk,
1717
isInteractive,
1818
resolveFileUrl,
19+
resolveOutputDir,
20+
resolveCredential,
1921
} from "bailian-cli-core";
2022
import { promptText, failIfMissing } from "../../output/prompt.ts";
2123
import { emitResult } from "../../output/output.ts";
22-
import { resolveOutputDir, resolveCredential } from "bailian-cli-core";
2324

24-
const OMNI_VOICES = ["Chelsie", "Cherry", "Ethan", "Serena", "Sunny", "Tina"];
25+
interface VoiceEntry {
26+
voice: string;
27+
name: string;
28+
desc: string;
29+
lang: string;
30+
}
31+
32+
// qwen-omni 系统音色
33+
const OMNI_VOICES: VoiceEntry[] = [
34+
{ voice: "Dylan", name: "北京-晓东", desc: "胡同少年", lang: "中文/北京" },
35+
{ voice: "Kiki", name: "粤语-阿清", desc: "甜美港妹", lang: "中文/英文" },
36+
{ voice: "Li", name: "南京-老李", desc: "南京大叔", lang: "中文/英文" },
37+
{ voice: "Sunny", name: "四川-晴儿", desc: "甜飒川妹", lang: "中文" },
38+
{ voice: "Elias", name: "墨讲师", desc: "学术讲师女", lang: "中文/英文" },
39+
{ voice: "Nofish", name: "不吃鱼", desc: "南方口音男", lang: "中文/英文" },
40+
{ voice: "Marcus", name: "陕西-秦川", desc: "陕北汉子", lang: "中文/英文" },
41+
{ voice: "Eric", name: "四川-程川", desc: "成都大哥", lang: "中文/英文" },
42+
{ voice: "Jada", name: "上海-阿珍", desc: "沪上阿姐", lang: "中文" },
43+
{ voice: "Rocky", name: "粤语-阿强", desc: "幽默港仔", lang: "中文/英文" },
44+
{ voice: "Cherry", name: "芊悦", desc: "阳光自然女", lang: "中文/英文" },
45+
{ voice: "Roy", name: "闽南-阿杰", desc: "闽南哥仔", lang: "中文/英文" },
46+
{ voice: "Jennifer", name: "詹妮弗", desc: "美剧大女主", lang: "中文/英文" },
47+
{ voice: "Ryan", name: "甜茶", desc: "美剧张力男", lang: "中文/英文" },
48+
{ voice: "Katerina", name: "卡捷琳娜", desc: "御姐深情女", lang: "中文/英文" },
49+
{ voice: "Peter", name: "天津-李彼得", desc: "天津捧哏", lang: "中文/英文" },
50+
{ voice: "Ethan", name: "晨煦", desc: "北方口音男", lang: "中文/英文" },
51+
];
52+
53+
function printVoiceList(): void {
54+
const col = (s: string, w: number) => s.padEnd(w);
55+
process.stdout.write("\nOmni output voices:\n");
56+
process.stdout.write(
57+
`${col("VOICE ID", 12)} ${col("NAME", 14)} ${col("DESCRIPTION", 14)} LANGUAGE\n`,
58+
);
59+
process.stdout.write(`${"-".repeat(12)} ${"-".repeat(14)} ${"-".repeat(14)} ${"-".repeat(12)}\n`);
60+
for (const v of OMNI_VOICES) {
61+
process.stdout.write(`${col(v.voice, 12)} ${col(v.name, 14)} ${col(v.desc, 14)} ${v.lang}\n`);
62+
}
63+
process.stdout.write(`\nTotal: ${OMNI_VOICES.length} voices\n`);
64+
}
2565

2666
/**
2767
* Extension to input audio format.
@@ -110,7 +150,11 @@ export default defineCommand({
110150
},
111151
{
112152
flag: "--voice <voice>",
113-
description: `Output voice (default: Cherry). Options: ${OMNI_VOICES.join(", ")}`,
153+
description: "Output voice ID (default: Cherry). Use --list-voices to see all options",
154+
},
155+
{
156+
flag: "--list-voices",
157+
description: "List available output voices and exit",
114158
},
115159
{ flag: "--audio-format <fmt>", description: "Audio output format (default: wav)" },
116160
{ flag: "--audio-out <path>", description: "Save audio to file (default: auto-generate)" },
@@ -119,6 +163,7 @@ export default defineCommand({
119163
{ flag: "--temperature <n>", description: "Sampling temperature (0.0, 2.0]", type: "number" },
120164
],
121165
examples: [
166+
"bl omni --list-voices",
122167
'bl omni --message "Hello, who are you?"',
123168
'bl omni --message "Describe this image" --image ./photo.jpg',
124169
'bl omni --message "What is this audio saying?" --audio https://example.com/audio.wav',
@@ -129,6 +174,11 @@ export default defineCommand({
129174
'bl omni --message "Read this passage aloud" --audio-out greeting.wav',
130175
],
131176
async run(config: Config, flags: GlobalFlags) {
177+
if (flags.listVoices) {
178+
printVoiceList();
179+
return;
180+
}
181+
132182
// --- Parse messages ---
133183
let userMessages: string[] = [];
134184
if (flags.message) {

packages/cli/src/commands/speech/synthesize.ts

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,14 @@ import {
2020
DOCS_HOSTS,
2121
} from "bailian-cli-core";
2222

23-
const COSYVOICE_CLONE_DESIGN_DOC = `${DOCS_HOSTS.cn}/cosyvoice-clone-design-api`;
23+
import { VOICE_TTS_PAGE } from "../../urls.ts";
2424
import { downloadFile } from "../../utils/download.ts";
2525
import { runConcurrent, downloadParallel, getConcurrency } from "../../utils/concurrent.ts";
2626
import { promptText, promptSelect, failIfMissing } from "../../output/prompt.ts";
2727
import { emitResult, emitBare } from "../../output/output.ts";
2828

29+
const COSYVOICE_CLONE_DESIGN_DOC = `${DOCS_HOSTS.cn}/cosyvoice-clone-design-api`;
30+
2931
interface VoiceEntry {
3032
voice: string;
3133
name: string;
@@ -37,7 +39,7 @@ interface VoiceEntry {
3739
const COSYVOICE_V3_FLASH_VOICES: VoiceEntry[] = [
3840
// 社交陪伴
3941
{ voice: "longanyang", name: "龙安洋", desc: "阳光大男孩", lang: "中文/英文" },
40-
{ voice: "longanhuan", name: "龙安欢", desc: "欢脱元气女", lang: "中文/英文" },
42+
{ voice: "longanhuan_v3", name: "龙安欢", desc: "欢脱元气女", lang: "中文/英文" },
4143
{ voice: "longantai_v3", name: "龙安台", desc: "嗲甜台湾女", lang: "中文/英文" },
4244
{ voice: "longhua_v3", name: "龙华", desc: "元气甜美女", lang: "中文/英文" },
4345
{ voice: "longcheng_v3", name: "龙橙", desc: "智慧青年男", lang: "中文/英文" },
@@ -121,12 +123,14 @@ function printVoiceList(model: string): void {
121123
const voices = MODEL_VOICES[model];
122124
if (!voices) {
123125
process.stdout.write(`No built-in voice list available for model: ${model}\n`);
126+
process.stdout.write(`Browse voices in the console: ${VOICE_TTS_PAGE}\n`);
124127
return;
125128
}
126129
if (voices.length === 0) {
127130
process.stdout.write(`Model ${model} has no system voices.\n`);
128131
process.stdout.write("Use clone or design voices created via the CosyVoice API.\n");
129132
process.stdout.write(`See: ${COSYVOICE_CLONE_DESIGN_DOC}\n`);
133+
process.stdout.write(`Browse voices in the console: ${VOICE_TTS_PAGE}\n`);
130134
return;
131135
}
132136
const col = (s: string, w: number) => s.padEnd(w);
@@ -139,6 +143,7 @@ function printVoiceList(model: string): void {
139143
process.stdout.write(`${col(v.voice, 26)} ${col(v.name, 10)} ${col(v.desc, 16)} ${v.lang}\n`);
140144
}
141145
process.stdout.write(`\nTotal: ${voices.length} voices\n`);
146+
process.stdout.write(`Preview and browse more voices in the console: \n${VOICE_TTS_PAGE}\n`);
142147
}
143148

144149
export default defineCommand({
@@ -156,11 +161,12 @@ export default defineCommand({
156161
{
157162
flag: "--voice <voice>",
158163
description:
159-
"Voice ID. Use --list-voices to see system voices for cosyvoice-v3-flash; for v3.5-flash provide a clone/design voice ID",
164+
"Voice ID. Use --list-voices to see built-in voices for cosyvoice-v3-flash; for v3.5-flash provide a clone/design voice ID",
160165
},
161166
{
162167
flag: "--list-voices",
163-
description: "List available system voices for the selected model and exit",
168+
description:
169+
"List built-in system voices for the selected model and exit (console link shown in output)",
164170
},
165171
{ flag: "--format <format>", description: "Audio format: mp3, pcm, wav, opus (default: mp3)" },
166172
{ flag: "--sample-rate <rate>", description: "Audio sample rate in Hz (e.g. 24000)" },
@@ -264,7 +270,7 @@ export default defineCommand({
264270
const modelVoices = MODEL_VOICES[model];
265271
if (modelVoices && modelVoices.length > 0) {
266272
throw new BailianError(
267-
`--voice is required.\nRun the following to see available voices:\n bl speech synthesize --list-voices --model ${model}`,
273+
`--voice is required.\nRun the following to see available voices:\n bl speech synthesize --list-voices --model ${model}\nBrowse more voices: ${VOICE_TTS_PAGE}`,
268274
ExitCode.USAGE,
269275
);
270276
} else {

packages/cli/src/urls.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,6 @@ export const BAILIAN_CONSOLE = `${BAILIAN_CONSOLE_ROOT}/cn-beijing`;
1414

1515
/** Direct deep link to API key management page. */
1616
export const API_KEY_PAGE = `${BAILIAN_CONSOLE}/?tab=app#/api-key`;
17+
18+
/** Voice TTS experience center — browse system and custom voices. */
19+
export const VOICE_TTS_PAGE = "https://help.aliyun.com/zh/model-studio/cosyvoice-voice-list";

packages/cli/tests/e2e/omni.e2e.test.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,15 @@ describe("e2e: omni", () => {
1515
expect(exitCode, stderr).toBe(0);
1616
expect(stderr).toMatch(/omni|--message|--audio|text-only/i);
1717
});
18+
19+
test("omni --list-voices 输出音色列表并退出", async () => {
20+
const { stdout, stderr, exitCode } = await runCli(["omni", "--list-voices"]);
21+
expect(exitCode, stderr).toBe(0);
22+
expect(stdout).toMatch(/Omni output voices:/);
23+
expect(stdout).toMatch(/Dylan/);
24+
expect(stdout).toMatch(/Cherry/);
25+
expect(stdout).toMatch(/Total: 17 voices/);
26+
});
1827
});
1928

2029
describe.skipIf(!isBailianE2EMediaEnabled() || !isDashScopeE2EReady())(

skills/bailian-cli/reference/omni.md

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,23 +23,28 @@ Index: [index.md](index.md)
2323

2424
#### Options
2525

26-
| Flag | Type | Required | Description |
27-
| ---------------------- | ------- | -------- | ------------------------------------------------------------------------------------ |
28-
| `--message <text>` | array | yes | Message text (repeatable, prefix role: to set role) |
29-
| `--model <model>` | string | no | Model ID (default: qwen3.5-omni-plus) |
30-
| `--system <text>` | string | no | System prompt |
31-
| `--image <url>` | array | no | Image URL or local file (repeatable) |
32-
| `--audio <url>` | array | no | Audio URL or local file (.wav/.mp3/.amr/.aac/.m4a/.ogg/.3gp/.3gpp) |
33-
| `--video <url>` | array | no | Video file URL / local path, or comma-separated frame URLs |
34-
| `--voice <voice>` | string | no | Output voice (default: Cherry). Options: Chelsie, Cherry, Ethan, Serena, Sunny, Tina |
35-
| `--audio-format <fmt>` | string | no | Audio output format (default: wav) |
36-
| `--audio-out <path>` | string | no | Save audio to file (default: auto-generate) |
37-
| `--text-only` | boolean | no | Output text only, no audio generation |
38-
| `--max-tokens <n>` | number | no | Maximum tokens to generate |
39-
| `--temperature <n>` | number | no | Sampling temperature (0.0, 2.0] |
26+
| Flag | Type | Required | Description |
27+
| ---------------------- | ------- | -------- | ----------------------------------------------------------------------- |
28+
| `--message <text>` | array | yes | Message text (repeatable, prefix role: to set role) |
29+
| `--model <model>` | string | no | Model ID (default: qwen3.5-omni-plus) |
30+
| `--system <text>` | string | no | System prompt |
31+
| `--image <url>` | array | no | Image URL or local file (repeatable) |
32+
| `--audio <url>` | array | no | Audio URL or local file (.wav/.mp3/.amr/.aac/.m4a/.ogg/.3gp/.3gpp) |
33+
| `--video <url>` | array | no | Video file URL / local path, or comma-separated frame URLs |
34+
| `--voice <voice>` | string | no | Output voice ID (default: Cherry). Use --list-voices to see all options |
35+
| `--list-voices` | boolean | no | List available output voices and exit |
36+
| `--audio-format <fmt>` | string | no | Audio output format (default: wav) |
37+
| `--audio-out <path>` | string | no | Save audio to file (default: auto-generate) |
38+
| `--text-only` | boolean | no | Output text only, no audio generation |
39+
| `--max-tokens <n>` | number | no | Maximum tokens to generate |
40+
| `--temperature <n>` | number | no | Sampling temperature (0.0, 2.0] |
4041

4142
#### Examples
4243

44+
```bash
45+
bl omni --list-voices
46+
```
47+
4348
```bash
4449
bl omni --message "Hello, who are you?"
4550
```

skills/bailian-cli/reference/speech.md

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -77,24 +77,24 @@ bl speech recognize --url https://example.com/audio.mp3 --no-wait --quiet
7777

7878
#### Options
7979

80-
| Flag | Type | Required | Description |
81-
| ---------------------- | ------- | -------- | ----------------------------------------------------------------------------------------------------------------------- |
82-
| `--text <text>` | string | yes | Text to synthesize into speech |
83-
| `--text-file <path>` | string | no | Read text from a file instead of --text |
84-
| `--model <model>` | string | no | Model ID (default: cosyvoice-v3-flash). System voices available for cosyvoice-v3-flash |
85-
| `--voice <voice>` | string | no | Voice ID. Use --list-voices to see system voices for cosyvoice-v3-flash; for v3.5-flash provide a clone/design voice ID |
86-
| `--list-voices` | boolean | no | List available system voices for the selected model and exit |
87-
| `--format <format>` | string | no | Audio format: mp3, pcm, wav, opus (default: mp3) |
88-
| `--sample-rate <rate>` | string | no | Audio sample rate in Hz (e.g. 24000) |
89-
| `--volume <volume>` | string | no | Volume 0-100 (default: 50) |
90-
| `--rate <rate>` | string | no | Speech rate 0.5-2.0 (default: 1.0) |
91-
| `--pitch <pitch>` | string | no | Pitch multiplier 0.5-2.0 (default: 1.0) |
92-
| `--seed <seed>` | string | no | Random seed 0-65535 for reproducible synthesis |
93-
| `--language <lang>` | string | no | Language hint (e.g. zh, en, ja, ko, fr, de) |
94-
| `--instruction <text>` | string | no | Natural language instruction to control speech style (e.g. "Use a gentle tone") |
95-
| `--enable-ssml` | boolean | no | Enable SSML markup parsing in input text |
96-
| `--out <path>` | string | no | Save audio to file (default: auto-generate in temp dir) |
97-
| `--stream` | boolean | no | Stream raw PCM audio to stdout (pipe to player) |
80+
| Flag | Type | Required | Description |
81+
| ---------------------- | ------- | -------- | ------------------------------------------------------------------------------------------------------------------------- |
82+
| `--text <text>` | string | yes | Text to synthesize into speech |
83+
| `--text-file <path>` | string | no | Read text from a file instead of --text |
84+
| `--model <model>` | string | no | Model ID (default: cosyvoice-v3-flash). System voices available for cosyvoice-v3-flash |
85+
| `--voice <voice>` | string | no | Voice ID. Use --list-voices to see built-in voices for cosyvoice-v3-flash; for v3.5-flash provide a clone/design voice ID |
86+
| `--list-voices` | boolean | no | List built-in system voices for the selected model and exit (console link shown in output) |
87+
| `--format <format>` | string | no | Audio format: mp3, pcm, wav, opus (default: mp3) |
88+
| `--sample-rate <rate>` | string | no | Audio sample rate in Hz (e.g. 24000) |
89+
| `--volume <volume>` | string | no | Volume 0-100 (default: 50) |
90+
| `--rate <rate>` | string | no | Speech rate 0.5-2.0 (default: 1.0) |
91+
| `--pitch <pitch>` | string | no | Pitch multiplier 0.5-2.0 (default: 1.0) |
92+
| `--seed <seed>` | string | no | Random seed 0-65535 for reproducible synthesis |
93+
| `--language <lang>` | string | no | Language hint (e.g. zh, en, ja, ko, fr, de) |
94+
| `--instruction <text>` | string | no | Natural language instruction to control speech style (e.g. "Use a gentle tone") |
95+
| `--enable-ssml` | boolean | no | Enable SSML markup parsing in input text |
96+
| `--out <path>` | string | no | Save audio to file (default: auto-generate in temp dir) |
97+
| `--stream` | boolean | no | Stream raw PCM audio to stdout (pipe to player) |
9898

9999
#### Examples
100100

0 commit comments

Comments
 (0)