FastDeploy/mkdocs.yml at develop · cattidea/FastDeploy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
site_name: 'FastDeploy : Large Language Model Deployment'
repo_url: https://github.com/PaddlePaddle/FastDeploy
repo_name: FastDeploy

copyright: Copyright &copy; 2025 Maintained by FastDeploy

theme:
  name: material
  highlightjs: true
  favicon: assets/images/favicon.ico
  logo: assets/images/logo.jpg
  palette:
    - media: "(prefers-color-scheme: light)" # 浅色
      scheme: default
      primary: indigo
      accent: indigo
      toggle:
        icon: material/brightness-7
        name: Switch to dark mode
    - media: "(prefers-color-scheme: dark)" # 深色
      scheme: slate
      primary: black
      accent: indigo
      toggle:
        icon: material/brightness-4
        name: Switch to system preference

plugins:
  - search
  - i18n:
      docs_structure: folder
      fallback_to_default: true
      reconfigure_material: true
      reconfigure_search: true
      languages:
        - locale: en
          default: true
          name: English
          site_name: 'FastDeploy: Large Language Model Deployment'
          build: true
          link: /FastDeploy/
        - locale: zh
          name: 简体中文
          site_name: 飞桨大语言模型推理部署工具包
          link: /FastDeploy/zh/
          nav_translations:
            FastDeploy: FastDeploy
            Quick Start: 快速入门
            Installation: 安装
            Nvidia GPU: 英伟达 GPU
            KunlunXin XPU: 昆仑芯 XPU
            Intel Gaudi: 英特尔 Gaudi
            HYGON DCU: 海光 DCU
            Enflame S60: 燧原 S60
            Iluvatar CoreX: 天数 CoreX
            Metax C550: 沐曦 C550
            Quick Deployment For ERNIE-4.5-0.3B: ERNIE-4.5-0.3B快速部署
            Quick Deployment for ERNIE-4.5-VL-28B-A3B: ERNIE-4.5-VL-28B-A3B快速部署
            ERNIE-4.5-300B-A47B: ERNIE-4.5-300B-A47B快速部署
            ERNIE-4.5-VL-424B-A47B: ERNIE-4.5-VL-424B-A47B快速部署
            Quick Deployment For QWEN: Qwen3-0.6b快速部署
            Quick Deployment For QWEN2.5-VL: Qwen2.5-VL系列快速部署
            Online Serving: 在线服务
            OpenAI-Compatible API Server: 兼容 OpenAI 协议的服务化部署
            Monitor Metrics: 监控Metrics
            Scheduler: 调度器
            Graceful Shutdown: 服务优雅关闭
            Offline Inference: 离线推理
            Observability: 可观测性
            Trace: Trace服务
            CLI: CLI 使用说明
            Chat: Chat命令
            Complete: Complete命令
            Server: Server命令
            Bench: Bench命令
            Collect Env: Collect Env命令
            Run Batch: Run Batch命令
            Tokenizer: Tokenizer命令
            ERNIE-4.5-21B-A3B: ERNIE-4.5-21B-A3B
            ERNIE-4.5-21B-A3B-Thinking: ERNIE-4.5-21B-A3B-Thinking
            ERNIE-4.5-300B-A47B: ERNIE-4.5-300B-A47B
            ERNIE-4.5-VL-28B-A3B: ERNIE-4.5-VL-28B-A3B
            ERNIE-4.5-VL-424B-A47B: ERNIE-4.5-VL-424B-A47B
            PaddleOCR-VL-0.9B: PaddleOCR-VL-0.9B
            FAQ: 常见问题
            Quantization: 量化
            Overview: 概述
            Online Quantization: 在线量化
            WINT2 Quantization: WINT2量化
            Features: 特性
            Prefix Caching: 前缀缓存
            Disaggregation: 分离式部署
            Chunked Prefill: 分块预填充
            Load Balance: 负载均衡
            Speculative Decoding: 投机解码
            Structured Outputs: 结构化输出
            Reasoning Output: 思考链内容
            Early Stop: 早停功能
            Plugins: 插件机制
            PaddleFormers Backend: PaddleFormers 后端
            Sampling: 采样策略
            MultiNode Deployment: 多机部署
            Graph Optimization: 图优化
            Data Parallelism: 数据并行
            PLAS: PLAS
            Supported Models: 支持模型列表
            Benchmark: 基准测试
            Usage: 用法
            Log Description: 日志说明
            Code Overview: 代码概述
            Environment Variables: 环境变量

nav:
  - FastDeploy: index.md
  - Quick Start:
      - Installation:
          - Nvidia GPU: get_started/installation/nvidia_gpu.md
          - KunlunXin XPU: get_started/installation/kunlunxin_xpu.md
          - Intel Gaudi: get_started/installation/intel_gaudi.md
          - HYGON DCU: get_started/installation/hygon_dcu.md
          - Enflame S60: get_started/installation/Enflame_gcu.md
          - Iluvatar CoreX: get_started/installation/iluvatar_gpu.md
          - Metax C550: get_started/installation/metax_gpu.md
      - Quick Deployment For ERNIE-4.5-0.3B: get_started/quick_start.md
      - Quick Deployment for ERNIE-4.5-VL-28B-A3B: get_started/quick_start_vl.md
      - ERNIE-4.5-300B-A47B: get_started/ernie-4.5.md
      - ERNIE-4.5-VL-424B-A47B: get_started/ernie-4.5-vl.md
      - Quick Deployment For QWEN: get_started/quick_start_qwen.md
      - Quick Deployment For QWEN2.5-VL: get_started/quick_start_qwen25_vl.md
  - Online Serving:
      - OpenAI-Compatible API Server: online_serving/README.md
      - Monitor Metrics: online_serving/metrics.md
      - Scheduler: online_serving/scheduler.md
      - Graceful Shutdown: online_serving/graceful_shutdown_service.md
      - Load-Balancing Scheduling Router: online_serving/router.md
  - Offline Inference: offline_inference.md
  - Best Practices:
      - ERNIE-4.5-0.3B: best_practices/ERNIE-4.5-0.3B-Paddle.md
      - ERNIE-4.5-21B-A3B: best_practices/ERNIE-4.5-21B-A3B-Paddle.md
      - ERNIE-4.5-300B-A47B: best_practices/ERNIE-4.5-300B-A47B-Paddle.md
      - ERNIE-4.5-21B-A3B-Thinking: best_practices/ERNIE-4.5-21B-A3B-Thinking.md
      - ERNIE-4.5-VL-28B-A3B: best_practices/ERNIE-4.5-VL-28B-A3B-Paddle.md
      - ERNIE-4.5-VL-424B-A47B: best_practices/ERNIE-4.5-VL-424B-A47B-Paddle.md
      - PaddleOCR-VL-0.9B: best_practices/PaddleOCR-VL-0.9B.md
      - FAQ: best_practices/FAQ.md
  - Quantization:
      - Overview: quantization/README.md
      - Online Quantization: quantization/online_quantization.md
      - WINT2 Quantization: quantization/wint2.md
  - Features:
      - Prefix Caching: features/prefix_caching.md
      - Disaggregation: features/disaggregated.md
      - Chunked Prefill: features/chunked_prefill.md
      - Load Balance: features/load_balance.md
      - Speculative Decoding: features/speculative_decoding.md
      - Structured Outputs: features/structured_outputs.md
      - Reasoning Output: features/reasoning_output.md
      - Early Stop: features/early_stop.md
      - Plugins: features/plugins.md
      - PaddleFormers Backend: features/paddleformers_backend.md
      - Sampling: features/sampling.md
      - MultiNode Deployment: features/multi-node_deployment.md
      - Graph Optimization: features/graph_optimization.md
      - Data Parallelism: features/data_parallel_service.md
      - PLAS: features/plas_attention.md
  - Supported Models: supported_models.md
  - Benchmark: benchmark.md
  - Usage:
      - Log Description: usage/log.md
      - Code Overview: usage/code_overview.md
      - Environment Variables: usage/environment_variables.md
  - CLI:
      - Overview: cli/README.md
      - Chat: cli/chat.md
      - Complete: cli/complete.md
      - Server: cli/serve.md
      - Collect Env: cli/collect-env.md
      - Bench: cli/bench.md
      - Run Batch: cli/run-batch.md
      - Tokenizer: cli/tokenizer.md
  - Observability:
      - Trace: observability/trace.md