diff --git a/benchmarks/paddleocr_vl/PaddleOCR-VL-1_5_fastdeploy.yaml b/benchmarks/paddleocr_vl/PaddleOCR-VL-1_5_fastdeploy.yaml new file mode 100644 index 00000000000..8f90d2cd410 --- /dev/null +++ b/benchmarks/paddleocr_vl/PaddleOCR-VL-1_5_fastdeploy.yaml @@ -0,0 +1,83 @@ + +pipeline_name: PaddleOCR-VL + +batch_size: 64 + +use_queues: True + +use_doc_preprocessor: False +use_layout_detection: True +use_chart_recognition: False +use_seal_recognition: False +use_polygon_points: True +format_block_content: False +merge_layout_blocks: True +markdown_ignore_labels: + - number + - footnote + - header + - header_image + - footer + - footer_image + - aside_text + +SubModules: + LayoutDetection: + module_name: layout_detection + model_name: PP-DocLayoutV3 + model_dir: /root/.paddlex/official_models/PP-DocLayoutV3 + batch_size: 8 + threshold: 0.3 + layout_nms: True + layout_unclip_ratio: [1.0, 1.0] + layout_merge_bboxes_mode: + 0: "union" # abstract + 1: "union" # algorithm + 2: "union" # aside_text + 3: "large" # chart + 4: "union" # content + 5: "large" # display_formula + 6: "large" # doc_title + 7: "union" # figure_title + 8: "union" # footer + 9: "union" # footer + 10: "union" # footnote + 11: "union" # formula_number + 12: "union" # header + 13: "union" # header + 14: "union" # image + 15: "large" # inline_formula + 16: "union" # number + 17: "large" # paragraph_title + 18: "union" # reference + 19: "union" # reference_content + 20: "union" # seal + 21: "union" # table + 22: "union" # text + 23: "union" # text + 24: "union" # vision_footnote + VLRecognition: + module_name: vl_recognition + model_name: PaddleOCR-VL-1.5-0.9B + model_dir: /mnt/moark-models/PaddleOCR-VL-1.5 + batch_size: 4096 + genai_config: + backend: fastdeploy-server + server_url: http://0.0.0.0:8300/v1/ + +SubPipelines: + DocPreprocessor: + pipeline_name: doc_preprocessor + batch_size: 8 + use_doc_orientation_classify: True + use_doc_unwarping: True + SubModules: + DocOrientationClassify: + module_name: doc_text_orientation + model_name: PP-LCNet_x1_0_doc_ori + model_dir: null + batch_size: 8 + DocUnwarping: + module_name: image_unwarping + model_name: UVDoc + model_dir: null \ No newline at end of file diff --git a/benchmarks/paddleocr_vl/ocr_benchmark.py b/benchmarks/paddleocr_vl/ocr_benchmark.py new file mode 100644 index 00000000000..91c80348ae4 --- /dev/null +++ b/benchmarks/paddleocr_vl/ocr_benchmark.py @@ -0,0 +1,312 @@ +#!/usr/bin/env python + +import argparse +import glob +import json +import os +import sys +import time +import uuid +import multiprocessing as mp +from pathlib import Path +from threading import Thread, Event +from operator import itemgetter + +import tiktoken +from tqdm import tqdm +from pymxsml.mxsml_extension import * + +import sys +import traceback + +import paddle.profiler as profiler +# 全局变量:记录处理失败的文件 +error_files = [] + +# ====================== +# 1. 基础工具与全局配置 +# ====================== + +# encoding = tiktoken.get_encoding("cl100k_base") + +def get_curr_time(): + return time.perf_counter() + +# ====================== +# 2. Worker 进程逻辑 +# ====================== +def worker_loop(worker_idx, gpu_id, config_path, device, shared_task_q, result_q): + """ + 每个 Worker 进程初始化一次 Pipeline,随后抢占式获取任务。 + """ + try: + # 隔离当前进程可见的 GPU 资源 + os.environ["MACA_VISIBLE_DEVICES"] = str(gpu_id) + from paddlex import create_pipeline + + # 初始化 Pipeline (假定为 FastDeploy 后端) + pipeline = create_pipeline(config_path, device=device) + # pipeline = create_pipeline(config_path, device="cpu") + result_q.put(("READY", worker_idx)) + + + except Exception as e: + result_q.put(("ERR", worker_idx, f"Initialization Failed: {repr(e)}")) + return + + # Phase-stagger workers so their per-task submission waves interleave instead of + # aligning. Aligned waves overload the VL server in bursts (queue spikes) then leave + # it idle (GPU troughs); interleaving keeps the server steadily fed. Off by default. + stagger = float(os.environ.get("FD_WORKER_STAGGER", "0") or "0") + if stagger > 0: + time.sleep(worker_idx * stagger) + + while True: + msg = shared_task_q.get() + if msg is None: # 收到 None 代表所有任务完成 + break + + tag, job_id, batch_paths = msg + start_t = get_curr_time() + + try: + # 执行推理:直接传入路径列表 + results = list(pipeline.predict(input=batch_paths)) + print(f"Batch {job_id} processed {len(results)} pages") + markdown_list = [] + for res in results: + # 提取 Markdown 文本 + res_dict = res._to_markdown(pretty=False) + markdown_list.append(res_dict["markdown_texts"]) + + full_markdown = "\n\n".join(markdown_list) + end_t = get_curr_time() + + + # 返回详细结果数据用于主进程统计 + task_info = { + "id": job_id, + "start_time": start_t, + "end_time": end_t, + "successful": True, + "processed_pages": len(results), + # "generated_tokens": len(encoding.encode(full_markdown)), + "markdown": full_markdown + } + result_q.put(("DONE", task_info)) + + except Exception as e: + # 方式1:获取完整的栈信息字符串(推荐) + error_trace = traceback.format_exc() + result_q.put(("DONE", { + "id": job_id, + "start_time": start_t, + "end_time": get_curr_time(), + "successful": False, + "error": str(e), + "traceback": error_trace + })) + +# ====================== +# 3. 监控与任务分发 +# ====================== +def monitor_device(gpu_ids, gpu_metrics_list, stop_event): + """ + 独立线程:采样所有指定 GPU 的性能指标。 + """ + try: + mxSmlExInit() + handles = [mxSmlExDeviceGetHandleByIndex(gid) for gid in gpu_ids] + while not stop_event.is_set(): + sample = {"ts": time.time(), "per_gpu": []} + for gid, handle in zip(gpu_ids, handles): + util = mxSmlExDeviceGetUtilizationRates(handle).gpu + mem = mxSmlExDeviceGetMemoryInfo(handle).used + sample["per_gpu"].append({"id": gid, "util": util, "mem": mem}) + gpu_metrics_list.append(sample) + time.sleep(0.5) + except Exception as e: + print(f"Monitor Thread Error: {e}") + finally: + mxSmlExShutdown() + +def task_producer(all_paths, batch_size, shared_task_q): + """ + 生产者线程:将路径列表切片并塞入任务队列。 + """ + for i in range(0, len(all_paths), batch_size): + batch = all_paths[i : i + batch_size] + # 任务元组:(TAG, 唯一ID, 路径列表) + shared_task_q.put(("RUN", uuid.uuid4().hex, batch)) + +# ====================== +# 4. 主程序 (统计与协调) +# ====================== +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Multi-GPU Pipeline Parallel Benchmark") + parser.add_argument("--input_dirs", type=str, default="/external/ai2/models/squad3/model/dataset/omni1_5/pdfs") + parser.add_argument("-b", "--batch_size", type=int, default=0, + help="Files per task. <=0 means auto (ceil(total/num_workers), one big batch per worker). " + "A small value (e.g. 4) creates many more tasks than workers so workers pull " + "dynamically, desync their pipeline phases, and feed the VL server a smoother stream.") + parser.add_argument("-o", "--output_path", type=str, default="benchmark_results.json") + parser.add_argument("--device", type=str, default="metax_gpu") + parser.add_argument("--paddlex_config_path", type=str, default="PaddleOCR-VL-1_5_fastdeploy.yaml") + parser.add_argument("--gpu_ids", type=int, nargs="+", default=[0]) + parser.add_argument("--process_per_gpu", type=int, default=3, help="每个 GPU 启动的并发进程数") + args = parser.parse_args() + + # 扫描输入路径 (支持单路径及递归匹配) + all_input_paths = [] + input_dirs = [args.input_dirs] if isinstance(args.input_dirs, str) else args.input_dirs + for d in input_dirs: + # 只选取 PDF 文件,避免目录或其它杂质干扰 pipeline + all_input_paths += glob.glob(os.path.join(d, "*.pdf")) + all_input_paths.sort() + print(f"Found {len(all_input_paths)} PDF files: {all_input_paths}") + if not all_input_paths: + print(f"Error: No PDF files found in {args.input_dirs}") + sys.exit(1) + + total_files = len(all_input_paths) + num_workers = len(args.gpu_ids) * args.process_per_gpu + # batch_size<=0 => auto (one big batch per worker, original behavior). + # A small explicit batch_size decouples task granularity from worker count: many more + # tasks than workers => dynamic pull-based load balancing + desynced pipeline phases => + # the VL server receives a smoother, more continuous request stream (less sawtooth). + if args.batch_size <= 0: + # 使用 (a + b - 1) // b 这种技巧来实现向上取整 + args.batch_size = (total_files + num_workers - 1) // num_workers + args.batch_size = max(1, min(args.batch_size, total_files)) + total_batches = (total_files + args.batch_size - 1) // args.batch_size + + # 启动并行环境 + mp_ctx = mp.get_context("spawn") + # 设置背压保护:限制任务队列长度,防止内存因路径列表积压过多 + total_workers = len(args.gpu_ids) * args.process_per_gpu + shared_task_q = mp_ctx.Queue(maxsize=num_workers * 2) + result_q = mp_ctx.Queue() + + workers = [] + print(f"[System] Initializing {total_workers} parallel pipelines on GPU(s) {args.gpu_ids}...") + + for i in range(total_workers): + gid = args.gpu_ids[i % len(args.gpu_ids)] + p = mp_ctx.Process( + target=worker_loop, + args=(i, gid, args.paddlex_config_path, args.device, shared_task_q, result_q) + ) + p.start() + workers.append(p) + + # 等待所有 Worker 初始化 + ready_count = 0 + while ready_count < total_workers: + msg = result_q.get() + if msg[0] == "READY": + ready_count += 1 + elif msg[0] == "ERR": + print(f"[FATAL] Worker {msg[1]} failed: {msg[2]}") + for p in workers: p.terminate() + sys.exit(2) + + # 启动性能监控与生产者 + gpu_metrics_list = [] + stop_monitor = Event() + monitor_thread = Thread(target=monitor_device, args=(args.gpu_ids, gpu_metrics_list, stop_monitor), daemon=True) + monitor_thread.start() + + producer = Thread(target=task_producer, args=(all_input_paths, args.batch_size, shared_task_q), daemon=True) + producer.start() + + # 结果收集与汇总 + task_info_list = [] + start_time = get_curr_time() + + print(f"[Processing] {total_files} files | {total_batches} batches | batch_size={args.batch_size}") + + + with open("generated_markdown.md", "w", encoding="utf-8") as f: + pbar = tqdm(total=total_batches, desc="Inference Progress") + batches_done = 0 + while batches_done < total_batches: + msg = result_q.get() + if msg[0] == "DONE": + info = msg[1] + if info["successful"]: + f.write(info["markdown"]) + f.write("\n\n") + task_info_list.append(info) + else: + pbar.write(f"\n[Warning] Task {info['id']} failed: {info.get('error')} | traceback: {info.get('traceback', '无栈信息')}") + + batches_done += 1 + pbar.update(1) + pbar.close() + + end_time = get_curr_time() + + # 清理现场 + stop_monitor.set() + # 放入哨兵信号,通知 Worker 退出 + for _ in range(total_workers): shared_task_q.put(None) + for w in workers: w.join(timeout=5) + monitor_thread.join(timeout=2) + + # ====================== + # 5. Summary & 统计输出 + # ====================== + duration = end_time - start_time + throughput_file = total_files / duration + + latencies = [info["end_time"] - info["start_time"] for info in task_info_list if info["successful"]] + avg_latency_batch = sum(latencies) / len(latencies) if latencies else 0 + + print("\n" + "="*50) + print(f"{'BENCHMARK SUMMARY':^50}") + print("="*50) + print(f"Total Files: {total_files}") + print(f"Throughput: {throughput_file:.4f} files/sec") + print(f"Total Execution Time: {duration:.2f} sec") + print(f"Avg Batch Latency: {avg_latency_batch:.4f} sec") + + successful_batches = len(task_info_list) + if successful_batches > 0: + processed_pages = sum(info.get("processed_pages", 0) for info in task_info_list) + throughput_page = processed_pages / duration + print(f"Total Processed Pages: {processed_pages}") + print(f"Page Throughput: {throughput_page:.4f} pages/sec") + + generated_tokens = sum(info.get("generated_tokens", 0) for info in task_info_list) + throughput_token = generated_tokens / duration + print(f"Total Generated Tokens: {generated_tokens}") + print(f"Token Throughput: {throughput_token:.1f} tokens/sec") + + if gpu_metrics_list: + # 计算系统级的平均和峰值 + all_utils = [sum(g['util'] for g in sample['per_gpu'])/len(args.gpu_ids) for sample in gpu_metrics_list] + all_mems = [sum(g['mem'] for g in sample['per_gpu']) for sample in gpu_metrics_list] + + print("-" * 50) + print(f"System GPU Utilization (%): Peak={max(all_utils):.1f}, Avg={sum(all_utils)/len(all_utils):.1f}") + print(f"System GPU Memory (MB): Peak={max(all_mems)/1024**2:.1f}, Avg={sum(all_mems)/len(gpu_metrics_list)/1024**2:.1f}") + print("="*50) + + # 导出 JSON 报告 + final_report = { + "config": vars(args), + "metrics": { + "total_files": total_files, + "throughput_file": throughput_file, + "avg_latency_batch": avg_latency_batch, + "processed_pages": processed_pages if successful_batches > 0 else 0, + "throughput_page": throughput_page if successful_batches > 0 else 0, + "generated_tokens": generated_tokens if successful_batches > 0 else 0, + "throughput_token": throughput_token if successful_batches > 0 else 0, + }, + "gpu_history": gpu_metrics_list + } + with open(args.output_path, "w", encoding="utf-8") as f: + json.dump(final_report, f, ensure_ascii=False, indent=2) + print(f"\n[Done] Results saved to {args.output_path}") + diff --git a/custom_ops/metax_ops/apply_rope_qkv.cu b/custom_ops/metax_ops/apply_rope_qkv.cu index 3c7679e077e..50b1cdb9160 100644 --- a/custom_ops/metax_ops/apply_rope_qkv.cu +++ b/custom_ops/metax_ops/apply_rope_qkv.cu @@ -50,6 +50,7 @@ struct Converter<__nv_bfloat16> { struct ApplyRopeQKVParams { int head_dim; + int half_dim; // head_dim / 2, used by rotate-half convention int token_stride; int head_stride; int q_stride; @@ -61,37 +62,64 @@ struct ApplyRopeQKVParams { int kv_head_num; }; +// Rotate-half convention (used by SigLIP vision encoder and most HF models): +// for i < half_dim: out[i] = x[i] * cos[i] - x[i + half_dim] * sin[i] +// for i >= half_dim: out[i] = x[i] * cos[i] + x[i - half_dim] * sin[i] +// Equivalent to `x * cos + concat([-x[half:], x[:half]]) * sin`. +// Each Vec4 at head_dim_idx loads its 4-element main chunk AND a paired Vec4 +// from the other half, then applies the rotate-half formula element-wise. template -__device__ __forceinline__ void RotateQKVec4(const T* qkv_ptr, - const T* rot_cos_ptr, - const T* rot_sin_ptr, - const int load_idx, - const int store_idx, - const int rot_base_idx, - T* out) { +__device__ __forceinline__ void RotateQKVec4HalfStyle(const T* qkv_ptr, + const T* rot_cos_ptr, + const T* rot_sin_ptr, + const int load_idx, + const int store_idx, + const int rot_base_idx, + const int head_dim_idx, + const int half_dim, + T* out) { using VecT = AlignedVector; VecT qk_vec; Load(qkv_ptr + load_idx, &qk_vec); - VecT rot_half_vec = {-qk_vec[1], qk_vec[0], -qk_vec[3], qk_vec[2]}; + // Load the paired Vec4 from the other half of head_dim for rotate_half. + // pair_load = load_idx + half_dim if first half, load_idx - half_dim if second. + const int pair_load = + load_idx + ((head_dim_idx < half_dim) ? half_dim : -half_dim); + VecT pair_vec; + Load(qkv_ptr + pair_load, &pair_vec); + VecT cos_vec, sin_vec; Load(rot_cos_ptr + rot_base_idx, &cos_vec); Load(rot_sin_ptr + rot_base_idx, &sin_vec); + + if (head_dim_idx < half_dim) { + // first half: out = x * cos - pair * sin +#pragma unroll + for (int i = 0; i < 4; ++i) { + *(out + store_idx + i) = + qk_vec[i] * cos_vec[i] - pair_vec[i] * sin_vec[i]; + } + } else { + // second half: out = x * cos + pair * sin #pragma unroll - for (int i = 0; i < 4; ++i) { - *(out + store_idx + i) = - qk_vec[i] * cos_vec[i] + rot_half_vec[i] * sin_vec[i]; + for (int i = 0; i < 4; ++i) { + *(out + store_idx + i) = + qk_vec[i] * cos_vec[i] + pair_vec[i] * sin_vec[i]; + } } } template -__device__ __forceinline__ void RotateQKVec4(const T* qkv_ptr, - const float* rot_cos_ptr, - const float* rot_sin_ptr, - const int load_idx, - const int store_idx, - const int rot_base_idx, - T* out) { +__device__ __forceinline__ void RotateQKVec4HalfStyle(const T* qkv_ptr, + const float* rot_cos_ptr, + const float* rot_sin_ptr, + const int load_idx, + const int store_idx, + const int rot_base_idx, + const int head_dim_idx, + const int half_dim, + T* out) { using VecT = AlignedVector; using VecF = AlignedVector; auto to_float = [] __device__(T val) -> float { @@ -103,17 +131,27 @@ __device__ __forceinline__ void RotateQKVec4(const T* qkv_ptr, VecT qk_vec; Load(qkv_ptr + load_idx, &qk_vec); - VecF rot_half_vec = {-to_float(qk_vec[1]), - to_float(qk_vec[0]), - -to_float(qk_vec[3]), - to_float(qk_vec[2])}; + const int pair_load = + load_idx + ((head_dim_idx < half_dim) ? half_dim : -half_dim); + VecT pair_vec; + Load(qkv_ptr + pair_load, &pair_vec); + VecF cos_vec, sin_vec; Load(rot_cos_ptr + rot_base_idx, &cos_vec); Load(rot_sin_ptr + rot_base_idx, &sin_vec); + + if (head_dim_idx < half_dim) { +#pragma unroll + for (int i = 0; i < 4; ++i) { + *(out + store_idx + i) = from_float(to_float(qk_vec[i]) * cos_vec[i] - + to_float(pair_vec[i]) * sin_vec[i]); + } + } else { #pragma unroll - for (int i = 0; i < 4; ++i) { - *(out + store_idx + i) = from_float(to_float(qk_vec[i]) * cos_vec[i] + - rot_half_vec[i] * sin_vec[i]); + for (int i = 0; i < 4; ++i) { + *(out + store_idx + i) = from_float(to_float(qk_vec[i]) * cos_vec[i] + + to_float(pair_vec[i]) * sin_vec[i]); + } } } @@ -148,7 +186,8 @@ __global__ void DispatchApplyRopeQKVVec4Kernel(const T* qkv, head_dim_idx; store_idx = token_idx * param.q_stride + head_idx * param.head_dim + head_dim_idx; - RotateQKVec4(qkv, rot_cos, rot_sin, load_idx, store_idx, rot_idx, q_out); + RotateQKVec4HalfStyle(qkv, rot_cos, rot_sin, load_idx, store_idx, rot_idx, + head_dim_idx, param.half_dim, q_out); } if (head_idx < param.kv_head_num && head_dim_idx < param.head_dim) { // kv @@ -157,7 +196,8 @@ __global__ void DispatchApplyRopeQKVVec4Kernel(const T* qkv, head_dim_idx; store_idx = token_idx * param.kv_stride + head_idx * param.head_dim + head_dim_idx; - RotateQKVec4(qkv, rot_cos, rot_sin, load_idx, store_idx, rot_idx, k_out); + RotateQKVec4HalfStyle(qkv, rot_cos, rot_sin, load_idx, store_idx, rot_idx, + head_dim_idx, param.half_dim, k_out); load_idx = token_idx * param.token_stride + (head_idx + param.v_head_offset) * param.head_stride + head_dim_idx; @@ -196,6 +236,7 @@ void ApplyRopeQKVKernel(const paddle::Tensor& qkv, ApplyRopeQKVParams param; param.head_dim = head_dim; + param.half_dim = head_dim / 2; param.token_stride = all_num_head * head_dim; param.head_stride = head_dim; param.q_stride = q_head_num * head_dim; diff --git a/fastdeploy/input/paddleocr_vl_processor/image_processor.py b/fastdeploy/input/paddleocr_vl_processor/image_processor.py index 8e333d5bf96..dd365234ab3 100644 --- a/fastdeploy/input/paddleocr_vl_processor/image_processor.py +++ b/fastdeploy/input/paddleocr_vl_processor/image_processor.py @@ -99,9 +99,9 @@ def smart_resize( height = round((height * factor) / width) width = factor - if max(height, width) / min(height, width) > 200: + if max(height, width) / min(height, width) > 300: raise ValueError( - f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}" + f"absolute aspect ratio must be smaller than 300, got {max(height, width) / min(height, width)}" ) h_bar = round(height / factor) * factor w_bar = round(width / factor) * factor diff --git a/fastdeploy/model_executor/models/paddleocr_vl/siglip.py b/fastdeploy/model_executor/models/paddleocr_vl/siglip.py index 82d3eb09e95..685bcc75440 100644 --- a/fastdeploy/model_executor/models/paddleocr_vl/siglip.py +++ b/fastdeploy/model_executor/models/paddleocr_vl/siglip.py @@ -26,7 +26,7 @@ from fastdeploy.model_executor.utils import h2d_copy, slice_fn from .config import PaddleOCRVisionConfig -from .siglip_ops import get_activation_fn, neox_rope_embedding +from .siglip_ops import _siglip_activation, neox_rope_embedding_eager class SiglipAttention(nn.Layer): @@ -125,7 +125,7 @@ def forward( ): B, seq_length, D = hidden_states.shape qkv = self.qkv_proj(hidden_states) - q, k, v = neox_rope_embedding(qkv, cos_emb, sin_emb, self.num_heads, self.head_dim) + q, k, v = neox_rope_embedding_eager(qkv, cos_emb, sin_emb, self.num_heads, self.head_dim) attn_output = self.flash_attn_func( q, k, @@ -303,7 +303,7 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: hidden_states = self.fc1(hidden_states) - hidden_states = get_activation_fn(self.config.hidden_act)(hidden_states[0]) + hidden_states = _siglip_activation(hidden_states[0]) hidden_states = self.fc2(hidden_states) return hidden_states diff --git a/fastdeploy/model_executor/models/paddleocr_vl/siglip_ops.py b/fastdeploy/model_executor/models/paddleocr_vl/siglip_ops.py index 33cd502a5ea..4dc449cd557 100644 --- a/fastdeploy/model_executor/models/paddleocr_vl/siglip_ops.py +++ b/fastdeploy/model_executor/models/paddleocr_vl/siglip_ops.py @@ -16,6 +16,8 @@ from typing import List +import os + import paddle from paddleformers.transformers.activations import ACT2FN @@ -23,6 +25,22 @@ if current_platform.is_cuda(): from fastdeploy.model_executor.ops.gpu import fused_neox_rope_embedding, gelu_tanh +elif current_platform.is_maca(): + # Metax C500 (SM 80) has gelu_tanh fused op available, but not fused_neox_rope_embedding. + # Enable gelu_tanh to avoid Python fallback in SigLIP MLP (27 layers × 1 GELU each). + from fastdeploy.model_executor.ops.gpu import gelu_tanh + + # Fused RoPE kernel (rotate-half convention) compiled from apply_rope_qkv.cu. + # Replaces the Python fallback in SigLIP vision encoder (27 layers × 1 RoPE each, + # measured at 27.5% of vision encoder time — see optimization_log.md §0.1.1). + _ROPE_SO = os.path.join(os.path.dirname(__file__), "apply_rope_qkv_pd_.so") + _MACA_FUSED_ROPE_OK = False + if os.path.exists(_ROPE_SO): + try: + paddle.utils.cpp_extension.load_op_meta_info_and_register_op(_ROPE_SO) + _MACA_FUSED_ROPE_OK = True + except Exception: + pass def rotate_half(x): @@ -59,6 +77,30 @@ def native_neox_rope_embedding(qkv, cos, sin, num_heads): return q, k, v +def maca_fused_neox_rope_embedding(qkv, cos, sin, num_heads, head_dim): + """Metax fused RoPE kernel — rotate-half convention. + Input: qkv [B, seq, 3*num_heads*head_dim], cos/sin [seq, 1, head_dim] + Output: q, k, v each [seq, num_heads, head_dim] + """ + B, seq_length, D = qkv.shape + if seq_length == -1: + _, seq_length, _ = paddle.shape(qkv) + # Flatten batch+seq → tokens (kernel expects 2D qkv) + qkv_flat = qkv.reshape([-1, D]) + q_out, k_out, v_out = paddle._C_ops._run_custom_op( + "apply_rope_qkv", + qkv_flat, + cos, + sin, + num_heads, + num_heads, + head_dim, + ) + return q_out, k_out, v_out + + + + jit_unified_marker = paddle.jit.marker.unified if hasattr(paddle.jit.marker, "unified") else lambda fn: fn @@ -68,6 +110,8 @@ def neox_rope_embedding( ) -> List[paddle.Tensor]: if current_platform.is_cuda() and paddle.in_dynamic_mode(): return fused_neox_rope_embedding(qkv, cos_emb, sin_emb, num_heads, head_dim) + elif current_platform.is_maca() and _MACA_FUSED_ROPE_OK and paddle.in_dynamic_mode(): + return maca_fused_neox_rope_embedding(qkv, cos_emb, sin_emb, num_heads, head_dim) else: return native_neox_rope_embedding(qkv, cos_emb, sin_emb, num_heads) @@ -75,9 +119,37 @@ def neox_rope_embedding( @jit_unified_marker def get_activation_fn(hidden_act: str): if hidden_act == "gelu_pytorch_tanh": - if current_platform.is_cuda() and paddle.in_dynamic_mode(): + if (current_platform.is_cuda() or current_platform.is_maca()) and paddle.in_dynamic_mode(): return gelu_tanh else: return ACT2FN["gelu_new"] else: return ACT2FN[hidden_act] + + +# ── eager-mode dispatch cache ───────────────────────────────────── +# Avoid repeated platform / dynamic-mode checks on every encoder-layer +# call. Resolved once on first invocation, then the fast path is a +# single boolean branch (no function calls, no attribute lookups). +_USE_MACA_FUSED_ROPE = ( + current_platform.is_maca() and _MACA_FUSED_ROPE_OK and paddle.in_dynamic_mode() +) +_USE_CUDA_FUSED_ROPE = ( + current_platform.is_cuda() and paddle.in_dynamic_mode() +) + + +def neox_rope_embedding_eager( + qkv, cos_emb, sin_emb, num_heads, head_dim +): + """Hot-path RoPE dispatch — no platform checks, no decorator overhead.""" + if _USE_MACA_FUSED_ROPE: + return maca_fused_neox_rope_embedding(qkv, cos_emb, sin_emb, num_heads, head_dim) + elif _USE_CUDA_FUSED_ROPE: + return fused_neox_rope_embedding(qkv, cos_emb, sin_emb, num_heads, head_dim) + else: + return native_neox_rope_embedding(qkv, cos_emb, sin_emb, num_heads) + + +# Pre-resolved activation for SigLIP MLP (gelu_pytorch_tanh). +_siglip_activation = get_activation_fn("gelu_pytorch_tanh") diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py index 77944b3a2cf..a6a0e662c0d 100644 --- a/fastdeploy/worker/metax_model_runner.py +++ b/fastdeploy/worker/metax_model_runner.py @@ -1705,7 +1705,7 @@ def _dummy_pooler_run( skip_save_output=True, async_output_queue=self.async_output_queue, think_end_id=self.model_config.think_end_id, - line_break_id=self.model_config.line_break_id, + # line_break_id=self.model_config.line_break_id, ) self.exist_prefill_flag = False return pooler_output @@ -1810,7 +1810,7 @@ def _dummy_sampler_run( skip_save_output=True, async_output_queue=self.async_output_queue, think_end_id=self.model_config.think_end_id, - line_break_id=self.model_config.line_break_id, + # line_break_id=self.model_config.line_break_id, enable_entropy=self.enable_entropy and self.parallel_config.tensor_parallel_rank == 0, ) self.exist_prefill_flag = False @@ -2457,7 +2457,7 @@ def _postprocess( skip_save_output=skip_save_output, async_output_queue=self.async_output_queue, think_end_id=self.model_config.think_end_id, - line_break_id=self.model_config.line_break_id, + # line_break_id=self.model_config.line_break_id, enable_entropy=self.enable_entropy and self.parallel_config.tensor_parallel_rank == 0, ) diff --git a/scripts/build_rope_kernel.sh b/scripts/build_rope_kernel.sh new file mode 100755 index 00000000000..4f41aa154b5 --- /dev/null +++ b/scripts/build_rope_kernel.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +# 独立编译 RoPE kernel,不影响系统环境 +# 用法: bash scripts/build_rope_kernel.sh + +set -e + +echo "==========================================" +echo "RoPE Kernel 编译脚本 (Metax MACA)" +echo "==========================================" + +# 保存原始环境变量 +ORIG_PATH=$PATH +ORIG_LD_LIBRARY_PATH=$LD_LIBRARY_PATH + +# 设置 Metax 编译环境 (仅在当前 shell) +export MACA_PATH=/opt/maca + +# 初始化 cu-bridge (如果不存在) +if [ ! -d ${HOME}/cu-bridge ]; then + echo "[INFO] 初始化 cu-bridge..." + ${MACA_PATH}/tools/cu-bridge/tools/pre_make +fi + +# 设置编译环境变量 +export CUCC_PATH=/opt/maca/tools/cu-bridge +export CUCC_CMAKE_ENTRY=2 +export CUDA_PATH=${HOME}/cu-bridge/CUDA_DIR +export PATH=${CUDA_PATH}/bin:${MACA_PATH}/mxgpu_llvm/bin:${MACA_PATH}/bin:${CUCC_PATH}/tools:${CUCC_PATH}/bin:${ORIG_PATH} +export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${ORIG_LD_LIBRARY_PATH} + +echo "[INFO] 编译环境已设置" +echo " CUDA_PATH: $CUDA_PATH" +echo " CUCC_PATH: $CUCC_PATH" + +# 源文件和输出路径 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" +SRC_FILE="${PROJECT_ROOT}/custom_ops/metax_ops/apply_rope_qkv.cu" +OUTPUT_FILE="${PROJECT_ROOT}/fastdeploy/model_executor/models/paddleocr_vl/apply_rope_qkv_pd_.so" + +echo "[INFO] 源文件: $SRC_FILE" +echo "[INFO] 输出文件: $OUTPUT_FILE" + +# 备份原有的 .so (如果存在) +if [ -f "$OUTPUT_FILE" ]; then + BACKUP_FILE="${OUTPUT_FILE}.backup.$(date +%Y%m%d_%H%M%S)" + echo "[INFO] 备份原有 .so: $BACKUP_FILE" + cp "$OUTPUT_FILE" "$BACKUP_FILE" +fi + +# 使用 paddle 的 cpp_extension 编译 +echo "[INFO] 开始编译..." +cd "$PROJECT_ROOT" + +# 设置 Metax 库的链接标志 +export LDFLAGS="-L/opt/maca/lib -lruntime_cu -lmcruntime -lmctlassEx -lmccompiler" + +python -c " +import os +import sys +import paddle +from paddle.utils.cpp_extension import load + +# 设置 Metax 环境变量 +os.environ['PADDLE_CUSTOM_DEVICE'] = 'metax_gpu' + +source_files = ['${SRC_FILE}'] +extra_include = ['${PROJECT_ROOT}/custom_ops/gpu_ops'] + +try: + op = load( + name='apply_rope_qkv_pd', + sources=source_files, + extra_include_paths=extra_include + ['${PROJECT_ROOT}/custom_ops/third_party/nlohmann_json/include'], + extra_cxx_cflags=['-DPADDLE_WITH_CUSTOM_DEVICE_METAX_GPU', '-DPADDLE_DEV'], + extra_cuda_cflags=['-DPADDLE_WITH_CUSTOM_DEVICE_METAX_GPU', '-DPADDLE_DEV'], + extra_ldflags=['-L/opt/maca/lib', '-lruntime_cu', '-lmcruntime', '-lmctlassEx', '-lmccompiler'], + verbose=False + ) + print('[SUCCESS] 编译成功') + print(f'编译产物: {op}') +except Exception as e: + print(f'[ERROR] 编译失败: {e}') + sys.exit(1) +" + +# 找到编译产物并复制到目标位置 +BUILD_DIR="${HOME}/.cache/paddle_extensions/apply_rope_qkv_pd" + +SO_FILE=$(find "$BUILD_DIR" -name "apply_rope_qkv_pd*.so" -type f 2>/dev/null | head -1) + +if [ -z "$SO_FILE" ]; then + echo "[ERROR] 找不到编译产物 .so 文件" + exit 1 +fi + +echo "[INFO] 找到编译产物: $SO_FILE" +echo "[INFO] 复制到: $OUTPUT_FILE" +cp "$SO_FILE" "$OUTPUT_FILE" + +# 验证 +if [ -f "$OUTPUT_FILE" ]; then + echo "[SUCCESS] RoPE kernel 编译完成" + ls -lh "$OUTPUT_FILE" +else + echo "[ERROR] 复制失败" + exit 1 +fi + +# 恢复环境变量 (虽然脚本结束会自动恢复,但显式恢复更清晰) +export PATH=$ORIG_PATH +export LD_LIBRARY_PATH=$ORIG_LD_LIBRARY_PATH + +echo "==========================================" +echo "编译完成!" +echo "==========================================" diff --git a/scripts/envSetup.sh b/scripts/envSetup.sh new file mode 100755 index 00000000000..9d75fa34f1c --- /dev/null +++ b/scripts/envSetup.sh @@ -0,0 +1,66 @@ +export MACA_PATH=/opt/maca + +if [ ! -d ${HOME}/cu-bridge ]; then + `${MACA_PATH}/tools/cu-bridge/tools/pre_make` +fi + +export CUCC_PATH=/opt/maca/tools/cu-bridge +export CUCC_CMAKE_ENTRY=2 +export CUDA_PATH=${HOME}/cu-bridge/CUDA_DIR +export PATH=${CUDA_PATH}/bin:${MACA_PATH}/mxgpu_llvm/bin:${MACA_PATH}/bin:${CUCC_PATH}/tools:${CUCC_PATH}/bin:${PATH} +export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:$LD_LIBRARY_PATH +export MACA_VISIBLE_DEVICES="0" +export PADDLE_XCCL_BACKEND=metax_gpu +export FLAGS_weight_only_linear_arch=80 +export FD_MOE_BACKEND=cutlass +export ENABLE_V1_KVCACHE_SCHEDULER=1 +export FD_ENC_DEC_BLOCK_NUM=2 +export FD_SAMPLING_CLASS="rejection" # 受编译器升级到 llvm19 影响,top_p_sampling 算子会出现阻塞情况 + +export PYTHONPATH="/data/FastDeploy:${PYTHONPATH:-}" +if [ ! -f /tmp/shm_redirect.so ]; then + cat > /tmp/shm_redirect.c << 'CEOF' +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +static const char* REDIRECT_DIR = "/tmp/shm"; +static char* redirect_path(const char* name) { + static __thread char buf[512]; + if (name[0] == '/') name++; + snprintf(buf, sizeof(buf), "%s/%s", REDIRECT_DIR, name); + return buf; +} +int shm_open(const char *name, int oflag, mode_t mode) { + static int created = 0; + if (!created) { mkdir(REDIRECT_DIR, 01777); created = 1; } + int fd = open(redirect_path(name), oflag, mode); + if (fd >= 0 && (oflag & O_CREAT)) { fchmod(fd, mode); } + return fd; +} +int shm_unlink(const char *name) { return unlink(redirect_path(name)); } +CEOF + gcc -shared -fPIC -o /tmp/shm_redirect.so /tmp/shm_redirect.c -ldl +fi +mkdir -p /tmp/shm && chmod 1777 /tmp/shm + +export LD_PRELOAD="/tmp/shm_redirect.so${LD_PRELOAD:+:$LD_PRELOAD}" + +# Clean stale shared memory from prior crashed runs (both the real /dev/shm and our redirect dir) +rm -f /dev/shm/paddle_* /dev/shm/*signal* /dev/shm/key_caches_* /dev/shm/value_caches_* \ + /dev/shm/__KMP_REGISTERED_LIB_* /dev/shm/sem.mp-* /dev/shm/*.8302 /dev/shm/8300.* \ + /dev/shm/router_* /dev/shm/fmq_* /dev/shm/triton_* 2>/dev/null +rm -f /tmp/shm/paddle_* /tmp/shm/*signal* /tmp/shm/key_caches_* /tmp/shm/value_caches_* \ + /tmp/shm/__KMP_REGISTERED_LIB_* /tmp/shm/sem.mp-* /tmp/shm/*.8302 /tmp/shm/8300.* \ + /tmp/shm/router_* /tmp/shm/fmq_* /tmp/shm/triton_* 2>/dev/null + +# Bypass corp HTTP proxy for localhost so benchmark client can reach FastDeploy server directly +export NO_PROXY="127.0.0.1,localhost,0.0.0.0" +export no_proxy="$NO_PROXY" diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh new file mode 100755 index 00000000000..b5ab24956a2 --- /dev/null +++ b/scripts/run_benchmark.sh @@ -0,0 +1,23 @@ +#!/bin/bash + + +set -euo pipefail + +pkill -9 -f "ocr_benchmark.py" 2>/dev/null || true +sleep 2 + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/envSetup.sh" + +cd /data/FastDeploy/benchmarks/paddleocr_vl + +mkdir -p output + +python3 ocr_benchmark.py \ + --input_dirs /data/OmniDocBench_v1_5/images_128_pdf \ + --paddlex_config_path PaddleOCR-VL-1_5_fastdeploy.yaml \ + --device metax_gpu \ + --gpu_ids 0 \ + --process_per_gpu 5 \ + --batch_size 4 \ + -o output/benchmark_result.json diff --git a/scripts/run_server.sh b/scripts/run_server.sh new file mode 100755 index 00000000000..e24ca1d2880 --- /dev/null +++ b/scripts/run_server.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +source "${SCRIPT_DIR}/envSetup.sh" + +MODEL="/mnt/moark-models/PaddleOCR-VL-1.5" +MACA_VISIBLE_DEVICES="${MACA_VISIBLE_DEVICES:-0}" \ +python3 -m fastdeploy.entrypoints.openai.api_server \ + --model "$MODEL" \ + --max-model-len 16384 \ + --max-num-batched-tokens 16384 \ + --gpu-memory-utilization 0.4 \ + --max-num-seqs 256 \ + --graph-optimization-config '{"use_cudagraph":true,"graph_opt_level":0}' \ + --workers 6 \ + --max-concurrency 4096 \ + --port 8300 \ + --metrics-port 8301 \ + --engine-worker-queue-port 8302