diff --git a/benchmarks/paddleocr_vl/PaddleOCR-VL-1_5_fastdeploy.yaml b/benchmarks/paddleocr_vl/PaddleOCR-VL-1_5_fastdeploy.yaml
new file mode 100644
index 00000000000..8f90d2cd410
--- /dev/null
+++ b/benchmarks/paddleocr_vl/PaddleOCR-VL-1_5_fastdeploy.yaml
@@ -0,0 +1,83 @@
+
+pipeline_name: PaddleOCR-VL
+
+batch_size: 64
+
+use_queues: True
+
+use_doc_preprocessor: False
+use_layout_detection: True
+use_chart_recognition: False
+use_seal_recognition: False
+use_polygon_points: True
+format_block_content: False
+merge_layout_blocks: True
+markdown_ignore_labels:
+  - number
+  - footnote
+  - header
+  - header_image
+  - footer
+  - footer_image
+  - aside_text
+
+SubModules:
+  LayoutDetection:
+    module_name: layout_detection
+    model_name: PP-DocLayoutV3
+    model_dir: /root/.paddlex/official_models/PP-DocLayoutV3
+    batch_size: 8
+    threshold: 0.3
+    layout_nms: True
+    layout_unclip_ratio: [1.0, 1.0] 
+    layout_merge_bboxes_mode: 
+      0: "union" # abstract
+      1: "union" # algorithm
+      2: "union" # aside_text
+      3: "large" # chart
+      4: "union" # content
+      5: "large" # display_formula
+      6: "large" # doc_title
+      7: "union" # figure_title
+      8: "union" # footer
+      9: "union" # footer
+      10: "union" # footnote
+      11: "union" # formula_number
+      12: "union" # header
+      13: "union" # header
+      14: "union" # image
+      15: "large" # inline_formula
+      16: "union" # number
+      17: "large" # paragraph_title
+      18: "union" # reference
+      19: "union" # reference_content
+      20: "union" # seal
+      21: "union" # table
+      22: "union" # text
+      23: "union" # text
+      24: "union" # vision_footnote
+  VLRecognition:
+    module_name: vl_recognition
+    model_name: PaddleOCR-VL-1.5-0.9B
+    model_dir: /mnt/moark-models/PaddleOCR-VL-1.5
+    batch_size: 4096
+    genai_config:
+      backend: fastdeploy-server
+      server_url: http://0.0.0.0:8300/v1/
+
+SubPipelines:
+  DocPreprocessor:
+    pipeline_name: doc_preprocessor
+    batch_size: 8
+    use_doc_orientation_classify: True
+    use_doc_unwarping: True
+    SubModules:
+      DocOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+        batch_size: 8
+      DocUnwarping:
+        module_name: image_unwarping
+        model_name: UVDoc
+        model_dir: null
\ No newline at end of file
diff --git a/benchmarks/paddleocr_vl/ocr_benchmark.py b/benchmarks/paddleocr_vl/ocr_benchmark.py
new file mode 100644
index 00000000000..91c80348ae4
--- /dev/null
+++ b/benchmarks/paddleocr_vl/ocr_benchmark.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python
+
+import argparse
+import glob
+import json
+import os
+import sys
+import time
+import uuid
+import multiprocessing as mp
+from pathlib import Path
+from threading import Thread, Event
+from operator import itemgetter
+
+import tiktoken
+from tqdm import tqdm
+from pymxsml.mxsml_extension import *
+
+import sys
+import traceback
+
+import paddle.profiler as profiler
+# 全局变量：记录处理失败的文件
+error_files = []
+
+# ======================
+# 1. 基础工具与全局配置
+# ======================
+
+# encoding = tiktoken.get_encoding("cl100k_base")
+
+def get_curr_time():
+    return time.perf_counter()
+
+# ======================
+# 2. Worker 进程逻辑
+# ======================
+def worker_loop(worker_idx, gpu_id, config_path, device, shared_task_q, result_q):
+    """
+    每个 Worker 进程初始化一次 Pipeline，随后抢占式获取任务。
+    """
+    try:
+        # 隔离当前进程可见的 GPU 资源
+        os.environ["MACA_VISIBLE_DEVICES"] = str(gpu_id)
+        from paddlex import create_pipeline
+        
+        # 初始化 Pipeline (假定为 FastDeploy 后端)
+        pipeline = create_pipeline(config_path, device=device)
+        # pipeline = create_pipeline(config_path, device="cpu")
+        result_q.put(("READY", worker_idx))
+
+
+    except Exception as e:
+        result_q.put(("ERR", worker_idx, f"Initialization Failed: {repr(e)}"))
+        return
+
+    # Phase-stagger workers so their per-task submission waves interleave instead of
+    # aligning. Aligned waves overload the VL server in bursts (queue spikes) then leave
+    # it idle (GPU troughs); interleaving keeps the server steadily fed. Off by default.
+    stagger = float(os.environ.get("FD_WORKER_STAGGER", "0") or "0")
+    if stagger > 0:
+        time.sleep(worker_idx * stagger)
+
+    while True:
+        msg = shared_task_q.get()
+        if msg is None:  # 收到 None 代表所有任务完成
+            break
+        
+        tag, job_id, batch_paths = msg
+        start_t = get_curr_time()
+        
+        try:
+            # 执行推理：直接传入路径列表
+            results = list(pipeline.predict(input=batch_paths))
+            print(f"Batch {job_id} processed {len(results)} pages")
+            markdown_list = []
+            for res in results:
+                # 提取 Markdown 文本
+                res_dict = res._to_markdown(pretty=False)
+                markdown_list.append(res_dict["markdown_texts"])
+            
+            full_markdown = "\n\n".join(markdown_list)
+            end_t = get_curr_time()
+            
+
+            # 返回详细结果数据用于主进程统计
+            task_info = {
+                "id": job_id,
+                "start_time": start_t,
+                "end_time": end_t,
+                "successful": True,
+                "processed_pages": len(results),
+                # "generated_tokens": len(encoding.encode(full_markdown)),
+                "markdown": full_markdown
+            }
+            result_q.put(("DONE", task_info))
+            
+        except Exception as e:
+            # 方式1：获取完整的栈信息字符串（推荐）
+            error_trace = traceback.format_exc()
+            result_q.put(("DONE", {
+                "id": job_id,
+                "start_time": start_t,
+                "end_time": get_curr_time(),
+                "successful": False,
+                "error": str(e),
+                "traceback": error_trace
+            }))
+
+# ======================
+# 3. 监控与任务分发
+# ======================
+def monitor_device(gpu_ids, gpu_metrics_list, stop_event):
+    """
+    独立线程：采样所有指定 GPU 的性能指标。
+    """
+    try:
+        mxSmlExInit()
+        handles = [mxSmlExDeviceGetHandleByIndex(gid) for gid in gpu_ids]
+        while not stop_event.is_set():
+            sample = {"ts": time.time(), "per_gpu": []}
+            for gid, handle in zip(gpu_ids, handles):
+                util = mxSmlExDeviceGetUtilizationRates(handle).gpu
+                mem = mxSmlExDeviceGetMemoryInfo(handle).used
+                sample["per_gpu"].append({"id": gid, "util": util, "mem": mem})
+            gpu_metrics_list.append(sample)
+            time.sleep(0.5)
+    except Exception as e:
+        print(f"Monitor Thread Error: {e}")
+    finally:
+        mxSmlExShutdown()
+
+def task_producer(all_paths, batch_size, shared_task_q):
+    """
+    生产者线程：将路径列表切片并塞入任务队列。
+    """
+    for i in range(0, len(all_paths), batch_size):
+        batch = all_paths[i : i + batch_size]
+        # 任务元组：(TAG, 唯一ID, 路径列表)
+        shared_task_q.put(("RUN", uuid.uuid4().hex, batch))
+
+# ======================
+# 4. 主程序 (统计与协调)
+# ======================
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Multi-GPU Pipeline Parallel Benchmark")
+    parser.add_argument("--input_dirs", type=str, default="/external/ai2/models/squad3/model/dataset/omni1_5/pdfs")
+    parser.add_argument("-b", "--batch_size", type=int, default=0,
+                        help="Files per task. <=0 means auto (ceil(total/num_workers), one big batch per worker). "
+                             "A small value (e.g. 4) creates many more tasks than workers so workers pull "
+                             "dynamically, desync their pipeline phases, and feed the VL server a smoother stream.")
+    parser.add_argument("-o", "--output_path", type=str, default="benchmark_results.json")
+    parser.add_argument("--device", type=str, default="metax_gpu")
+    parser.add_argument("--paddlex_config_path", type=str, default="PaddleOCR-VL-1_5_fastdeploy.yaml")
+    parser.add_argument("--gpu_ids", type=int, nargs="+", default=[0])
+    parser.add_argument("--process_per_gpu", type=int, default=3, help="每个 GPU 启动的并发进程数")
+    args = parser.parse_args()
+
+    # 扫描输入路径 (支持单路径及递归匹配)
+    all_input_paths = []
+    input_dirs = [args.input_dirs] if isinstance(args.input_dirs, str) else args.input_dirs
+    for d in input_dirs:
+        # 只选取 PDF 文件，避免目录或其它杂质干扰 pipeline
+        all_input_paths += glob.glob(os.path.join(d, "*.pdf"))
+    all_input_paths.sort()
+    print(f"Found {len(all_input_paths)} PDF files: {all_input_paths}")
+    if not all_input_paths:
+        print(f"Error: No PDF files found in {args.input_dirs}")
+        sys.exit(1)
+
+    total_files = len(all_input_paths)
+    num_workers = len(args.gpu_ids) * args.process_per_gpu
+    # batch_size<=0 => auto (one big batch per worker, original behavior).
+    # A small explicit batch_size decouples task granularity from worker count: many more
+    # tasks than workers => dynamic pull-based load balancing + desynced pipeline phases =>
+    # the VL server receives a smoother, more continuous request stream (less sawtooth).
+    if args.batch_size <= 0:
+        # 使用 (a + b - 1) // b 这种技巧来实现向上取整
+        args.batch_size = (total_files + num_workers - 1) // num_workers
+    args.batch_size = max(1, min(args.batch_size, total_files))
+    total_batches = (total_files + args.batch_size - 1) // args.batch_size
+
+    # 启动并行环境
+    mp_ctx = mp.get_context("spawn")
+    # 设置背压保护：限制任务队列长度，防止内存因路径列表积压过多
+    total_workers = len(args.gpu_ids) * args.process_per_gpu
+    shared_task_q = mp_ctx.Queue(maxsize=num_workers * 2) 
+    result_q = mp_ctx.Queue()
+    
+    workers = []
+    print(f"[System] Initializing {total_workers} parallel pipelines on GPU(s) {args.gpu_ids}...")
+    
+    for i in range(total_workers):
+        gid = args.gpu_ids[i % len(args.gpu_ids)]
+        p = mp_ctx.Process(
+            target=worker_loop,
+            args=(i, gid, args.paddlex_config_path, args.device, shared_task_q, result_q)
+        )
+        p.start()
+        workers.append(p)
+
+    # 等待所有 Worker 初始化
+    ready_count = 0
+    while ready_count < total_workers:
+        msg = result_q.get()
+        if msg[0] == "READY": 
+            ready_count += 1
+        elif msg[0] == "ERR":
+            print(f"[FATAL] Worker {msg[1]} failed: {msg[2]}")
+            for p in workers: p.terminate()
+            sys.exit(2)
+
+    # 启动性能监控与生产者
+    gpu_metrics_list = []
+    stop_monitor = Event()
+    monitor_thread = Thread(target=monitor_device, args=(args.gpu_ids, gpu_metrics_list, stop_monitor), daemon=True)
+    monitor_thread.start()
+
+    producer = Thread(target=task_producer, args=(all_input_paths, args.batch_size, shared_task_q), daemon=True)
+    producer.start()
+
+    # 结果收集与汇总
+    task_info_list = []
+    start_time = get_curr_time()
+    
+    print(f"[Processing] {total_files} files | {total_batches} batches | batch_size={args.batch_size}")
+    
+
+    with open("generated_markdown.md", "w", encoding="utf-8") as f:
+        pbar = tqdm(total=total_batches, desc="Inference Progress")
+        batches_done = 0
+        while batches_done < total_batches:
+            msg = result_q.get()
+            if msg[0] == "DONE":
+                info = msg[1]
+                if info["successful"]:
+                    f.write(info["markdown"])
+                    f.write("\n\n")
+                    task_info_list.append(info)
+                else:
+                    pbar.write(f"\n[Warning] Task {info['id']} failed: {info.get('error')} | traceback: {info.get('traceback', '无栈信息')}")
+                
+                batches_done += 1
+                pbar.update(1)
+        pbar.close()
+
+    end_time = get_curr_time()
+
+    # 清理现场
+    stop_monitor.set()
+    # 放入哨兵信号，通知 Worker 退出
+    for _ in range(total_workers): shared_task_q.put(None)
+    for w in workers: w.join(timeout=5)
+    monitor_thread.join(timeout=2)
+
+    # ======================
+    # 5. Summary & 统计输出
+    # ======================
+    duration = end_time - start_time
+    throughput_file = total_files / duration
+    
+    latencies = [info["end_time"] - info["start_time"] for info in task_info_list if info["successful"]]
+    avg_latency_batch = sum(latencies) / len(latencies) if latencies else 0
+
+    print("\n" + "="*50)
+    print(f"{'BENCHMARK SUMMARY':^50}")
+    print("="*50)
+    print(f"Total Files: {total_files}")
+    print(f"Throughput: {throughput_file:.4f} files/sec")
+    print(f"Total Execution Time: {duration:.2f} sec")
+    print(f"Avg Batch Latency: {avg_latency_batch:.4f} sec")
+
+    successful_batches = len(task_info_list)
+    if successful_batches > 0:
+        processed_pages = sum(info.get("processed_pages", 0) for info in task_info_list)
+        throughput_page = processed_pages / duration
+        print(f"Total Processed Pages: {processed_pages}")
+        print(f"Page Throughput: {throughput_page:.4f} pages/sec")
+        
+        generated_tokens = sum(info.get("generated_tokens", 0) for info in task_info_list)
+        throughput_token = generated_tokens / duration
+        print(f"Total Generated Tokens: {generated_tokens}")
+        print(f"Token Throughput: {throughput_token:.1f} tokens/sec")
+    
+    if gpu_metrics_list:
+        # 计算系统级的平均和峰值
+        all_utils = [sum(g['util'] for g in sample['per_gpu'])/len(args.gpu_ids) for sample in gpu_metrics_list]
+        all_mems = [sum(g['mem'] for g in sample['per_gpu']) for sample in gpu_metrics_list]
+        
+        print("-" * 50)
+        print(f"System GPU Utilization (%): Peak={max(all_utils):.1f}, Avg={sum(all_utils)/len(all_utils):.1f}")
+        print(f"System GPU Memory (MB): Peak={max(all_mems)/1024**2:.1f}, Avg={sum(all_mems)/len(gpu_metrics_list)/1024**2:.1f}")
+    print("="*50)
+
+    # 导出 JSON 报告
+    final_report = {
+        "config": vars(args),
+        "metrics": {
+            "total_files": total_files,
+            "throughput_file": throughput_file,
+            "avg_latency_batch": avg_latency_batch,
+            "processed_pages": processed_pages if successful_batches > 0 else 0,
+            "throughput_page": throughput_page if successful_batches > 0 else 0,
+            "generated_tokens": generated_tokens if successful_batches > 0 else 0,
+            "throughput_token": throughput_token if successful_batches > 0 else 0,
+        },
+        "gpu_history": gpu_metrics_list
+    }
+    with open(args.output_path, "w", encoding="utf-8") as f:
+        json.dump(final_report, f, ensure_ascii=False, indent=2)
+    print(f"\n[Done] Results saved to {args.output_path}")
+
diff --git a/custom_ops/metax_ops/apply_rope_qkv.cu b/custom_ops/metax_ops/apply_rope_qkv.cu
index 3c7679e077e..50b1cdb9160 100644
--- a/custom_ops/metax_ops/apply_rope_qkv.cu
+++ b/custom_ops/metax_ops/apply_rope_qkv.cu
@@ -50,6 +50,7 @@ struct Converter<__nv_bfloat16> {
 
 struct ApplyRopeQKVParams {
   int head_dim;
+  int half_dim;  // head_dim / 2, used by rotate-half convention
   int token_stride;
   int head_stride;
   int q_stride;
@@ -61,37 +62,64 @@ struct ApplyRopeQKVParams {
   int kv_head_num;
 };
 
+// Rotate-half convention (used by SigLIP vision encoder and most HF models):
+//   for i < half_dim:  out[i] = x[i] * cos[i] - x[i + half_dim] * sin[i]
+//   for i >= half_dim: out[i] = x[i] * cos[i] + x[i - half_dim] * sin[i]
+// Equivalent to `x * cos + concat([-x[half:], x[:half]]) * sin`.
+// Each Vec4 at head_dim_idx loads its 4-element main chunk AND a paired Vec4
+// from the other half, then applies the rotate-half formula element-wise.
 template <typename T>
-__device__ __forceinline__ void RotateQKVec4(const T* qkv_ptr,
-                                             const T* rot_cos_ptr,
-                                             const T* rot_sin_ptr,
-                                             const int load_idx,
-                                             const int store_idx,
-                                             const int rot_base_idx,
-                                             T* out) {
+__device__ __forceinline__ void RotateQKVec4HalfStyle(const T* qkv_ptr,
+                                                      const T* rot_cos_ptr,
+                                                      const T* rot_sin_ptr,
+                                                      const int load_idx,
+                                                      const int store_idx,
+                                                      const int rot_base_idx,
+                                                      const int head_dim_idx,
+                                                      const int half_dim,
+                                                      T* out) {
   using VecT = AlignedVector<T, 4>;
 
   VecT qk_vec;
   Load(qkv_ptr + load_idx, &qk_vec);
-  VecT rot_half_vec = {-qk_vec[1], qk_vec[0], -qk_vec[3], qk_vec[2]};
+  // Load the paired Vec4 from the other half of head_dim for rotate_half.
+  // pair_load = load_idx + half_dim if first half, load_idx - half_dim if second.
+  const int pair_load =
+      load_idx + ((head_dim_idx < half_dim) ? half_dim : -half_dim);
+  VecT pair_vec;
+  Load(qkv_ptr + pair_load, &pair_vec);
+
   VecT cos_vec, sin_vec;
   Load(rot_cos_ptr + rot_base_idx, &cos_vec);
   Load(rot_sin_ptr + rot_base_idx, &sin_vec);
+
+  if (head_dim_idx < half_dim) {
+    // first half: out = x * cos - pair * sin
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+      *(out + store_idx + i) =
+          qk_vec[i] * cos_vec[i] - pair_vec[i] * sin_vec[i];
+    }
+  } else {
+    // second half: out = x * cos + pair * sin
 #pragma unroll
-  for (int i = 0; i < 4; ++i) {
-    *(out + store_idx + i) =
-        qk_vec[i] * cos_vec[i] + rot_half_vec[i] * sin_vec[i];
+    for (int i = 0; i < 4; ++i) {
+      *(out + store_idx + i) =
+          qk_vec[i] * cos_vec[i] + pair_vec[i] * sin_vec[i];
+    }
   }
 }
 
 template <typename T>
-__device__ __forceinline__ void RotateQKVec4(const T* qkv_ptr,
-                                             const float* rot_cos_ptr,
-                                             const float* rot_sin_ptr,
-                                             const int load_idx,
-                                             const int store_idx,
-                                             const int rot_base_idx,
-                                             T* out) {
+__device__ __forceinline__ void RotateQKVec4HalfStyle(const T* qkv_ptr,
+                                                      const float* rot_cos_ptr,
+                                                      const float* rot_sin_ptr,
+                                                      const int load_idx,
+                                                      const int store_idx,
+                                                      const int rot_base_idx,
+                                                      const int head_dim_idx,
+                                                      const int half_dim,
+                                                      T* out) {
   using VecT = AlignedVector<T, 4>;
   using VecF = AlignedVector<float, 4>;
   auto to_float = [] __device__(T val) -> float {
@@ -103,17 +131,27 @@ __device__ __forceinline__ void RotateQKVec4(const T* qkv_ptr,
 
   VecT qk_vec;
   Load(qkv_ptr + load_idx, &qk_vec);
-  VecF rot_half_vec = {-to_float(qk_vec[1]),
-                       to_float(qk_vec[0]),
-                       -to_float(qk_vec[3]),
-                       to_float(qk_vec[2])};
+  const int pair_load =
+      load_idx + ((head_dim_idx < half_dim) ? half_dim : -half_dim);
+  VecT pair_vec;
+  Load(qkv_ptr + pair_load, &pair_vec);
+
   VecF cos_vec, sin_vec;
   Load(rot_cos_ptr + rot_base_idx, &cos_vec);
   Load(rot_sin_ptr + rot_base_idx, &sin_vec);
+
+  if (head_dim_idx < half_dim) {
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+      *(out + store_idx + i) = from_float(to_float(qk_vec[i]) * cos_vec[i] -
+                                          to_float(pair_vec[i]) * sin_vec[i]);
+    }
+  } else {
 #pragma unroll
-  for (int i = 0; i < 4; ++i) {
-    *(out + store_idx + i) = from_float(to_float(qk_vec[i]) * cos_vec[i] +
-                                        rot_half_vec[i] * sin_vec[i]);
+    for (int i = 0; i < 4; ++i) {
+      *(out + store_idx + i) = from_float(to_float(qk_vec[i]) * cos_vec[i] +
+                                          to_float(pair_vec[i]) * sin_vec[i]);
+    }
   }
 }
 
@@ -148,7 +186,8 @@ __global__ void DispatchApplyRopeQKVVec4Kernel(const T* qkv,
                head_dim_idx;
     store_idx =
         token_idx * param.q_stride + head_idx * param.head_dim + head_dim_idx;
-    RotateQKVec4(qkv, rot_cos, rot_sin, load_idx, store_idx, rot_idx, q_out);
+    RotateQKVec4HalfStyle(qkv, rot_cos, rot_sin, load_idx, store_idx, rot_idx,
+                          head_dim_idx, param.half_dim, q_out);
   }
 
   if (head_idx < param.kv_head_num && head_dim_idx < param.head_dim) {  // kv
@@ -157,7 +196,8 @@ __global__ void DispatchApplyRopeQKVVec4Kernel(const T* qkv,
                head_dim_idx;
     store_idx =
         token_idx * param.kv_stride + head_idx * param.head_dim + head_dim_idx;
-    RotateQKVec4(qkv, rot_cos, rot_sin, load_idx, store_idx, rot_idx, k_out);
+    RotateQKVec4HalfStyle(qkv, rot_cos, rot_sin, load_idx, store_idx, rot_idx,
+                          head_dim_idx, param.half_dim, k_out);
     load_idx = token_idx * param.token_stride +
                (head_idx + param.v_head_offset) * param.head_stride +
                head_dim_idx;
@@ -196,6 +236,7 @@ void ApplyRopeQKVKernel(const paddle::Tensor& qkv,
 
   ApplyRopeQKVParams param;
   param.head_dim = head_dim;
+  param.half_dim = head_dim / 2;
   param.token_stride = all_num_head * head_dim;
   param.head_stride = head_dim;
   param.q_stride = q_head_num * head_dim;
diff --git a/fastdeploy/input/paddleocr_vl_processor/image_processor.py b/fastdeploy/input/paddleocr_vl_processor/image_processor.py
index 8e333d5bf96..dd365234ab3 100644
--- a/fastdeploy/input/paddleocr_vl_processor/image_processor.py
+++ b/fastdeploy/input/paddleocr_vl_processor/image_processor.py
@@ -99,9 +99,9 @@ def smart_resize(
         height = round((height * factor) / width)
         width = factor
 
-    if max(height, width) / min(height, width) > 200:
+    if max(height, width) / min(height, width) > 300:
         raise ValueError(
-            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+            f"absolute aspect ratio must be smaller than 300, got {max(height, width) / min(height, width)}"
         )
     h_bar = round(height / factor) * factor
     w_bar = round(width / factor) * factor
diff --git a/fastdeploy/model_executor/models/paddleocr_vl/siglip.py b/fastdeploy/model_executor/models/paddleocr_vl/siglip.py
index 82d3eb09e95..685bcc75440 100644
--- a/fastdeploy/model_executor/models/paddleocr_vl/siglip.py
+++ b/fastdeploy/model_executor/models/paddleocr_vl/siglip.py
@@ -26,7 +26,7 @@
 from fastdeploy.model_executor.utils import h2d_copy, slice_fn
 
 from .config import PaddleOCRVisionConfig
-from .siglip_ops import get_activation_fn, neox_rope_embedding
+from .siglip_ops import _siglip_activation, neox_rope_embedding_eager
 
 
 class SiglipAttention(nn.Layer):
@@ -125,7 +125,7 @@ def forward(
     ):
         B, seq_length, D = hidden_states.shape
         qkv = self.qkv_proj(hidden_states)
-        q, k, v = neox_rope_embedding(qkv, cos_emb, sin_emb, self.num_heads, self.head_dim)
+        q, k, v = neox_rope_embedding_eager(qkv, cos_emb, sin_emb, self.num_heads, self.head_dim)
         attn_output = self.flash_attn_func(
             q,
             k,
@@ -303,7 +303,7 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
 
     def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
         hidden_states = self.fc1(hidden_states)
-        hidden_states = get_activation_fn(self.config.hidden_act)(hidden_states[0])
+        hidden_states = _siglip_activation(hidden_states[0])
         hidden_states = self.fc2(hidden_states)
         return hidden_states
 
diff --git a/fastdeploy/model_executor/models/paddleocr_vl/siglip_ops.py b/fastdeploy/model_executor/models/paddleocr_vl/siglip_ops.py
index 33cd502a5ea..4dc449cd557 100644
--- a/fastdeploy/model_executor/models/paddleocr_vl/siglip_ops.py
+++ b/fastdeploy/model_executor/models/paddleocr_vl/siglip_ops.py
@@ -16,6 +16,8 @@
 
 from typing import List
 
+import os
+
 import paddle
 from paddleformers.transformers.activations import ACT2FN
 
@@ -23,6 +25,22 @@
 
 if current_platform.is_cuda():
     from fastdeploy.model_executor.ops.gpu import fused_neox_rope_embedding, gelu_tanh
+elif current_platform.is_maca():
+    # Metax C500 (SM 80) has gelu_tanh fused op available, but not fused_neox_rope_embedding.
+    # Enable gelu_tanh to avoid Python fallback in SigLIP MLP (27 layers × 1 GELU each).
+    from fastdeploy.model_executor.ops.gpu import gelu_tanh
+
+    # Fused RoPE kernel (rotate-half convention) compiled from apply_rope_qkv.cu.
+    # Replaces the Python fallback in SigLIP vision encoder (27 layers × 1 RoPE each,
+    # measured at 27.5% of vision encoder time — see optimization_log.md §0.1.1).
+    _ROPE_SO = os.path.join(os.path.dirname(__file__), "apply_rope_qkv_pd_.so")
+    _MACA_FUSED_ROPE_OK = False
+    if os.path.exists(_ROPE_SO):
+        try:
+            paddle.utils.cpp_extension.load_op_meta_info_and_register_op(_ROPE_SO)
+            _MACA_FUSED_ROPE_OK = True
+        except Exception:
+            pass
 
 
 def rotate_half(x):
@@ -59,6 +77,30 @@ def native_neox_rope_embedding(qkv, cos, sin, num_heads):
     return q, k, v
 
 
+def maca_fused_neox_rope_embedding(qkv, cos, sin, num_heads, head_dim):
+    """Metax fused RoPE kernel — rotate-half convention.
+    Input: qkv [B, seq, 3*num_heads*head_dim], cos/sin [seq, 1, head_dim]
+    Output: q, k, v each [seq, num_heads, head_dim]
+    """
+    B, seq_length, D = qkv.shape
+    if seq_length == -1:
+        _, seq_length, _ = paddle.shape(qkv)
+    # Flatten batch+seq → tokens (kernel expects 2D qkv)
+    qkv_flat = qkv.reshape([-1, D])
+    q_out, k_out, v_out = paddle._C_ops._run_custom_op(
+        "apply_rope_qkv",
+        qkv_flat,
+        cos,
+        sin,
+        num_heads,
+        num_heads,
+        head_dim,
+    )
+    return q_out, k_out, v_out
+
+
+
+
 jit_unified_marker = paddle.jit.marker.unified if hasattr(paddle.jit.marker, "unified") else lambda fn: fn
 
 
@@ -68,6 +110,8 @@ def neox_rope_embedding(
 ) -> List[paddle.Tensor]:
     if current_platform.is_cuda() and paddle.in_dynamic_mode():
         return fused_neox_rope_embedding(qkv, cos_emb, sin_emb, num_heads, head_dim)
+    elif current_platform.is_maca() and _MACA_FUSED_ROPE_OK and paddle.in_dynamic_mode():
+        return maca_fused_neox_rope_embedding(qkv, cos_emb, sin_emb, num_heads, head_dim)
     else:
         return native_neox_rope_embedding(qkv, cos_emb, sin_emb, num_heads)
 
@@ -75,9 +119,37 @@ def neox_rope_embedding(
 @jit_unified_marker
 def get_activation_fn(hidden_act: str):
     if hidden_act == "gelu_pytorch_tanh":
-        if current_platform.is_cuda() and paddle.in_dynamic_mode():
+        if (current_platform.is_cuda() or current_platform.is_maca()) and paddle.in_dynamic_mode():
             return gelu_tanh
         else:
             return ACT2FN["gelu_new"]
     else:
         return ACT2FN[hidden_act]
+
+
+# ── eager-mode dispatch cache ─────────────────────────────────────
+# Avoid repeated platform / dynamic-mode checks on every encoder-layer
+# call.  Resolved once on first invocation, then the fast path is a
+# single boolean branch (no function calls, no attribute lookups).
+_USE_MACA_FUSED_ROPE = (
+    current_platform.is_maca() and _MACA_FUSED_ROPE_OK and paddle.in_dynamic_mode()
+)
+_USE_CUDA_FUSED_ROPE = (
+    current_platform.is_cuda() and paddle.in_dynamic_mode()
+)
+
+
+def neox_rope_embedding_eager(
+    qkv, cos_emb, sin_emb, num_heads, head_dim
+):
+    """Hot-path RoPE dispatch — no platform checks, no decorator overhead."""
+    if _USE_MACA_FUSED_ROPE:
+        return maca_fused_neox_rope_embedding(qkv, cos_emb, sin_emb, num_heads, head_dim)
+    elif _USE_CUDA_FUSED_ROPE:
+        return fused_neox_rope_embedding(qkv, cos_emb, sin_emb, num_heads, head_dim)
+    else:
+        return native_neox_rope_embedding(qkv, cos_emb, sin_emb, num_heads)
+
+
+# Pre-resolved activation for SigLIP MLP (gelu_pytorch_tanh).
+_siglip_activation = get_activation_fn("gelu_pytorch_tanh")
diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py
index 77944b3a2cf..a6a0e662c0d 100644
--- a/fastdeploy/worker/metax_model_runner.py
+++ b/fastdeploy/worker/metax_model_runner.py
@@ -1705,7 +1705,7 @@ def _dummy_pooler_run(
             skip_save_output=True,
             async_output_queue=self.async_output_queue,
             think_end_id=self.model_config.think_end_id,
-            line_break_id=self.model_config.line_break_id,
+            # line_break_id=self.model_config.line_break_id,
         )
         self.exist_prefill_flag = False
         return pooler_output
@@ -1810,7 +1810,7 @@ def _dummy_sampler_run(
             skip_save_output=True,
             async_output_queue=self.async_output_queue,
             think_end_id=self.model_config.think_end_id,
-            line_break_id=self.model_config.line_break_id,
+            # line_break_id=self.model_config.line_break_id,
             enable_entropy=self.enable_entropy and self.parallel_config.tensor_parallel_rank == 0,
         )
         self.exist_prefill_flag = False
@@ -2457,7 +2457,7 @@ def _postprocess(
                 skip_save_output=skip_save_output,
                 async_output_queue=self.async_output_queue,
                 think_end_id=self.model_config.think_end_id,
-                line_break_id=self.model_config.line_break_id,
+                # line_break_id=self.model_config.line_break_id,
                 enable_entropy=self.enable_entropy and self.parallel_config.tensor_parallel_rank == 0,
             )
 
diff --git a/scripts/build_rope_kernel.sh b/scripts/build_rope_kernel.sh
new file mode 100755
index 00000000000..4f41aa154b5
--- /dev/null
+++ b/scripts/build_rope_kernel.sh
@@ -0,0 +1,116 @@
+#!/usr/bin/env bash
+# 独立编译 RoPE kernel，不影响系统环境
+# 用法: bash scripts/build_rope_kernel.sh
+
+set -e
+
+echo "=========================================="
+echo "RoPE Kernel 编译脚本 (Metax MACA)"
+echo "=========================================="
+
+# 保存原始环境变量
+ORIG_PATH=$PATH
+ORIG_LD_LIBRARY_PATH=$LD_LIBRARY_PATH
+
+# 设置 Metax 编译环境 (仅在当前 shell)
+export MACA_PATH=/opt/maca
+
+# 初始化 cu-bridge (如果不存在)
+if [ ! -d ${HOME}/cu-bridge ]; then
+    echo "[INFO] 初始化 cu-bridge..."
+    ${MACA_PATH}/tools/cu-bridge/tools/pre_make
+fi
+
+# 设置编译环境变量
+export CUCC_PATH=/opt/maca/tools/cu-bridge
+export CUCC_CMAKE_ENTRY=2
+export CUDA_PATH=${HOME}/cu-bridge/CUDA_DIR
+export PATH=${CUDA_PATH}/bin:${MACA_PATH}/mxgpu_llvm/bin:${MACA_PATH}/bin:${CUCC_PATH}/tools:${CUCC_PATH}/bin:${ORIG_PATH}
+export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${ORIG_LD_LIBRARY_PATH}
+
+echo "[INFO] 编译环境已设置"
+echo "  CUDA_PATH: $CUDA_PATH"
+echo "  CUCC_PATH: $CUCC_PATH"
+
+# 源文件和输出路径
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+SRC_FILE="${PROJECT_ROOT}/custom_ops/metax_ops/apply_rope_qkv.cu"
+OUTPUT_FILE="${PROJECT_ROOT}/fastdeploy/model_executor/models/paddleocr_vl/apply_rope_qkv_pd_.so"
+
+echo "[INFO] 源文件: $SRC_FILE"
+echo "[INFO] 输出文件: $OUTPUT_FILE"
+
+# 备份原有的 .so (如果存在)
+if [ -f "$OUTPUT_FILE" ]; then
+    BACKUP_FILE="${OUTPUT_FILE}.backup.$(date +%Y%m%d_%H%M%S)"
+    echo "[INFO] 备份原有 .so: $BACKUP_FILE"
+    cp "$OUTPUT_FILE" "$BACKUP_FILE"
+fi
+
+# 使用 paddle 的 cpp_extension 编译
+echo "[INFO] 开始编译..."
+cd "$PROJECT_ROOT"
+
+# 设置 Metax 库的链接标志
+export LDFLAGS="-L/opt/maca/lib -lruntime_cu -lmcruntime -lmctlassEx -lmccompiler"
+
+python -c "
+import os
+import sys
+import paddle
+from paddle.utils.cpp_extension import load
+
+# 设置 Metax 环境变量
+os.environ['PADDLE_CUSTOM_DEVICE'] = 'metax_gpu'
+
+source_files = ['${SRC_FILE}']
+extra_include = ['${PROJECT_ROOT}/custom_ops/gpu_ops']
+
+try:
+    op = load(
+        name='apply_rope_qkv_pd',
+        sources=source_files,
+        extra_include_paths=extra_include + ['${PROJECT_ROOT}/custom_ops/third_party/nlohmann_json/include'],
+        extra_cxx_cflags=['-DPADDLE_WITH_CUSTOM_DEVICE_METAX_GPU', '-DPADDLE_DEV'],
+        extra_cuda_cflags=['-DPADDLE_WITH_CUSTOM_DEVICE_METAX_GPU', '-DPADDLE_DEV'],
+        extra_ldflags=['-L/opt/maca/lib', '-lruntime_cu', '-lmcruntime', '-lmctlassEx', '-lmccompiler'],
+        verbose=False
+    )
+    print('[SUCCESS] 编译成功')
+    print(f'编译产物: {op}')
+except Exception as e:
+    print(f'[ERROR] 编译失败: {e}')
+    sys.exit(1)
+"
+
+# 找到编译产物并复制到目标位置
+BUILD_DIR="${HOME}/.cache/paddle_extensions/apply_rope_qkv_pd"
+
+SO_FILE=$(find "$BUILD_DIR" -name "apply_rope_qkv_pd*.so" -type f 2>/dev/null | head -1)
+
+if [ -z "$SO_FILE" ]; then
+    echo "[ERROR] 找不到编译产物 .so 文件"
+    exit 1
+fi
+
+echo "[INFO] 找到编译产物: $SO_FILE"
+echo "[INFO] 复制到: $OUTPUT_FILE"
+cp "$SO_FILE" "$OUTPUT_FILE"
+
+# 验证
+if [ -f "$OUTPUT_FILE" ]; then
+    echo "[SUCCESS] RoPE kernel 编译完成"
+    ls -lh "$OUTPUT_FILE"
+else
+    echo "[ERROR] 复制失败"
+    exit 1
+fi
+
+# 恢复环境变量 (虽然脚本结束会自动恢复，但显式恢复更清晰)
+export PATH=$ORIG_PATH
+export LD_LIBRARY_PATH=$ORIG_LD_LIBRARY_PATH
+
+echo "=========================================="
+echo "编译完成！"
+echo "=========================================="
diff --git a/scripts/envSetup.sh b/scripts/envSetup.sh
new file mode 100755
index 00000000000..9d75fa34f1c
--- /dev/null
+++ b/scripts/envSetup.sh
@@ -0,0 +1,66 @@
+export MACA_PATH=/opt/maca
+
+if [ ! -d ${HOME}/cu-bridge ]; then
+  `${MACA_PATH}/tools/cu-bridge/tools/pre_make`
+fi
+
+export CUCC_PATH=/opt/maca/tools/cu-bridge
+export CUCC_CMAKE_ENTRY=2
+export CUDA_PATH=${HOME}/cu-bridge/CUDA_DIR
+export PATH=${CUDA_PATH}/bin:${MACA_PATH}/mxgpu_llvm/bin:${MACA_PATH}/bin:${CUCC_PATH}/tools:${CUCC_PATH}/bin:${PATH}
+export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:$LD_LIBRARY_PATH
+export MACA_VISIBLE_DEVICES="0"
+export PADDLE_XCCL_BACKEND=metax_gpu
+export FLAGS_weight_only_linear_arch=80
+export FD_MOE_BACKEND=cutlass
+export ENABLE_V1_KVCACHE_SCHEDULER=1
+export FD_ENC_DEC_BLOCK_NUM=2
+export FD_SAMPLING_CLASS="rejection"   # 受编译器升级到 llvm19 影响，top_p_sampling 算子会出现阻塞情况
+
+export PYTHONPATH="/data/FastDeploy:${PYTHONPATH:-}"
+if [ ! -f /tmp/shm_redirect.so ]; then
+    cat > /tmp/shm_redirect.c << 'CEOF'
+#define _GNU_SOURCE
+#include <dlfcn.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <errno.h>
+static const char* REDIRECT_DIR = "/tmp/shm";
+static char* redirect_path(const char* name) {
+    static __thread char buf[512];
+    if (name[0] == '/') name++;
+    snprintf(buf, sizeof(buf), "%s/%s", REDIRECT_DIR, name);
+    return buf;
+}
+int shm_open(const char *name, int oflag, mode_t mode) {
+    static int created = 0;
+    if (!created) { mkdir(REDIRECT_DIR, 01777); created = 1; }
+    int fd = open(redirect_path(name), oflag, mode);
+    if (fd >= 0 && (oflag & O_CREAT)) { fchmod(fd, mode); }
+    return fd;
+}
+int shm_unlink(const char *name) { return unlink(redirect_path(name)); }
+CEOF
+    gcc -shared -fPIC -o /tmp/shm_redirect.so /tmp/shm_redirect.c -ldl
+fi
+mkdir -p /tmp/shm && chmod 1777 /tmp/shm
+
+export LD_PRELOAD="/tmp/shm_redirect.so${LD_PRELOAD:+:$LD_PRELOAD}"
+
+# Clean stale shared memory from prior crashed runs (both the real /dev/shm and our redirect dir)
+rm -f /dev/shm/paddle_* /dev/shm/*signal* /dev/shm/key_caches_* /dev/shm/value_caches_* \
+      /dev/shm/__KMP_REGISTERED_LIB_* /dev/shm/sem.mp-* /dev/shm/*.8302 /dev/shm/8300.* \
+      /dev/shm/router_* /dev/shm/fmq_* /dev/shm/triton_* 2>/dev/null
+rm -f /tmp/shm/paddle_* /tmp/shm/*signal* /tmp/shm/key_caches_* /tmp/shm/value_caches_* \
+      /tmp/shm/__KMP_REGISTERED_LIB_* /tmp/shm/sem.mp-* /tmp/shm/*.8302 /tmp/shm/8300.* \
+      /tmp/shm/router_* /tmp/shm/fmq_* /tmp/shm/triton_* 2>/dev/null
+
+# Bypass corp HTTP proxy for localhost so benchmark client can reach FastDeploy server directly
+export NO_PROXY="127.0.0.1,localhost,0.0.0.0"
+export no_proxy="$NO_PROXY"
diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh
new file mode 100755
index 00000000000..b5ab24956a2
--- /dev/null
+++ b/scripts/run_benchmark.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+
+set -euo pipefail
+
+pkill -9 -f "ocr_benchmark.py" 2>/dev/null || true
+sleep 2
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/envSetup.sh"
+
+cd /data/FastDeploy/benchmarks/paddleocr_vl
+
+mkdir -p output
+
+python3 ocr_benchmark.py \
+    --input_dirs /data/OmniDocBench_v1_5/images_128_pdf \
+    --paddlex_config_path PaddleOCR-VL-1_5_fastdeploy.yaml \
+    --device metax_gpu \
+    --gpu_ids 0 \
+    --process_per_gpu 5 \
+    --batch_size 4 \
+    -o output/benchmark_result.json
diff --git a/scripts/run_server.sh b/scripts/run_server.sh
new file mode 100755
index 00000000000..e24ca1d2880
--- /dev/null
+++ b/scripts/run_server.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+source "${SCRIPT_DIR}/envSetup.sh"
+
+MODEL="/mnt/moark-models/PaddleOCR-VL-1.5"
+MACA_VISIBLE_DEVICES="${MACA_VISIBLE_DEVICES:-0}" \
+python3 -m fastdeploy.entrypoints.openai.api_server \
+    --model "$MODEL" \
+    --max-model-len 16384 \
+    --max-num-batched-tokens 16384 \
+    --gpu-memory-utilization 0.4 \
+    --max-num-seqs 256 \
+    --graph-optimization-config '{"use_cudagraph":true,"graph_opt_level":0}' \
+    --workers 6 \
+    --max-concurrency 4096 \
+    --port 8300 \
+    --metrics-port 8301 \
+    --engine-worker-queue-port 8302