PaddlePaddle
diff --git a/‎.github/workflows/_accuracy_test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/_accuracy_test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/_base_test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/_base_test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/_build_linux.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/_build_linux.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/_logprob_test_linux.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/_logprob_test_linux.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/_pre_ce_test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/_pre_ce_test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/_stable_test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/_stable_test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/_unit_test_coverage.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/_unit_test_coverage.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/rerun.yml‎
Lines changed: 10 additions & 0 deletions b/‎.github/workflows/rerun.yml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/append_attn/append_attention_func.cuh‎
Lines changed: 16 additions & 0 deletions b/‎custom_ops/gpu_ops/append_attn/append_attention_func.cuh‎
Lines changed: 16 additions & 0 deletions
@@ -143,7 +143,7 @@ jobs:
           -v "${CACHE_DIR}/ConfigDir:/root/.config" \
           -e TZ="Asia/Shanghai" \
           --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
-          python -m pip install paddlepaddle-gpu==3.3.0.dev20251112 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+          python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
 
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 
 
@@ -143,7 +143,7 @@ jobs:
           -v "${CACHE_DIR}/ConfigDir:/root/.config" \
           -e TZ="Asia/Shanghai" \
           --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
-          python -m pip install paddlepaddle-gpu==3.3.0.dev20251112 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+          python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
 
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 
 
@@ -155,7 +155,7 @@ jobs:
             elif [[ "${PADDLEVERSION}" != "" ]];then
               python -m pip install paddlepaddle-gpu==${PADDLEVERSION} -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
             else
-              python -m pip install paddlepaddle-gpu==3.3.0.dev20251112 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+              python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
             fi
 
             pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 
@@ -134,7 +134,7 @@ jobs:
           -v "${CACHE_DIR}/ConfigDir:/root/.config" \
           -e TZ="Asia/Shanghai" \
           --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
-          python -m pip install paddlepaddle-gpu==3.3.0.dev20251112 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+          python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
 
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 
 
@@ -151,7 +151,7 @@ jobs:
           --gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c '
           git config --global --add safe.directory /workspace/FastDeploy
           cd FastDeploy
-          python -m pip install paddlepaddle-gpu==3.3.0.dev20251112 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+          python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
           python -m pip install ${fd_wheel_url}
           bash scripts/run_pre_ce.sh
           '
@@ -146,7 +146,7 @@ jobs:
           -v "${CACHE_DIR}/ConfigDir:/root/.config" \
           -e TZ="Asia/Shanghai" \
           --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
-          python -m pip install paddlepaddle-gpu==3.3.0.dev20251112 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+          python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
 
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 
 
@@ -176,7 +176,7 @@ jobs:
           git config --global --add safe.directory /workspace/FastDeploy
           cd FastDeploy
           git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt
-          python -m pip install paddlepaddle-gpu==3.3.0.dev20251112 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+          python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
           pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 
           python -m pip install -r scripts/unittest_requirement.txt
 
@@ -56,6 +56,16 @@ jobs:
           REPO: ${{ github.event.repository.name }}
           JOB_NAME: 'CI_XPU'
 
+      - name: Rerun Check PR Template
+        if: ${{ contains(github.event.comment.body, 'check_pr_template') }}
+        uses: ./.github/actions/rerun-workflow
+        with:
+          PR_ID: ${{ github.event.issue.number }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          OWNER: ${{ github.repository_owner }}
+          REPO: ${{ github.event.repository.name }}
+          JOB_NAME: 'Check PR Template'
+
       - name: Rerun Codestyle-check
         if: ${{ contains(github.event.comment.body, 'codestyle') || contains(github.event.comment.body, 'pre_commit') }}
         uses: ./.github/actions/rerun-workflow
 
@@ -2296,6 +2296,9 @@ __global__ void merge_multi_chunks_decoder_kernel(
   const int bid = blockIdx.x, hid = blockIdx.y;
   __shared__ T smem[bdy * HEAD_DIM];
   __shared__ float md_smem[bdy * 2];
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  cudaGridDependencySynchronize();
+#endif
   const int start_token_idx = cu_seqlens_q[bid];
   const int seq_len_q = seq_lens_q[bid];
   if (seq_len_q == 0) return;
@@ -2332,6 +2335,10 @@ __global__ void merge_multi_chunks_decoder_kernel(
   } else if constexpr (std::is_same<T, __nv_bfloat16>::value) {
     m = -3.0e+30f;
   }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  cudaGridDependencySynchronize();
+#endif
+
 #pragma unroll 2
   for (int i = ty; i < num_chunks_this_seq; i += bdy) {
     uint32_t offset = (bid * num_chunks + i) * num_heads + hid;
@@ -2397,6 +2404,9 @@ __global__ void merge_multi_chunks_decoder_kernel(
         out_vec,
         &out[(start_token_idx * num_heads + hid) * head_dim + vid * vec_size]);
   }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 template <typename T,
@@ -2433,6 +2443,9 @@ __global__ void merge_multi_chunks_v2_kernel(
   const int hid = blockIdx.y;
   __shared__ T smem[bdy * HEAD_DIM];
   __shared__ float md_smem[bdy * 2];
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  cudaGridDependencySynchronize();
+#endif
   for (int qid = blockIdx.x; qid < token_num; qid += gridDim.x) {
     const uint32_t bid = batch_id_per_token[qid];
     if (bid == -1) {
@@ -2569,4 +2582,7 @@ __global__ void merge_multi_chunks_v2_kernel(
     }
     __syncthreads();
   }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,7 @@ jobs:`
`151`	`151`	`--gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c '`
`152`	`152`	`git config --global --add safe.directory /workspace/FastDeploy`
`153`	`153`	`cd FastDeploy`
`154`		`- python -m pip install paddlepaddle-gpu==3.3.0.dev20251112 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/`
	`154`	`+ python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/`
`155`	`155`	`python -m pip install ${fd_wheel_url}`
`156`	`156`	`bash scripts/run_pre_ce.sh`
`157`	`157`	`'`