diff --git a/fastdeploy/model_executor/layers/quantization/fp8_utils.py b/fastdeploy/model_executor/layers/quantization/fp8_utils.py index 65d30d4004d..52cc1665386 100644 --- a/fastdeploy/model_executor/layers/quantization/fp8_utils.py +++ b/fastdeploy/model_executor/layers/quantization/fp8_utils.py @@ -80,8 +80,10 @@ def load_deep_gemm(): logger.info("Detected sm100, use PFCC DeepGEMM") else: - logger.info("use FastDeploy DeepGEMM") - import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm + paddle.enable_compat(scope={"deep_gemm"}) + import deep_gemm as deep_gemm + + logger.info("Use PFCC DeepGEMM") else: deep_gemm = None return deep_gemm diff --git a/requirements.txt b/requirements.txt index 66ed714045b..ff1ae91aceb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -49,4 +49,5 @@ py-cpuinfo flashinfer-python-paddle flash_mask @ https://paddle-qa.bj.bcebos.com/ernie/flash_mask-4.0.post20260128-py3-none-any.whl arctic_inference @ https://paddle-qa.bj.bcebos.com/ernie/arctic_inference-0.1.3-cp310-cp310-linux_x86_64.whl +deep-gemm-cpp @ https://paddle-qa.bj.bcebos.com/ernie/deep_gemm_cpp-2.2.0%2Blocal-cp310-cp310-linux_x86_64.whl transformers>=4.55.1,<5.0.0