diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index c8aa6d0ff34..68695acec32 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -225,10 +225,10 @@ def _validate_split_kv_size(value: int) -> int: "FD_WORKER_ALIVE_TIMEOUT": lambda: int(os.getenv("FD_WORKER_ALIVE_TIMEOUT", "30")), # File path for file storage backend "FILE_BACKEND_STORAGE_DIR": lambda: str(os.getenv("FILE_BACKEND_STORAGE_DIR", "/tmp/fastdeploy")), - # Custom all-reduce max buffer size in MB (default 64MB). + # Custom all-reduce max buffer size in MB (default 8MB). # Increase this to avoid NCCL fallback for large tensors in deterministic mode. # E.g. FD_CUSTOM_AR_MAX_SIZE_MB=128 for 128MB. - "FD_CUSTOM_AR_MAX_SIZE_MB": lambda: int(os.getenv("FD_CUSTOM_AR_MAX_SIZE_MB", "64")), + "FD_CUSTOM_AR_MAX_SIZE_MB": lambda: int(os.getenv("FD_CUSTOM_AR_MAX_SIZE_MB", "8")), # Enable deterministic inference mode for chunked prefill alignment "FD_DETERMINISTIC_MODE": lambda: bool(int(os.getenv("FD_DETERMINISTIC_MODE", "0"))), # Split KV block size for deterministic alignment (must be power of 2 and > 0, default 16) diff --git a/tests/e2e/4cards_cases/test_determinism_long.py b/tests/e2e/4cards_cases/test_determinism_long.py index 94104636e9a..fa92c41e15b 100644 --- a/tests/e2e/4cards_cases/test_determinism_long.py +++ b/tests/e2e/4cards_cases/test_determinism_long.py @@ -143,7 +143,7 @@ def _module_env(): { "CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES", "0,1,2,3"), "FD_DETERMINISTIC_MODE": "1", - "FD_CUSTOM_AR_MAX_SIZE_MB": os.environ.get("FD_CUSTOM_AR_MAX_SIZE_MB", "57"), + "FD_CUSTOM_AR_MAX_SIZE_MB": os.environ.get("FD_CUSTOM_AR_MAX_SIZE_MB", "64"), "FLAGS_max_partition_size": _CHUNK_SIZE_FOR_TEST, } ):