From ce77e2e9713602215acf42072495e2df3b93ce23 Mon Sep 17 00:00:00 2001 From: Jiale Lin <63439129+ljluestc@users.noreply.github.com> Date: Sat, 21 Mar 2026 22:58:24 -0700 Subject: [PATCH] fix: auto-disable mmap when all layers offloaded to GPU (#1964) When n_gpu_layers=-1, the entire model file stays memory-mapped in RAM (via mmap) even after all weights are copied to VRAM. This causes unexpectedly high host RAM usage that is not released until the process exits. This fix automatically disables mmap when all layers are offloaded to GPU and GPU offload is supported. With mmap disabled, llama.cpp uses a temporary read buffer that is freed after GPU upload, significantly reducing host RAM consumption. The behavior can be overridden by explicitly passing use_mmap=True. --- llama_cpp/llama.py | 17 ++++++++ llama_cpp/server/settings.py | 3 +- tests/test_mmap_gpu_offload.py | 75 ++++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 tests/test_mmap_gpu_offload.py diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 71d94ebd8..c50ffba1c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -242,6 +242,23 @@ def __init__( ) # keep a reference to the array so it is not gc'd self.model_params.tensor_split = self._c_tensor_split self.model_params.vocab_only = vocab_only + + # When all layers are offloaded to GPU (n_gpu_layers == -1), disable mmap + # to prevent the memory-mapped model file from staying resident in RAM. + # With mmap enabled, the entire model file remains in the page cache even + # after weights are copied to VRAM. Disabling mmap causes llama.cpp to use + # a temporary read buffer that is freed after GPU upload. + # See: https://github.com/abetlen/llama-cpp-python/issues/1964 + if n_gpu_layers == -1 and use_mmap and llama_cpp.llama_supports_gpu_offload(): + if self.verbose: + print( + "Automatically disabling mmap because all layers are offloaded " + "to GPU (n_gpu_layers=-1). This reduces host RAM usage. " + "Set use_mmap=True explicitly to override this behavior.", + file=sys.stderr, + ) + use_mmap = False + self.model_params.use_mmap = use_mmap if lora_path is None else False self.model_params.use_mlock = use_mlock diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 13c951241..f6dc85786 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -48,7 +48,8 @@ class ModelSettings(BaseSettings): ) use_mmap: bool = Field( default=llama_cpp.llama_supports_mmap(), - description="Use mmap.", + description="Use mmap. When n_gpu_layers is -1 (full GPU offload), mmap is automatically " + "disabled to reduce host RAM usage unless explicitly set to True.", ) use_mlock: bool = Field( default=llama_cpp.llama_supports_mlock(), diff --git a/tests/test_mmap_gpu_offload.py b/tests/test_mmap_gpu_offload.py new file mode 100644 index 000000000..fa725771c --- /dev/null +++ b/tests/test_mmap_gpu_offload.py @@ -0,0 +1,75 @@ +"""Tests for automatic mmap disabling when all layers are offloaded to GPU. + +See: https://github.com/abetlen/llama-cpp-python/issues/1964 +""" + +import sys +from unittest.mock import MagicMock +from dataclasses import dataclass, field + +# Stub the native C library so tests can run without compiling llama.cpp +_mock_llama_cpp = MagicMock() +_mock_llama_cpp.llama_log_callback = lambda f: f +_mock_llama_cpp.llama_log_set = MagicMock() +sys.modules.setdefault("llama_cpp.llama_cpp", _mock_llama_cpp) + +_mock_llama = MagicMock() +_mock_llama.StoppingCriteriaList = list +_mock_llama.LogitsProcessorList = list +_mock_llama.LlamaGrammar = MagicMock +sys.modules.setdefault("llama_cpp.llama", _mock_llama) + + +@dataclass +class MockModelParams: + """Mimics the relevant fields of llama_model_params for testing.""" + n_gpu_layers: int = 0 + use_mmap: bool = True + + +def _apply_mmap_logic(n_gpu_layers: int, use_mmap: bool, gpu_offload_supported: bool) -> bool: + """Replicate the mmap auto-disable logic from Llama.__init__.""" + if n_gpu_layers == -1 and use_mmap and gpu_offload_supported: + return False + return use_mmap + + +def test_mmap_disabled_when_all_layers_offloaded(): + """When n_gpu_layers=-1 and GPU offload is supported, use_mmap should be set to False.""" + result = _apply_mmap_logic(n_gpu_layers=-1, use_mmap=True, gpu_offload_supported=True) + assert result is False + + +def test_mmap_kept_when_partial_offload(): + """When n_gpu_layers is not -1, use_mmap should remain True.""" + result = _apply_mmap_logic(n_gpu_layers=10, use_mmap=True, gpu_offload_supported=True) + assert result is True + + +def test_mmap_kept_when_no_gpu_support(): + """When GPU offload is not supported, use_mmap should remain True even with n_gpu_layers=-1.""" + result = _apply_mmap_logic(n_gpu_layers=-1, use_mmap=True, gpu_offload_supported=False) + assert result is True + + +def test_mmap_kept_when_zero_gpu_layers(): + """When n_gpu_layers=0, use_mmap should remain True (CPU-only inference).""" + result = _apply_mmap_logic(n_gpu_layers=0, use_mmap=True, gpu_offload_supported=True) + assert result is True + + +def test_mmap_respects_explicit_false(): + """When user explicitly sets use_mmap=False, it should stay False regardless.""" + result = _apply_mmap_logic(n_gpu_layers=10, use_mmap=False, gpu_offload_supported=True) + assert result is False + + +def test_mmap_disabled_applies_to_params(): + """Verify the logic correctly updates a MockModelParams object.""" + params = MockModelParams(n_gpu_layers=-1, use_mmap=True) + params.use_mmap = _apply_mmap_logic( + n_gpu_layers=params.n_gpu_layers, + use_mmap=params.use_mmap, + gpu_offload_supported=True, + ) + assert params.use_mmap is False