docker · doringeman · Mar 4, 2026 · Mar 3, 2026
diff --git a/pkg/inference/backends/llamacpp/errors.go b/pkg/inference/backends/llamacpp/errors.go
@@ -0,0 +1,32 @@
+package llamacpp
+
+import "regexp"
+
+// llamaCppErrorPatterns contains regex patterns to extract meaningful error messages
+// from llama.cpp stderr output. The patterns are tried in order, and the first match wins.
+var llamaCppErrorPatterns = []struct {
+	pattern *regexp.Regexp
+	message string
+}{
+	// Metal buffer allocation failure
+	// https://github.com/ggml-org/llama.cpp/blob/ecd99d6a9acbc436bad085783bcd5d0b9ae9e9e9/ggml/src/ggml-metal/ggml-metal-device.m#L1498
+	{regexp.MustCompile(`failed to allocate buffer, size = .*MiB`), "not enough GPU memory to load the model (Metal)"},
+	// CUDA out of memory
+	// https://github.com/ggml-org/llama.cpp/blob/ecd99d6a9acbc436bad085783bcd5d0b9ae9e9e9/ggml/src/ggml-cuda/ggml-cuda.cu#L710
+	{regexp.MustCompile(`cudaMalloc failed: out of memory`), "not enough GPU memory to load the model (CUDA)"},
+	// Generic model loading failure
+	// https://github.com/ggml-org/llama.cpp/blob/ecd99d6a9acbc436bad085783bcd5d0b9ae9e9e9/tools/server/server.cpp#L254
+	{regexp.MustCompile(`exiting due to model loading error`), "failed to load model"},
+}
+
+// ExtractLlamaCppError attempts to extract a meaningful error message from llama.cpp output.
+// It looks for common error patterns and returns a cleaner, more user-friendly message.
+// If no recognizable pattern is found, it returns the full output.
+func ExtractLlamaCppError(output string) string {
+	for _, entry := range llamaCppErrorPatterns {
+		if entry.pattern.MatchString(output) {
+			return entry.message
+		}
+	}
+	return output
+}
diff --git a/pkg/inference/backends/llamacpp/errors_test.go b/pkg/inference/backends/llamacpp/errors_test.go
@@ -0,0 +1,39 @@
+package llamacpp
+
+import (
+	"testing"
+)
+
+func TestExtractLlamaCppError(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{
+			name:     "Metal buffer allocation failure",
+			input:    "ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB",
+			expected: "not enough GPU memory to load the model (Metal)",
+		},
+		{
+			name:     "cudaMalloc OOM",
+			input:    "ggml_backend_cuda_buffer_type_alloc_buffer: allocating 12.50 MiB on device 1: cudaMalloc failed: out of memory",
+			expected: "not enough GPU memory to load the model (CUDA)",
+		},
+		{
+			name: "loading error",
+			input: `common_init_from_params: failed to load model '/models/model.gguf'
+main: exiting due to model loading error`,
+			expected: "failed to load model",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := ExtractLlamaCppError(tt.input)
+			if result != tt.expected {
+				t.Errorf("ExtractLlamaCppError() = %q, want %q", result, tt.expected)
+			}
+		})
+	}
+}
diff --git a/pkg/inference/backends/llamacpp/llamacpp.go b/pkg/inference/backends/llamacpp/llamacpp.go
@@ -171,14 +171,15 @@ func (l *llamaCpp) Run(ctx context.Context, socket, model string, _ string, mode
 	}
 
 	return backends.RunBackend(ctx, backends.RunnerConfig{
-		BackendName:     "llama.cpp",
-		Socket:          socket,
-		BinaryPath:      filepath.Join(binPath, "com.docker.llama-server"),
-		SandboxPath:     binPath,
-		SandboxConfig:   sandbox.ConfigurationLlamaCpp,
-		Args:            args,
-		Logger:          l.log,
-		ServerLogWriter: logging.NewWriter(l.serverLog),
+		BackendName:      "llama.cpp",
+		Socket:           socket,
+		BinaryPath:       filepath.Join(binPath, "com.docker.llama-server"),
+		SandboxPath:      binPath,
+		SandboxConfig:    sandbox.ConfigurationLlamaCpp,
+		Args:             args,
+		Logger:           l.log,
+		ServerLogWriter:  logging.NewWriter(l.serverLog),
+		ErrorTransformer: ExtractLlamaCppError,
 	})
 }