diff --git a/pkg/inference/backends/llamacpp/errors.go b/pkg/inference/backends/llamacpp/errors.go new file mode 100644 index 00000000..b749336e --- /dev/null +++ b/pkg/inference/backends/llamacpp/errors.go @@ -0,0 +1,32 @@ +package llamacpp + +import "regexp" + +// llamaCppErrorPatterns contains regex patterns to extract meaningful error messages +// from llama.cpp stderr output. The patterns are tried in order, and the first match wins. +var llamaCppErrorPatterns = []struct { + pattern *regexp.Regexp + message string +}{ + // Metal buffer allocation failure + // https://github.com/ggml-org/llama.cpp/blob/ecd99d6a9acbc436bad085783bcd5d0b9ae9e9e9/ggml/src/ggml-metal/ggml-metal-device.m#L1498 + {regexp.MustCompile(`failed to allocate buffer, size = .*MiB`), "not enough GPU memory to load the model (Metal)"}, + // CUDA out of memory + // https://github.com/ggml-org/llama.cpp/blob/ecd99d6a9acbc436bad085783bcd5d0b9ae9e9e9/ggml/src/ggml-cuda/ggml-cuda.cu#L710 + {regexp.MustCompile(`cudaMalloc failed: out of memory`), "not enough GPU memory to load the model (CUDA)"}, + // Generic model loading failure + // https://github.com/ggml-org/llama.cpp/blob/ecd99d6a9acbc436bad085783bcd5d0b9ae9e9e9/tools/server/server.cpp#L254 + {regexp.MustCompile(`exiting due to model loading error`), "failed to load model"}, +} + +// ExtractLlamaCppError attempts to extract a meaningful error message from llama.cpp output. +// It looks for common error patterns and returns a cleaner, more user-friendly message. +// If no recognizable pattern is found, it returns the full output. +func ExtractLlamaCppError(output string) string { + for _, entry := range llamaCppErrorPatterns { + if entry.pattern.MatchString(output) { + return entry.message + } + } + return output +} diff --git a/pkg/inference/backends/llamacpp/errors_test.go b/pkg/inference/backends/llamacpp/errors_test.go new file mode 100644 index 00000000..859d333c --- /dev/null +++ b/pkg/inference/backends/llamacpp/errors_test.go @@ -0,0 +1,39 @@ +package llamacpp + +import ( + "testing" +) + +func TestExtractLlamaCppError(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "Metal buffer allocation failure", + input: "ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB", + expected: "not enough GPU memory to load the model (Metal)", + }, + { + name: "cudaMalloc OOM", + input: "ggml_backend_cuda_buffer_type_alloc_buffer: allocating 12.50 MiB on device 1: cudaMalloc failed: out of memory", + expected: "not enough GPU memory to load the model (CUDA)", + }, + { + name: "loading error", + input: `common_init_from_params: failed to load model '/models/model.gguf' +main: exiting due to model loading error`, + expected: "failed to load model", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := ExtractLlamaCppError(tt.input) + if result != tt.expected { + t.Errorf("ExtractLlamaCppError() = %q, want %q", result, tt.expected) + } + }) + } +} diff --git a/pkg/inference/backends/llamacpp/llamacpp.go b/pkg/inference/backends/llamacpp/llamacpp.go index 848d841a..62cfbd63 100644 --- a/pkg/inference/backends/llamacpp/llamacpp.go +++ b/pkg/inference/backends/llamacpp/llamacpp.go @@ -171,14 +171,15 @@ func (l *llamaCpp) Run(ctx context.Context, socket, model string, _ string, mode } return backends.RunBackend(ctx, backends.RunnerConfig{ - BackendName: "llama.cpp", - Socket: socket, - BinaryPath: filepath.Join(binPath, "com.docker.llama-server"), - SandboxPath: binPath, - SandboxConfig: sandbox.ConfigurationLlamaCpp, - Args: args, - Logger: l.log, - ServerLogWriter: logging.NewWriter(l.serverLog), + BackendName: "llama.cpp", + Socket: socket, + BinaryPath: filepath.Join(binPath, "com.docker.llama-server"), + SandboxPath: binPath, + SandboxConfig: sandbox.ConfigurationLlamaCpp, + Args: args, + Logger: l.log, + ServerLogWriter: logging.NewWriter(l.serverLog), + ErrorTransformer: ExtractLlamaCppError, }) }