Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions pkg/inference/backends/llamacpp/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package llamacpp

import "regexp"

// llamaCppErrorPatterns contains regex patterns to extract meaningful error messages
// from llama.cpp stderr output. The patterns are tried in order, and the first match wins.
var llamaCppErrorPatterns = []struct {
pattern *regexp.Regexp
message string
}{
// Metal buffer allocation failure
// https://github.com/ggml-org/llama.cpp/blob/ecd99d6a9acbc436bad085783bcd5d0b9ae9e9e9/ggml/src/ggml-metal/ggml-metal-device.m#L1498
{regexp.MustCompile(`failed to allocate buffer, size = .*MiB`), "not enough GPU memory to load the model (Metal)"},
// CUDA out of memory
// https://github.com/ggml-org/llama.cpp/blob/ecd99d6a9acbc436bad085783bcd5d0b9ae9e9e9/ggml/src/ggml-cuda/ggml-cuda.cu#L710
{regexp.MustCompile(`cudaMalloc failed: out of memory`), "not enough GPU memory to load the model (CUDA)"},
// Generic model loading failure
// https://github.com/ggml-org/llama.cpp/blob/ecd99d6a9acbc436bad085783bcd5d0b9ae9e9e9/tools/server/server.cpp#L254
{regexp.MustCompile(`exiting due to model loading error`), "failed to load model"},
}

// ExtractLlamaCppError attempts to extract a meaningful error message from llama.cpp output.
// It looks for common error patterns and returns a cleaner, more user-friendly message.
// If no recognizable pattern is found, it returns the full output.
func ExtractLlamaCppError(output string) string {
for _, entry := range llamaCppErrorPatterns {
if entry.pattern.MatchString(output) {
return entry.message
}
}
return output
}
39 changes: 39 additions & 0 deletions pkg/inference/backends/llamacpp/errors_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package llamacpp

import (
"testing"
)

func TestExtractLlamaCppError(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "Metal buffer allocation failure",
input: "ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB",
expected: "not enough GPU memory to load the model (Metal)",
},
{
name: "cudaMalloc OOM",
input: "ggml_backend_cuda_buffer_type_alloc_buffer: allocating 12.50 MiB on device 1: cudaMalloc failed: out of memory",
expected: "not enough GPU memory to load the model (CUDA)",
},
{
name: "loading error",
input: `common_init_from_params: failed to load model '/models/model.gguf'
main: exiting due to model loading error`,
expected: "failed to load model",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := ExtractLlamaCppError(tt.input)
if result != tt.expected {
t.Errorf("ExtractLlamaCppError() = %q, want %q", result, tt.expected)
}
})
}
}
17 changes: 9 additions & 8 deletions pkg/inference/backends/llamacpp/llamacpp.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,14 +171,15 @@ func (l *llamaCpp) Run(ctx context.Context, socket, model string, _ string, mode
}

return backends.RunBackend(ctx, backends.RunnerConfig{
BackendName: "llama.cpp",
Socket: socket,
BinaryPath: filepath.Join(binPath, "com.docker.llama-server"),
SandboxPath: binPath,
SandboxConfig: sandbox.ConfigurationLlamaCpp,
Args: args,
Logger: l.log,
ServerLogWriter: logging.NewWriter(l.serverLog),
BackendName: "llama.cpp",
Socket: socket,
BinaryPath: filepath.Join(binPath, "com.docker.llama-server"),
SandboxPath: binPath,
SandboxConfig: sandbox.ConfigurationLlamaCpp,
Args: args,
Logger: l.log,
ServerLogWriter: logging.NewWriter(l.serverLog),
ErrorTransformer: ExtractLlamaCppError,
})
}

Expand Down