LocalAI version:
LocalAI v4.1.3 (fdc9f7b)
Environment, CPU architecture, OS, and Version:
Linux compute 6.19.11-arch1-1 #1 SMP PREEMPT_DYNAMIC Thu, 02 Apr 2026 23:33:01 +0000 x86_64 GNU/Linux
Describe the bug
Gemma 4 tool responses are not assigned to chat completion response, however it can be seen within the backend traces. Additionally, after the initial attempt, there are 5 additional retries to obtain the response.
As an aside: I think that the api traces is disconnected - see test results below.
Click to see screenshot of backend showing traces response
What the screen shot is showing is that the backend traces record the tool call response correctly... however, it is not found with in the chat completion response.
Some initial diagnostics
The llm inference response, which originates within the core backend, must be getting some wires crossed somewhere; hence, the retries (const maxRetries = 5) found after the inference attempt.
To Reproduce
gemma4_bug_test.go
package pkg
import (
"bytes"
"encoding/json"
"io"
"net/http"
"testing"
"github.com/sashabaranov/go-openai"
"github.com/sashabaranov/go-openai/jsonschema"
)
var model = "gemma-4-e2b-it"
var (
endpoint_host = "http://127.0.0.1:8080"
endpoint_traces = "/api/traces"
endpoint_backend_traces = "/api/backend-traces"
endpoint_chat_completions = "/v1/chat/completions"
)
var completionRequest = openai.ChatCompletionRequest{
Model: model,
Messages: []openai.ChatCompletionMessage{
{Content: "You are a helpful robot.", Role: "assistant"},
{Content: "What is two plus two?", Role: "user"},
},
ToolChoice: "auto",
Tools: []openai.Tool{
{
Type: openai.ToolTypeFunction,
Function: &openai.FunctionDefinition{
Name: "calculate",
Description: "Performs a basic arithmetic operation on two numbers.",
Strict: true,
Parameters: jsonschema.Definition{
Type: jsonschema.Object,
Properties: map[string]jsonschema.Definition{
"number_a": {Type: jsonschema.Number},
"number_b": {Type: jsonschema.Number},
"operation": {
Type: jsonschema.String,
Enum: []string{"add", "subtract", "multiply", "divide"},
},
},
Required: []string{"number_a", "number_b", "operation"},
},
},
},
},
}
func TestGemma4ToolCall(t *testing.T) {
clearTraces()
clearBackendTraces()
t.Logf("RESPONSE:%+v\n", postResponse(t))
traces := [][]map[string]string{getTraces(t), getBackendTraces(t)}
for _, logs := range traces {
t.Log("Count of logs", len(logs))
for _, each := range logs {
t.Log(each)
}
}
}
func poster(m, u string, d *bytes.Buffer) []byte {
req, err := http.NewRequest(m, u, d)
if err != nil {
panic(err)
}
req.Header.Set("accept", "application/json")
req.Header.Set("Content-Type", "application/json")
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return nil
}
respBody, err := io.ReadAll(resp.Body)
if err != nil {
panic(err)
}
defer resp.Body.Close()
return respBody
}
func clearTraces() {
m := "POST"
u := endpoint_host + endpoint_traces + "/clear"
d := bytes.NewBuffer([]byte{})
poster(m, u, d)
}
func clearBackendTraces() {
m := "POST"
u := endpoint_host + endpoint_backend_traces + "/clear"
d := bytes.NewBuffer([]byte{})
poster(m, u, d)
}
func postResponse(t *testing.T) openai.ChatCompletionResponse {
req, err := json.Marshal(completionRequest)
if err != nil {
t.Fatal(err)
}
m := "POST"
u := endpoint_host + endpoint_chat_completions
d := bytes.NewBuffer(req)
response := openai.ChatCompletionResponse{}
if err := json.Unmarshal(poster(m, u, d), &response); err != nil {
t.Fatal(err)
}
return response
}
func getter(t *testing.T, u string) []byte {
resp, err := http.Get(u)
if err != nil {
t.Fatal(err)
}
defer resp.Body.Close()
respBody, err := io.ReadAll(resp.Body)
if err != nil {
t.Fatal(err)
}
return respBody
}
func getTraces(t *testing.T) []map[string]string {
logs := []map[string]string{}
u := endpoint_host + endpoint_traces
json.Unmarshal(getter(t, u), &logs)
return logs
}
func getBackendTraces(t *testing.T) []map[string]string {
logs := []map[string]string{}
u := endpoint_host + endpoint_backend_traces
json.Unmarshal(getter(t, u), &logs)
return logs
}
Output:
go test gemma4_test.go -v
=== RUN TestGemma4ToolCall
gemma4_test.go:55: RESPONSE:{ID:5c027e82-4c74-43ec-b659-be50e6ee54b9 Object:chat.completion Created:1776088111 Model:gemma-4-e2b-it Choices:[{Index:0 Message:{Role:assistant Content: Refusal: MultiContent:[] Name: ReasoningContent: FunctionCall:<nil> ToolCalls:[] ToolCallID:} FinishReason:stop LogProbs:<nil> ContentFilterResults:{Hate:{Filtered:false Severity:} SelfHarm:{Filtered:false Severity:} Sexual:{Filtered:false Severity:} Violence:{Filtered:false Severity:} JailBreak:{Filtered:false Detected:false} Profanity:{Filtered:false Detected:false}}}] Usage:{PromptTokens:32 CompletionTokens:45 TotalTokens:77 PromptTokensDetails:<nil> CompletionTokensDetails:<nil>} SystemFingerprint: PromptFilterResults:[] ServiceTier: httpHeader:map[]}
gemma4_test.go:58: Count of logs 0
gemma4_test.go:58: Count of logs 6
gemma4_test.go:60: map[backend:llama-cpp data: duration: model_name:gemma-4-e2b-it summary:What is two plus two? timestamp:2026-04-13T07:48:31.527688963-06:00 type:llm]
gemma4_test.go:60: map[backend:llama-cpp data: duration: model_name:gemma-4-e2b-it summary:What is two plus two? timestamp:2026-04-13T07:48:31.527688963-06:00 type:llm]
gemma4_test.go:60: map[backend:llama-cpp data: duration: model_name:gemma-4-e2b-it summary:What is two plus two? timestamp:2026-04-13T07:48:31.527688963-06:00 type:llm]
gemma4_test.go:60: map[backend:llama-cpp data: duration: model_name:gemma-4-e2b-it summary:What is two plus two? timestamp:2026-04-13T07:48:31.527688963-06:00 type:llm]
gemma4_test.go:60: map[backend:llama-cpp data: duration: model_name:gemma-4-e2b-it summary:What is two plus two? timestamp:2026-04-13T07:48:31.527688963-06:00 type:llm]
gemma4_test.go:60: map[backend:llama-cpp data: duration: model_name:gemma-4-e2b-it summary:What is two plus two? timestamp:2026-04-13T07:48:31.527688963-06:00 type:llm]
--- PASS: TestGemma4ToolCall (8.71s)
PASS
ok command-line-arguments 8.713s
Expected behavior
To have tool calls in chat completion response? lol.
It would be nice to have the api traces working again, too.
LocalAI version:
LocalAI v4.1.3 (fdc9f7b)
Environment, CPU architecture, OS, and Version:
Linux compute 6.19.11-arch1-1 #1 SMP PREEMPT_DYNAMIC Thu, 02 Apr 2026 23:33:01 +0000 x86_64 GNU/Linux
Describe the bug
Gemma 4 tool responses are not assigned to chat completion response, however it can be seen within the backend traces. Additionally, after the initial attempt, there are 5 additional retries to obtain the response.
Click to see screenshot of backend showing traces response
Some initial diagnostics
The llm inference response, which originates within the core backend, must be getting some wires crossed somewhere; hence, the retries (
const maxRetries = 5) found after the inference attempt.To Reproduce
gemma4_bug_test.goOutput:
Expected behavior
To have tool calls in chat completion response? lol.
It would be nice to have the api traces working again, too.