Skip to content

Gemma 4 Tool Response is not returned as expected #9334

@alixanderthegreat

Description

@alixanderthegreat

LocalAI version:
LocalAI v4.1.3 (fdc9f7b)

Environment, CPU architecture, OS, and Version:
Linux compute 6.19.11-arch1-1 #1 SMP PREEMPT_DYNAMIC Thu, 02 Apr 2026 23:33:01 +0000 x86_64 GNU/Linux

Describe the bug
Gemma 4 tool responses are not assigned to chat completion response, however it can be seen within the backend traces. Additionally, after the initial attempt, there are 5 additional retries to obtain the response.

As an aside: I think that the api traces is disconnected - see test results below.

Click to see screenshot of backend showing traces response Image
What the screen shot is showing is that the backend traces record the tool call response correctly... however, it is not found with in the chat completion response.

Some initial diagnostics

The llm inference response, which originates within the core backend, must be getting some wires crossed somewhere; hence, the retries (const maxRetries = 5) found after the inference attempt.

To Reproduce
gemma4_bug_test.go

package pkg

import (
	"bytes"
	"encoding/json"
	"io"
	"net/http"
	"testing"

	"github.com/sashabaranov/go-openai"
	"github.com/sashabaranov/go-openai/jsonschema"
)

var model = "gemma-4-e2b-it"
var (
	endpoint_host             = "http://127.0.0.1:8080"
	endpoint_traces           = "/api/traces"
	endpoint_backend_traces   = "/api/backend-traces"
	endpoint_chat_completions = "/v1/chat/completions"
)
var completionRequest = openai.ChatCompletionRequest{
	Model: model,
	Messages: []openai.ChatCompletionMessage{
		{Content: "You are a helpful robot.", Role: "assistant"},
		{Content: "What is two plus two?", Role: "user"},
	},
	ToolChoice: "auto",
	Tools: []openai.Tool{
		{
			Type: openai.ToolTypeFunction,
			Function: &openai.FunctionDefinition{
				Name:        "calculate",
				Description: "Performs a basic arithmetic operation on two numbers.",
				Strict:      true,
				Parameters: jsonschema.Definition{
					Type: jsonschema.Object,
					Properties: map[string]jsonschema.Definition{
						"number_a": {Type: jsonschema.Number},
						"number_b": {Type: jsonschema.Number},
						"operation": {
							Type: jsonschema.String,
							Enum: []string{"add", "subtract", "multiply", "divide"},
						},
					},
					Required: []string{"number_a", "number_b", "operation"},
				},
			},
		},
	},
}

func TestGemma4ToolCall(t *testing.T) {
	clearTraces()
	clearBackendTraces()
	t.Logf("RESPONSE:%+v\n", postResponse(t))
	traces := [][]map[string]string{getTraces(t), getBackendTraces(t)}
	for _, logs := range traces {
		t.Log("Count of logs", len(logs))
		for _, each := range logs {
			t.Log(each)
		}
	}
}
func poster(m, u string, d *bytes.Buffer) []byte {
	req, err := http.NewRequest(m, u, d)
	if err != nil {
		panic(err)
	}
	req.Header.Set("accept", "application/json")
	req.Header.Set("Content-Type", "application/json")
	client := &http.Client{}
	resp, err := client.Do(req)
	if err != nil {
		return nil
	}
	respBody, err := io.ReadAll(resp.Body)
	if err != nil {
		panic(err)
	}
	defer resp.Body.Close()
	return respBody
}
func clearTraces() {
	m := "POST"
	u := endpoint_host + endpoint_traces + "/clear"
	d := bytes.NewBuffer([]byte{})
	poster(m, u, d)
}
func clearBackendTraces() {
	m := "POST"
	u := endpoint_host + endpoint_backend_traces + "/clear"
	d := bytes.NewBuffer([]byte{})
	poster(m, u, d)
}
func postResponse(t *testing.T) openai.ChatCompletionResponse {
	req, err := json.Marshal(completionRequest)
	if err != nil {
		t.Fatal(err)
	}
	m := "POST"
	u := endpoint_host + endpoint_chat_completions
	d := bytes.NewBuffer(req)
	response := openai.ChatCompletionResponse{}
	if err := json.Unmarshal(poster(m, u, d), &response); err != nil {
		t.Fatal(err)
	}
	return response
}
func getter(t *testing.T, u string) []byte {
	resp, err := http.Get(u)
	if err != nil {
		t.Fatal(err)
	}
	defer resp.Body.Close()
	respBody, err := io.ReadAll(resp.Body)
	if err != nil {
		t.Fatal(err)
	}
	return respBody
}
func getTraces(t *testing.T) []map[string]string {
	logs := []map[string]string{}
	u := endpoint_host + endpoint_traces
	json.Unmarshal(getter(t, u), &logs)
	return logs

}
func getBackendTraces(t *testing.T) []map[string]string {
	logs := []map[string]string{}
	u := endpoint_host + endpoint_backend_traces
	json.Unmarshal(getter(t, u), &logs)
	return logs
}

Output:

go test gemma4_test.go -v
=== RUN   TestGemma4ToolCall
    gemma4_test.go:55: RESPONSE:{ID:5c027e82-4c74-43ec-b659-be50e6ee54b9 Object:chat.completion Created:1776088111 Model:gemma-4-e2b-it Choices:[{Index:0 Message:{Role:assistant Content: Refusal: MultiContent:[] Name: ReasoningContent: FunctionCall:<nil> ToolCalls:[] ToolCallID:} FinishReason:stop LogProbs:<nil> ContentFilterResults:{Hate:{Filtered:false Severity:} SelfHarm:{Filtered:false Severity:} Sexual:{Filtered:false Severity:} Violence:{Filtered:false Severity:} JailBreak:{Filtered:false Detected:false} Profanity:{Filtered:false Detected:false}}}] Usage:{PromptTokens:32 CompletionTokens:45 TotalTokens:77 PromptTokensDetails:<nil> CompletionTokensDetails:<nil>} SystemFingerprint: PromptFilterResults:[] ServiceTier: httpHeader:map[]}
    gemma4_test.go:58: Count of logs 0
    gemma4_test.go:58: Count of logs 6
    gemma4_test.go:60: map[backend:llama-cpp data: duration: model_name:gemma-4-e2b-it summary:What is two plus two? timestamp:2026-04-13T07:48:31.527688963-06:00 type:llm]
    gemma4_test.go:60: map[backend:llama-cpp data: duration: model_name:gemma-4-e2b-it summary:What is two plus two? timestamp:2026-04-13T07:48:31.527688963-06:00 type:llm]
    gemma4_test.go:60: map[backend:llama-cpp data: duration: model_name:gemma-4-e2b-it summary:What is two plus two? timestamp:2026-04-13T07:48:31.527688963-06:00 type:llm]
    gemma4_test.go:60: map[backend:llama-cpp data: duration: model_name:gemma-4-e2b-it summary:What is two plus two? timestamp:2026-04-13T07:48:31.527688963-06:00 type:llm]
    gemma4_test.go:60: map[backend:llama-cpp data: duration: model_name:gemma-4-e2b-it summary:What is two plus two? timestamp:2026-04-13T07:48:31.527688963-06:00 type:llm]
    gemma4_test.go:60: map[backend:llama-cpp data: duration: model_name:gemma-4-e2b-it summary:What is two plus two? timestamp:2026-04-13T07:48:31.527688963-06:00 type:llm]
--- PASS: TestGemma4ToolCall (8.71s)
PASS
ok      command-line-arguments  8.713s

Expected behavior
To have tool calls in chat completion response? lol.

It would be nice to have the api traces working again, too.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions