From b17c41ac3c44cc8abd3922e7d46f7bf59925b797 Mon Sep 17 00:00:00 2001
From: Gabriel Cosi <contact@gabrielcosi.dev>
Date: Thu, 25 Jun 2026 11:34:06 +0200
Subject: [PATCH 1/6] Add OpenAI-compatible embedding client to vmcp

The vmcp optimizer only spoke the TEI embedding API. Add an "openai"
provider for OpenAI-compatible services (OpenAI, Azure, Bifrost,
LiteLLM), selected via optimizer.embeddingProvider with the model in
embeddingModel and the key read from OPENAI_API_KEY. Defaults to "tei",
so existing configs are unaffected.

Closes #5305

Signed-off-by: Gabriel Cosi <contact@gabrielcosi.dev>
---
 ...olhive.stacklok.dev_virtualmcpservers.yaml |  54 +++
 ...olhive.stacklok.dev_virtualmcpservers.yaml |  54 +++
 docs/operator/crd-api.md                      |   2 +
 .../virtual-mcps/vmcp_optimizer_openai.yaml   |  87 +++++
 pkg/vmcp/config/config.go                     |  25 ++
 .../internal/similarity/embedding_client.go   |  30 ++
 .../similarity/embedding_client_test.go       |  82 +++++
 .../internal/similarity/openai_client.go      | 171 ++++++++++
 .../internal/similarity/openai_client_test.go | 320 ++++++++++++++++++
 .../internal/similarity/tei_client.go         |  12 -
 pkg/vmcp/optimizer/internal/types/types.go    |  24 ++
 pkg/vmcp/optimizer/optimizer.go               |  39 +++
 pkg/vmcp/optimizer/optimizer_test.go          |  89 +++++
 13 files changed, 977 insertions(+), 12 deletions(-)
 create mode 100644 examples/operator/virtual-mcps/vmcp_optimizer_openai.yaml
 create mode 100644 pkg/vmcp/optimizer/internal/similarity/embedding_client.go
 create mode 100644 pkg/vmcp/optimizer/internal/similarity/embedding_client_test.go
 create mode 100644 pkg/vmcp/optimizer/internal/similarity/openai_client.go
 create mode 100644 pkg/vmcp/optimizer/internal/similarity/openai_client_test.go

diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml
index 035a10a98b..ad33615c29 100644
--- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml
+++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml
@@ -1792,6 +1792,33 @@ spec:
                       instead of all backend tools directly. This reduces token usage by allowing
                       LLMs to discover relevant tools on demand rather than receiving all tool definitions.
                     properties:
+                      embeddingModel:
+                        description: |-
+                          EmbeddingModel is the model name requested from the embedding service
+                          (e.g. "text-embedding-3-small"). Required when EmbeddingProvider is
+                          "openai". Ignored for the "tei" provider, where the model is fixed by the
+                          running TEI container.
+
+                          The API key for an OpenAI-compatible service is not configured here: it is
+                          read from the OPENAI_API_KEY environment variable so the secret never
+                          lands in a CRD spec or ConfigMap. An empty key omits the Authorization
+                          header, which supports keyless in-cluster gateways.
+                        type: string
+                      embeddingProvider:
+                        default: tei
+                        description: |-
+                          EmbeddingProvider selects the wire protocol used to talk to the embedding
+                          service. "tei" speaks the HuggingFace Text Embeddings Inference API;
+                          "openai" speaks the OpenAI-compatible /embeddings API, which lets the
+                          optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or
+                          LiteLLM. Defaults to "tei" when empty.
+
+                          The "openai" provider reads EmbeddingService directly and is not used with
+                          EmbeddingServerRef, which provisions a managed TEI server.
+                        enum:
+                        - tei
+                        - openai
+                        type: string
                       embeddingService:
                         description: |-
                           EmbeddingService is the full base URL of the embedding service endpoint
@@ -4880,6 +4907,33 @@ spec:
                       instead of all backend tools directly. This reduces token usage by allowing
                       LLMs to discover relevant tools on demand rather than receiving all tool definitions.
                     properties:
+                      embeddingModel:
+                        description: |-
+                          EmbeddingModel is the model name requested from the embedding service
+                          (e.g. "text-embedding-3-small"). Required when EmbeddingProvider is
+                          "openai". Ignored for the "tei" provider, where the model is fixed by the
+                          running TEI container.
+
+                          The API key for an OpenAI-compatible service is not configured here: it is
+                          read from the OPENAI_API_KEY environment variable so the secret never
+                          lands in a CRD spec or ConfigMap. An empty key omits the Authorization
+                          header, which supports keyless in-cluster gateways.
+                        type: string
+                      embeddingProvider:
+                        default: tei
+                        description: |-
+                          EmbeddingProvider selects the wire protocol used to talk to the embedding
+                          service. "tei" speaks the HuggingFace Text Embeddings Inference API;
+                          "openai" speaks the OpenAI-compatible /embeddings API, which lets the
+                          optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or
+                          LiteLLM. Defaults to "tei" when empty.
+
+                          The "openai" provider reads EmbeddingService directly and is not used with
+                          EmbeddingServerRef, which provisions a managed TEI server.
+                        enum:
+                        - tei
+                        - openai
+                        type: string
                       embeddingService:
                         description: |-
                           EmbeddingService is the full base URL of the embedding service endpoint
diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml
index d23ab42957..d4dd4b6203 100644
--- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml
+++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml
@@ -1795,6 +1795,33 @@ spec:
                       instead of all backend tools directly. This reduces token usage by allowing
                       LLMs to discover relevant tools on demand rather than receiving all tool definitions.
                     properties:
+                      embeddingModel:
+                        description: |-
+                          EmbeddingModel is the model name requested from the embedding service
+                          (e.g. "text-embedding-3-small"). Required when EmbeddingProvider is
+                          "openai". Ignored for the "tei" provider, where the model is fixed by the
+                          running TEI container.
+
+                          The API key for an OpenAI-compatible service is not configured here: it is
+                          read from the OPENAI_API_KEY environment variable so the secret never
+                          lands in a CRD spec or ConfigMap. An empty key omits the Authorization
+                          header, which supports keyless in-cluster gateways.
+                        type: string
+                      embeddingProvider:
+                        default: tei
+                        description: |-
+                          EmbeddingProvider selects the wire protocol used to talk to the embedding
+                          service. "tei" speaks the HuggingFace Text Embeddings Inference API;
+                          "openai" speaks the OpenAI-compatible /embeddings API, which lets the
+                          optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or
+                          LiteLLM. Defaults to "tei" when empty.
+
+                          The "openai" provider reads EmbeddingService directly and is not used with
+                          EmbeddingServerRef, which provisions a managed TEI server.
+                        enum:
+                        - tei
+                        - openai
+                        type: string
                       embeddingService:
                         description: |-
                           EmbeddingService is the full base URL of the embedding service endpoint
@@ -4883,6 +4910,33 @@ spec:
                       instead of all backend tools directly. This reduces token usage by allowing
                       LLMs to discover relevant tools on demand rather than receiving all tool definitions.
                     properties:
+                      embeddingModel:
+                        description: |-
+                          EmbeddingModel is the model name requested from the embedding service
+                          (e.g. "text-embedding-3-small"). Required when EmbeddingProvider is
+                          "openai". Ignored for the "tei" provider, where the model is fixed by the
+                          running TEI container.
+
+                          The API key for an OpenAI-compatible service is not configured here: it is
+                          read from the OPENAI_API_KEY environment variable so the secret never
+                          lands in a CRD spec or ConfigMap. An empty key omits the Authorization
+                          header, which supports keyless in-cluster gateways.
+                        type: string
+                      embeddingProvider:
+                        default: tei
+                        description: |-
+                          EmbeddingProvider selects the wire protocol used to talk to the embedding
+                          service. "tei" speaks the HuggingFace Text Embeddings Inference API;
+                          "openai" speaks the OpenAI-compatible /embeddings API, which lets the
+                          optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or
+                          LiteLLM. Defaults to "tei" when empty.
+
+                          The "openai" provider reads EmbeddingService directly and is not used with
+                          EmbeddingServerRef, which provisions a managed TEI server.
+                        enum:
+                        - tei
+                        - openai
+                        type: string
                       embeddingService:
                         description: |-
                           EmbeddingService is the full base URL of the embedding service endpoint
diff --git a/docs/operator/crd-api.md b/docs/operator/crd-api.md
index a1cc135267..2e34a58d6d 100644
--- a/docs/operator/crd-api.md
+++ b/docs/operator/crd-api.md
@@ -502,6 +502,8 @@ _Appears in:_
 | --- | --- | --- | --- |
 | `embeddingService` _string_ | EmbeddingService is the full base URL of the embedding service endpoint<br />(e.g., http://my-embedding.default.svc.cluster.local:8080) for semantic<br />tool discovery.<br />In a Kubernetes environment, it is more convenient to use the<br />VirtualMCPServerSpec.EmbeddingServerRef field instead of setting this<br />directly. EmbeddingServerRef references an EmbeddingServer CRD by name,<br />and the operator automatically resolves the referenced resource's<br />Status.URL to populate this field. This provides managed lifecycle<br />(the operator watches the EmbeddingServer for readiness and URL changes)<br />and avoids hardcoding service URLs in the config. If both<br />EmbeddingServerRef and this field are set, EmbeddingServerRef takes<br />precedence and this value is overridden with a warning. |  | Optional: \{\} <br /> |
 | `embeddingServiceTimeout` _[vmcp.config.Duration](#vmcpconfigduration)_ | EmbeddingServiceTimeout is the HTTP request timeout for calls to the embedding service.<br />Defaults to 30s if not specified. | 30s | Pattern: `^([0-9]+(\.[0-9]+)?(ns\|us\|µs\|ms\|s\|m\|h))+$` <br />Type: string <br />Optional: \{\} <br /> |
+| `embeddingProvider` _string_ | EmbeddingProvider selects the wire protocol used to talk to the embedding<br />service. "tei" speaks the HuggingFace Text Embeddings Inference API;<br />"openai" speaks the OpenAI-compatible /embeddings API, which lets the<br />optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or<br />LiteLLM. Defaults to "tei" when empty.<br />The "openai" provider reads EmbeddingService directly and is not used with<br />EmbeddingServerRef, which provisions a managed TEI server. | tei | Enum: [tei openai] <br />Optional: \{\} <br /> |
+| `embeddingModel` _string_ | EmbeddingModel is the model name requested from the embedding service<br />(e.g. "text-embedding-3-small"). Required when EmbeddingProvider is<br />"openai". Ignored for the "tei" provider, where the model is fixed by the<br />running TEI container.<br />The API key for an OpenAI-compatible service is not configured here: it is<br />read from the OPENAI_API_KEY environment variable so the secret never<br />lands in a CRD spec or ConfigMap. An empty key omits the Authorization<br />header, which supports keyless in-cluster gateways. |  | Optional: \{\} <br /> |
 | `maxToolsToReturn` _integer_ | MaxToolsToReturn is the maximum number of tool results returned by a search query.<br />Defaults to 8 if not specified or zero. |  | Maximum: 50 <br />Minimum: 1 <br />Optional: \{\} <br /> |
 | `hybridSearchSemanticRatio` _string_ | HybridSearchSemanticRatio controls the balance between semantic (meaning-based)<br />and keyword search results. 0.0 = all keyword, 1.0 = all semantic.<br />Defaults to "0.5" if not specified or empty.<br />Serialized as a string because CRDs do not support float types portably. |  | Pattern: `^([0-9]*[.])?[0-9]+$` <br />Optional: \{\} <br /> |
 | `semanticDistanceThreshold` _string_ | SemanticDistanceThreshold is the maximum distance for semantic search results.<br />Results exceeding this threshold are filtered out from semantic search.<br />This threshold does not apply to keyword search.<br />Range: 0 = identical, 2 = completely unrelated.<br />Defaults to "1.0" if not specified or empty.<br />Serialized as a string because CRDs do not support float types portably. |  | Pattern: `^([0-9]*[.])?[0-9]+$` <br />Optional: \{\} <br /> |
diff --git a/examples/operator/virtual-mcps/vmcp_optimizer_openai.yaml b/examples/operator/virtual-mcps/vmcp_optimizer_openai.yaml
new file mode 100644
index 0000000000..5e6436a3ae
--- /dev/null
+++ b/examples/operator/virtual-mcps/vmcp_optimizer_openai.yaml
@@ -0,0 +1,87 @@
+# Example: VirtualMCPServer optimizer using an OpenAI-compatible embedding API
+#
+# Instead of a managed TEI EmbeddingServer, this points the optimizer at an
+# external service that speaks the OpenAI /embeddings API — OpenAI, Azure
+# OpenAI, or a gateway such as Bifrost or LiteLLM. There is no EmbeddingServer
+# or embeddingServerRef: the endpoint is reached directly via embeddingService.
+#
+# The API key is read from the OPENAI_API_KEY environment variable so it never
+# lands in the CRD spec or the generated ConfigMap. Inject it into the vmcp
+# container from a Secret via podTemplateSpec (omit it for keyless gateways).
+#
+# Note: unlike the TEI backend, the OpenAI API does not silently truncate
+# over-long inputs; a tool description exceeding the model's context window
+# returns an error rather than being truncated.
+#
+# Usage:
+#   kubectl apply -f vmcp_optimizer_openai.yaml
+
+---
+apiVersion: toolhive.stacklok.dev/v1beta1
+kind: MCPGroup
+metadata:
+  name: optimizer-services
+  namespace: default
+spec:
+  description: Backend services for an OpenAI-embedding optimizer
+
+---
+apiVersion: toolhive.stacklok.dev/v1beta1
+kind: MCPServer
+metadata:
+  name: fetch
+  namespace: default
+spec:
+  groupRef:
+    name: optimizer-services
+  image: ghcr.io/stackloklabs/gofetch/server
+  transport: streamable-http
+  proxyPort: 8080
+  mcpPort: 8080
+
+---
+# Secret holding the embedding API key. Omit for keyless in-cluster gateways.
+apiVersion: v1
+kind: Secret
+metadata:
+  name: embedding-api-key
+  namespace: default
+type: Opaque
+stringData:
+  apiKey: "sk-replace-me"
+
+---
+apiVersion: toolhive.stacklok.dev/v1beta1
+kind: VirtualMCPServer
+metadata:
+  name: optimizer-vmcp
+  namespace: default
+spec:
+  groupRef:
+    name: optimizer-services
+  config:
+    optimizer:
+      # Speak the OpenAI /embeddings API instead of TEI.
+      embeddingProvider: openai
+      # Base URL of the OpenAI-compatible service; "/embeddings" is appended.
+      embeddingService: http://bifrost.ai.svc.cluster.local:8080/v1
+      # Model requested from the service (required for the openai provider).
+      embeddingModel: text-embedding-3-small
+      embeddingServiceTimeout: 15s
+
+  incomingAuth:
+    type: anonymous
+  outgoingAuth:
+    source: discovered
+
+  # Inject the API key into the vmcp container as OPENAI_API_KEY.
+  podTemplateSpec:
+    spec:
+      containers:
+        - name: vmcp
+          env:
+            - name: OPENAI_API_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: embedding-api-key
+                  key: apiKey
diff --git a/pkg/vmcp/config/config.go b/pkg/vmcp/config/config.go
index de91d6cd49..033a6e0d0b 100644
--- a/pkg/vmcp/config/config.go
+++ b/pkg/vmcp/config/config.go
@@ -933,6 +933,31 @@ type OptimizerConfig struct {
 	// +optional
 	EmbeddingServiceTimeout Duration `json:"embeddingServiceTimeout,omitempty" yaml:"embeddingServiceTimeout,omitempty"`
 
+	// EmbeddingProvider selects the wire protocol used to talk to the embedding
+	// service. "tei" speaks the HuggingFace Text Embeddings Inference API;
+	// "openai" speaks the OpenAI-compatible /embeddings API, which lets the
+	// optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or
+	// LiteLLM. Defaults to "tei" when empty.
+	//
+	// The "openai" provider reads EmbeddingService directly and is not used with
+	// EmbeddingServerRef, which provisions a managed TEI server.
+	// +kubebuilder:validation:Enum=tei;openai
+	// +kubebuilder:default="tei"
+	// +optional
+	EmbeddingProvider string `json:"embeddingProvider,omitempty" yaml:"embeddingProvider,omitempty"`
+
+	// EmbeddingModel is the model name requested from the embedding service
+	// (e.g. "text-embedding-3-small"). Required when EmbeddingProvider is
+	// "openai". Ignored for the "tei" provider, where the model is fixed by the
+	// running TEI container.
+	//
+	// The API key for an OpenAI-compatible service is not configured here: it is
+	// read from the OPENAI_API_KEY environment variable so the secret never
+	// lands in a CRD spec or ConfigMap. An empty key omits the Authorization
+	// header, which supports keyless in-cluster gateways.
+	// +optional
+	EmbeddingModel string `json:"embeddingModel,omitempty" yaml:"embeddingModel,omitempty"`
+
 	// MaxToolsToReturn is the maximum number of tool results returned by a search query.
 	// Defaults to 8 if not specified or zero.
 	// +kubebuilder:validation:Minimum=1
diff --git a/pkg/vmcp/optimizer/internal/similarity/embedding_client.go b/pkg/vmcp/optimizer/internal/similarity/embedding_client.go
new file mode 100644
index 0000000000..1d94d206de
--- /dev/null
+++ b/pkg/vmcp/optimizer/internal/similarity/embedding_client.go
@@ -0,0 +1,30 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+package similarity
+
+import (
+	"fmt"
+
+	"github.com/stacklok/toolhive/pkg/vmcp/optimizer/internal/types"
+)
+
+// NewEmbeddingClient creates an EmbeddingClient from the given optimizer
+// configuration, selecting the backend implementation from EmbeddingProvider.
+// It returns (nil, nil) if cfg is nil or no embedding service URL is configured,
+// meaning semantic search will be disabled.
+func NewEmbeddingClient(cfg *types.OptimizerConfig) (types.EmbeddingClient, error) {
+	if cfg == nil || cfg.EmbeddingService == "" {
+		return nil, nil
+	}
+
+	switch cfg.EmbeddingProvider {
+	case "", types.EmbeddingProviderTEI:
+		return newTEIClient(cfg.EmbeddingService, cfg.EmbeddingServiceTimeout)
+	case types.EmbeddingProviderOpenAI:
+		return newOpenAIClient(cfg.EmbeddingService, cfg.EmbeddingModel, cfg.EmbeddingAPIKey, cfg.EmbeddingServiceTimeout)
+	default:
+		return nil, fmt.Errorf("unsupported embedding provider %q (supported: %q, %q)",
+			cfg.EmbeddingProvider, types.EmbeddingProviderTEI, types.EmbeddingProviderOpenAI)
+	}
+}
diff --git a/pkg/vmcp/optimizer/internal/similarity/embedding_client_test.go b/pkg/vmcp/optimizer/internal/similarity/embedding_client_test.go
new file mode 100644
index 0000000000..f38a5302be
--- /dev/null
+++ b/pkg/vmcp/optimizer/internal/similarity/embedding_client_test.go
@@ -0,0 +1,82 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+package similarity
+
+import (
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/stacklok/toolhive/pkg/vmcp/optimizer/internal/types"
+)
+
+func TestNewEmbeddingClient(t *testing.T) {
+	t.Parallel()
+
+	// TEI selection queries the /info endpoint on construction, so a stub server
+	// is needed for that case.
+	teiInfo := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == infoPath {
+			w.Header().Set("Content-Type", "application/json")
+			_, _ = w.Write([]byte(`{"max_client_batch_size": 16}`))
+			return
+		}
+		w.WriteHeader(http.StatusNotFound)
+	}))
+	t.Cleanup(teiInfo.Close)
+
+	t.Run("nil config disables semantic search", func(t *testing.T) {
+		t.Parallel()
+		client, err := NewEmbeddingClient(nil)
+		require.NoError(t, err)
+		require.Nil(t, client)
+	})
+
+	t.Run("empty service disables semantic search", func(t *testing.T) {
+		t.Parallel()
+		client, err := NewEmbeddingClient(&types.OptimizerConfig{EmbeddingProvider: types.EmbeddingProviderOpenAI})
+		require.NoError(t, err)
+		require.Nil(t, client)
+	})
+
+	t.Run("empty provider defaults to TEI", func(t *testing.T) {
+		t.Parallel()
+		client, err := NewEmbeddingClient(&types.OptimizerConfig{EmbeddingService: teiInfo.URL})
+		require.NoError(t, err)
+		require.IsType(t, &teiClient{}, client)
+	})
+
+	t.Run("tei provider", func(t *testing.T) {
+		t.Parallel()
+		client, err := NewEmbeddingClient(&types.OptimizerConfig{
+			EmbeddingService:  teiInfo.URL,
+			EmbeddingProvider: types.EmbeddingProviderTEI,
+		})
+		require.NoError(t, err)
+		require.IsType(t, &teiClient{}, client)
+	})
+
+	t.Run("openai provider", func(t *testing.T) {
+		t.Parallel()
+		client, err := NewEmbeddingClient(&types.OptimizerConfig{
+			EmbeddingService:  "http://embeddings:8080/v1",
+			EmbeddingProvider: types.EmbeddingProviderOpenAI,
+			EmbeddingModel:    "text-embedding-3-small",
+		})
+		require.NoError(t, err)
+		require.IsType(t, &openAIClient{}, client)
+	})
+
+	t.Run("unsupported provider returns error", func(t *testing.T) {
+		t.Parallel()
+		client, err := NewEmbeddingClient(&types.OptimizerConfig{
+			EmbeddingService:  "http://embeddings:8080",
+			EmbeddingProvider: "cohere",
+		})
+		require.ErrorContains(t, err, "unsupported embedding provider")
+		require.Nil(t, client)
+	})
+}
diff --git a/pkg/vmcp/optimizer/internal/similarity/openai_client.go b/pkg/vmcp/optimizer/internal/similarity/openai_client.go
new file mode 100644
index 0000000000..e3dd1b1561
--- /dev/null
+++ b/pkg/vmcp/optimizer/internal/similarity/openai_client.go
@@ -0,0 +1,171 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+package similarity
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"net/http"
+	"time"
+)
+
+const (
+	embeddingsPath = "/embeddings"
+
+	// openAIMaxBatchSize is the OpenAI cap on inputs per /embeddings request;
+	// compatible gateways generally honor the same limit.
+	openAIMaxBatchSize = 2048
+)
+
+// openAIClient implements types.EmbeddingClient against an OpenAI-compatible
+// /embeddings API (OpenAI, Azure OpenAI, or a gateway such as Bifrost or LiteLLM).
+type openAIClient struct {
+	baseURL      string
+	apiKey       string
+	model        string
+	httpClient   *http.Client
+	maxBatchSize int
+}
+
+// newOpenAIClient creates a client that POSTs to baseURL+"/embeddings" using the
+// given model. A non-empty apiKey is sent as a Bearer token; an empty apiKey
+// omits the Authorization header so keyless endpoints work. Zero timeout uses
+// defaultTimeout.
+func newOpenAIClient(baseURL, model, apiKey string, timeout time.Duration) (*openAIClient, error) {
+	if baseURL == "" {
+		return nil, fmt.Errorf("OpenAI embedding base URL is required")
+	}
+	if model == "" {
+		return nil, fmt.Errorf("OpenAI embedding model is required")
+	}
+
+	if timeout == 0 {
+		timeout = defaultTimeout
+	}
+
+	slog.Debug("OpenAI embedding client created",
+		"base_url", baseURL, "model", model, "timeout", timeout)
+
+	return &openAIClient{
+		baseURL:      baseURL,
+		apiKey:       apiKey,
+		model:        model,
+		httpClient:   &http.Client{Timeout: timeout},
+		maxBatchSize: openAIMaxBatchSize,
+	}, nil
+}
+
+type openAIEmbedRequest struct {
+	Model string   `json:"model"`
+	Input []string `json:"input"`
+	// EncodingFormat pins the response to float arrays, since we decode into
+	// []float32; without it a compatible server may return base64.
+	EncodingFormat string `json:"encoding_format"`
+}
+
+type openAIEmbedResponse struct {
+	Data []openAIEmbedding `json:"data"`
+}
+
+type openAIEmbedding struct {
+	Index     int       `json:"index"`
+	Embedding []float32 `json:"embedding"`
+}
+
+// Embed returns a vector embedding for the given text.
+func (c *openAIClient) Embed(ctx context.Context, text string) ([]float32, error) {
+	results, err := c.EmbedBatch(ctx, []string{text})
+	if err != nil {
+		return nil, err
+	}
+	if len(results) == 0 {
+		return nil, fmt.Errorf("OpenAI returned empty response for single input")
+	}
+	return results[0], nil
+}
+
+// EmbedBatch returns embeddings for multiple texts, chunking to respect the
+// OpenAI /embeddings input batch size.
+func (c *openAIClient) EmbedBatch(ctx context.Context, texts []string) ([][]float32, error) {
+	if len(texts) == 0 {
+		return nil, nil
+	}
+
+	allEmbeddings := make([][]float32, 0, len(texts))
+
+	for start := 0; start < len(texts); start += c.maxBatchSize {
+		end := min(start+c.maxBatchSize, len(texts))
+		embeddings, err := c.embedChunk(ctx, texts[start:end])
+		if err != nil {
+			return nil, err
+		}
+		allEmbeddings = append(allEmbeddings, embeddings...)
+	}
+
+	slog.Debug("OpenAI embedding batch completed",
+		"inputs", len(texts), "chunks", (len(texts)+c.maxBatchSize-1)/c.maxBatchSize,
+		"dimensions", len(allEmbeddings[0]))
+
+	return allEmbeddings, nil
+}
+
+// embedChunk sends one batch to the /embeddings endpoint and returns the
+// embeddings ordered to match texts.
+func (c *openAIClient) embedChunk(ctx context.Context, texts []string) ([][]float32, error) {
+	bodyBytes, err := json.Marshal(openAIEmbedRequest{Model: c.model, Input: texts, EncodingFormat: "float"})
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal OpenAI request: %w", err)
+	}
+
+	url := c.baseURL + embeddingsPath
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(bodyBytes))
+	if err != nil {
+		return nil, fmt.Errorf("failed to create OpenAI request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+	if c.apiKey != "" {
+		req.Header.Set("Authorization", "Bearer "+c.apiKey)
+	}
+
+	resp, err := c.httpClient.Do(req) // #nosec G704 -- URL is built from the configured embedding base URL
+	if err != nil {
+		return nil, fmt.Errorf("OpenAI request failed: %w", err)
+	}
+	defer func() { _ = resp.Body.Close() }()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		return nil, fmt.Errorf("OpenAI returned status %d: %s", resp.StatusCode, string(body))
+	}
+
+	var embedResp openAIEmbedResponse
+	if err := json.NewDecoder(resp.Body).Decode(&embedResp); err != nil {
+		return nil, fmt.Errorf("failed to decode OpenAI response: %w", err)
+	}
+
+	if len(embedResp.Data) != len(texts) {
+		return nil, fmt.Errorf("OpenAI returned %d embeddings for %d inputs", len(embedResp.Data), len(texts))
+	}
+
+	// Place each embedding at its reported index; the API is free to return
+	// entries out of order.
+	embeddings := make([][]float32, len(texts))
+	for _, d := range embedResp.Data {
+		if d.Index < 0 || d.Index >= len(texts) {
+			return nil, fmt.Errorf("OpenAI returned out-of-range embedding index %d for %d inputs", d.Index, len(texts))
+		}
+		embeddings[d.Index] = d.Embedding
+	}
+
+	return embeddings, nil
+}
+
+// Close is a no-op for the OpenAI client.
+func (*openAIClient) Close() error {
+	return nil
+}
diff --git a/pkg/vmcp/optimizer/internal/similarity/openai_client_test.go b/pkg/vmcp/optimizer/internal/similarity/openai_client_test.go
new file mode 100644
index 0000000000..af7da66bc1
--- /dev/null
+++ b/pkg/vmcp/optimizer/internal/similarity/openai_client_test.go
@@ -0,0 +1,320 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+package similarity
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func Test_newOpenAIClient(t *testing.T) {
+	t.Parallel()
+
+	t.Run("empty URL returns error", func(t *testing.T) {
+		t.Parallel()
+		client, err := newOpenAIClient("", "text-embedding-3-small", "key", 0)
+		require.ErrorContains(t, err, "OpenAI embedding base URL is required")
+		require.Nil(t, client)
+	})
+
+	t.Run("empty model returns error", func(t *testing.T) {
+		t.Parallel()
+		client, err := newOpenAIClient("http://embeddings:8080/v1", "", "key", 0)
+		require.ErrorContains(t, err, "OpenAI embedding model is required")
+		require.Nil(t, client)
+	})
+
+	t.Run("valid args create client with default batch size", func(t *testing.T) {
+		t.Parallel()
+		client, err := newOpenAIClient("http://embeddings:8080/v1", "text-embedding-3-small", "key", 0)
+		require.NoError(t, err)
+		require.NotNil(t, client)
+		require.Equal(t, openAIMaxBatchSize, client.maxBatchSize)
+		require.Equal(t, defaultTimeout, client.httpClient.Timeout)
+	})
+
+	t.Run("custom timeout", func(t *testing.T) {
+		t.Parallel()
+		client, err := newOpenAIClient("http://embeddings:8080/v1", "text-embedding-3-small", "key", 5*time.Second)
+		require.NoError(t, err)
+		require.NotNil(t, client)
+		require.Equal(t, 5*time.Second, client.httpClient.Timeout)
+	})
+}
+
+func TestOpenAIClient_Embed(t *testing.T) {
+	t.Parallel()
+
+	expected := []float32{0.1, 0.2, 0.3}
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		require.Equal(t, http.MethodPost, r.Method)
+		require.Equal(t, embeddingsPath, r.URL.Path)
+		require.Equal(t, "application/json", r.Header.Get("Content-Type"))
+		require.Equal(t, "Bearer test-key", r.Header.Get("Authorization"))
+
+		var req openAIEmbedRequest
+		require.NoError(t, json.NewDecoder(r.Body).Decode(&req))
+		require.Equal(t, "text-embedding-3-small", req.Model)
+		require.Equal(t, "float", req.EncodingFormat)
+		require.Len(t, req.Input, 1)
+		require.Equal(t, "hello world", req.Input[0])
+
+		writeOpenAIEmbeddings(t, w, [][]float32{expected})
+	}))
+	t.Cleanup(srv.Close)
+
+	client := newTestOpenAIClient(t, srv.URL, "test-key")
+
+	result, err := client.Embed(context.Background(), "hello world")
+	require.NoError(t, err)
+	require.Equal(t, expected, result)
+}
+
+func TestOpenAIClient_EmbedBatch(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name       string
+		texts      []string
+		handler    http.HandlerFunc
+		wantErr    string
+		wantLen    int
+		wantResult [][]float32
+	}{
+		{
+			name:  "empty input",
+			texts: nil,
+		},
+		{
+			name:  "single input",
+			texts: []string{"hello"},
+			handler: func(w http.ResponseWriter, _ *http.Request) {
+				writeOpenAIEmbeddings(t, w, [][]float32{{0.1, 0.2}})
+			},
+			wantLen:    1,
+			wantResult: [][]float32{{0.1, 0.2}},
+		},
+		{
+			name:  "multiple inputs",
+			texts: []string{"hello", "world"},
+			handler: func(w http.ResponseWriter, _ *http.Request) {
+				writeOpenAIEmbeddings(t, w, [][]float32{{0.1, 0.2}, {0.3, 0.4}})
+			},
+			wantLen:    2,
+			wantResult: [][]float32{{0.1, 0.2}, {0.3, 0.4}},
+		},
+		{
+			name:  "out-of-order data is reordered by index",
+			texts: []string{"hello", "world"},
+			handler: func(w http.ResponseWriter, _ *http.Request) {
+				w.Header().Set("Content-Type", "application/json")
+				_ = json.NewEncoder(w).Encode(openAIEmbedResponse{Data: []openAIEmbedding{
+					{Index: 1, Embedding: []float32{0.3, 0.4}},
+					{Index: 0, Embedding: []float32{0.1, 0.2}},
+				}})
+			},
+			wantLen:    2,
+			wantResult: [][]float32{{0.1, 0.2}, {0.3, 0.4}},
+		},
+		{
+			name:  "server error",
+			texts: []string{"hello"},
+			handler: func(w http.ResponseWriter, _ *http.Request) {
+				w.WriteHeader(http.StatusInternalServerError)
+				_, _ = w.Write([]byte("internal error"))
+			},
+			wantErr: "OpenAI returned status 500",
+		},
+		{
+			name:  "mismatched count",
+			texts: []string{"hello", "world"},
+			handler: func(w http.ResponseWriter, _ *http.Request) {
+				writeOpenAIEmbeddings(t, w, [][]float32{{0.1, 0.2}})
+			},
+			wantErr: "OpenAI returned 1 embeddings for 2 inputs",
+		},
+		{
+			name:  "out-of-range index",
+			texts: []string{"hello"},
+			handler: func(w http.ResponseWriter, _ *http.Request) {
+				w.Header().Set("Content-Type", "application/json")
+				_ = json.NewEncoder(w).Encode(openAIEmbedResponse{Data: []openAIEmbedding{
+					{Index: 5, Embedding: []float32{0.1, 0.2}},
+				}})
+			},
+			wantErr: "out-of-range embedding index 5",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+
+			var srv *httptest.Server
+			if tt.handler != nil {
+				srv = httptest.NewServer(tt.handler)
+				t.Cleanup(srv.Close)
+			}
+
+			baseURL := "http://localhost:0"
+			if srv != nil {
+				baseURL = srv.URL
+			}
+
+			client := newTestOpenAIClient(t, baseURL, "test-key")
+
+			results, err := client.EmbedBatch(context.Background(), tt.texts)
+			if tt.wantErr != "" {
+				require.ErrorContains(t, err, tt.wantErr)
+				return
+			}
+
+			require.NoError(t, err)
+			if tt.wantLen > 0 {
+				require.Len(t, results, tt.wantLen)
+				require.Equal(t, tt.wantResult, results)
+			} else {
+				require.Nil(t, results)
+			}
+		})
+	}
+}
+
+func TestOpenAIClient_EmbedBatch_Chunking(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name         string
+		maxBatchSize int
+		numInputs    int
+		wantChunks   int
+	}{
+		{name: "inputs fit in single batch", maxBatchSize: 5, numInputs: 3, wantChunks: 1},
+		{name: "inputs exactly fill one batch", maxBatchSize: 4, numInputs: 4, wantChunks: 1},
+		{name: "inputs split into two batches", maxBatchSize: 3, numInputs: 5, wantChunks: 2},
+		{name: "inputs split into many batches", maxBatchSize: 2, numInputs: 7, wantChunks: 4},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+
+			var chunkCount int
+			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				var req openAIEmbedRequest
+				require.NoError(t, json.NewDecoder(r.Body).Decode(&req))
+				require.LessOrEqual(t, len(req.Input), tt.maxBatchSize,
+					"chunk size should not exceed maxBatchSize")
+				chunkCount++
+
+				embeddings := make([][]float32, len(req.Input))
+				for i := range embeddings {
+					embeddings[i] = []float32{float32(i) * 0.1}
+				}
+				writeOpenAIEmbeddings(t, w, embeddings)
+			}))
+			t.Cleanup(srv.Close)
+
+			texts := make([]string, tt.numInputs)
+			for i := range texts {
+				texts[i] = fmt.Sprintf("text-%d", i)
+			}
+
+			client := newTestOpenAIClientWithBatch(t, srv.URL, tt.maxBatchSize)
+			results, err := client.EmbedBatch(context.Background(), texts)
+			require.NoError(t, err)
+			require.Len(t, results, tt.numInputs)
+			require.Equal(t, tt.wantChunks, chunkCount)
+		})
+	}
+}
+
+func TestOpenAIClient_EmbedBatch_ChunkErrorStopsEarly(t *testing.T) {
+	t.Parallel()
+
+	var callCount int
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		callCount++
+		if callCount == 2 {
+			w.WriteHeader(http.StatusInternalServerError)
+			_, _ = w.Write([]byte("server overloaded"))
+			return
+		}
+		writeOpenAIEmbeddings(t, w, [][]float32{{0.1}, {0.2}})
+	}))
+	t.Cleanup(srv.Close)
+
+	texts := make([]string, 6) // 3 chunks of 2
+	for i := range texts {
+		texts[i] = fmt.Sprintf("text-%d", i)
+	}
+
+	client := newTestOpenAIClientWithBatch(t, srv.URL, 2)
+	_, err := client.EmbedBatch(context.Background(), texts)
+	require.ErrorContains(t, err, "OpenAI returned status 500")
+	require.Equal(t, 2, callCount, "should stop after the failing chunk")
+}
+
+func TestOpenAIClient_OmitsAuthHeaderWhenKeyless(t *testing.T) {
+	t.Parallel()
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		require.Empty(t, r.Header.Get("Authorization"))
+		writeOpenAIEmbeddings(t, w, [][]float32{{0.1}})
+	}))
+	t.Cleanup(srv.Close)
+
+	client := newTestOpenAIClient(t, srv.URL, "")
+
+	_, err := client.Embed(context.Background(), "hello")
+	require.NoError(t, err)
+}
+
+func TestOpenAIClient_Close(t *testing.T) {
+	t.Parallel()
+
+	client := newTestOpenAIClient(t, "http://my-embedding:8080/v1", "key")
+	require.NoError(t, client.Close())
+}
+
+// writeOpenAIEmbeddings encodes embeddings as an OpenAI /embeddings response,
+// assigning each entry its slice position as the index.
+func writeOpenAIEmbeddings(t *testing.T, w http.ResponseWriter, embeddings [][]float32) {
+	t.Helper()
+	resp := openAIEmbedResponse{Data: make([]openAIEmbedding, len(embeddings))}
+	for i, e := range embeddings {
+		resp.Data[i] = openAIEmbedding{Index: i, Embedding: e}
+	}
+	w.Header().Set("Content-Type", "application/json")
+	require.NoError(t, json.NewEncoder(w).Encode(resp))
+}
+
+// newTestOpenAIClient creates an openAIClient pointing at the given URL for
+// testing. It defaults to a large batch size so requests are single-chunk.
+func newTestOpenAIClient(t *testing.T, baseURL, apiKey string) *openAIClient {
+	t.Helper()
+	client := newTestOpenAIClientWithBatch(t, baseURL, 1000)
+	client.apiKey = apiKey
+	return client
+}
+
+// newTestOpenAIClientWithBatch creates an openAIClient with a specific max batch
+// size for testing, using a fixed API key.
+func newTestOpenAIClientWithBatch(t *testing.T, baseURL string, maxBatchSize int) *openAIClient {
+	t.Helper()
+	return &openAIClient{
+		baseURL:      baseURL,
+		apiKey:       "test-key",
+		model:        "text-embedding-3-small",
+		httpClient:   &http.Client{Timeout: defaultTimeout},
+		maxBatchSize: maxBatchSize,
+	}
+}
diff --git a/pkg/vmcp/optimizer/internal/similarity/tei_client.go b/pkg/vmcp/optimizer/internal/similarity/tei_client.go
index fe58ba4fd1..035f3e0f9d 100644
--- a/pkg/vmcp/optimizer/internal/similarity/tei_client.go
+++ b/pkg/vmcp/optimizer/internal/similarity/tei_client.go
@@ -12,8 +12,6 @@ import (
 	"log/slog"
 	"net/http"
 	"time"
-
-	"github.com/stacklok/toolhive/pkg/vmcp/optimizer/internal/types"
 )
 
 const (
@@ -38,16 +36,6 @@ type teiClient struct {
 	maxBatchSize int
 }
 
-// NewEmbeddingClient creates an EmbeddingClient from the given optimizer
-// configuration. It returns (nil, nil) if cfg is nil or no embedding service
-// URL is configured, meaning semantic search will be disabled.
-func NewEmbeddingClient(cfg *types.OptimizerConfig) (types.EmbeddingClient, error) {
-	if cfg == nil || cfg.EmbeddingService == "" {
-		return nil, nil
-	}
-	return newTEIClient(cfg.EmbeddingService, cfg.EmbeddingServiceTimeout)
-}
-
 // newTEIClient creates a new TEI embedding client that calls the specified endpoint.
 // It queries the TEI /info endpoint to discover the server's maximum batch size.
 func newTEIClient(baseURL string, timeout time.Duration) (*teiClient, error) {
diff --git a/pkg/vmcp/optimizer/internal/types/types.go b/pkg/vmcp/optimizer/internal/types/types.go
index dac8beaa1e..1eb90f424a 100644
--- a/pkg/vmcp/optimizer/internal/types/types.go
+++ b/pkg/vmcp/optimizer/internal/types/types.go
@@ -37,6 +37,16 @@ type ToolStore interface {
 	Close() error
 }
 
+// Embedding provider identifiers select the wire protocol used to talk to the
+// embedding service. They match config.OptimizerConfig.EmbeddingProvider.
+const (
+	// EmbeddingProviderTEI speaks the HuggingFace Text Embeddings Inference API.
+	EmbeddingProviderTEI = "tei"
+
+	// EmbeddingProviderOpenAI speaks the OpenAI-compatible /embeddings API.
+	EmbeddingProviderOpenAI = "openai"
+)
+
 // EmbeddingClient generates vector embeddings from text.
 // Implementations may use local models, remote APIs, or deterministic fakes.
 // The dimensionality of embeddings can be inferred from the returned vectors.
@@ -70,6 +80,20 @@ type OptimizerConfig struct {
 	// Zero means use the default timeout (30s).
 	EmbeddingServiceTimeout time.Duration
 
+	// EmbeddingProvider selects the embedding backend wire protocol
+	// (EmbeddingProviderTEI or EmbeddingProviderOpenAI). Empty defaults to TEI.
+	EmbeddingProvider string
+
+	// EmbeddingModel is the model name requested from an OpenAI-compatible
+	// embedding service (e.g. "text-embedding-3-small"). Unused by the TEI
+	// provider, where the model is fixed by the running container.
+	EmbeddingModel string
+
+	// EmbeddingAPIKey is the bearer token sent to an OpenAI-compatible embedding
+	// service. Empty means no Authorization header is sent, which supports
+	// keyless in-cluster gateways. Never populated for the TEI provider.
+	EmbeddingAPIKey string
+
 	// MaxToolsToReturn limits the number of tools returned by FindTool.
 	MaxToolsToReturn *int
 
diff --git a/pkg/vmcp/optimizer/optimizer.go b/pkg/vmcp/optimizer/optimizer.go
index 2fe3f85587..7e41d0d0ed 100644
--- a/pkg/vmcp/optimizer/optimizer.go
+++ b/pkg/vmcp/optimizer/optimizer.go
@@ -16,6 +16,7 @@ import (
 	"context"
 	"fmt"
 	"log/slog"
+	"os"
 	"strconv"
 	"time"
 
@@ -29,6 +30,12 @@ import (
 	"github.com/stacklok/toolhive/pkg/vmcp/optimizer/internal/types"
 )
 
+// embeddingAPIKeyEnvVar holds the bearer token for an OpenAI-compatible
+// embedding service. It is an env var, not a config field, so the secret never
+// lands in a CRD spec or ConfigMap.
+// #nosec G101 -- This is an environment variable name, not a hardcoded credential
+const embeddingAPIKeyEnvVar = "OPENAI_API_KEY"
+
 // Config defines configuration options for the Optimizer.
 // It is defined in the internal/types package and aliased here so that
 // external consumers continue to use optimizer.Config.
@@ -45,6 +52,12 @@ func GetAndValidateConfig(cfg *vmcpconfig.OptimizerConfig) (*Config, error) {
 	optCfg := &Config{
 		EmbeddingService:        cfg.EmbeddingService,
 		EmbeddingServiceTimeout: time.Duration(cfg.EmbeddingServiceTimeout),
+		EmbeddingProvider:       cfg.EmbeddingProvider,
+		EmbeddingModel:          cfg.EmbeddingModel,
+	}
+
+	if err := resolveEmbeddingProvider(optCfg); err != nil {
+		return nil, err
 	}
 
 	if cfg.MaxToolsToReturn != 0 {
@@ -85,6 +98,32 @@ func GetAndValidateConfig(cfg *vmcpconfig.OptimizerConfig) (*Config, error) {
 	return optCfg, nil
 }
 
+// resolveEmbeddingProvider normalizes and validates the embedding provider on
+// optCfg in place. An empty provider defaults to TEI so existing configs keep
+// working; the OpenAI provider requires a service and model and reads its API
+// key from the environment.
+func resolveEmbeddingProvider(optCfg *Config) error {
+	switch optCfg.EmbeddingProvider {
+	case "":
+		optCfg.EmbeddingProvider = types.EmbeddingProviderTEI
+	case types.EmbeddingProviderTEI:
+	case types.EmbeddingProviderOpenAI:
+		if optCfg.EmbeddingService == "" {
+			return fmt.Errorf("optimizer.embeddingService is required when optimizer.embeddingProvider is %q",
+				types.EmbeddingProviderOpenAI)
+		}
+		if optCfg.EmbeddingModel == "" {
+			return fmt.Errorf("optimizer.embeddingModel is required when optimizer.embeddingProvider is %q",
+				types.EmbeddingProviderOpenAI)
+		}
+		optCfg.EmbeddingAPIKey = os.Getenv(embeddingAPIKeyEnvVar)
+	default:
+		return fmt.Errorf("optimizer.embeddingProvider must be %q or %q, got %q",
+			types.EmbeddingProviderTEI, types.EmbeddingProviderOpenAI, optCfg.EmbeddingProvider)
+	}
+	return nil
+}
+
 // Optimizer defines the interface for intelligent tool discovery and invocation.
 //
 // The default implementation delegates search to a ToolStore (SQLite FTS5 with
diff --git a/pkg/vmcp/optimizer/optimizer_test.go b/pkg/vmcp/optimizer/optimizer_test.go
index c389c0e17f..bde54bcd67 100644
--- a/pkg/vmcp/optimizer/optimizer_test.go
+++ b/pkg/vmcp/optimizer/optimizer_test.go
@@ -18,6 +18,7 @@ import (
 
 	vmcpconfig "github.com/stacklok/toolhive/pkg/vmcp/config"
 	"github.com/stacklok/toolhive/pkg/vmcp/optimizer/internal/tokencounter"
+	"github.com/stacklok/toolhive/pkg/vmcp/optimizer/internal/types"
 	"github.com/stacklok/toolhive/pkg/vmcp/optimizer/internal/types/mocks"
 )
 
@@ -52,6 +53,54 @@ func TestGetAndValidateConfig(t *testing.T) {
 				EmbeddingService: "http://embeddings:8080",
 			},
 		},
+		{
+			name: "explicit tei provider",
+			cfg: &vmcpconfig.OptimizerConfig{
+				EmbeddingService:  "http://embeddings:8080",
+				EmbeddingProvider: types.EmbeddingProviderTEI,
+			},
+			expected: &Config{
+				EmbeddingService:  "http://embeddings:8080",
+				EmbeddingProvider: types.EmbeddingProviderTEI,
+			},
+		},
+		{
+			name: "openai provider with service and model",
+			cfg: &vmcpconfig.OptimizerConfig{
+				EmbeddingService:  "http://bifrost:8080/v1",
+				EmbeddingProvider: types.EmbeddingProviderOpenAI,
+				EmbeddingModel:    "text-embedding-3-small",
+			},
+			expected: &Config{
+				EmbeddingService:  "http://bifrost:8080/v1",
+				EmbeddingProvider: types.EmbeddingProviderOpenAI,
+				EmbeddingModel:    "text-embedding-3-small",
+			},
+		},
+		{
+			name: "error: openai provider without service",
+			cfg: &vmcpconfig.OptimizerConfig{
+				EmbeddingProvider: types.EmbeddingProviderOpenAI,
+				EmbeddingModel:    "text-embedding-3-small",
+			},
+			errContains: "optimizer.embeddingService is required",
+		},
+		{
+			name: "error: openai provider without model",
+			cfg: &vmcpconfig.OptimizerConfig{
+				EmbeddingService:  "http://bifrost:8080/v1",
+				EmbeddingProvider: types.EmbeddingProviderOpenAI,
+			},
+			errContains: "optimizer.embeddingModel is required",
+		},
+		{
+			name: "error: unknown provider",
+			cfg: &vmcpconfig.OptimizerConfig{
+				EmbeddingService:  "http://embeddings:8080",
+				EmbeddingProvider: "cohere",
+			},
+			errContains: "optimizer.embeddingProvider must be",
+		},
 		{
 			name: "all valid values are parsed",
 			cfg: &vmcpconfig.OptimizerConfig{
@@ -208,6 +257,13 @@ func TestGetAndValidateConfig(t *testing.T) {
 			require.NotNil(t, result)
 			assert.Equal(t, tt.expected.EmbeddingService, result.EmbeddingService)
 
+			wantProvider := tt.expected.EmbeddingProvider
+			if wantProvider == "" {
+				wantProvider = types.EmbeddingProviderTEI
+			}
+			assert.Equal(t, wantProvider, result.EmbeddingProvider)
+			assert.Equal(t, tt.expected.EmbeddingModel, result.EmbeddingModel)
+
 			if tt.expected.MaxToolsToReturn != nil {
 				require.NotNil(t, result.MaxToolsToReturn)
 				assert.Equal(t, *tt.expected.MaxToolsToReturn, *result.MaxToolsToReturn)
@@ -232,6 +288,39 @@ func TestGetAndValidateConfig(t *testing.T) {
 	}
 }
 
+func TestGetAndValidateConfig_OpenAIAPIKeyFromEnv(t *testing.T) {
+	openAICfg := func() *vmcpconfig.OptimizerConfig {
+		return &vmcpconfig.OptimizerConfig{
+			EmbeddingService:  "http://bifrost:8080/v1",
+			EmbeddingProvider: types.EmbeddingProviderOpenAI,
+			EmbeddingModel:    "text-embedding-3-small",
+		}
+	}
+
+	t.Run("key is read from the environment", func(t *testing.T) {
+		t.Setenv(embeddingAPIKeyEnvVar, "sk-test")
+		result, err := GetAndValidateConfig(openAICfg())
+		require.NoError(t, err)
+		assert.Equal(t, "sk-test", result.EmbeddingAPIKey)
+	})
+
+	t.Run("unset key yields a keyless client", func(t *testing.T) {
+		t.Setenv(embeddingAPIKeyEnvVar, "")
+		result, err := GetAndValidateConfig(openAICfg())
+		require.NoError(t, err)
+		assert.Empty(t, result.EmbeddingAPIKey)
+	})
+
+	t.Run("tei provider never reads the key", func(t *testing.T) {
+		t.Setenv(embeddingAPIKeyEnvVar, "sk-test")
+		result, err := GetAndValidateConfig(&vmcpconfig.OptimizerConfig{
+			EmbeddingService: "http://embeddings:8080",
+		})
+		require.NoError(t, err)
+		assert.Empty(t, result.EmbeddingAPIKey)
+	})
+}
+
 // newMockStoreWithSubstringSearch returns a gomock MockToolStore configured with
 // DoAndReturn handlers that accumulate tools via UpsertTools and perform
 // case-insensitive substring matching on Search. Suitable for tests that need

From 787e08c3a24c36233093df7d3d7735f8e3b09c6d Mon Sep 17 00:00:00 2001
From: Gabriel Cosi <contact@gabrielcosi.dev>
Date: Fri, 26 Jun 2026 00:46:23 +0200
Subject: [PATCH 2/6] Reject embeddingServerRef with openai provider

Signed-off-by: Gabriel Cosi <contact@gabrielcosi.dev>
---
 .../api/v1beta1/virtualmcpserver_types.go     |  1 +
 .../virtualmcpserver_embedding_cel_test.go    | 56 +++++++++++++++++++
 ...olhive.stacklok.dev_virtualmcpservers.yaml | 22 ++++++--
 ...olhive.stacklok.dev_virtualmcpservers.yaml | 22 ++++++--
 docs/operator/crd-api.md                      |  2 +-
 pkg/vmcp/config/config.go                     |  5 +-
 6 files changed, 97 insertions(+), 11 deletions(-)
 create mode 100644 cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_embedding_cel_test.go

diff --git a/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go b/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go
index 32e9c795f9..d2b37439a7 100644
--- a/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go
+++ b/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go
@@ -19,6 +19,7 @@ import (
 // +kubebuilder:validation:XValidation:rule="!has(self.config) || !has(self.config.rateLimiting) || (has(self.sessionStorage) && self.sessionStorage.provider == 'redis')",message="config.rateLimiting requires sessionStorage with provider 'redis'"
 // +kubebuilder:validation:XValidation:rule="!(has(self.config) && has(self.config.rateLimiting) && has(self.config.rateLimiting.perUser)) || (has(self.incomingAuth) && self.incomingAuth.type == 'oidc')",message="config.rateLimiting.perUser requires incomingAuth.type oidc"
 // +kubebuilder:validation:XValidation:rule="!has(self.config) || !has(self.config.rateLimiting) || !has(self.config.rateLimiting.tools) || self.config.rateLimiting.tools.all(t, !has(t.perUser)) || (has(self.incomingAuth) && self.incomingAuth.type == 'oidc')",message="per-tool perUser rate limiting requires incomingAuth.type oidc"
+// +kubebuilder:validation:XValidation:rule="!(has(self.embeddingServerRef) && has(self.config) && has(self.config.optimizer) && has(self.config.optimizer.embeddingProvider) && self.config.optimizer.embeddingProvider == 'openai')",message="embeddingServerRef provisions a managed TEI server and cannot be combined with optimizer.embeddingProvider 'openai'; openai mode uses embeddingService directly"
 //
 //nolint:lll // CEL validation rules exceed line length limit
 type VirtualMCPServerSpec struct {
diff --git a/cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_embedding_cel_test.go b/cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_embedding_cel_test.go
new file mode 100644
index 0000000000..1512d7adf3
--- /dev/null
+++ b/cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_embedding_cel_test.go
@@ -0,0 +1,56 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+// Package controllers contains integration tests for the VirtualMCPServer controller
+package controllers
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	mcpv1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1"
+	"github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1/v1beta1test"
+	vmcpconfig "github.com/stacklok/toolhive/pkg/vmcp/config"
+)
+
+func newVirtualMCPServerWithOptimizer(name string, optimizer *vmcpconfig.OptimizerConfig,
+	opts ...v1beta1test.VirtualMCPServerOption) *mcpv1beta1.VirtualMCPServer {
+	base := []v1beta1test.VirtualMCPServerOption{
+		v1beta1test.WithVMCPGroupRef("test-group"),
+		v1beta1test.WithVMCPIncomingAuth(&mcpv1beta1.IncomingAuthConfig{Type: "anonymous"}),
+		v1beta1test.WithVMCPConfig(vmcpconfig.Config{Group: "test-group", Optimizer: optimizer}),
+	}
+	return v1beta1test.NewVirtualMCPServer(name, "default", append(base, opts...)...)
+}
+
+var _ = Describe("CEL Validation for embedding provider on VirtualMCPServer",
+	Label("k8s", "cel", "validation"), func() {
+		It("should reject embeddingServerRef combined with embeddingProvider openai", func() {
+			vmcp := newVirtualMCPServerWithOptimizer("vmcp-ref-openai",
+				&vmcpconfig.OptimizerConfig{EmbeddingProvider: "openai", EmbeddingModel: "text-embedding-3-small"},
+				v1beta1test.WithVMCPEmbeddingServerRef("managed-tei"))
+			err := k8sClient.Create(ctx, vmcp)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring(
+				"embeddingServerRef provisions a managed TEI server and cannot be combined with optimizer.embeddingProvider 'openai'"))
+		})
+
+		It("should accept embeddingServerRef with the default (tei) provider", func() {
+			vmcp := newVirtualMCPServerWithOptimizer("vmcp-ref-tei",
+				&vmcpconfig.OptimizerConfig{EmbeddingProvider: "tei"},
+				v1beta1test.WithVMCPEmbeddingServerRef("managed-tei"))
+			err := k8sClient.Create(ctx, vmcp)
+			Expect(err).NotTo(HaveOccurred())
+		})
+
+		It("should accept embeddingProvider openai without an embeddingServerRef", func() {
+			vmcp := newVirtualMCPServerWithOptimizer("vmcp-openai-no-ref",
+				&vmcpconfig.OptimizerConfig{
+					EmbeddingProvider: "openai",
+					EmbeddingService:  "http://gateway.example:8080",
+					EmbeddingModel:    "text-embedding-3-small",
+				})
+			err := k8sClient.Create(ctx, vmcp)
+			Expect(err).NotTo(HaveOccurred())
+		})
+	})
diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml
index ad33615c29..aa1bf1c27f 100644
--- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml
+++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml
@@ -1813,8 +1813,9 @@ spec:
                           optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or
                           LiteLLM. Defaults to "tei" when empty.
 
-                          The "openai" provider reads EmbeddingService directly and is not used with
-                          EmbeddingServerRef, which provisions a managed TEI server.
+                          The "openai" provider reads EmbeddingService directly and cannot be combined
+                          with EmbeddingServerRef, which provisions a managed TEI server; the operator
+                          rejects that combination at admission.
                         enum:
                         - tei
                         - openai
@@ -2963,6 +2964,12 @@ spec:
               rule: '!has(self.config) || !has(self.config.rateLimiting) || !has(self.config.rateLimiting.tools)
                 || self.config.rateLimiting.tools.all(t, !has(t.perUser)) || (has(self.incomingAuth)
                 && self.incomingAuth.type == ''oidc'')'
+            - message: embeddingServerRef provisions a managed TEI server and cannot
+                be combined with optimizer.embeddingProvider 'openai'; openai mode
+                uses embeddingService directly
+              rule: '!(has(self.embeddingServerRef) && has(self.config) && has(self.config.optimizer)
+                && has(self.config.optimizer.embeddingProvider) && self.config.optimizer.embeddingProvider
+                == ''openai'')'
           status:
             description: VirtualMCPServerStatus defines the observed state of VirtualMCPServer
             properties:
@@ -4928,8 +4935,9 @@ spec:
                           optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or
                           LiteLLM. Defaults to "tei" when empty.
 
-                          The "openai" provider reads EmbeddingService directly and is not used with
-                          EmbeddingServerRef, which provisions a managed TEI server.
+                          The "openai" provider reads EmbeddingService directly and cannot be combined
+                          with EmbeddingServerRef, which provisions a managed TEI server; the operator
+                          rejects that combination at admission.
                         enum:
                         - tei
                         - openai
@@ -6078,6 +6086,12 @@ spec:
               rule: '!has(self.config) || !has(self.config.rateLimiting) || !has(self.config.rateLimiting.tools)
                 || self.config.rateLimiting.tools.all(t, !has(t.perUser)) || (has(self.incomingAuth)
                 && self.incomingAuth.type == ''oidc'')'
+            - message: embeddingServerRef provisions a managed TEI server and cannot
+                be combined with optimizer.embeddingProvider 'openai'; openai mode
+                uses embeddingService directly
+              rule: '!(has(self.embeddingServerRef) && has(self.config) && has(self.config.optimizer)
+                && has(self.config.optimizer.embeddingProvider) && self.config.optimizer.embeddingProvider
+                == ''openai'')'
           status:
             description: VirtualMCPServerStatus defines the observed state of VirtualMCPServer
             properties:
diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml
index d4dd4b6203..6a34d92c93 100644
--- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml
+++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml
@@ -1816,8 +1816,9 @@ spec:
                           optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or
                           LiteLLM. Defaults to "tei" when empty.
 
-                          The "openai" provider reads EmbeddingService directly and is not used with
-                          EmbeddingServerRef, which provisions a managed TEI server.
+                          The "openai" provider reads EmbeddingService directly and cannot be combined
+                          with EmbeddingServerRef, which provisions a managed TEI server; the operator
+                          rejects that combination at admission.
                         enum:
                         - tei
                         - openai
@@ -2966,6 +2967,12 @@ spec:
               rule: '!has(self.config) || !has(self.config.rateLimiting) || !has(self.config.rateLimiting.tools)
                 || self.config.rateLimiting.tools.all(t, !has(t.perUser)) || (has(self.incomingAuth)
                 && self.incomingAuth.type == ''oidc'')'
+            - message: embeddingServerRef provisions a managed TEI server and cannot
+                be combined with optimizer.embeddingProvider 'openai'; openai mode
+                uses embeddingService directly
+              rule: '!(has(self.embeddingServerRef) && has(self.config) && has(self.config.optimizer)
+                && has(self.config.optimizer.embeddingProvider) && self.config.optimizer.embeddingProvider
+                == ''openai'')'
           status:
             description: VirtualMCPServerStatus defines the observed state of VirtualMCPServer
             properties:
@@ -4931,8 +4938,9 @@ spec:
                           optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or
                           LiteLLM. Defaults to "tei" when empty.
 
-                          The "openai" provider reads EmbeddingService directly and is not used with
-                          EmbeddingServerRef, which provisions a managed TEI server.
+                          The "openai" provider reads EmbeddingService directly and cannot be combined
+                          with EmbeddingServerRef, which provisions a managed TEI server; the operator
+                          rejects that combination at admission.
                         enum:
                         - tei
                         - openai
@@ -6081,6 +6089,12 @@ spec:
               rule: '!has(self.config) || !has(self.config.rateLimiting) || !has(self.config.rateLimiting.tools)
                 || self.config.rateLimiting.tools.all(t, !has(t.perUser)) || (has(self.incomingAuth)
                 && self.incomingAuth.type == ''oidc'')'
+            - message: embeddingServerRef provisions a managed TEI server and cannot
+                be combined with optimizer.embeddingProvider 'openai'; openai mode
+                uses embeddingService directly
+              rule: '!(has(self.embeddingServerRef) && has(self.config) && has(self.config.optimizer)
+                && has(self.config.optimizer.embeddingProvider) && self.config.optimizer.embeddingProvider
+                == ''openai'')'
           status:
             description: VirtualMCPServerStatus defines the observed state of VirtualMCPServer
             properties:
diff --git a/docs/operator/crd-api.md b/docs/operator/crd-api.md
index 2e34a58d6d..77caf0ffae 100644
--- a/docs/operator/crd-api.md
+++ b/docs/operator/crd-api.md
@@ -502,7 +502,7 @@ _Appears in:_
 | --- | --- | --- | --- |
 | `embeddingService` _string_ | EmbeddingService is the full base URL of the embedding service endpoint<br />(e.g., http://my-embedding.default.svc.cluster.local:8080) for semantic<br />tool discovery.<br />In a Kubernetes environment, it is more convenient to use the<br />VirtualMCPServerSpec.EmbeddingServerRef field instead of setting this<br />directly. EmbeddingServerRef references an EmbeddingServer CRD by name,<br />and the operator automatically resolves the referenced resource's<br />Status.URL to populate this field. This provides managed lifecycle<br />(the operator watches the EmbeddingServer for readiness and URL changes)<br />and avoids hardcoding service URLs in the config. If both<br />EmbeddingServerRef and this field are set, EmbeddingServerRef takes<br />precedence and this value is overridden with a warning. |  | Optional: \{\} <br /> |
 | `embeddingServiceTimeout` _[vmcp.config.Duration](#vmcpconfigduration)_ | EmbeddingServiceTimeout is the HTTP request timeout for calls to the embedding service.<br />Defaults to 30s if not specified. | 30s | Pattern: `^([0-9]+(\.[0-9]+)?(ns\|us\|µs\|ms\|s\|m\|h))+$` <br />Type: string <br />Optional: \{\} <br /> |
-| `embeddingProvider` _string_ | EmbeddingProvider selects the wire protocol used to talk to the embedding<br />service. "tei" speaks the HuggingFace Text Embeddings Inference API;<br />"openai" speaks the OpenAI-compatible /embeddings API, which lets the<br />optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or<br />LiteLLM. Defaults to "tei" when empty.<br />The "openai" provider reads EmbeddingService directly and is not used with<br />EmbeddingServerRef, which provisions a managed TEI server. | tei | Enum: [tei openai] <br />Optional: \{\} <br /> |
+| `embeddingProvider` _string_ | EmbeddingProvider selects the wire protocol used to talk to the embedding<br />service. "tei" speaks the HuggingFace Text Embeddings Inference API;<br />"openai" speaks the OpenAI-compatible /embeddings API, which lets the<br />optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or<br />LiteLLM. Defaults to "tei" when empty.<br />The "openai" provider reads EmbeddingService directly and cannot be combined<br />with EmbeddingServerRef, which provisions a managed TEI server; the operator<br />rejects that combination at admission. | tei | Enum: [tei openai] <br />Optional: \{\} <br /> |
 | `embeddingModel` _string_ | EmbeddingModel is the model name requested from the embedding service<br />(e.g. "text-embedding-3-small"). Required when EmbeddingProvider is<br />"openai". Ignored for the "tei" provider, where the model is fixed by the<br />running TEI container.<br />The API key for an OpenAI-compatible service is not configured here: it is<br />read from the OPENAI_API_KEY environment variable so the secret never<br />lands in a CRD spec or ConfigMap. An empty key omits the Authorization<br />header, which supports keyless in-cluster gateways. |  | Optional: \{\} <br /> |
 | `maxToolsToReturn` _integer_ | MaxToolsToReturn is the maximum number of tool results returned by a search query.<br />Defaults to 8 if not specified or zero. |  | Maximum: 50 <br />Minimum: 1 <br />Optional: \{\} <br /> |
 | `hybridSearchSemanticRatio` _string_ | HybridSearchSemanticRatio controls the balance between semantic (meaning-based)<br />and keyword search results. 0.0 = all keyword, 1.0 = all semantic.<br />Defaults to "0.5" if not specified or empty.<br />Serialized as a string because CRDs do not support float types portably. |  | Pattern: `^([0-9]*[.])?[0-9]+$` <br />Optional: \{\} <br /> |
diff --git a/pkg/vmcp/config/config.go b/pkg/vmcp/config/config.go
index 033a6e0d0b..919460c0d5 100644
--- a/pkg/vmcp/config/config.go
+++ b/pkg/vmcp/config/config.go
@@ -939,8 +939,9 @@ type OptimizerConfig struct {
 	// optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or
 	// LiteLLM. Defaults to "tei" when empty.
 	//
-	// The "openai" provider reads EmbeddingService directly and is not used with
-	// EmbeddingServerRef, which provisions a managed TEI server.
+	// The "openai" provider reads EmbeddingService directly and cannot be combined
+	// with EmbeddingServerRef, which provisions a managed TEI server; the operator
+	// rejects that combination at admission.
 	// +kubebuilder:validation:Enum=tei;openai
 	// +kubebuilder:default="tei"
 	// +optional

From f0a51df1fc5afeafde0eb32edd0d52597398eeb1 Mon Sep 17 00:00:00 2001
From: Gabriel Cosi <contact@gabrielcosi.dev>
Date: Fri, 26 Jun 2026 00:49:42 +0200
Subject: [PATCH 3/6] Drain OpenAI response body before close

Signed-off-by: Gabriel Cosi <contact@gabrielcosi.dev>
---
 pkg/vmcp/optimizer/internal/similarity/openai_client.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pkg/vmcp/optimizer/internal/similarity/openai_client.go b/pkg/vmcp/optimizer/internal/similarity/openai_client.go
index e3dd1b1561..fa6dd84f80 100644
--- a/pkg/vmcp/optimizer/internal/similarity/openai_client.go
+++ b/pkg/vmcp/optimizer/internal/similarity/openai_client.go
@@ -136,7 +136,10 @@ func (c *openAIClient) embedChunk(ctx context.Context, texts []string) ([][]floa
 	if err != nil {
 		return nil, fmt.Errorf("OpenAI request failed: %w", err)
 	}
-	defer func() { _ = resp.Body.Close() }()
+	defer func() {
+		_, _ = io.Copy(io.Discard, resp.Body)
+		_ = resp.Body.Close()
+	}()
 
 	if resp.StatusCode != http.StatusOK {
 		body, _ := io.ReadAll(resp.Body)

From 3da509259898900fdd7ba63e25c626a1bc2fdec0 Mon Sep 17 00:00:00 2001
From: Gabriel Cosi <contact@gabrielcosi.dev>
Date: Fri, 26 Jun 2026 00:50:58 +0200
Subject: [PATCH 4/6] Trim trailing slash from OpenAI base URL

Signed-off-by: Gabriel Cosi <contact@gabrielcosi.dev>
---
 pkg/vmcp/optimizer/internal/similarity/openai_client.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkg/vmcp/optimizer/internal/similarity/openai_client.go b/pkg/vmcp/optimizer/internal/similarity/openai_client.go
index fa6dd84f80..54d110d079 100644
--- a/pkg/vmcp/optimizer/internal/similarity/openai_client.go
+++ b/pkg/vmcp/optimizer/internal/similarity/openai_client.go
@@ -11,6 +11,7 @@ import (
 	"io"
 	"log/slog"
 	"net/http"
+	"strings"
 	"time"
 )
 
@@ -43,6 +44,7 @@ func newOpenAIClient(baseURL, model, apiKey string, timeout time.Duration) (*ope
 	if model == "" {
 		return nil, fmt.Errorf("OpenAI embedding model is required")
 	}
+	baseURL = strings.TrimSuffix(baseURL, "/")
 
 	if timeout == 0 {
 		timeout = defaultTimeout

From d37fdcb5b0cafba5dbf5a08ef90a91d2db6833c1 Mon Sep 17 00:00:00 2001
From: Gabriel Cosi <contact@gabrielcosi.dev>
Date: Fri, 26 Jun 2026 01:12:40 +0200
Subject: [PATCH 5/6] Add live OpenAI embedding integration test

Signed-off-by: Gabriel Cosi <contact@gabrielcosi.dev>
---
 .../openai_client_integration_test.go         | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 pkg/vmcp/optimizer/internal/similarity/openai_client_integration_test.go

diff --git a/pkg/vmcp/optimizer/internal/similarity/openai_client_integration_test.go b/pkg/vmcp/optimizer/internal/similarity/openai_client_integration_test.go
new file mode 100644
index 0000000000..59efacaad2
--- /dev/null
+++ b/pkg/vmcp/optimizer/internal/similarity/openai_client_integration_test.go
@@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+package similarity
+
+import (
+	"cmp"
+	"context"
+	"os"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+// TestOpenAIClient_Live exercises the real /embeddings wire path against an
+// OpenAI-compatible endpoint. It is skipped unless OPENAI_API_KEY is set, so the
+// default `task test` run stays green. Override OPENAI_EMBEDDING_BASE_URL and
+// OPENAI_EMBEDDING_MODEL to point it at a compatible gateway.
+func TestOpenAIClient_Live(t *testing.T) {
+	t.Parallel()
+
+	apiKey := os.Getenv("OPENAI_API_KEY")
+	if apiKey == "" {
+		t.Skip("OPENAI_API_KEY not set; skipping live OpenAI embedding test")
+	}
+
+	baseURL := cmp.Or(os.Getenv("OPENAI_EMBEDDING_BASE_URL"), "https://api.openai.com/v1")
+	model := cmp.Or(os.Getenv("OPENAI_EMBEDDING_MODEL"), "text-embedding-3-small")
+
+	client, err := newOpenAIClient(baseURL, model, apiKey, 0)
+	require.NoError(t, err)
+	t.Cleanup(func() { _ = client.Close() })
+
+	ctx := context.Background()
+
+	vec, err := client.Embed(ctx, "the quick brown fox")
+	require.NoError(t, err)
+	require.NotEmpty(t, vec, "embedding vector must not be empty")
+
+	// Repeat one input so we can confirm results land back in request order:
+	// identical inputs must produce identical vectors at their own indices.
+	inputs := []string{"the quick brown fox", "lorem ipsum", "the quick brown fox"}
+	batch, err := client.EmbedBatch(ctx, inputs)
+	require.NoError(t, err)
+	require.Len(t, batch, len(inputs))
+	for i, e := range batch {
+		require.Lenf(t, e, len(vec), "embedding %d has unexpected dimension", i)
+	}
+	require.Equal(t, batch[0], batch[2], "identical inputs must map to identical embeddings (order preserved)")
+}

From dd1ddf53eab8195ba3cfd785a68124985cce3ab0 Mon Sep 17 00:00:00 2001
From: Gabriel Cosi <contact@gabrielcosi.dev>
Date: Fri, 26 Jun 2026 01:12:40 +0200
Subject: [PATCH 6/6] Generalize embedding gateway examples

Signed-off-by: Gabriel Cosi <contact@gabrielcosi.dev>
---
 .../crds/toolhive.stacklok.dev_virtualmcpservers.yaml     | 8 ++++----
 .../toolhive.stacklok.dev_virtualmcpservers.yaml          | 8 ++++----
 docs/operator/crd-api.md                                  | 2 +-
 examples/operator/virtual-mcps/vmcp_optimizer_openai.yaml | 4 ++--
 pkg/vmcp/config/config.go                                 | 4 ++--
 pkg/vmcp/optimizer/internal/similarity/openai_client.go   | 2 +-
 pkg/vmcp/optimizer/optimizer_test.go                      | 8 ++++----
 7 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml
index aa1bf1c27f..d075d36855 100644
--- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml
+++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml
@@ -1810,8 +1810,8 @@ spec:
                           EmbeddingProvider selects the wire protocol used to talk to the embedding
                           service. "tei" speaks the HuggingFace Text Embeddings Inference API;
                           "openai" speaks the OpenAI-compatible /embeddings API, which lets the
-                          optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or
-                          LiteLLM. Defaults to "tei" when empty.
+                          optimizer use OpenAI, Azure OpenAI, or another OpenAI-compatible gateway.
+                          Defaults to "tei" when empty.
 
                           The "openai" provider reads EmbeddingService directly and cannot be combined
                           with EmbeddingServerRef, which provisions a managed TEI server; the operator
@@ -4932,8 +4932,8 @@ spec:
                           EmbeddingProvider selects the wire protocol used to talk to the embedding
                           service. "tei" speaks the HuggingFace Text Embeddings Inference API;
                           "openai" speaks the OpenAI-compatible /embeddings API, which lets the
-                          optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or
-                          LiteLLM. Defaults to "tei" when empty.
+                          optimizer use OpenAI, Azure OpenAI, or another OpenAI-compatible gateway.
+                          Defaults to "tei" when empty.
 
                           The "openai" provider reads EmbeddingService directly and cannot be combined
                           with EmbeddingServerRef, which provisions a managed TEI server; the operator
diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml
index 6a34d92c93..b23002cd86 100644
--- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml
+++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml
@@ -1813,8 +1813,8 @@ spec:
                           EmbeddingProvider selects the wire protocol used to talk to the embedding
                           service. "tei" speaks the HuggingFace Text Embeddings Inference API;
                           "openai" speaks the OpenAI-compatible /embeddings API, which lets the
-                          optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or
-                          LiteLLM. Defaults to "tei" when empty.
+                          optimizer use OpenAI, Azure OpenAI, or another OpenAI-compatible gateway.
+                          Defaults to "tei" when empty.
 
                           The "openai" provider reads EmbeddingService directly and cannot be combined
                           with EmbeddingServerRef, which provisions a managed TEI server; the operator
@@ -4935,8 +4935,8 @@ spec:
                           EmbeddingProvider selects the wire protocol used to talk to the embedding
                           service. "tei" speaks the HuggingFace Text Embeddings Inference API;
                           "openai" speaks the OpenAI-compatible /embeddings API, which lets the
-                          optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or
-                          LiteLLM. Defaults to "tei" when empty.
+                          optimizer use OpenAI, Azure OpenAI, or another OpenAI-compatible gateway.
+                          Defaults to "tei" when empty.
 
                           The "openai" provider reads EmbeddingService directly and cannot be combined
                           with EmbeddingServerRef, which provisions a managed TEI server; the operator
diff --git a/docs/operator/crd-api.md b/docs/operator/crd-api.md
index 77caf0ffae..6072c90e35 100644
--- a/docs/operator/crd-api.md
+++ b/docs/operator/crd-api.md
@@ -502,7 +502,7 @@ _Appears in:_
 | --- | --- | --- | --- |
 | `embeddingService` _string_ | EmbeddingService is the full base URL of the embedding service endpoint<br />(e.g., http://my-embedding.default.svc.cluster.local:8080) for semantic<br />tool discovery.<br />In a Kubernetes environment, it is more convenient to use the<br />VirtualMCPServerSpec.EmbeddingServerRef field instead of setting this<br />directly. EmbeddingServerRef references an EmbeddingServer CRD by name,<br />and the operator automatically resolves the referenced resource's<br />Status.URL to populate this field. This provides managed lifecycle<br />(the operator watches the EmbeddingServer for readiness and URL changes)<br />and avoids hardcoding service URLs in the config. If both<br />EmbeddingServerRef and this field are set, EmbeddingServerRef takes<br />precedence and this value is overridden with a warning. |  | Optional: \{\} <br /> |
 | `embeddingServiceTimeout` _[vmcp.config.Duration](#vmcpconfigduration)_ | EmbeddingServiceTimeout is the HTTP request timeout for calls to the embedding service.<br />Defaults to 30s if not specified. | 30s | Pattern: `^([0-9]+(\.[0-9]+)?(ns\|us\|µs\|ms\|s\|m\|h))+$` <br />Type: string <br />Optional: \{\} <br /> |
-| `embeddingProvider` _string_ | EmbeddingProvider selects the wire protocol used to talk to the embedding<br />service. "tei" speaks the HuggingFace Text Embeddings Inference API;<br />"openai" speaks the OpenAI-compatible /embeddings API, which lets the<br />optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or<br />LiteLLM. Defaults to "tei" when empty.<br />The "openai" provider reads EmbeddingService directly and cannot be combined<br />with EmbeddingServerRef, which provisions a managed TEI server; the operator<br />rejects that combination at admission. | tei | Enum: [tei openai] <br />Optional: \{\} <br /> |
+| `embeddingProvider` _string_ | EmbeddingProvider selects the wire protocol used to talk to the embedding<br />service. "tei" speaks the HuggingFace Text Embeddings Inference API;<br />"openai" speaks the OpenAI-compatible /embeddings API, which lets the<br />optimizer use OpenAI, Azure OpenAI, or another OpenAI-compatible gateway.<br />Defaults to "tei" when empty.<br />The "openai" provider reads EmbeddingService directly and cannot be combined<br />with EmbeddingServerRef, which provisions a managed TEI server; the operator<br />rejects that combination at admission. | tei | Enum: [tei openai] <br />Optional: \{\} <br /> |
 | `embeddingModel` _string_ | EmbeddingModel is the model name requested from the embedding service<br />(e.g. "text-embedding-3-small"). Required when EmbeddingProvider is<br />"openai". Ignored for the "tei" provider, where the model is fixed by the<br />running TEI container.<br />The API key for an OpenAI-compatible service is not configured here: it is<br />read from the OPENAI_API_KEY environment variable so the secret never<br />lands in a CRD spec or ConfigMap. An empty key omits the Authorization<br />header, which supports keyless in-cluster gateways. |  | Optional: \{\} <br /> |
 | `maxToolsToReturn` _integer_ | MaxToolsToReturn is the maximum number of tool results returned by a search query.<br />Defaults to 8 if not specified or zero. |  | Maximum: 50 <br />Minimum: 1 <br />Optional: \{\} <br /> |
 | `hybridSearchSemanticRatio` _string_ | HybridSearchSemanticRatio controls the balance between semantic (meaning-based)<br />and keyword search results. 0.0 = all keyword, 1.0 = all semantic.<br />Defaults to "0.5" if not specified or empty.<br />Serialized as a string because CRDs do not support float types portably. |  | Pattern: `^([0-9]*[.])?[0-9]+$` <br />Optional: \{\} <br /> |
diff --git a/examples/operator/virtual-mcps/vmcp_optimizer_openai.yaml b/examples/operator/virtual-mcps/vmcp_optimizer_openai.yaml
index 5e6436a3ae..b51a740c06 100644
--- a/examples/operator/virtual-mcps/vmcp_optimizer_openai.yaml
+++ b/examples/operator/virtual-mcps/vmcp_optimizer_openai.yaml
@@ -2,7 +2,7 @@
 #
 # Instead of a managed TEI EmbeddingServer, this points the optimizer at an
 # external service that speaks the OpenAI /embeddings API — OpenAI, Azure
-# OpenAI, or a gateway such as Bifrost or LiteLLM. There is no EmbeddingServer
+# OpenAI, or another OpenAI-compatible gateway. There is no EmbeddingServer
 # or embeddingServerRef: the endpoint is reached directly via embeddingService.
 #
 # The API key is read from the OPENAI_API_KEY environment variable so it never
@@ -64,7 +64,7 @@ spec:
       # Speak the OpenAI /embeddings API instead of TEI.
       embeddingProvider: openai
       # Base URL of the OpenAI-compatible service; "/embeddings" is appended.
-      embeddingService: http://bifrost.ai.svc.cluster.local:8080/v1
+      embeddingService: http://llm-gateway.default.svc.cluster.local:8080/v1
       # Model requested from the service (required for the openai provider).
       embeddingModel: text-embedding-3-small
       embeddingServiceTimeout: 15s
diff --git a/pkg/vmcp/config/config.go b/pkg/vmcp/config/config.go
index 919460c0d5..669e3665ed 100644
--- a/pkg/vmcp/config/config.go
+++ b/pkg/vmcp/config/config.go
@@ -936,8 +936,8 @@ type OptimizerConfig struct {
 	// EmbeddingProvider selects the wire protocol used to talk to the embedding
 	// service. "tei" speaks the HuggingFace Text Embeddings Inference API;
 	// "openai" speaks the OpenAI-compatible /embeddings API, which lets the
-	// optimizer use OpenAI, Azure OpenAI, or a gateway such as Bifrost or
-	// LiteLLM. Defaults to "tei" when empty.
+	// optimizer use OpenAI, Azure OpenAI, or another OpenAI-compatible gateway.
+	// Defaults to "tei" when empty.
 	//
 	// The "openai" provider reads EmbeddingService directly and cannot be combined
 	// with EmbeddingServerRef, which provisions a managed TEI server; the operator
diff --git a/pkg/vmcp/optimizer/internal/similarity/openai_client.go b/pkg/vmcp/optimizer/internal/similarity/openai_client.go
index 54d110d079..c9f3d1fdec 100644
--- a/pkg/vmcp/optimizer/internal/similarity/openai_client.go
+++ b/pkg/vmcp/optimizer/internal/similarity/openai_client.go
@@ -24,7 +24,7 @@ const (
 )
 
 // openAIClient implements types.EmbeddingClient against an OpenAI-compatible
-// /embeddings API (OpenAI, Azure OpenAI, or a gateway such as Bifrost or LiteLLM).
+// /embeddings API (OpenAI, Azure OpenAI, or another OpenAI-compatible gateway).
 type openAIClient struct {
 	baseURL      string
 	apiKey       string
diff --git a/pkg/vmcp/optimizer/optimizer_test.go b/pkg/vmcp/optimizer/optimizer_test.go
index bde54bcd67..8ccef87b0c 100644
--- a/pkg/vmcp/optimizer/optimizer_test.go
+++ b/pkg/vmcp/optimizer/optimizer_test.go
@@ -67,12 +67,12 @@ func TestGetAndValidateConfig(t *testing.T) {
 		{
 			name: "openai provider with service and model",
 			cfg: &vmcpconfig.OptimizerConfig{
-				EmbeddingService:  "http://bifrost:8080/v1",
+				EmbeddingService:  "http://gateway:8080/v1",
 				EmbeddingProvider: types.EmbeddingProviderOpenAI,
 				EmbeddingModel:    "text-embedding-3-small",
 			},
 			expected: &Config{
-				EmbeddingService:  "http://bifrost:8080/v1",
+				EmbeddingService:  "http://gateway:8080/v1",
 				EmbeddingProvider: types.EmbeddingProviderOpenAI,
 				EmbeddingModel:    "text-embedding-3-small",
 			},
@@ -88,7 +88,7 @@ func TestGetAndValidateConfig(t *testing.T) {
 		{
 			name: "error: openai provider without model",
 			cfg: &vmcpconfig.OptimizerConfig{
-				EmbeddingService:  "http://bifrost:8080/v1",
+				EmbeddingService:  "http://gateway:8080/v1",
 				EmbeddingProvider: types.EmbeddingProviderOpenAI,
 			},
 			errContains: "optimizer.embeddingModel is required",
@@ -291,7 +291,7 @@ func TestGetAndValidateConfig(t *testing.T) {
 func TestGetAndValidateConfig_OpenAIAPIKeyFromEnv(t *testing.T) {
 	openAICfg := func() *vmcpconfig.OptimizerConfig {
 		return &vmcpconfig.OptimizerConfig{
-			EmbeddingService:  "http://bifrost:8080/v1",
+			EmbeddingService:  "http://gateway:8080/v1",
 			EmbeddingProvider: types.EmbeddingProviderOpenAI,
 			EmbeddingModel:    "text-embedding-3-small",
 		}