From 1382b4cc1b8afaeaa60c42db24a8952306edd6ba Mon Sep 17 00:00:00 2001 From: lukasmetzner Date: Thu, 23 Apr 2026 15:18:05 +0200 Subject: [PATCH 1/3] fix: routes controller on node name drift If a Kubernetes node got successfully initialized we have the Server ID stored in the node objets `providerID`. When renaming a Server, causing a drift between the node and Server name, we should still be able to identify the Server via its ID. This is currently not implemented in the routes' controller, where we rely purly on the Servers name. As the upstream library only provides us with the Kubernetes node name and not the full object, we have to first fetch it from the Kubernetes API. With the node object at hand we can identify and fetch the Server by its ID. Furthermore, we can use the Kubernetes node object to improve our handling of Kubernetes events. The event emitted on a CIDR mismatch is currently not associated with the affected node, because so far we did not have access to the nodes UUID. --- hcloud/cloud.go | 6 +- hcloud/cloud_test.go | 10 +- hcloud/routes.go | 206 ++++++++++++++++-------------- hcloud/routes_test.go | 232 +++++++++++++++++++++++++++++++++- internal/hcops/server.go | 25 ++++ internal/hcops/server_test.go | 6 + main.go | 4 +- 7 files changed, 384 insertions(+), 105 deletions(-) diff --git a/hcloud/cloud.go b/hcloud/cloud.go index 621ee7a19..ad9235755 100644 --- a/hcloud/cloud.go +++ b/hcloud/cloud.go @@ -28,6 +28,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/client-go/kubernetes/scheme" v1 "k8s.io/client-go/kubernetes/typed/core/v1" + corelisters "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/record" cloudprovider "k8s.io/cloud-provider" "k8s.io/klog/v2" @@ -55,9 +56,10 @@ type cloud struct { recorder record.EventRecorder networkID int64 cidr string + nodeLister corelisters.NodeLister } -func NewCloud(cidr string) (cloudprovider.Interface, error) { +func NewCloud(cidr string, nodeLister corelisters.NodeLister) (cloudprovider.Interface, error) { const op = "hcloud/newCloud" metrics.OperationCalled.WithLabelValues(op).Inc() @@ -147,6 +149,7 @@ func NewCloud(cidr string) (cloudprovider.Interface, error) { cfg: cfg, networkID: networkID, cidr: cidr, + nodeLister: nodeLister, }, nil } @@ -213,6 +216,7 @@ func (c *cloud) Routes() (cloudprovider.Routes, bool) { c.networkID, c.cidr, c.recorder, + c.nodeLister, ) if err != nil { klog.ErrorS(err, "create routes provider", "networkID", c.networkID) diff --git a/hcloud/cloud_test.go b/hcloud/cloud_test.go index 71634ee5f..28e363c95 100644 --- a/hcloud/cloud_test.go +++ b/hcloud/cloud_test.go @@ -95,7 +95,7 @@ func TestNewCloud(t *testing.T) { json.NewEncoder(w).Encode(schema.LocationListResponse{Locations: []schema.Location{}}) }) - _, err := NewCloud(DefaultClusterCIDR) + _, err := NewCloud(DefaultClusterCIDR, nil) assert.NoError(t, err) } @@ -107,7 +107,7 @@ func TestNewCloudConnectionNotPossible(t *testing.T) { ) defer resetEnv() - _, err := NewCloud(DefaultClusterCIDR) + _, err := NewCloud(DefaultClusterCIDR, nil) assert.EqualError(t, err, `hcloud/newCloud: Get "http://127.0.0.1:4711/v1/locations?": dial tcp 127.0.0.1:4711: connect: connection refused`) } @@ -135,7 +135,7 @@ func TestNewCloudInvalidToken(t *testing.T) { ) }) - _, err := NewCloud(DefaultClusterCIDR) + _, err := NewCloud(DefaultClusterCIDR, nil) assert.EqualError(t, err, "hcloud/newCloud: unable to authenticate (unauthorized)") } @@ -172,7 +172,7 @@ func TestCloud(t *testing.T) { ) }) - cloud, err := NewCloud(DefaultClusterCIDR) + cloud, err := NewCloud(DefaultClusterCIDR, nil) if err != nil { t.Fatalf("Unexpected error: %v", err) } @@ -229,7 +229,7 @@ func TestCloud(t *testing.T) { ) defer resetEnv() - c, err := NewCloud(DefaultClusterCIDR) + c, err := NewCloud(DefaultClusterCIDR, nil) if err != nil { t.Errorf("%s", err) } diff --git a/hcloud/routes.go b/hcloud/routes.go index 37552330b..99ec3fe8c 100644 --- a/hcloud/routes.go +++ b/hcloud/routes.go @@ -5,19 +5,21 @@ import ( "errors" "fmt" "net" + "slices" "time" "golang.org/x/time/rate" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + corelisters "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/record" cloudprovider "k8s.io/cloud-provider" "k8s.io/klog/v2" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/hcops" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/providerid" "github.com/hetznercloud/hcloud-go/v2/hcloud" ) @@ -29,9 +31,10 @@ type routes struct { serverCache *hcops.AllServersCache clusterCIDR *net.IPNet recorder record.EventRecorder + nodeLister corelisters.NodeLister } -func newRoutes(client *hcloud.Client, networkID int64, clusterCIDR string, recorder record.EventRecorder) (*routes, error) { +func newRoutes(client *hcloud.Client, networkID int64, clusterCIDR string, recorder record.EventRecorder, nodeLister corelisters.NodeLister) (*routes, error) { const op = "hcloud/newRoutes" metrics.OperationCalled.WithLabelValues(op).Inc() @@ -60,6 +63,7 @@ func newRoutes(client *hcloud.Client, networkID int64, clusterCIDR string, recor }, clusterCIDR: cidr, recorder: recorder, + nodeLister: nodeLister, }, nil } @@ -105,77 +109,124 @@ func (r *routes) CreateRoute(ctx context.Context, _ string, _ string, route *clo const op = "hcloud/CreateRoute" metrics.OperationCalled.WithLabelValues(op).Inc() - srv, err := r.serverCache.ByName(ctx, string(route.TargetNode)) + node, gateway, err := r.resolveRouteTarget(ctx, string(route.TargetNode)) if err != nil { return fmt.Errorf("%s: %w", op, err) } - privNet, ok := findServerPrivateNetByID(srv, r.network.ID) - if !ok { - r.serverCache.InvalidateCache() - srv, err = r.serverCache.ByName(ctx, string(route.TargetNode)) - if err != nil { - return fmt.Errorf("%s: %w", op, err) - } - - privNet, ok = findServerPrivateNetByID(srv, r.network.ID) - if !ok { - return fmt.Errorf("%s: server %v: network with id %d not attached to this server", op, route.TargetNode, r.network.ID) - } + if !slices.ContainsFunc(route.TargetNodeAddresses, func(target corev1.NodeAddress) bool { + return target.Type == corev1.NodeInternalIP && target.Address == gateway.String() + }) { + return fmt.Errorf("%s: IP %s not part of routes target addresses", op, gateway.String()) } - ip := privNet.IP _, cidr, err := net.ParseCIDR(route.DestinationCIDR) if err != nil { return fmt.Errorf("%s: %w", op, err) } - clusterPrefixLen, _ := r.clusterCIDR.Mask.Size() - destPrefixLen, _ := cidr.Mask.Size() + r.warnCIDRMismatch(cidr, node) - if !r.clusterCIDR.Contains(cidr.IP) || destPrefixLen < clusterPrefixLen { - node := &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: string(route.TargetNode), - Namespace: "", - }, - } - // Event is only visible via `kubectl get events` and not `kubectl describe node`, - // as we do not have the UID here and `kubectl describe node` filters by UID. - // Because of this behavior we are also dispatching a log message. - r.recorder.Eventf( - node, - corev1.EventTypeWarning, - "ClusterCIDRMisconfigured", - "route CIDR %s is not contained within cluster CIDR %s", - route.DestinationCIDR, - r.clusterCIDR.String(), - ) - klog.Warningf( - "route CIDR %s is not contained within cluster CIDR %s", - route.DestinationCIDR, - r.clusterCIDR.String(), - ) + return r.upsertRoute(ctx, gateway, cidr, string(route.TargetNode)) +} + +// resolveRouteTarget returns the k8s node and the hcloud server's private IP on the routes +// network — everything needed to create a route for this node (gateway IP) and record events +// against it (node). +// Looks up the server by ProviderID to survive k8s node-name drift, with a ByName fallback for +// ID changes (e.g. server recreated). Refreshes the cache once if the private-net attachment +// isn't yet reflected. +func (r *routes) resolveRouteTarget(ctx context.Context, nodeName string) (*corev1.Node, net.IP, error) { + const op = "hcloud/resolveRouteTarget" + metrics.OperationCalled.WithLabelValues(op).Inc() + + node, err := r.nodeLister.Get(nodeName) + if err != nil { + return nil, nil, fmt.Errorf("%s: %w", op, err) + } + + if node.Spec.ProviderID == "" { + return nil, nil, fmt.Errorf("%s: node %q not yet initialized", op, node.Name) + } + + id, isCloudServer, err := providerid.ToServerID(node.Spec.ProviderID) + if err != nil { + return nil, nil, fmt.Errorf("%s: %w", op, err) + } + if !isCloudServer { + return nil, nil, fmt.Errorf("%s: node %q is not a Cloud server, routes are only supported for Cloud servers", op, node.Name) } - routeExists, err := r.checkIfRouteAlreadyExists(ctx, route) + server, err := r.serverCache.ByID(ctx, id) + if errors.Is(err, hcops.ErrNotFound) { + server, err = r.serverCache.ByName(ctx, node.Name) + } if err != nil { + return nil, nil, fmt.Errorf("%s: %w", op, err) + } + + privNet, ok := findServerPrivateNetByID(server, r.network.ID) + if !ok { + r.serverCache.InvalidateCache() + server, err = r.serverCache.ByID(ctx, server.ID) + if err != nil { + return nil, nil, fmt.Errorf("%s: %w", op, err) + } + privNet, ok = findServerPrivateNetByID(server, r.network.ID) + if !ok { + return nil, nil, fmt.Errorf("%s: server %q: network with id %d not attached to this server", op, node.Name, r.network.ID) + } + } + + return node, privNet.IP, nil +} + +// upsertRoute ensures the hcloud network has a route for cidr pointing at gateway. A matching +// route is a no-op; a stale route with a different gateway is replaced in place. nodeName is +// used only for logging and for surfacing API conflicts against the right k8s object. +func (r *routes) upsertRoute(ctx context.Context, gateway net.IP, cidr *net.IPNet, nodeName string) error { + const op = "hcloud/upsertRoute" + metrics.OperationCalled.WithLabelValues(op).Inc() + + if err := r.reloadNetwork(ctx); err != nil { return fmt.Errorf("%s: %w", op, err) } - if routeExists { + destination := cidr.String() + existingIdx := slices.IndexFunc(r.network.Routes, func(nr hcloud.NetworkRoute) bool { + return nr.Destination.String() == destination + }) + if existingIdx >= 0 { + existing := r.network.Routes[existingIdx] + if existing.Gateway.Equal(gateway) { + klog.InfoS( + "route already exists: skipping creation", + "target-node", nodeName, + "destination-cidr", destination, + ) + return nil + } + + action, _, err := r.client.Network.DeleteRoute(ctx, r.network, hcloud.NetworkDeleteRouteOpts{ + Route: existing, + }) + if err != nil { + return fmt.Errorf("%s: %w", op, err) + } + if err := r.client.Action.WaitFor(ctx, action); err != nil { + return fmt.Errorf("%s: %w", op, err) + } klog.InfoS( - "route already exists: skipping creation", - "target-node", route.TargetNode, - "destination-cidr", route.DestinationCIDR, + "deleted stale route with wrong gateway; recreating", + "target-node", nodeName, + "destination-cidr", destination, ) - return nil } opts := hcloud.NetworkAddRouteOpts{ Route: hcloud.NetworkRoute{ Destination: cidr, - Gateway: ip, + Gateway: gateway, }, } action, _, err := r.client.Network.AddRoute(ctx, r.network, opts) @@ -183,7 +234,7 @@ func (r *routes) CreateRoute(ctx context.Context, _ string, _ string, route *clo if hcloud.IsError(err, hcloud.ErrorCodeLocked, hcloud.ErrorCodeConflict) { return apierrors.NewConflict( corev1.Resource("nodes"), - string(route.TargetNode), + nodeName, err, ) } @@ -220,17 +271,6 @@ func (r *routes) DeleteRoute(ctx context.Context, _ string, route *cloudprovider return fmt.Errorf("%s: %w", op, err) } - err = r.deleteRouteFromHcloud(ctx, cidr, ip) - if err != nil { - return fmt.Errorf("%s: %w", op, err) - } - return nil -} - -func (r *routes) deleteRouteFromHcloud(ctx context.Context, cidr *net.IPNet, ip net.IP) error { - const op = "hcloud/deleteRouteFromHcloud" - metrics.OperationCalled.WithLabelValues(op).Inc() - opts := hcloud.NetworkDeleteRouteOpts{ Route: hcloud.NetworkRoute{ Destination: cidr, @@ -272,43 +312,19 @@ func (r *routes) hcloudRouteToRoute(ctx context.Context, route hcloud.NetworkRou return cpRoute, nil } -func (r *routes) checkIfRouteAlreadyExists(ctx context.Context, route *cloudprovider.Route) (bool, error) { - const op = "hcloud/checkIfRouteAlreadyExists" - metrics.OperationCalled.WithLabelValues(op).Inc() +func (r *routes) warnCIDRMismatch(cidr *net.IPNet, node *corev1.Node) { + clusterPrefixLen, _ := r.clusterCIDR.Mask.Size() + destPrefixLen, _ := cidr.Mask.Size() - if err := r.reloadNetwork(ctx); err != nil { - return false, fmt.Errorf("%s: %w", op, err) - } - - for _, _route := range r.network.Routes { - if _route.Destination.String() == route.DestinationCIDR { - srv, err := r.serverCache.ByName(ctx, string(route.TargetNode)) - if err != nil { - return false, fmt.Errorf("%s: %w", op, err) - } - privNet, ok := findServerPrivateNetByID(srv, r.network.ID) - if !ok { - return false, fmt.Errorf("%s: server %v: no network with id: %d", op, route.TargetNode, r.network.ID) - } - ip := privNet.IP - - if !_route.Gateway.Equal(ip) { - action, _, err := r.client.Network.DeleteRoute(ctx, r.network, hcloud.NetworkDeleteRouteOpts{ - Route: _route, - }) - if err != nil { - return false, fmt.Errorf("%s: %w", op, err) - } - - if err := r.client.Action.WaitFor(ctx, action); err != nil { - return false, fmt.Errorf("%s: %w", op, err) - } - } - - return true, nil - } + if !r.clusterCIDR.Contains(cidr.IP) || destPrefixLen < clusterPrefixLen { + warnMsg := fmt.Sprintf( + "route CIDR %s is not contained within cluster CIDR %s", + cidr.String(), + r.clusterCIDR.String(), + ) + klog.Warning(warnMsg) + r.recorder.Event(node, corev1.EventTypeWarning, "ClusterCIDRMisconfigured", warnMsg) } - return false, nil } func findServerPrivateNetByID(srv *hcloud.Server, id int64) (hcloud.ServerPrivateNet, bool) { diff --git a/hcloud/routes_test.go b/hcloud/routes_test.go index fdf8200a5..4f74006dc 100644 --- a/hcloud/routes_test.go +++ b/hcloud/routes_test.go @@ -6,12 +6,28 @@ import ( "net/http" "testing" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + corelisters "k8s.io/client-go/listers/core/v1" + "k8s.io/client-go/tools/cache" cloudprovider "k8s.io/cloud-provider" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" ) +func nodeLister(t *testing.T, nodes ...*corev1.Node) corelisters.NodeLister { + indexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{}) + for _, n := range nodes { + if err := indexer.Add(n); err != nil { + t.Fatalf("seed node lister: %v", err) + } + } + return corelisters.NewNodeLister(indexer) +} + const DefaultClusterCIDR = "10.244.0.0/16" func TestRoutes_CreateRoute(t *testing.T) { @@ -72,7 +88,11 @@ func TestRoutes_CreateRoute(t *testing.T) { }, }) }) - routes, err := newRoutes(env.Client, 1, DefaultClusterCIDR, env.Recorder) + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node15"}, + Spec: corev1.NodeSpec{ProviderID: "hcloud://1"}, + } + routes, err := newRoutes(env.Client, 1, DefaultClusterCIDR, env.Recorder, nodeLister(t, node)) if err != nil { t.Fatalf("Unexpected error: %v", err) } @@ -81,6 +101,9 @@ func TestRoutes_CreateRoute(t *testing.T) { Name: "route", TargetNode: "node15", DestinationCIDR: "10.5.0.0/24", + TargetNodeAddresses: []corev1.NodeAddress{ + {Type: corev1.NodeInternalIP, Address: "10.0.0.2"}, + }, }) if err != nil { t.Fatalf("Unexpected error: %v", err) @@ -121,7 +144,7 @@ func TestRoutes_ListRoutes(t *testing.T) { }, }) }) - routes, err := newRoutes(env.Client, 1, DefaultClusterCIDR, env.Recorder) + routes, err := newRoutes(env.Client, 1, DefaultClusterCIDR, env.Recorder, nodeLister(t)) if err != nil { t.Fatalf("Unexpected error: %v", err) } @@ -189,7 +212,7 @@ func TestRoutes_DeleteRoute(t *testing.T) { }, }) }) - routes, err := newRoutes(env.Client, 1, DefaultClusterCIDR, env.Recorder) + routes, err := newRoutes(env.Client, 1, DefaultClusterCIDR, env.Recorder, nodeLister(t)) if err != nil { t.Fatalf("Unexpected error: %v", err) } @@ -203,3 +226,206 @@ func TestRoutes_DeleteRoute(t *testing.T) { t.Fatalf("Unexpected error: %v", err) } } + +func TestRoutes_CreateRoute_RobotProviderID(t *testing.T) { + env := newTestEnv() + defer env.Teardown() + + env.Mux.HandleFunc("/networks/1", func(w http.ResponseWriter, _ *http.Request) { + json.NewEncoder(w).Encode(schema.NetworkGetResponse{ + Network: schema.Network{ID: 1, Name: "network-1", IPRange: "10.0.0.0/8"}, + }) + }) + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "robot-node"}, + Spec: corev1.NodeSpec{ProviderID: "hrobot://1"}, + } + + routes, err := newRoutes(env.Client, 1, DefaultClusterCIDR, env.Recorder, nodeLister(t, node)) + require.NoError(t, err) + + err = routes.CreateRoute(context.TODO(), "my-cluster", "route", &cloudprovider.Route{ + TargetNode: "robot-node", + DestinationCIDR: "10.5.0.0/24", + }) + require.Error(t, err) + assert.ErrorContains(t, err, "not a Cloud server") +} + +// TestRoutes_CreateRoute_NodeNameDrift proves the routes controller still works when the +// k8s node name differs from the hcloud server name — the core fix in this PR. The server is +// resolved by its immutable ProviderID rather than by k8s node name. +func TestRoutes_CreateRoute_NodeNameDrift(t *testing.T) { + env := newTestEnv() + defer env.Teardown() + + env.Mux.HandleFunc("/servers", func(w http.ResponseWriter, _ *http.Request) { + json.NewEncoder(w).Encode(schema.ServerListResponse{ + Servers: []schema.Server{ + { + ID: 42, + Name: "original-hostname", // intentionally differs from the k8s node name + PrivateNet: []schema.ServerPrivateNet{ + {Network: 1, IP: "10.0.0.2"}, + }, + }, + }, + }) + }) + env.Mux.HandleFunc("/networks/1", func(w http.ResponseWriter, _ *http.Request) { + json.NewEncoder(w).Encode(schema.NetworkGetResponse{ + Network: schema.Network{ID: 1, Name: "network-1", IPRange: "10.0.0.0/8"}, + }) + }) + env.Mux.HandleFunc("/actions", func(w http.ResponseWriter, _ *http.Request) { + json.NewEncoder(w).Encode(schema.ActionListResponse{ + Actions: []schema.Action{{ID: 1, Status: string(hcloud.ActionStatusSuccess), Progress: 100}}, + }) + }) + env.Mux.HandleFunc("/networks/1/actions/add_route", func(w http.ResponseWriter, r *http.Request) { + var reqBody schema.NetworkActionAddRouteRequest + assert.NoError(t, json.NewDecoder(r.Body).Decode(&reqBody)) + assert.Equal(t, "10.0.0.2", reqBody.Gateway) + json.NewEncoder(w).Encode(schema.NetworkActionAddRouteResponse{ + Action: schema.Action{ID: 1, Status: string(hcloud.ActionStatusRunning)}, + }) + }) + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "drifted-name"}, + Spec: corev1.NodeSpec{ProviderID: "hcloud://42"}, + } + + routes, err := newRoutes(env.Client, 1, DefaultClusterCIDR, env.Recorder, nodeLister(t, node)) + require.NoError(t, err) + + err = routes.CreateRoute(context.TODO(), "my-cluster", "route", &cloudprovider.Route{ + TargetNode: "drifted-name", + DestinationCIDR: "10.5.0.0/24", + TargetNodeAddresses: []corev1.NodeAddress{ + {Type: corev1.NodeInternalIP, Address: "10.0.0.2"}, + }, + }) + require.NoError(t, err) +} + +// TestRoutes_CreateRoute_ReplaceStaleRoute asserts that a pre-existing route with a wrong +// gateway is deleted and a new one is added in the same call (upsert-in-place). +func TestRoutes_CreateRoute_ReplaceStaleRoute(t *testing.T) { + env := newTestEnv() + defer env.Teardown() + + env.Mux.HandleFunc("/servers", func(w http.ResponseWriter, _ *http.Request) { + err := json.NewEncoder(w).Encode(schema.ServerListResponse{ + Servers: []schema.Server{ + {ID: 1, Name: "node15", PrivateNet: []schema.ServerPrivateNet{{Network: 1, IP: "10.0.0.2"}}}, + }, + }) + assert.NoError(t, err) + }) + env.Mux.HandleFunc("/networks/1", func(w http.ResponseWriter, _ *http.Request) { + err := json.NewEncoder(w).Encode(schema.NetworkGetResponse{ + Network: schema.Network{ + ID: 1, Name: "network-1", IPRange: "10.0.0.0/8", + Routes: []schema.NetworkRoute{ + {Destination: "10.5.0.0/24", Gateway: "10.99.99.99"}, + }, + }, + }) + assert.NoError(t, err) + }) + env.Mux.HandleFunc("/actions", func(w http.ResponseWriter, _ *http.Request) { + err := json.NewEncoder(w).Encode(schema.ActionListResponse{ + Actions: []schema.Action{{ID: 1, Status: string(hcloud.ActionStatusSuccess), Progress: 100}}, + }) + assert.NoError(t, err) + }) + env.Mux.HandleFunc("/networks/1/actions/delete_route", func(w http.ResponseWriter, r *http.Request) { + var reqBody schema.NetworkActionDeleteRouteRequest + assert.NoError(t, json.NewDecoder(r.Body).Decode(&reqBody)) + assert.Equal(t, "10.99.99.99", reqBody.Gateway) + err := json.NewEncoder(w).Encode(schema.NetworkActionDeleteRouteResponse{ + Action: schema.Action{ID: 1, Status: string(hcloud.ActionStatusRunning)}, + }) + assert.NoError(t, err) + }) + env.Mux.HandleFunc("/networks/1/actions/add_route", func(w http.ResponseWriter, r *http.Request) { + var reqBody schema.NetworkActionAddRouteRequest + assert.NoError(t, json.NewDecoder(r.Body).Decode(&reqBody)) + assert.Equal(t, "10.0.0.2", reqBody.Gateway) + err := json.NewEncoder(w).Encode(schema.NetworkActionAddRouteResponse{ + Action: schema.Action{ID: 1, Status: string(hcloud.ActionStatusRunning)}, + }) + assert.NoError(t, err) + }) + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node15"}, + Spec: corev1.NodeSpec{ProviderID: "hcloud://1"}, + } + + routes, err := newRoutes(env.Client, 1, DefaultClusterCIDR, env.Recorder, nodeLister(t, node)) + require.NoError(t, err) + + err = routes.CreateRoute(context.TODO(), "my-cluster", "route", &cloudprovider.Route{ + TargetNode: "node15", + DestinationCIDR: "10.5.0.0/24", + TargetNodeAddresses: []corev1.NodeAddress{ + {Type: corev1.NodeInternalIP, Address: "10.0.0.2"}, + }, + }) + require.NoError(t, err) +} + +// TestRoutes_CreateRoute_AlreadyExists asserts no API mutations happen when the route already +// matches the desired destination + gateway. +func TestRoutes_CreateRoute_AlreadyExists(t *testing.T) { + env := newTestEnv() + defer env.Teardown() + + env.Mux.HandleFunc("/servers", func(w http.ResponseWriter, _ *http.Request) { + json.NewEncoder(w).Encode(schema.ServerListResponse{ + Servers: []schema.Server{ + {ID: 1, Name: "node15", PrivateNet: []schema.ServerPrivateNet{{Network: 1, IP: "10.0.0.2"}}}, + }, + }) + }) + env.Mux.HandleFunc("/networks/1", func(w http.ResponseWriter, _ *http.Request) { + json.NewEncoder(w).Encode(schema.NetworkGetResponse{ + Network: schema.Network{ + ID: 1, Name: "network-1", IPRange: "10.0.0.0/8", + Routes: []schema.NetworkRoute{ + {Destination: "10.5.0.0/24", Gateway: "10.0.0.2"}, + }, + }, + }) + }) + + calledAdd, calledDelete := false, false + env.Mux.HandleFunc("/networks/1/actions/delete_route", func(_ http.ResponseWriter, _ *http.Request) { + calledDelete = true + }) + env.Mux.HandleFunc("/networks/1/actions/add_route", func(_ http.ResponseWriter, _ *http.Request) { + calledAdd = true + }) + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node15"}, + Spec: corev1.NodeSpec{ProviderID: "hcloud://1"}, + } + + routes, err := newRoutes(env.Client, 1, DefaultClusterCIDR, env.Recorder, nodeLister(t, node)) + require.NoError(t, err) + + err = routes.CreateRoute(context.TODO(), "my-cluster", "route", &cloudprovider.Route{ + TargetNode: "node15", + DestinationCIDR: "10.5.0.0/24", + TargetNodeAddresses: []corev1.NodeAddress{ + {Type: corev1.NodeInternalIP, Address: "10.0.0.2"}, + }, + }) + require.NoError(t, err) + assert.False(t, calledAdd, "route should already exist: /add_route must not be called") + assert.False(t, calledDelete, "route should already exist: /delete_route must not be called") +} diff --git a/internal/hcops/server.go b/internal/hcops/server.go index 7872ed739..13498d05f 100644 --- a/internal/hcops/server.go +++ b/internal/hcops/server.go @@ -32,6 +32,7 @@ type AllServersCache struct { lastRefresh time.Time byPrivIP map[string]*hcloud.Server byName map[string]*hcloud.Server + byID map[int64]*hcloud.Server mu sync.Mutex } @@ -77,6 +78,26 @@ func (c *AllServersCache) ByName(ctx context.Context, name string) (*hcloud.Serv return srv, nil } +// ByID obtains a server from the cache using the servers id. +// +// Note that a pointer to the object stored in the cache is returned. Modifying +// this object affects the cache and all other code parts holding a reference. +// Furthermore, modifying the returned server is not concurrency safe. +func (c *AllServersCache) ByID(ctx context.Context, id int64) (*hcloud.Server, error) { + const op = "hcops/AllServersCache.ByID" + metrics.OperationCalled.WithLabelValues(op).Inc() + + srv, err := c.getFromCache(ctx, func() (*hcloud.Server, bool) { + srv, ok := c.byID[id] + return srv, ok + }) + if err != nil { + return nil, fmt.Errorf("%s: %d %w", op, id, err) + } + + return srv, nil +} + // getFromCache wraps the cache maps with expiry time and "get-on-unavailable" functionality. func (c *AllServersCache) getFromCache(ctx context.Context, retrieveFromCacheMaps func() (*hcloud.Server, bool)) (*hcloud.Server, error) { const op = "hcops/AllServersCache.getCache" @@ -136,6 +157,7 @@ func (c *AllServersCache) refreshCache(ctx context.Context) error { // Re-initialize all maps. This effectively clears the current cache. c.byPrivIP = make(map[string]*hcloud.Server) c.byName = make(map[string]*hcloud.Server) + c.byID = make(map[int64]*hcloud.Server) for _, server := range servers { // Index servers by the IPs of their private networks @@ -157,6 +179,9 @@ func (c *AllServersCache) refreshCache(ctx context.Context) error { // Index servers by their names. c.byName[server.Name] = server + + // Index servers by their ID. + c.byID[server.ID] = server } c.lastRefresh = time.Now() diff --git a/internal/hcops/server_test.go b/internal/hcops/server_test.go index 142ddceb8..5d205a772 100644 --- a/internal/hcops/server_test.go +++ b/internal/hcops/server_test.go @@ -375,6 +375,12 @@ func newAllServersCacheOps(t *testing.T, srv *hcloud.Server) map[string]allServe } return c.ByName(t.Context(), srv.Name) }, + "ByID": func(c *hcops.AllServersCache) (*hcloud.Server, error) { + if srv.ID == 0 { + t.Fatal("ByID: server has no id") + } + return c.ByID(t.Context(), srv.ID) + }, } } diff --git a/main.go b/main.go index 2905310a0..61dd5eb2e 100644 --- a/main.go +++ b/main.go @@ -62,7 +62,9 @@ func main() { } func cloudInitializer(config *config.CompletedConfig) cloudprovider.Interface { - cloud, err := hcloud.NewCloud(config.ComponentConfig.KubeCloudShared.ClusterCIDR) + nodeLister := config.SharedInformers.Core().V1().Nodes().Lister() + + cloud, err := hcloud.NewCloud(config.ComponentConfig.KubeCloudShared.ClusterCIDR, nodeLister) if err != nil { klog.Fatalf("Cloud provider could not be initialized: %v", err) } From 68bd811213a38dcc70a526c15a953c5fdd142bf0 Mon Sep 17 00:00:00 2001 From: lukasmetzner Date: Mon, 27 Apr 2026 14:22:22 +0200 Subject: [PATCH 2/3] refactor: remove ops and improve error msgs --- hcloud/routes.go | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/hcloud/routes.go b/hcloud/routes.go index 99ec3fe8c..02d777827 100644 --- a/hcloud/routes.go +++ b/hcloud/routes.go @@ -111,7 +111,7 @@ func (r *routes) CreateRoute(ctx context.Context, _ string, _ string, route *clo node, gateway, err := r.resolveRouteTarget(ctx, string(route.TargetNode)) if err != nil { - return fmt.Errorf("%s: %w", op, err) + return fmt.Errorf("%s: error resolving route target: %w", op, err) } if !slices.ContainsFunc(route.TargetNodeAddresses, func(target corev1.NodeAddress) bool { @@ -127,7 +127,11 @@ func (r *routes) CreateRoute(ctx context.Context, _ string, _ string, route *clo r.warnCIDRMismatch(cidr, node) - return r.upsertRoute(ctx, gateway, cidr, string(route.TargetNode)) + if err := r.upsertRoute(ctx, gateway, cidr, string(route.TargetNode)); err != nil { + return fmt.Errorf("error upserting route %q via %q: %w", cidr.String(), gateway.String(), err) + } + + return nil } // resolveRouteTarget returns the k8s node and the hcloud server's private IP on the routes @@ -137,24 +141,21 @@ func (r *routes) CreateRoute(ctx context.Context, _ string, _ string, route *clo // ID changes (e.g. server recreated). Refreshes the cache once if the private-net attachment // isn't yet reflected. func (r *routes) resolveRouteTarget(ctx context.Context, nodeName string) (*corev1.Node, net.IP, error) { - const op = "hcloud/resolveRouteTarget" - metrics.OperationCalled.WithLabelValues(op).Inc() - node, err := r.nodeLister.Get(nodeName) if err != nil { - return nil, nil, fmt.Errorf("%s: %w", op, err) + return nil, nil, fmt.Errorf("error fetching node %s by name: %w", nodeName, err) } if node.Spec.ProviderID == "" { - return nil, nil, fmt.Errorf("%s: node %q not yet initialized", op, node.Name) + return nil, nil, fmt.Errorf("node %s not yet initialized", node.Name) } id, isCloudServer, err := providerid.ToServerID(node.Spec.ProviderID) if err != nil { - return nil, nil, fmt.Errorf("%s: %w", op, err) + return nil, nil, fmt.Errorf("error parsing providerID %q for node %s: %w", node.Spec.ProviderID, nodeName, err) } if !isCloudServer { - return nil, nil, fmt.Errorf("%s: node %q is not a Cloud server, routes are only supported for Cloud servers", op, node.Name) + return nil, nil, fmt.Errorf("node %s is not a Cloud server, routes are only supported for Cloud servers", node.Name) } server, err := r.serverCache.ByID(ctx, id) @@ -162,7 +163,7 @@ func (r *routes) resolveRouteTarget(ctx context.Context, nodeName string) (*core server, err = r.serverCache.ByName(ctx, node.Name) } if err != nil { - return nil, nil, fmt.Errorf("%s: %w", op, err) + return nil, nil, fmt.Errorf("error fetching node %s by ID: %w", nodeName, err) } privNet, ok := findServerPrivateNetByID(server, r.network.ID) @@ -170,11 +171,11 @@ func (r *routes) resolveRouteTarget(ctx context.Context, nodeName string) (*core r.serverCache.InvalidateCache() server, err = r.serverCache.ByID(ctx, server.ID) if err != nil { - return nil, nil, fmt.Errorf("%s: %w", op, err) + return nil, nil, fmt.Errorf("error fetching node %s by ID: %w", nodeName, err) } privNet, ok = findServerPrivateNetByID(server, r.network.ID) if !ok { - return nil, nil, fmt.Errorf("%s: server %q: network with id %d not attached to this server", op, node.Name, r.network.ID) + return nil, nil, fmt.Errorf("server %s (%d): network with id %d not attached to this server", server.Name, server.ID, r.network.ID) } } @@ -185,11 +186,8 @@ func (r *routes) resolveRouteTarget(ctx context.Context, nodeName string) (*core // route is a no-op; a stale route with a different gateway is replaced in place. nodeName is // used only for logging and for surfacing API conflicts against the right k8s object. func (r *routes) upsertRoute(ctx context.Context, gateway net.IP, cidr *net.IPNet, nodeName string) error { - const op = "hcloud/upsertRoute" - metrics.OperationCalled.WithLabelValues(op).Inc() - if err := r.reloadNetwork(ctx); err != nil { - return fmt.Errorf("%s: %w", op, err) + return fmt.Errorf("error reloading network: %w", err) } destination := cidr.String() @@ -211,15 +209,16 @@ func (r *routes) upsertRoute(ctx context.Context, gateway net.IP, cidr *net.IPNe Route: existing, }) if err != nil { - return fmt.Errorf("%s: %w", op, err) + return fmt.Errorf("error deleting route for %q via %q: %w", cidr.String(), gateway.String(), err) } if err := r.client.Action.WaitFor(ctx, action); err != nil { - return fmt.Errorf("%s: %w", op, err) + return fmt.Errorf("error deleting route for %q via %q: %w", cidr.String(), gateway.String(), err) } klog.InfoS( "deleted stale route with wrong gateway; recreating", - "target-node", nodeName, - "destination-cidr", destination, + "node", nodeName, + "gateway", gateway, + "cidr", destination, ) } @@ -238,11 +237,11 @@ func (r *routes) upsertRoute(ctx context.Context, gateway net.IP, cidr *net.IPNe err, ) } - return fmt.Errorf("%s: %w", op, err) + return fmt.Errorf("error adding route for %q via %q: %w", cidr.String(), gateway.String(), err) } if err := r.client.Action.WaitFor(ctx, action); err != nil { - return fmt.Errorf("%s: %w", op, err) + return fmt.Errorf("error adding route for %q via %q: %w", cidr.String(), gateway.String(), err) } return nil From ea742f635ff12393c6e34abbc755203b2ab03373 Mon Sep 17 00:00:00 2001 From: lukasmetzner Date: Mon, 27 Apr 2026 14:43:09 +0200 Subject: [PATCH 3/3] refactor: error message spelling --- hcloud/routes.go | 8 ++++---- hcloud/routes_test.go | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/hcloud/routes.go b/hcloud/routes.go index 02d777827..522c93ea1 100644 --- a/hcloud/routes.go +++ b/hcloud/routes.go @@ -152,10 +152,10 @@ func (r *routes) resolveRouteTarget(ctx context.Context, nodeName string) (*core id, isCloudServer, err := providerid.ToServerID(node.Spec.ProviderID) if err != nil { - return nil, nil, fmt.Errorf("error parsing providerID %q for node %s: %w", node.Spec.ProviderID, nodeName, err) + return nil, nil, fmt.Errorf("error parsing provider id %q for node %s: %w", node.Spec.ProviderID, nodeName, err) } if !isCloudServer { - return nil, nil, fmt.Errorf("node %s is not a Cloud server, routes are only supported for Cloud servers", node.Name) + return nil, nil, fmt.Errorf("node %s is not a cloud server, routes are only supported for cloud servers", node.Name) } server, err := r.serverCache.ByID(ctx, id) @@ -163,7 +163,7 @@ func (r *routes) resolveRouteTarget(ctx context.Context, nodeName string) (*core server, err = r.serverCache.ByName(ctx, node.Name) } if err != nil { - return nil, nil, fmt.Errorf("error fetching node %s by ID: %w", nodeName, err) + return nil, nil, fmt.Errorf("error fetching node %s by id: %w", nodeName, err) } privNet, ok := findServerPrivateNetByID(server, r.network.ID) @@ -171,7 +171,7 @@ func (r *routes) resolveRouteTarget(ctx context.Context, nodeName string) (*core r.serverCache.InvalidateCache() server, err = r.serverCache.ByID(ctx, server.ID) if err != nil { - return nil, nil, fmt.Errorf("error fetching node %s by ID: %w", nodeName, err) + return nil, nil, fmt.Errorf("error fetching node %s by id: %w", nodeName, err) } privNet, ok = findServerPrivateNetByID(server, r.network.ID) if !ok { diff --git a/hcloud/routes_test.go b/hcloud/routes_test.go index 4f74006dc..439f21131 100644 --- a/hcloud/routes_test.go +++ b/hcloud/routes_test.go @@ -250,7 +250,7 @@ func TestRoutes_CreateRoute_RobotProviderID(t *testing.T) { DestinationCIDR: "10.5.0.0/24", }) require.Error(t, err) - assert.ErrorContains(t, err, "not a Cloud server") + assert.ErrorContains(t, err, "not a cloud server") } // TestRoutes_CreateRoute_NodeNameDrift proves the routes controller still works when the