diff --git a/lib/network/allocate.go b/lib/network/allocate.go index 37afd107..1ff06b0d 100644 --- a/lib/network/allocate.go +++ b/lib/network/allocate.go @@ -6,6 +6,7 @@ import ( "fmt" mathrand "math/rand" "net" + "os" "strings" "time" @@ -58,11 +59,17 @@ func (m *manager) CreateAllocation(ctx context.Context, req AllocateRequest) (*N tap := GenerateTAPName(req.InstanceID) // 5. Create TAP device with bidirectional rate limiting - if err := m.createTAPDevice(tap, network.Bridge, network.Isolated, req.DownloadBps, req.UploadBps, req.UploadCeilBps); err != nil { + classID, err := m.createTAPDevice(ctx, tap, network.Bridge, network.Isolated, req.DownloadBps, req.UploadBps, req.UploadCeilBps) + if err != nil { return nil, fmt.Errorf("create TAP device: %w", err) } m.recordTAPOperation(ctx, "create") + // Persist assigned tc class ID so removal uses the correct ID after collisions. + if classID != "" { + m.saveClassID(req.InstanceID, classID) + } + log.InfoContext(ctx, "allocated network", "instance_id", req.InstanceID, "instance_name", req.InstanceName, @@ -114,11 +121,17 @@ func (m *manager) RecreateAllocation(ctx context.Context, instanceID string, dow // 3. Recreate TAP device with same name and rate limits from instance metadata uploadCeilBps := uploadBps * int64(m.GetUploadBurstMultiplier()) - if err := m.createTAPDevice(alloc.TAPDevice, network.Bridge, network.Isolated, downloadBps, uploadBps, uploadCeilBps); err != nil { + classID, err := m.createTAPDevice(ctx, alloc.TAPDevice, network.Bridge, network.Isolated, downloadBps, uploadBps, uploadCeilBps) + if err != nil { return fmt.Errorf("create TAP device: %w", err) } m.recordTAPOperation(ctx, "create") + // Persist assigned tc class ID so removal uses the correct ID after collisions. + if classID != "" { + m.saveClassID(instanceID, classID) + } + log.InfoContext(ctx, "recreated network for restore", "instance_id", instanceID, "network", "default", @@ -144,8 +157,8 @@ func (m *manager) ReleaseAllocation(ctx context.Context, alloc *Allocation) erro return nil } - // 1. Delete TAP device (best effort) - if err := m.deleteTAPDevice(alloc.TAPDevice); err != nil { + // 1. Delete TAP device (best effort), using stored class ID for correct HTB cleanup + if err := m.deleteTAPDevice(alloc.TAPDevice, alloc.ClassID); err != nil { log.WarnContext(ctx, "failed to delete TAP device", "tap", alloc.TAPDevice, "error", err) } else { m.recordTAPOperation(ctx, "delete") @@ -295,6 +308,23 @@ func generateMAC() (string, error) { buf[0], buf[1], buf[2], buf[3], buf[4], buf[5]), nil } +// saveClassID persists the tc class ID for an instance so it survives restarts. +func (m *manager) saveClassID(instanceID, classID string) { + path := m.paths.InstanceDir(instanceID) + _ = os.WriteFile(path+"/classid", []byte(classID), 0644) +} + +// loadClassID loads the persisted tc class ID for an instance. +// Returns empty string if not found (backwards compat for old allocations). +func (m *manager) loadClassID(instanceID string) string { + path := m.paths.InstanceDir(instanceID) + data, err := os.ReadFile(path + "/classid") + if err != nil { + return "" + } + return strings.TrimSpace(string(data)) +} + // TAPPrefix is the prefix used for hypeman TAP devices const TAPPrefix = "hype-" diff --git a/lib/network/bridge_darwin.go b/lib/network/bridge_darwin.go index 6eec3940..92cd850d 100644 --- a/lib/network/bridge_darwin.go +++ b/lib/network/bridge_darwin.go @@ -35,13 +35,13 @@ func (m *manager) setupBridgeHTB(ctx context.Context, bridgeName string, capacit // createTAPDevice is a no-op on macOS as we use NAT networking. // Virtualization.framework creates virtual network interfaces internally. -func (m *manager) createTAPDevice(tapName, bridgeName string, isolated bool, downloadBps, uploadBps, uploadCeilBps int64) error { +func (m *manager) createTAPDevice(ctx context.Context, tapName, bridgeName string, isolated bool, downloadBps, uploadBps, uploadCeilBps int64) (string, error) { // On macOS with vz, network devices are created by the VMM itself - return nil + return "", nil } // deleteTAPDevice is a no-op on macOS as we use NAT networking. -func (m *manager) deleteTAPDevice(tapName string) error { +func (m *manager) deleteTAPDevice(tapName, classID string) error { return nil } diff --git a/lib/network/bridge_linux.go b/lib/network/bridge_linux.go index f3509327..62831b97 100644 --- a/lib/network/bridge_linux.go +++ b/lib/network/bridge_linux.go @@ -521,12 +521,13 @@ func (m *manager) lastHypemanForwardRulePosition() int { // createTAPDevice creates TAP device and attaches to bridge. // downloadBps: rate limit for download (external→VM), applied as TBF on TAP egress // uploadBps/uploadCeilBps: rate limit for upload (VM→external), applied as HTB class on bridge -func (m *manager) createTAPDevice(tapName, bridgeName string, isolated bool, downloadBps, uploadBps, uploadCeilBps int64) error { +// Returns the tc class ID actually assigned (empty if no upload rate limiting). +func (m *manager) createTAPDevice(ctx context.Context, tapName, bridgeName string, isolated bool, downloadBps, uploadBps, uploadCeilBps int64) (string, error) { // 1. Check if TAP already exists if _, err := netlink.LinkByName(tapName); err == nil { // TAP already exists, delete it first - if err := m.deleteTAPDevice(tapName); err != nil { - return fmt.Errorf("delete existing TAP: %w", err) + if err := m.deleteTAPDevice(tapName, ""); err != nil { + return "", fmt.Errorf("delete existing TAP: %w", err) } } @@ -545,27 +546,27 @@ func (m *manager) createTAPDevice(tapName, bridgeName string, isolated bool, dow } if err := netlink.LinkAdd(tap); err != nil { - return fmt.Errorf("create TAP device: %w", err) + return "", fmt.Errorf("create TAP device: %w", err) } // 3. Set TAP up tapLink, err := netlink.LinkByName(tapName) if err != nil { - return fmt.Errorf("get TAP link: %w", err) + return "", fmt.Errorf("get TAP link: %w", err) } if err := netlink.LinkSetUp(tapLink); err != nil { - return fmt.Errorf("set TAP up: %w", err) + return "", fmt.Errorf("set TAP up: %w", err) } // 4. Attach TAP to bridge bridge, err := netlink.LinkByName(bridgeName) if err != nil { - return fmt.Errorf("get bridge: %w", err) + return "", fmt.Errorf("get bridge: %w", err) } if err := netlink.LinkSetMaster(tapLink, bridge); err != nil { - return fmt.Errorf("attach TAP to bridge: %w", err) + return "", fmt.Errorf("attach TAP to bridge: %w", err) } // 5. Enable port isolation so isolated TAPs can't directly talk to each other (requires kernel support and capabilities) @@ -579,25 +580,28 @@ func (m *manager) createTAPDevice(tapName, bridgeName string, isolated bool, dow } output, err := cmd.CombinedOutput() if err != nil { - return fmt.Errorf("set isolation mode: %w (output: %s)", err, string(output)) + return "", fmt.Errorf("set isolation mode: %w (output: %s)", err, string(output)) } } // 6. Apply download rate limiting (TBF on TAP egress) if downloadBps > 0 { if err := m.applyDownloadRateLimit(tapName, downloadBps); err != nil { - return fmt.Errorf("apply download rate limit: %w", err) + return "", fmt.Errorf("apply download rate limit: %w", err) } } // 7. Apply upload rate limiting (HTB class on bridge) + var classID string if uploadBps > 0 { - if err := m.addVMClass(bridgeName, tapName, uploadBps, uploadCeilBps); err != nil { - return fmt.Errorf("apply upload rate limit: %w", err) + var err error + classID, err = m.addVMClass(ctx, bridgeName, tapName, uploadBps, uploadCeilBps) + if err != nil { + return "", fmt.Errorf("apply upload rate limit: %w", err) } } - return nil + return classID, nil } // applyDownloadRateLimit applies download (external→VM) rate limiting using TBF on TAP egress. @@ -689,65 +693,92 @@ func (m *manager) setupBridgeHTB(ctx context.Context, bridgeName string, capacit // addVMClass adds an HTB class for a VM on the bridge for upload rate limiting. // Called during TAP device creation. rateBps is guaranteed, ceilBps is burst ceiling. -func (m *manager) addVMClass(bridgeName, tapName string, rateBps, ceilBps int64) error { +// Returns the class ID actually used (may differ from deriveClassID if a collision occurred). +func (m *manager) addVMClass(ctx context.Context, bridgeName, tapName string, rateBps, ceilBps int64) (string, error) { if rateBps <= 0 { - return nil // No rate limiting configured + return "", nil // No rate limiting configured } - // Use first 4 hex chars of TAP name suffix as class ID (e.g., "hype-a1b2c3d4" → "a1b2") - // This ensures unique, stable class IDs per VM - classID := deriveClassID(tapName) - fullClassID := fmt.Sprintf("1:%s", classID) - rateStr := formatTcRate(rateBps) if ceilBps <= 0 { ceilBps = rateBps } ceilStr := formatTcRate(ceilBps) - // 1. Add HTB class for this VM - cmd := exec.Command("tc", "class", "add", "dev", bridgeName, "parent", htbRootClassID, - "classid", fullClassID, "htb", "rate", rateStr, "ceil", ceilStr, "prio", "1") - cmd.SysProcAttr = &syscall.SysProcAttr{ - AmbientCaps: []uintptr{unix.CAP_NET_ADMIN}, - } - if output, err := cmd.CombinedOutput(); err != nil { - return fmt.Errorf("tc class add vm: %w (output: %s)", err, string(output)) - } + // Start with derived class ID, probe linearly on collision. + classIDVal := deriveClassIDVal(tapName) - // 2. Add fq_codel to this class for better latency under load - cmd = exec.Command("tc", "qdisc", "add", "dev", bridgeName, "parent", fullClassID, "fq_codel") - cmd.SysProcAttr = &syscall.SysProcAttr{ - AmbientCaps: []uintptr{unix.CAP_NET_ADMIN}, - } - // Ignore errors - fq_codel may not be available - cmd.Run() + const maxAttempts = 5 + var lastErr error + for attempt := 0; attempt < maxAttempts; attempt++ { + classID := fmt.Sprintf("%04x", classIDVal) + fullClassID := fmt.Sprintf("1:%s", classID) - // 3. Add filter to classify traffic from this TAP to this class - // Use basic match on incoming interface (rt_iif) - tapLink, err := netlink.LinkByName(tapName) - if err != nil { - return fmt.Errorf("get TAP link for filter: %w", err) - } - tapIndex := tapLink.Attrs().Index + // Try tc class add (NOT replace) so we detect collisions. + cmd := exec.Command("tc", "class", "add", "dev", bridgeName, "parent", htbRootClassID, + "classid", fullClassID, "htb", "rate", rateStr, "ceil", ceilStr, "prio", "1") + cmd.SysProcAttr = &syscall.SysProcAttr{ + AmbientCaps: []uintptr{unix.CAP_NET_ADMIN}, + } + output, err := cmd.CombinedOutput() + if err != nil { + // Check for "File exists" collision (exit status 2). + var exitErr *exec.ExitError + if errors.As(err, &exitErr) && exitErr.ExitCode() == 2 && strings.Contains(string(output), "File exists") { + if attempt == 0 { + m.recordTCClassCollision(ctx, "initial") + } else { + m.recordTCClassCollision(ctx, "retry") + } + // Increment class ID, wrapping within valid 16-bit range. + // Skip 0 (invalid) and 1 (root class 1:1). + classIDVal++ + if classIDVal == 0 || classIDVal == 1 { + classIDVal = 2 + } + lastErr = fmt.Errorf("tc class add: %w (output: %s)", err, string(output)) + continue + } + // Non-collision error: return immediately. + return "", fmt.Errorf("tc class add vm: %w (output: %s)", err, string(output)) + } - cmd = exec.Command("tc", "filter", "add", "dev", bridgeName, "parent", htbRootHandle, - "protocol", "all", "prio", "1", "basic", - "match", fmt.Sprintf("meta(rt_iif eq %d)", tapIndex), - "flowid", fullClassID) - cmd.SysProcAttr = &syscall.SysProcAttr{ - AmbientCaps: []uintptr{unix.CAP_NET_ADMIN}, - } - if output, err := cmd.CombinedOutput(); err != nil { - return fmt.Errorf("tc filter add: %w (output: %s)", err, string(output)) + // Success — add fq_codel and filter. + qdiscCmd := exec.Command("tc", "qdisc", "add", "dev", bridgeName, "parent", fullClassID, "fq_codel") + qdiscCmd.SysProcAttr = &syscall.SysProcAttr{ + AmbientCaps: []uintptr{unix.CAP_NET_ADMIN}, + } + qdiscCmd.Run() // Best effort + + tapLink, linkErr := netlink.LinkByName(tapName) + if linkErr != nil { + return "", fmt.Errorf("get TAP link for filter: %w", linkErr) + } + tapIndex := tapLink.Attrs().Index + + filterCmd := exec.Command("tc", "filter", "add", "dev", bridgeName, "parent", htbRootHandle, + "protocol", "all", "prio", "1", "basic", + "match", fmt.Sprintf("meta(rt_iif eq %d)", tapIndex), + "flowid", fullClassID) + filterCmd.SysProcAttr = &syscall.SysProcAttr{ + AmbientCaps: []uintptr{unix.CAP_NET_ADMIN}, + } + if output, filterErr := filterCmd.CombinedOutput(); filterErr != nil { + return "", fmt.Errorf("tc filter add: %w (output: %s)", filterErr, string(output)) + } + + return classID, nil } - return nil + return "", fmt.Errorf("tc class add failed after %d attempts: %w", maxAttempts, lastErr) } // removeVMClass removes the HTB class for a VM from the bridge. -func (m *manager) removeVMClass(bridgeName, tapName string) error { - classID := deriveClassID(tapName) +// If classID is non-empty, it is used directly; otherwise falls back to deriveClassID. +func (m *manager) removeVMClass(bridgeName, tapName, classID string) error { + if classID == "" { + classID = deriveClassID(tapName) + } fullClassID := fmt.Sprintf("1:%s", classID) // Delete filter first (by matching flowid) @@ -797,18 +828,22 @@ func (m *manager) removeVMClass(bridgeName, tapName string) error { return nil } -// deriveClassID derives a unique HTB class ID from a TAP name. -// Uses first 4 hex characters after the prefix (e.g., "hype-a1b2c3d4" → "a1b2"). -func deriveClassID(tapName string) string { - // Hash the TAP name to get a valid hex class ID. - // tc class IDs must be hexadecimal (0-9, a-f), but CUID2 instance IDs - // use base-36 (0-9, a-z) which includes invalid chars like t, w, v, etc. - // Using FNV-1a for speed. Limited to 16 bits since tc class IDs max at 0xFFFF. +// deriveClassIDVal derives the numeric HTB class ID from a TAP name. +// Uses FNV-1a hash truncated to 16 bits. Returns a value in 0x0002-0xFFFF range, +// avoiding 0 (invalid) and 1 (reserved for root class 1:1). +func deriveClassIDVal(tapName string) uint16 { h := fnv.New32a() h.Write([]byte(tapName)) - hash := h.Sum32() - // Use only 16 bits (tc class ID max is 0xFFFF) - return fmt.Sprintf("%04x", hash&0xFFFF) + val := uint16(h.Sum32() & 0xFFFF) + if val <= 1 { + val = 2 // 0 is invalid, 1 is root class (1:1) + } + return val +} + +// deriveClassID derives a unique HTB class ID string from a TAP name. +func deriveClassID(tapName string) string { + return fmt.Sprintf("%04x", deriveClassIDVal(tapName)) } // formatTcRate formats bytes per second as a tc rate string. @@ -829,9 +864,10 @@ func formatTcRate(bytesPerSec int64) string { } // deleteTAPDevice removes TAP device and its associated HTB class on the bridge. -func (m *manager) deleteTAPDevice(tapName string) error { +// If classID is non-empty, it is used for removeVMClass; otherwise falls back to deriveClassID. +func (m *manager) deleteTAPDevice(tapName, classID string) error { // Remove HTB class from bridge before deleting TAP - m.removeVMClass(m.config.Network.BridgeName, tapName) + m.removeVMClass(m.config.Network.BridgeName, tapName, classID) link, err := netlink.LinkByName(tapName) if err != nil { @@ -925,8 +961,8 @@ func (m *manager) CleanupOrphanedTAPs(ctx context.Context, runningInstanceIDs [] continue } - // Orphaned TAP - delete it - if err := m.deleteTAPDevice(name); err != nil { + // Orphaned TAP - delete it (no stored classID available, falls back to deriveClassID) + if err := m.deleteTAPDevice(name, ""); err != nil { log.WarnContext(ctx, "failed to delete orphaned TAP", "tap", name, "error", err) continue } @@ -953,18 +989,31 @@ func (m *manager) CleanupOrphanedClasses(ctx context.Context) int { return 0 } - // Build set of class IDs that belong to existing TAP devices + // Build set of class IDs that belong to existing TAP devices. + // Include both derived class IDs and stored class IDs from allocations + // (which may differ if a collision was resolved by probing). validClassIDs := make(map[string]bool) links, err := netlink.LinkList() if err == nil { for _, link := range links { name := link.Attrs().Name if strings.HasPrefix(name, TAPPrefix) { - classID := deriveClassID(name) - validClassIDs[classID] = true + validClassIDs[deriveClassID(name)] = true } } } + // Also include stored class IDs from allocations (handles probed IDs). + // If ListAllocations fails, bail out to avoid deleting valid probed classes. + allocs, allocErr := m.ListAllocations(ctx) + if allocErr != nil { + log.WarnContext(ctx, "skipping orphaned class cleanup: failed to list allocations", "error", allocErr) + return 0 + } + for _, alloc := range allocs { + if alloc.ClassID != "" { + validClassIDs[alloc.ClassID] = true + } + } // Parse class output and find orphaned classes // Format: "class htb 1:xxxx parent 1:1 ..." diff --git a/lib/network/derive.go b/lib/network/derive.go index 17f3d83d..7970d03e 100644 --- a/lib/network/derive.go +++ b/lib/network/derive.go @@ -83,6 +83,7 @@ func (m *manager) deriveAllocation(ctx context.Context, instanceID string) (*All Netmask: netmask, DNS: m.config.Network.DNSServer, State: state, + ClassID: m.loadClassID(instanceID), }, nil } diff --git a/lib/network/metrics.go b/lib/network/metrics.go index 48cbd1c0..a76b8d82 100644 --- a/lib/network/metrics.go +++ b/lib/network/metrics.go @@ -9,7 +9,8 @@ import ( // Metrics holds the metrics instruments for network operations. type Metrics struct { - tapOperations metric.Int64Counter + tapOperations metric.Int64Counter + tcClassCollisions metric.Int64Counter } // newNetworkMetrics creates and registers all network metrics. @@ -46,8 +47,17 @@ func newNetworkMetrics(meter metric.Meter, m *manager) (*Metrics, error) { return nil, err } + tcClassCollisions, err := meter.Int64Counter( + "hypeman_network_tc_class_collisions_total", + metric.WithDescription("Total number of tc class ID collisions during addVMClass"), + ) + if err != nil { + return nil, err + } + return &Metrics{ - tapOperations: tapOperations, + tapOperations: tapOperations, + tcClassCollisions: tcClassCollisions, }, nil } @@ -59,3 +69,13 @@ func (m *manager) recordTAPOperation(ctx context.Context, operation string) { m.metrics.tapOperations.Add(ctx, 1, metric.WithAttributes(attribute.String("operation", operation))) } + +// recordTCClassCollision records a tc class ID collision. +// attempt is "initial" for the first hash collision or "retry" for subsequent probe collisions. +func (m *manager) recordTCClassCollision(ctx context.Context, attempt string) { + if m.metrics == nil { + return + } + m.metrics.tcClassCollisions.Add(ctx, 1, + metric.WithAttributes(attribute.String("attempt", attempt))) +} diff --git a/lib/network/types.go b/lib/network/types.go index 7cd28905..a80e8487 100644 --- a/lib/network/types.go +++ b/lib/network/types.go @@ -36,6 +36,7 @@ type Allocation struct { Netmask string // Netmask in dotted decimal notation DNS string // DNS server for guest configuration State string // "running", "standby" (derived from CH or snapshot) + ClassID string // HTB class ID actually assigned (may differ from deriveClassID if collision occurred) } // NetworkConfig is the configuration returned after allocation