Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions lib/instances/network_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ func TestCreateInstanceWithNetwork(t *testing.T) {
t.Log("Initializing network...")
err = manager.networkManager.Initialize(ctx, nil)
require.NoError(t, err)
require.NoError(t, manager.networkManager.SetupHTB(ctx, 100*1024*1024))
t.Log("Network initialized")

// Create instance with nginx:alpine and default network
Expand Down Expand Up @@ -126,6 +127,18 @@ func TestCreateInstanceWithNetwork(t *testing.T) {
require.NoError(t, err)
_, isBridge := master.(*netlink.Bridge)
assert.True(t, isBridge, "TAP should be attached to a bridge")
bridgeName := master.Attrs().Name

t.Log("Verifying orphaned bridge tc cleanup preserves live TAP state...")
liveFlowID := createBridgeTCForTest(t, bridgeName, tap.Attrs().Index)
require.True(t, bridgeClassExists(t, bridgeName, liveFlowID), "live bridge tc class should exist before cleanup")
staleFlowID := createBridgeTCForTest(t, bridgeName, 1)
deletedTC := manager.networkManager.CleanupOrphanedClasses(ctx)
require.GreaterOrEqual(t, deletedTC, 2, "expected stale filter and class to be deleted")
assert.True(t, bridgeFilterExistsForFlowID(t, bridgeName, liveFlowID), "live bridge tc filter should remain")
assert.True(t, bridgeClassExists(t, bridgeName, liveFlowID), "live bridge tc class should remain")
assert.False(t, bridgeFilterExistsForFlowID(t, bridgeName, staleFlowID), "stale bridge tc filter should be deleted")
assert.False(t, bridgeClassExists(t, bridgeName, staleFlowID), "stale bridge tc class should be deleted")

// Wait for nginx to start
t.Log("Waiting for nginx to start...")
Expand Down Expand Up @@ -301,6 +314,123 @@ func TestCreateInstanceWithNetwork(t *testing.T) {
t.Log("Network integration test complete!")
}

func tcForTest(t *testing.T) string {
t.Helper()
if path, err := exec.LookPath("tc"); err == nil {
return path
}
return "/usr/sbin/tc"
}

func runTCForTest(t *testing.T, args ...string) string {
t.Helper()
output, err := exec.Command(tcForTest(t), args...).CombinedOutput()
require.NoError(t, err, "tc %s output: %s", strings.Join(args, " "), string(output))
return string(output)
}

func bridgeClassesForTest(t *testing.T, bridgeName string) []string {
t.Helper()
output := runTCForTest(t, "class", "show", "dev", bridgeName)
var classes []string
for _, line := range strings.Split(output, "\n") {
if !strings.Contains(line, "class htb 1:") {
continue
}
fields := strings.Fields(line)
if len(fields) >= 3 {
classes = append(classes, fields[2])
}
}
return classes
}

func bridgeClassExists(t *testing.T, bridgeName, classID string) bool {
t.Helper()
for _, class := range bridgeClassesForTest(t, bridgeName) {
if class == classID {
return true
}
}
return false
}

func bridgeFilterExistsForFlowID(t *testing.T, bridgeName, flowID string) bool {
t.Helper()
return len(bridgeFilterHandlesForFlowID(t, bridgeName, flowID)) > 0
}

func bridgeFilterHandlesForFlowID(t *testing.T, bridgeName, flowID string) []string {
t.Helper()
output := runTCForTest(t, "filter", "show", "dev", bridgeName, "parent", "1:")
var handles []string
for _, line := range strings.Split(output, "\n") {
if !strings.HasPrefix(line, "filter ") {
continue
}
fields := strings.Fields(line)
handle, gotFlowID := "", ""
for i, field := range fields {
if i+1 >= len(fields) {
break
}
switch field {
case "handle":
handle = fields[i+1]
case "flowid":
gotFlowID = fields[i+1]
}
}
if handle != "" && gotFlowID == flowID {
handles = append(handles, handle)
}
}
return handles
}

func createBridgeTCForTest(t *testing.T, bridgeName string, rtIif int) string {
t.Helper()
used := make(map[string]bool)
for _, classID := range bridgeClassesForTest(t, bridgeName) {
used[classID] = true
}

flowID := ""
for id := 0xff00; id <= 0xffff; id++ {
candidate := fmt.Sprintf("1:%04x", id)
if !used[candidate] {
flowID = candidate
break
}
}
require.NotEmpty(t, flowID, "expected an unused test class id")

t.Cleanup(func() {
bestEffortDeleteBridgeFiltersForFlowID(t, bridgeName, flowID)
_ = exec.Command(tcForTest(t), "qdisc", "del", "dev", bridgeName, "parent", flowID).Run()
_ = exec.Command(tcForTest(t), "class", "del", "dev", bridgeName, "classid", flowID).Run()
})

runTCForTest(t, "class", "add", "dev", bridgeName, "parent", "1:1",
"classid", flowID, "htb", "rate", "1mbit", "ceil", "1mbit")
runTCForTest(t, "qdisc", "add", "dev", bridgeName, "parent", flowID, "fq_codel")
runTCForTest(t, "filter", "add", "dev", bridgeName, "parent", "1:",
"protocol", "all", "prio", "1", "basic",
"match", fmt.Sprintf("meta(rt_iif eq %d)", rtIif), "flowid", flowID)

require.True(t, bridgeClassExists(t, bridgeName, flowID), "staged bridge tc class should exist")
require.True(t, bridgeFilterExistsForFlowID(t, bridgeName, flowID), "staged bridge tc filter should exist")
return flowID
}

func bestEffortDeleteBridgeFiltersForFlowID(t *testing.T, bridgeName, flowID string) {
t.Helper()
for _, handle := range bridgeFilterHandlesForFlowID(t, bridgeName, flowID) {
_ = exec.Command(tcForTest(t), "filter", "del", "dev", bridgeName, "parent", "1:",
"protocol", "all", "prio", "1", "handle", handle, "basic").Run()
}
}

func startRestartPolicyControllerForTest(t *testing.T, ctx context.Context, manager *manager) {
t.Helper()

Expand Down
36 changes: 21 additions & 15 deletions lib/instances/tap_gc.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,24 +49,30 @@ func (m *manager) reconcileTAPs(ctx context.Context) {
// Initializing, and Unknown. "Better to leave a stale TAP than crash a
// running VM."
insts, err := m.ListInstances(ctx, nil)
var preserve []string
if err != nil {
log.WarnContext(ctx, "TAP GC: failed to list instances, skipping pass", "error", err)
return
}
preserve := make([]string, 0, len(insts))
for _, inst := range insts {
if inst.State == StateRunning || inst.State == StateInitializing || inst.State == StateUnknown {
preserve = append(preserve, inst.Id)
log.WarnContext(ctx, "TAP GC: failed to list instances, skipping TAP pass", "error", err)
} else {
preserve = make([]string, 0, len(insts))
for _, inst := range insts {
if inst.State == StateRunning || inst.State == StateInitializing || inst.State == StateUnknown {
preserve = append(preserve, inst.Id)
}
}
}
if len(preserve) == 0 {
// CleanupOrphanedTAPs short-circuits on empty preserve set to avoid
// clobbering TAPs from concurrent hypeman processes/tests. Mirror that here
// rather than calling it with an empty slice.
log.DebugContext(ctx, "TAP GC: no preserve candidates, skipping pass")
return

if err == nil {
if len(preserve) == 0 {
// CleanupOrphanedTAPs short-circuits on empty preserve set to avoid
// clobbering TAPs from concurrent hypeman processes/tests. Mirror that here
// rather than calling it with an empty slice.
log.DebugContext(ctx, "TAP GC: no preserve candidates, skipping TAP pass")
} else if deleted := m.networkManager.CleanupOrphanedTAPs(ctx, preserve, tapGCMinAge); deleted > 0 {
log.InfoContext(ctx, "TAP GC: cleaned up orphaned TAP devices", "count", deleted)
}
}
if deleted := m.networkManager.CleanupOrphanedTAPs(ctx, preserve, tapGCMinAge); deleted > 0 {
log.InfoContext(ctx, "TAP GC: cleaned up orphaned TAP devices", "count", deleted)

if deleted := m.networkManager.CleanupOrphanedClasses(ctx); deleted > 0 {
log.InfoContext(ctx, "TAP GC: cleaned up orphaned tc filters/classes", "count", deleted)
}
}
28 changes: 11 additions & 17 deletions lib/network/allocate.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func (m *manager) CreateAllocation(ctx context.Context, req AllocateRequest) (*N
err = m.createTAPDevice(tapCtx, netConfig.TAPDevice, network.Bridge, network.Isolated)
tapSpanEnd(err)
if err != nil {
cleanupErr := m.deleteTAPDeviceSerialized(netConfig.TAPDevice, "")
cleanupErr := m.deleteTAPDeviceForInstanceSerialized(ctx, req.InstanceID, netConfig.TAPDevice)
if cleanupErr != nil {
log.WarnContext(ctx, "failed to clean up TAP after create failure", "tap", netConfig.TAPDevice, "error", cleanupErr)
}
Expand Down Expand Up @@ -146,7 +146,7 @@ func (m *manager) RecreateAllocation(ctx context.Context, instanceID string, dow
err = m.createTAPDevice(tapCtx, alloc.TAPDevice, network.Bridge, network.Isolated)
tapSpanEnd(err)
if err != nil {
_ = m.deleteTAPDeviceSerialized(alloc.TAPDevice, alloc.ClassID)
_ = m.deleteTAPDeviceForInstanceSerialized(ctx, instanceID, alloc.TAPDevice)
return fmt.Errorf("create TAP device: %w", err)
}
m.recordTAPOperation(ctx, "create")
Expand Down Expand Up @@ -386,10 +386,10 @@ func (m *manager) ReleaseAllocation(ctx context.Context, alloc *Allocation) erro
}
m.forgetPendingAllocation(alloc.InstanceID)

// 1. Delete TAP device (best effort), using stored class ID for correct HTB cleanup.
// 1. Delete TAP device (best effort), using the TAP's live tc filter for HTB cleanup.
// Serialize with async tc setup so queued rate-limit work cannot race with
// class removal and leave stale bridge state behind.
if err := m.deleteTAPDeviceForInstanceSerialized(alloc.InstanceID, alloc.TAPDevice, alloc.ClassID); err != nil {
if err := m.deleteTAPDeviceForInstanceSerialized(ctx, alloc.InstanceID, alloc.TAPDevice); err != nil {
log.WarnContext(ctx, "failed to delete TAP device", "tap", alloc.TAPDevice, "error", err)
} else {
m.recordTAPOperation(ctx, "delete")
Expand All @@ -403,26 +403,20 @@ func (m *manager) ReleaseAllocation(ctx context.Context, alloc *Allocation) erro
return nil
}

func (m *manager) deleteTAPDeviceSerialized(tapName, classID string) error {
func (m *manager) deleteTAPDeviceSerialized(ctx context.Context, tapName string) error {
m.tcMu.Lock()
defer m.tcMu.Unlock()
return m.deleteTAPDevice(tapName, classID)
return m.deleteTAPDevice(ctx, tapName)
}

func (m *manager) deleteTAPDeviceForInstanceSerialized(instanceID, tapName, classID string) error {
func (m *manager) deleteTAPDeviceForInstanceSerialized(ctx context.Context, instanceID, tapName string) error {
m.tcMu.Lock()
defer m.tcMu.Unlock()
return m.deleteTAPDevice(tapName, m.classIDForDelete(instanceID, classID))
}

func (m *manager) classIDForDelete(instanceID, fallback string) string {
if instanceID == "" {
return fallback
}
if classID := m.loadClassID(instanceID); classID != "" {
return classID
if err := m.deleteTAPDevice(ctx, tapName); err != nil {
return err
}
return fallback
m.clearClassID(instanceID)
return nil
}

// getOrInitDefaultNetwork resolves the default network and self-heals by running
Expand Down
6 changes: 5 additions & 1 deletion lib/network/bridge_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func (m *manager) addVMClass(ctx context.Context, bridgeName, tapName string, ra
}

// deleteTAPDevice is a no-op on macOS as we use NAT networking.
func (m *manager) deleteTAPDevice(tapName, classID string) error {
func (m *manager) deleteTAPDevice(ctx context.Context, tapName string) error {
return nil
}

Expand Down Expand Up @@ -79,3 +79,7 @@ func (m *manager) CleanupOrphanedTAPs(ctx context.Context, preserveInstanceIDs [
func (m *manager) CleanupOrphanedClasses(ctx context.Context) int {
return 0
}

func (m *manager) bridgeHTBClassCount(ctx context.Context) (int64, error) {
return 0, nil
}
Loading
Loading