opencontainers · kolyshkin · Jun 23, 2026 · thaJeztah · Jun 23, 2026 · kolyshkin
@@ -1,21 +1,138 @@
 package devices
 
 import (
+	"bytes"
 	"errors"
 	"fmt"
 	"os"
 	"runtime"
 	"sync"
 	"unsafe"
 
-	"github.com/cilium/ebpf"
 	"github.com/cilium/ebpf/asm"
-	"github.com/cilium/ebpf/link"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 
-func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error) {
+func bpf(cmd uintptr, attr unsafe.Pointer, size uintptr) (uintptr, error) {
+	r1, _, err := unix.Syscall(unix.SYS_BPF, cmd, uintptr(attr), size)
+	runtime.KeepAlive(attr)
+	if err != 0 {
+		return r1, err
+	}
+	return r1, nil
+}
+
+// bpfProgLoad loads a BPF_PROG_TYPE_CGROUP_DEVICE program and returns its fd.
+func bpfProgLoad(insns asm.Instructions, license string) (int, error) {
+	buf := bytes.NewBuffer(make([]byte, 0, insns.Size()))
+	if err := insns.Marshal(buf, nativeEndian); err != nil {
+		return -1, err
+	}
+	insnsBytes := buf.Bytes()
+
+	licensePtr, err := unix.BytePtrFromString(license)
+	if err != nil {
+		return -1, err
+	}
+
+	// Subset of struct bpf_attr for BPF_PROG_LOAD. Fields past the ones we set
+	// are left zero; the kernel zero-fills any part of bpf_attr beyond the size
+	// we pass.
+	attr := struct {
+		progType uint32
+		insnCnt  uint32
+		insns    uint64 // pointer
+		license  uint64 // pointer
+		logLevel uint32
+		logSize  uint32
+		logBuf   uint64 // pointer
+	}{
+		progType: unix.BPF_PROG_TYPE_CGROUP_DEVICE,
+		insnCnt:  uint32(len(insnsBytes) / asm.InstructionSize),
+		insns:    uint64(uintptr(unsafe.Pointer(&insnsBytes[0]))),
+		license:  uint64(uintptr(unsafe.Pointer(licensePtr))),
+	}
+
+	fd, err := bpf(unix.BPF_PROG_LOAD, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+	// attr holds the pointers as integers, so the GC can't see them; keep the
+	// referenced objects alive until the syscall returns.
+	runtime.KeepAlive(insnsBytes)
+	runtime.KeepAlive(licensePtr)
+	if err == nil {
+		return int(fd), nil
+	}
+
+	// The load failed. Retry with the verifier log enabled so we can include
+	// it in the error (the first attempt skips it, as it is the fast path).
+	log := make([]byte, 64*1024)
+	attr.logLevel = 1
+	attr.logSize = uint32(len(log))
+	attr.logBuf = uint64(uintptr(unsafe.Pointer(&log[0])))
+
+	fd, err = bpf(unix.BPF_PROG_LOAD, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+	runtime.KeepAlive(insnsBytes)
+	runtime.KeepAlive(licensePtr)
+	runtime.KeepAlive(log)
+	if err == nil {
+		return int(fd), nil
+	}
+	if n := bytes.IndexByte(log, 0); n > 0 {
+		return -1, fmt.Errorf("%w: %s", err, bytes.TrimRight(log[:n], "\n"))
+	}
+	return -1, err
+}
+
+// bpfProgGetFdByID returns the fd for the BPF program with the given ID.
+func bpfProgGetFdByID(id uint32) (int, error) {
+	// The kernel zero-fills the rest of bpf_attr beyond the size we pass.
+	attr := struct{ id uint32 }{id}
+	fd, err := bpf(unix.BPF_PROG_GET_FD_BY_ID, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+	if err != nil {
+		return -1, err
+	}
+	return int(fd), nil
+}
+
+// bpfProgAttach attaches progFd to cgroupFd with the given flags. If replaceFd
+// is >= 0, its fd is set in replaceBpfFd (for BPF_F_REPLACE semantics).
+func bpfProgAttach(cgroupFd, progFd int, attachFlags uint32, replaceFd int) error {
+	attr := struct {
+		targetFd     uint32
+		attachBpfFd  uint32
+		attachType   uint32
+		attachFlags  uint32
+		replaceBpfFd uint32
+	}{
+		targetFd:    uint32(cgroupFd),
+		attachBpfFd: uint32(progFd),
+		attachType:  uint32(unix.BPF_CGROUP_DEVICE),
+		attachFlags: attachFlags,
+	}
+	if replaceFd >= 0 {
+		attr.replaceBpfFd = uint32(replaceFd)
+	}
+	_, err := bpf(unix.BPF_PROG_ATTACH, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+	return err
+}
+
+// bpfProgDetach detaches progFd from cgroupFd.
+func bpfProgDetach(cgroupFd, progFd int) error {
+	// The kernel zero-fills the rest of bpf_attr beyond the size we pass.
+	attr := struct {
+		targetFd    uint32
+		attachBpfFd uint32
+		attachType  uint32
+	}{
+		targetFd:    uint32(cgroupFd),
+		attachBpfFd: uint32(progFd),
+		attachType:  uint32(unix.BPF_CGROUP_DEVICE),
+	}
+	_, err := bpf(unix.BPF_PROG_DETACH, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+	return err
+}
+
+func findAttachedCgroupDeviceFilters(dirFd int) (_ []int, retErr error) {
 	type bpfAttrQuery struct {
 		TargetFd    uint32
 		AttachType  uint32
@@ -37,36 +154,34 @@ func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error
 			ProgCnt:    uint32(len(progIds)),
 		}
 
-		// Fetch the list of program ids.
-		_, _, errno := unix.Syscall(unix.SYS_BPF,
-			uintptr(unix.BPF_PROG_QUERY),
-			uintptr(unsafe.Pointer(&query)),
-			unsafe.Sizeof(query))
+		// Fetch the list of program ids. bpf() keeps &query alive for the
+		// duration of the syscall, and query.ProgCnt is read right after.
+		_, err := bpf(unix.BPF_PROG_QUERY, unsafe.Pointer(&query), unsafe.Sizeof(query))
+		runtime.KeepAlive(progIds)
 		size = int(query.ProgCnt)
-		runtime.KeepAlive(query)
-		if errno != 0 {
+		if err != nil {
 			// On ENOSPC we get the correct number of programs.
-			if errno == unix.ENOSPC {
+			if errors.Is(err, unix.ENOSPC) {
 				retries++
 				continue
 			}
-			return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
+			return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", err)
 		}
 
-		// Convert the ids to program handles.
-		// On error we don't return the programs slice, so close the fds stored there.
+		// Convert the ids to program fds.
+		// On error we don't return the fds slice, so close the fds stored there.
 		progIds = progIds[:size]
-		programs := make([]*ebpf.Program, 0, len(progIds))
+		fds := make([]int, 0, len(progIds))
 		defer func() {
 			if retErr != nil {
-				for _, p := range programs {
-					p.Close()
+				for _, fd := range fds {
+					unix.Close(fd)
 				}
 			}
 		}()
 
 		for _, progId := range progIds {
-			program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
+			fd, err := bpfProgGetFdByID(progId)
 			if err != nil {
 				// We skip over programs that give us -EACCES or -EPERM. This
 				// is necessary because there may be BPF programs that have
@@ -83,10 +198,10 @@ func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error
 				}
 				return nil, fmt.Errorf("cannot fetch program from id: %w", err)
 			}
-			programs = append(programs, program)
+			fds = append(fds, fd)
 		}
 		runtime.KeepAlive(progIds)
-		return programs, nil
+		return fds, nil
 	}
 
 	return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
@@ -99,23 +214,17 @@ var (
 
 // Loosely based on the BPF_F_REPLACE support check in
 // https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
-//
-// TODO: move this logic to cilium/ebpf
 func haveBpfProgReplace() bool {
 	haveBpfProgReplaceOnce.Do(func() {
-		prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
-			Type:    ebpf.CGroupDevice,
-			License: "MIT",
-			Instructions: asm.Instructions{
-				asm.Mov.Imm(asm.R0, 0),
-				asm.Return(),
-			},
-		})
+		progFd, err := bpfProgLoad(asm.Instructions{
+			asm.Mov.Imm(asm.R0, 0),
+			asm.Return(),
+		}, "MIT")
 		if err != nil {
-			logrus.Warnf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
+			logrus.Warnf("checking for BPF_F_REPLACE support: bpfProgLoad failed: %v", err)
 			return
 		}
-		defer prog.Close()
+		defer unix.Close(progFd)
 
 		devnull, err := os.Open("/dev/null")
 		if err != nil {
@@ -127,24 +236,19 @@ func haveBpfProgReplace() bool {
 		// We know that we have BPF_PROG_ATTACH since we can load
 		// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
 		// we know that the feature isn't present.
-		err = link.RawAttachProgram(link.RawAttachProgramOptions{
-			// We rely on this fd being checked after attachFlags in the kernel.
-			Target: int(devnull.Fd()),
-			// Attempt to "replace" our BPF program with itself. This will
-			// always fail, but we should get -EINVAL if BPF_F_REPLACE is not
-			// supported.
-			Anchor:  link.ReplaceProgram(prog),
-			Program: prog,
-			Attach:  ebpf.AttachCGroupDevice,
-			Flags:   unix.BPF_F_ALLOW_MULTI,
-		})
-		if errors.Is(err, ebpf.ErrNotSupported) || errors.Is(err, unix.EINVAL) {
+		//
+		// We rely on the target fd being checked after attachFlags in the
+		// kernel. Attempting to "replace" our BPF program with itself always
+		// fails, but we should get -EINVAL if BPF_F_REPLACE is not supported,
+		// and -EBADF (from the dummy target fd) if it is.
+		err = bpfProgAttach(int(devnull.Fd()), progFd, unix.BPF_F_ALLOW_MULTI|unix.BPF_F_REPLACE, progFd)
+		if errors.Is(err, unix.EINVAL) {
 			// not supported
 			return
 		}
 		if !errors.Is(err, unix.EBADF) {
 			// If we see any new errors here, it's possible that there is a
-			// regression due to a cilium/ebpf update and the above EINVAL
+			// regression due to a kernel update and the above EINVAL
 			// checks are not working. So, be loud about it so someone notices
 			// and we can get the issue fixed quicker.
 			logrus.Warnf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
@@ -169,83 +273,58 @@ func loadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd
 	_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
 
 	// Get the list of existing programs.
-	oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
+	oldFds, err := findAttachedCgroupDeviceFilters(dirFd)
 	if err != nil {
 		return err
 	}
 	defer func() {
-		for _, p := range oldProgs {
-			p.Close()
+		for _, fd := range oldFds {
+			unix.Close(fd)
 		}
 	}()
 
-	useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
+	useReplaceProg := haveBpfProgReplace() && len(oldFds) == 1
 
 	// Generate new program.
-	spec := &ebpf.ProgramSpec{
-		Type:         ebpf.CGroupDevice,
-		Instructions: insts,
-		License:      license,
-	}
-	prog, err := ebpf.NewProgram(spec)
+	progFd, err := bpfProgLoad(insts, license)
 	if err != nil {
 		return err
 	}
-	defer prog.Close()
+	// Once the program is attached, the kernel keeps it alive via the cgroup
+	// attachment, so we no longer need our own fd; we also don't need it if the
+	// attach below fails. Either way, close it on return.
+	defer unix.Close(progFd)
 
 	// If there is only one old program, we can just replace it directly.
-
-	attachProgramOptions := link.RawAttachProgramOptions{
-		Target:  dirFd,
-		Program: prog,
-		Attach:  ebpf.AttachCGroupDevice,
-		Flags:   unix.BPF_F_ALLOW_MULTI,
-	}
-
+	replaceFd := -1
+	attachFlags := uint32(unix.BPF_F_ALLOW_MULTI)
 	if useReplaceProg {
-		attachProgramOptions.Anchor = link.ReplaceProgram(oldProgs[0])
+		replaceFd = oldFds[0]
+		attachFlags |= unix.BPF_F_REPLACE
 	}
-	err = link.RawAttachProgram(attachProgramOptions)
+	err = bpfProgAttach(dirFd, progFd, attachFlags, replaceFd)
 	if err != nil {
 		return fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
 	}
+
 	if !useReplaceProg {
 		logLevel := logrus.DebugLevel
 		// If there was more than one old program, give a warning (since this
 		// really shouldn't happen with runc-managed cgroups) and then detach
 		// all the old programs.
-		if len(oldProgs) > 1 {
+		if len(oldFds) > 1 {
 			// NOTE: Ideally this should be a warning but it turns out that
 			//       systemd-managed cgroups trigger this warning (apparently
 			//       systemd doesn't delete old non-systemd programs when
 			//       setting properties).
-			logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
+			logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldFds))
 			logLevel = logrus.InfoLevel
 		}
-		for idx, oldProg := range oldProgs {
-			// Output some extra debug info.
-			if info, err := oldProg.Info(); err == nil {
-				fields := logrus.Fields{
-					"type": info.Type.String(),
-					"tag":  info.Tag,
-					"name": info.Name,
-				}
-				if id, ok := info.ID(); ok {
-					fields["id"] = id
-				}
-				if runCount, ok := info.RunCount(); ok {
-					fields["run_count"] = runCount
-				}
-				if runtime, ok := info.Runtime(); ok {
-					fields["runtime"] = runtime.String()
-				}
-				logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
-			}
-			err = link.RawDetachProgram(link.RawDetachProgramOptions{
-				Target:  dirFd,
-				Program: oldProg,
-				Attach:  ebpf.AttachCGroupDevice,
-			})
+		for idx, oldFd := range oldFds {
+			logrus.WithFields(logrus.Fields{
+				"fd": oldFd,
+			}).Logf(logLevel, "removing old filter %d from cgroup", idx)
+			err = bpfProgDetach(dirFd, oldFd)
 			if err != nil {
 				return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
 			}

@@ -0,0 +1,9 @@
+//go:build armbe || arm64be || mips || mips64 || mips64p32 || ppc64 || s390 || s390x || sparc || sparc64
+
+package devices
+
+import "encoding/binary"
+
+// nativeEndian is used as a workaround for cilium/ebpf/asm
+// which does not accept binary.NativeEndian.
+var nativeEndian = binary.BigEndian
@@ -0,0 +1,9 @@
+//go:build 386 || amd64 || amd64p32 || arm || arm64 || loong64 || mipsle || mips64le || mips64p32le || ppc64le || riscv64 || wasm
+
+package devices
+
+import "encoding/binary"
+
+// nativeEndian is used as a workaround for cilium/ebpf/asm
+// which does not accept binary.NativeEndian.
+var nativeEndian = binary.LittleEndian