Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
263 changes: 171 additions & 92 deletions devices/ebpf_linux.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,138 @@
package devices

import (
"bytes"
"errors"
"fmt"
"os"
"runtime"
"sync"
"unsafe"

"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/link"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)

func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error) {
func bpf(cmd uintptr, attr unsafe.Pointer, size uintptr) (uintptr, error) {
r1, _, err := unix.Syscall(unix.SYS_BPF, cmd, uintptr(attr), size)
runtime.KeepAlive(attr)
if err != 0 {
return r1, err
}
return r1, nil
}

// bpfProgLoad loads a BPF_PROG_TYPE_CGROUP_DEVICE program and returns its fd.
func bpfProgLoad(insns asm.Instructions, license string) (int, error) {
buf := bytes.NewBuffer(make([]byte, 0, insns.Size()))
if err := insns.Marshal(buf, nativeEndian); err != nil {
return -1, err
}
insnsBytes := buf.Bytes()

licensePtr, err := unix.BytePtrFromString(license)
if err != nil {
return -1, err
}

// Subset of struct bpf_attr for BPF_PROG_LOAD. Fields past the ones we set
// are left zero; the kernel zero-fills any part of bpf_attr beyond the size
// we pass.
attr := struct {
progType uint32
insnCnt uint32
insns uint64 // pointer
license uint64 // pointer
logLevel uint32
logSize uint32
logBuf uint64 // pointer
}{
progType: unix.BPF_PROG_TYPE_CGROUP_DEVICE,
insnCnt: uint32(len(insnsBytes) / asm.InstructionSize),
insns: uint64(uintptr(unsafe.Pointer(&insnsBytes[0]))),
license: uint64(uintptr(unsafe.Pointer(licensePtr))),
}

fd, err := bpf(unix.BPF_PROG_LOAD, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
// attr holds the pointers as integers, so the GC can't see them; keep the
// referenced objects alive until the syscall returns.
runtime.KeepAlive(insnsBytes)
runtime.KeepAlive(licensePtr)
if err == nil {
return int(fd), nil
}

// The load failed. Retry with the verifier log enabled so we can include
// it in the error (the first attempt skips it, as it is the fast path).
log := make([]byte, 64*1024)
attr.logLevel = 1
attr.logSize = uint32(len(log))
attr.logBuf = uint64(uintptr(unsafe.Pointer(&log[0])))

fd, err = bpf(unix.BPF_PROG_LOAD, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
runtime.KeepAlive(insnsBytes)
runtime.KeepAlive(licensePtr)
runtime.KeepAlive(log)
if err == nil {
return int(fd), nil
}
if n := bytes.IndexByte(log, 0); n > 0 {
return -1, fmt.Errorf("%w: %s", err, bytes.TrimRight(log[:n], "\n"))
}
return -1, err
}

// bpfProgGetFdByID returns the fd for the BPF program with the given ID.
func bpfProgGetFdByID(id uint32) (int, error) {
// The kernel zero-fills the rest of bpf_attr beyond the size we pass.
attr := struct{ id uint32 }{id}
fd, err := bpf(unix.BPF_PROG_GET_FD_BY_ID, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
if err != nil {
return -1, err
}
return int(fd), nil
}

// bpfProgAttach attaches progFd to cgroupFd with the given flags. If replaceFd
// is >= 0, its fd is set in replaceBpfFd (for BPF_F_REPLACE semantics).
func bpfProgAttach(cgroupFd, progFd int, attachFlags uint32, replaceFd int) error {
attr := struct {
targetFd uint32
attachBpfFd uint32
attachType uint32
attachFlags uint32
replaceBpfFd uint32
}{
targetFd: uint32(cgroupFd),
attachBpfFd: uint32(progFd),
attachType: uint32(unix.BPF_CGROUP_DEVICE),
attachFlags: attachFlags,
}
if replaceFd >= 0 {
attr.replaceBpfFd = uint32(replaceFd)
}
_, err := bpf(unix.BPF_PROG_ATTACH, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
return err
}

// bpfProgDetach detaches progFd from cgroupFd.
func bpfProgDetach(cgroupFd, progFd int) error {
// The kernel zero-fills the rest of bpf_attr beyond the size we pass.
attr := struct {
targetFd uint32
attachBpfFd uint32
attachType uint32
}{
targetFd: uint32(cgroupFd),
attachBpfFd: uint32(progFd),
attachType: uint32(unix.BPF_CGROUP_DEVICE),
}
_, err := bpf(unix.BPF_PROG_DETACH, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
return err
}

func findAttachedCgroupDeviceFilters(dirFd int) (_ []int, retErr error) {
type bpfAttrQuery struct {
TargetFd uint32
AttachType uint32
Expand All @@ -37,36 +154,34 @@ func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error
ProgCnt: uint32(len(progIds)),
}

// Fetch the list of program ids.
_, _, errno := unix.Syscall(unix.SYS_BPF,
uintptr(unix.BPF_PROG_QUERY),
uintptr(unsafe.Pointer(&query)),
unsafe.Sizeof(query))
// Fetch the list of program ids. bpf() keeps &query alive for the
// duration of the syscall, and query.ProgCnt is read right after.
_, err := bpf(unix.BPF_PROG_QUERY, unsafe.Pointer(&query), unsafe.Sizeof(query))
runtime.KeepAlive(progIds)
size = int(query.ProgCnt)
runtime.KeepAlive(query)
if errno != 0 {
if err != nil {
// On ENOSPC we get the correct number of programs.
if errno == unix.ENOSPC {
if errors.Is(err, unix.ENOSPC) {
retries++
continue
}
return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", err)
}

// Convert the ids to program handles.
// On error we don't return the programs slice, so close the fds stored there.
// Convert the ids to program fds.
// On error we don't return the fds slice, so close the fds stored there.
progIds = progIds[:size]
programs := make([]*ebpf.Program, 0, len(progIds))
fds := make([]int, 0, len(progIds))
defer func() {
if retErr != nil {
for _, p := range programs {
p.Close()
for _, fd := range fds {
unix.Close(fd)
}
}
}()

for _, progId := range progIds {
program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
fd, err := bpfProgGetFdByID(progId)
if err != nil {
// We skip over programs that give us -EACCES or -EPERM. This
// is necessary because there may be BPF programs that have
Expand All @@ -83,10 +198,10 @@ func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error
}
return nil, fmt.Errorf("cannot fetch program from id: %w", err)
}
programs = append(programs, program)
fds = append(fds, fd)
}
runtime.KeepAlive(progIds)
return programs, nil
return fds, nil
}

return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
Expand All @@ -99,23 +214,17 @@ var (

// Loosely based on the BPF_F_REPLACE support check in
// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
//
// TODO: move this logic to cilium/ebpf
Comment on lines -102 to -103

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was there anything in our code that would still be worth upstreaming (even if we don't use it?) not sure how generic our changes are.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code was/is there, just not as a public API.

I guess the only "unique" code we have here is how we use BPF_PROG_REPLACE. Also, it's not too much code.

See also opencontainers/runc#5218 (comment)

func haveBpfProgReplace() bool {
haveBpfProgReplaceOnce.Do(func() {
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
License: "MIT",
Instructions: asm.Instructions{
asm.Mov.Imm(asm.R0, 0),
asm.Return(),
},
})
progFd, err := bpfProgLoad(asm.Instructions{
asm.Mov.Imm(asm.R0, 0),
asm.Return(),
}, "MIT")
if err != nil {
logrus.Warnf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
logrus.Warnf("checking for BPF_F_REPLACE support: bpfProgLoad failed: %v", err)
return
}
defer prog.Close()
defer unix.Close(progFd)

devnull, err := os.Open("/dev/null")
if err != nil {
Expand All @@ -127,24 +236,19 @@ func haveBpfProgReplace() bool {
// We know that we have BPF_PROG_ATTACH since we can load
// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
// we know that the feature isn't present.
err = link.RawAttachProgram(link.RawAttachProgramOptions{
// We rely on this fd being checked after attachFlags in the kernel.
Target: int(devnull.Fd()),
// Attempt to "replace" our BPF program with itself. This will
// always fail, but we should get -EINVAL if BPF_F_REPLACE is not
// supported.
Anchor: link.ReplaceProgram(prog),
Program: prog,
Attach: ebpf.AttachCGroupDevice,
Flags: unix.BPF_F_ALLOW_MULTI,
})
if errors.Is(err, ebpf.ErrNotSupported) || errors.Is(err, unix.EINVAL) {
//
// We rely on the target fd being checked after attachFlags in the
// kernel. Attempting to "replace" our BPF program with itself always
// fails, but we should get -EINVAL if BPF_F_REPLACE is not supported,
// and -EBADF (from the dummy target fd) if it is.
err = bpfProgAttach(int(devnull.Fd()), progFd, unix.BPF_F_ALLOW_MULTI|unix.BPF_F_REPLACE, progFd)
if errors.Is(err, unix.EINVAL) {
// not supported
return
}
if !errors.Is(err, unix.EBADF) {
// If we see any new errors here, it's possible that there is a
// regression due to a cilium/ebpf update and the above EINVAL
// regression due to a kernel update and the above EINVAL
// checks are not working. So, be loud about it so someone notices
// and we can get the issue fixed quicker.
logrus.Warnf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
Expand All @@ -169,83 +273,58 @@ func loadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd
_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)

// Get the list of existing programs.
oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
oldFds, err := findAttachedCgroupDeviceFilters(dirFd)
if err != nil {
return err
}
defer func() {
for _, p := range oldProgs {
p.Close()
for _, fd := range oldFds {
unix.Close(fd)
}
}()

useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
useReplaceProg := haveBpfProgReplace() && len(oldFds) == 1

// Generate new program.
spec := &ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
Instructions: insts,
License: license,
}
prog, err := ebpf.NewProgram(spec)
progFd, err := bpfProgLoad(insts, license)
if err != nil {
return err
}
defer prog.Close()
// Once the program is attached, the kernel keeps it alive via the cgroup
// attachment, so we no longer need our own fd; we also don't need it if the
// attach below fails. Either way, close it on return.
defer unix.Close(progFd)

// If there is only one old program, we can just replace it directly.

attachProgramOptions := link.RawAttachProgramOptions{
Target: dirFd,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
Flags: unix.BPF_F_ALLOW_MULTI,
}

replaceFd := -1
attachFlags := uint32(unix.BPF_F_ALLOW_MULTI)
if useReplaceProg {
attachProgramOptions.Anchor = link.ReplaceProgram(oldProgs[0])
replaceFd = oldFds[0]
attachFlags |= unix.BPF_F_REPLACE
}
err = link.RawAttachProgram(attachProgramOptions)
err = bpfProgAttach(dirFd, progFd, attachFlags, replaceFd)
if err != nil {
return fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
}

if !useReplaceProg {
logLevel := logrus.DebugLevel
// If there was more than one old program, give a warning (since this
// really shouldn't happen with runc-managed cgroups) and then detach
// all the old programs.
if len(oldProgs) > 1 {
if len(oldFds) > 1 {
// NOTE: Ideally this should be a warning but it turns out that
// systemd-managed cgroups trigger this warning (apparently
// systemd doesn't delete old non-systemd programs when
// setting properties).
logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldFds))
logLevel = logrus.InfoLevel
}
for idx, oldProg := range oldProgs {
// Output some extra debug info.
if info, err := oldProg.Info(); err == nil {
fields := logrus.Fields{
"type": info.Type.String(),
"tag": info.Tag,
"name": info.Name,
}
if id, ok := info.ID(); ok {
fields["id"] = id
}
if runCount, ok := info.RunCount(); ok {
fields["run_count"] = runCount
}
if runtime, ok := info.Runtime(); ok {
fields["runtime"] = runtime.String()
}
logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
}
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: oldProg,
Attach: ebpf.AttachCGroupDevice,
})
for idx, oldFd := range oldFds {
logrus.WithFields(logrus.Fields{
"fd": oldFd,
}).Logf(logLevel, "removing old filter %d from cgroup", idx)
err = bpfProgDetach(dirFd, oldFd)
if err != nil {
return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
}
Expand Down
9 changes: 9 additions & 0 deletions devices/endian_be.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//go:build armbe || arm64be || mips || mips64 || mips64p32 || ppc64 || s390 || s390x || sparc || sparc64

package devices

import "encoding/binary"

// nativeEndian is used as a workaround for cilium/ebpf/asm
// which does not accept binary.NativeEndian.
var nativeEndian = binary.BigEndian
Comment on lines +7 to +9

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is unfortunate; makes our code more brittle (if there would ever be more build-tags added). Would this be something that could be fixed in ebpf?

My AI-buddy suggested something like this could work (but haven't verified);

func newBPFRegisters(dst, src Register, bo binary.ByteOrder) (bpfRegisters, error) {
	var b [2]byte
	bo.PutUint16(b[:], 0x0102)

	switch b {
	case [2]byte{0x02, 0x01}: // little
		return bpfRegisters((src << 4) | (dst & 0xf)), nil
	case [2]byte{0x01, 0x02}: // big
		return bpfRegisters((dst << 4) | (src & 0xf)), nil
	default:
		return 0, fmt.Errorf("unrecognized ByteOrder %T", bo)
	}
}

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In that case, I guess we could do this locally as well if we don't want to depend on our build-tags being complete;

var nativeEndian = detectNativeEndian()

func detectNativeEndian() binary.ByteOrder {
	var b [2]byte
	binary.NativeEndian.PutUint16(b[:], 0x0102)

	switch b {
	case [2]byte{0x02, 0x01}:
		return binary.LittleEndian
	case [2]byte{0x01, 0x02}:
		return binary.BigEndian
	default:
		panic("unreachable: invalid native byte order")
	}
}

or

var nativeEndian = detectNativeEndian()

func detectNativeEndian() binary.ByteOrder {
	var x uint16 = 0x0102
	if *(*byte)(unsafe.Pointer(&x)) == 0x02 {
		return binary.LittleEndian
	}
	return binary.BigEndian
}

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Initially I had a similar code but I prefer to determine that during compile time rather than run time.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, if a whole new architecture is to be added to Golang, fixing this code is trivial.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But, we can have a runtime test to ensure our endian-ness is correct.

Added.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but I prefer to determine that during compile time rather than run time.

Yes, generally agreed. I wish there was a builtin endian-ness build-tag or something (and, yes, initially I thought; why not use binary.NativeEndian before realising ... why we had to do this.

The long list of platforms made me hesitate a bit (easy to miss one!)

But, we can have a runtime test to ensure our endian-ness is correct.

Works for me (for now); still would be great if upstream cilium didn't require us to do this 😅

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

still would be great if upstream cilium didn't require us to do this 😅

I tried: cilium/ebpf#1523

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Initially I had a similar code but I prefer to determine that during compile time rather than run time.

Why? The overhead should be negligible?

9 changes: 9 additions & 0 deletions devices/endian_le.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//go:build 386 || amd64 || amd64p32 || arm || arm64 || loong64 || mipsle || mips64le || mips64p32le || ppc64le || riscv64 || wasm

package devices

import "encoding/binary"

// nativeEndian is used as a workaround for cilium/ebpf/asm
// which does not accept binary.NativeEndian.
var nativeEndian = binary.LittleEndian
Loading
Loading