From 2e81f834af0ba4807d570b740643fb1e4107c819 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Mon, 8 Sep 2025 09:18:36 -0600 Subject: [PATCH] v2: set memory.oom.group => OOMPolicy in systemd We are interested in using memory.oom.cgroup, but need it to be set systemd because of [1], so let's set it. There are a few caveats, in no particular order: A. systemd does not allow OOMPolicy to be set on units that have already started, so we must do this in Apply() instead of Set(). B. As the comment suggests, OOMPolicy has three states (continue, stop, kill), where kill maps to memory.oom.group=1, and continue maps to =0. However, the bit about `runc update` doesn't quite make sense: the values will only ever be expressed in terms of memory.oom.group, so we only need to map the continue and kill values, which have direct mappings. Note that `runc update` here doesn't make sense anyway: because of (A), we cannot update these values. Perhaps we should reject these updates since systemd will? (Or maybe we try to update and just error out, in the event that systemd eventually allows this? The kernel allows updating it, the reason the systemd semantics have diverged is unclear.) C. systemd only gained support for setting OOMPolicy on scopes in versions >= 253; versions before this will fail. So, let's add a bit allowing the setup of OOMPolicy to Apply(), and ignore it in Set() -> genV2ResourcesProperties() -> unifiedResToSystemdProps(). [1]: This arguably is more important than the debug-level warning would suggest: if someone does the equivalent of a `systemctl daemon-reload`, systemd will reset our manually-via-cgroupfs set value to 0, because we did not explicitly set it in the service / scope definition, meaning that individual tasks will not actually oom the whole cgroup when they oom. Co-authored-by: Ethan Adams Signed-off-by: Tycho Andersen [halaney: Address review comments and fix unit test] Signed-off-by: Andrew Halaney --- systemd/systemd_test.go | 121 ++++++++++++++++++++++++++++++++++++++++ systemd/v2.go | 35 +++++++++--- 2 files changed, 148 insertions(+), 8 deletions(-) diff --git a/systemd/systemd_test.go b/systemd/systemd_test.go index 156e55e..5d34a86 100644 --- a/systemd/systemd_test.go +++ b/systemd/systemd_test.go @@ -2,7 +2,9 @@ package systemd import ( "os" + "os/exec" "reflect" + "strconv" "testing" systemdDbus "github.com/coreos/go-systemd/v22/dbus" @@ -157,6 +159,12 @@ func TestUnifiedResToSystemdProps(t *testing.T) { newProp("CPUWeight", uint64(1000)), }, }, + { + name: "memory.oom.group handled by Apply method", + res: map[string]string{ + "memory.oom.group": "1", + }, + }, } for _, tc := range testCases { @@ -272,3 +280,116 @@ func TestTasksMax(t *testing.T) { t.Fatalf("failed to set PidsLimit=nil: %v", err) } } + +func TestOOMPolicyApply(t *testing.T) { + if !IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if !cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v2 is required") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } + + cm := newDbusConnManager(false) + if systemdVersion(cm) < oomPolicySupportedVersion { + t.Skipf("Test requires systemd >= %d (OOMPolicy on scopes)", oomPolicySupportedVersion) + } + + testCases := []struct { + name string + oomGroupValue string + expectedPolicy string + expectError bool + }{ + { + name: "memory.oom.group=0 sets OOMPolicy=continue", + oomGroupValue: "0", + expectedPolicy: "continue", + expectError: false, + }, + { + name: "memory.oom.group=1 sets OOMPolicy=kill", + oomGroupValue: "1", + expectedPolicy: "kill", + expectError: false, + }, + { + name: "invalid memory.oom.group value", + oomGroupValue: "invalid", + expectError: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + config := &cgroups.Cgroup{ + ScopePrefix: "test", + Name: "test-oom-policy-" + strconv.FormatInt(int64(os.Getpid()), 10), + Resources: &cgroups.Resources{ + Unified: map[string]string{ + "memory.oom.group": tc.oomGroupValue, + }, + }, + } + + manager, err := NewUnifiedManager(config, "") + if err != nil { + t.Fatalf("Failed to create manager: %v", err) + } + defer func() { + _ = manager.Destroy() + }() + + // Scopes require a process inside. + cmd := exec.Command("bash", "-c", "sleep 1m") + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + // Make sure to not leave a zombie. + defer func() { + // These may fail, we don't care. + _ = cmd.Process.Kill() + _ = cmd.Wait() + }() + + err = manager.Apply(cmd.Process.Pid) + if tc.expectError { + if err == nil { + t.Fatal("Expected error but got none") + } + return + } + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + + unitName := getUnitName(config) + conn, err := systemdDbus.NewSystemdConnectionContext(t.Context()) + if err != nil { + t.Fatalf("Failed to connect to systemd: %v", err) + } + defer conn.Close() + + properties, err := conn.GetUnitTypePropertiesContext(t.Context(), unitName, "Scope") + if err != nil { + t.Fatalf("Failed to get unit type properties: %v", err) + } + + oomPolicyValue, exists := properties["OOMPolicy"] + if !exists { + t.Fatal("OOMPolicy property not found") + } + + oomPolicyStr, ok := oomPolicyValue.(string) + if !ok { + t.Fatalf("OOMPolicy value is not a string: %T", oomPolicyValue) + } + + if oomPolicyStr != tc.expectedPolicy { + t.Errorf("Expected OOMPolicy=%s, got %s", tc.expectedPolicy, oomPolicyStr) + } + }) + } +} diff --git a/systemd/v2.go b/systemd/v2.go index cb71e9e..bdfd446 100644 --- a/systemd/v2.go +++ b/systemd/v2.go @@ -20,7 +20,8 @@ import ( ) const ( - cpuIdleSupportedVersion = 252 + cpuIdleSupportedVersion = 252 + oomPolicySupportedVersion = 253 ) type UnifiedManager struct { @@ -183,13 +184,8 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props newProp("TasksMax", num)) case "memory.oom.group": - // Setting this to 1 is roughly equivalent to OOMPolicy=kill - // (as per systemd.service(5) and - // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html), - // but it's not clear what to do if it is unset or set - // to 0 in runc update, as there are two other possible - // values for OOMPolicy (continue/stop). - fallthrough + // This was set before the unit started, so no need to + // warn about it here. default: // Ignore the unknown resource here -- will still be @@ -338,6 +334,29 @@ func (m *UnifiedManager) Apply(pid int) error { properties = append(properties, c.SystemdProps...) + // OOMPolicy must be set at unit creation time, systemd does not allow + // changing it on a running scope for some reason despite the kernel allowing so + // so we do this here in Apply() instead of Set() + if c.Resources != nil { + if v, ok := c.Resources.Unified["memory.oom.group"]; ok { + if systemdVersion(m.dbus) >= oomPolicySupportedVersion { + value, err := strconv.ParseUint(v, 10, 64) + if err != nil { + return fmt.Errorf("unified resource %q value conversion error: %w", "memory.oom.group", err) + } + + switch value { + case 0: + properties = append(properties, newProp("OOMPolicy", "continue")) + case 1: + properties = append(properties, newProp("OOMPolicy", "kill")) + default: + logrus.Debugf("don't know how to convert memory.oom.group=%d; skipping (will still be applied to cgroupfs)", value) + } + } + } + } + if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err) }