diff --git a/.github/workflows/job-test-in-container.yml b/.github/workflows/job-test-in-container.yml index be742f8ab00..6b69d978028 100644 --- a/.github/workflows/job-test-in-container.yml +++ b/.github/workflows/job-test-in-container.yml @@ -151,6 +151,27 @@ jobs: sudo mkdir -p /etc/docker echo '{"ipv6": true, "fixed-cidr-v6": "2001:db8:1::/64", "experimental": true, "ip6tables": true}' | sudo tee /etc/docker/daemon.json sudo systemctl restart docker + - name: "Debug: DBUS availability in container" + run: | + echo "=== DBUS Debugging in Container ===" + [ "${{ inputs.target }}" == "rootful" ] \ + && args=(test-integration) \ + || args=(test-integration-${{ inputs.target }}) + docker run -t --rm --privileged "${args[@]}" bash -c ' + echo "--- DBUS Tools Availability ---" + which dbus-launch dbus-daemon dbus-send dbus-monitor systemd-run || true + echo "--- DBUS Binaries Location ---" + ls -la /usr/bin/dbus* 2>/dev/null || echo "No DBUS tools in /usr/bin" + ls -la /bin/dbus* 2>/dev/null || echo "No DBUS tools in /bin" + echo "--- Systemd Tools ---" + which systemctl systemd-run || true + echo "--- Environment ---" + echo "PATH: $PATH" + echo "USER: $(whoami)" + echo "UID: $(id -u)" + echo "--- Package Info ---" + dpkg -l | grep -E "(dbus|systemd)" || true + ' - name: "Run: integration tests" run: | . ./hack/github/action-helpers.sh diff --git a/Dockerfile.d/test-integration-rootless.sh b/Dockerfile.d/test-integration-rootless.sh index f6e243f32b5..29127a6267a 100755 --- a/Dockerfile.d/test-integration-rootless.sh +++ b/Dockerfile.d/test-integration-rootless.sh @@ -30,10 +30,40 @@ if [[ "$(id -u)" = "0" ]]; then touch /workaround-issue-622 fi + echo "=== DBUS Debugging as ROOT ===" + echo "--- DBUS Tools Availability (root) ---" + which dbus-launch dbus-daemon dbus-send dbus-monitor systemd-run || true + echo "--- Environment (root) ---" + echo "PATH: $PATH" + echo "USER: $(whoami)" + echo "UID: $(id -u)" + echo "DBUS_SESSION_BUS_ADDRESS: ${DBUS_SESSION_BUS_ADDRESS:-unset}" + echo "XDG_RUNTIME_DIR: ${XDG_RUNTIME_DIR:-unset}" + echo "--- Systemd Status (root) ---" + systemctl --user status 2>&1 || echo "systemctl --user failed as root" + echo "--- DBUS Launch Test (root) ---" + dbus-launch --sh-syntax 2>&1 || echo "dbus-launch failed as root" + # Switch to the rootless user via SSH systemctl start ssh exec ssh -o StrictHostKeyChecking=no rootless@localhost "$0" "$@" else + echo "=== DBUS Debugging as ROOTLESS USER ===" + echo "--- DBUS Tools Availability (rootless) ---" + which dbus-launch dbus-daemon dbus-send dbus-monitor systemd-run || true + echo "--- Environment (rootless) ---" + echo "PATH: $PATH" + echo "USER: $(whoami)" + echo "UID: $(id -u)" + echo "DBUS_SESSION_BUS_ADDRESS: ${DBUS_SESSION_BUS_ADDRESS:-unset}" + echo "XDG_RUNTIME_DIR: ${XDG_RUNTIME_DIR:-unset}" + echo "--- Systemd Status (rootless) ---" + systemctl --user status 2>&1 || echo "systemctl --user failed as rootless" + echo "--- DBUS Launch Test (rootless) ---" + dbus-launch --sh-syntax 2>&1 || echo "dbus-launch failed as rootless" + echo "--- Systemd User Environment ---" + systemctl --user show-environment 2>&1 || echo "systemctl --user show-environment failed" + containerd-rootless-setuptool.sh install if grep -q "options use-vc" /etc/resolv.conf; then containerd-rootless-setuptool.sh nsenter -- sh -euc 'echo "options use-vc" >>/etc/resolv.conf' @@ -60,6 +90,16 @@ EOF systemctl --user restart stargz-snapshotter.service export IPFS_PATH="/home/rootless/.local/share/ipfs" containerd-rootless-setuptool.sh install-bypass4netnsd + + echo "=== DBUS Debugging AFTER containerd-rootless-setuptool.sh ===" + echo "--- Environment After Setup ---" + echo "DBUS_SESSION_BUS_ADDRESS: ${DBUS_SESSION_BUS_ADDRESS:-unset}" + echo "XDG_RUNTIME_DIR: ${XDG_RUNTIME_DIR:-unset}" + echo "--- Systemd Status After Setup ---" + systemctl --user status 2>&1 || echo "systemctl --user still failed after setup" + echo "--- DBUS Launch Test After Setup ---" + dbus-launch --sh-syntax 2>&1 || echo "dbus-launch still failed after setup" + # Once ssh-ed, we lost the Dockerfile working dir, so, get back in the nerdctl checkout cd /go/src/github.com/containerd/nerdctl # We also lose the PATH (and SendEnv=PATH would require sshd config changes) diff --git a/cmd/nerdctl/container/container_create.go b/cmd/nerdctl/container/container_create.go index fc5cbdd97c8..e30d529481a 100644 --- a/cmd/nerdctl/container/container_create.go +++ b/cmd/nerdctl/container/container_create.go @@ -279,10 +279,6 @@ func createOptions(cmd *cobra.Command) (types.ContainerCreateOptions, error) { if err != nil { return opt, err } - opt.HealthStartInterval, err = cmd.Flags().GetDuration("health-start-interval") - if err != nil { - return opt, err - } opt.NoHealthcheck, err = cmd.Flags().GetBool("no-healthcheck") if err != nil { return opt, err diff --git a/cmd/nerdctl/container/container_health_check_test.go b/cmd/nerdctl/container/container_health_check_linux_test.go similarity index 75% rename from cmd/nerdctl/container/container_health_check_test.go rename to cmd/nerdctl/container/container_health_check_linux_test.go index aa2d1603313..f6dc9678b66 100644 --- a/cmd/nerdctl/container/container_health_check_test.go +++ b/cmd/nerdctl/container/container_health_check_linux_test.go @@ -32,11 +32,13 @@ import ( "github.com/containerd/nerdctl/mod/tigron/tig" "github.com/containerd/nerdctl/v2/pkg/healthcheck" + "github.com/containerd/nerdctl/v2/pkg/rootlessutil" "github.com/containerd/nerdctl/v2/pkg/testutil" "github.com/containerd/nerdctl/v2/pkg/testutil/nerdtest" ) func TestContainerHealthCheckBasic(t *testing.T) { + testCase := nerdtest.Setup() // Docker CLI does not provide a standalone healthcheck command. @@ -602,3 +604,198 @@ func TestContainerHealthCheckAdvance(t *testing.T) { testCase.Run(t) } + +func TestHealthCheck_SystemdIntegration_Basic(t *testing.T) { + testCase := nerdtest.Setup() + testCase.Require = require.Not(nerdtest.Docker) + + testCase.SubTests = []*test.Case{ + { + Description: "Basic healthy container with systemd-triggered healthcheck", + Setup: func(data test.Data, helpers test.Helpers) { + helpers.Ensure("run", "-d", "--name", data.Identifier(), + "--health-cmd", "echo healthy", + "--health-interval", "2s", + testutil.CommonImage, "sleep", "30") + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + // Ensure proper cleanup of systemd units + helpers.Anyhow("stop", data.Identifier()) + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All(func(stdout string, t tig.T) { + inspect := nerdtest.InspectContainer(helpers, data.Identifier()) + h := inspect.State.Health + assert.Assert(t, h != nil, "expected health state to be present") + assert.Equal(t, h.Status, "healthy") + assert.Assert(t, len(h.Log) > 0, "expected at least one health check log entry") + }), + } + }, + }, + { + Description: "Kill stops healthcheck execution", + Setup: func(data test.Data, helpers test.Helpers) { + helpers.Ensure("run", "-d", "--name", data.Identifier(), + "--health-cmd", "echo healthy", + "--health-interval", "1s", + testutil.CommonImage, "sleep", "30") + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + helpers.Ensure("kill", data.Identifier()) // Kill the container + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + // Container is already killed, just remove it + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All(func(stdout string, t tig.T) { + inspect := nerdtest.InspectContainer(helpers, data.Identifier()) + h := inspect.State.Health + assert.Assert(t, h != nil, "expected health state to be present") + assert.Assert(t, len(h.Log) > 0, "expected at least one health check log entry") + + // Get container FinishedAt timestamp + containerEnd, err := time.Parse(time.RFC3339Nano, inspect.State.FinishedAt) + assert.NilError(t, err, "parsing container FinishedAt") + + // Assert all healthcheck log start times are before container finished + for _, entry := range h.Log { + assert.NilError(t, err, "parsing healthcheck Start time") + assert.Assert(t, entry.Start.Before(containerEnd), "healthcheck ran after container was killed") + } + }), + } + }, + }, + } + testCase.Run(t) +} + +func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("systemd healthcheck tests are skipped in rootless environment") + } + testCase := nerdtest.Setup() + testCase.Require = require.Not(nerdtest.Docker) + + testCase.SubTests = []*test.Case{ + { + // Tests that CreateTimer() successfully creates systemd timer units and + // RemoveTransientHealthCheckFiles() properly cleans up units when container stops. + Description: "Systemd timer unit creation and cleanup", + Setup: func(data test.Data, helpers test.Helpers) { + helpers.Ensure("run", "-d", "--name", data.Identifier(), + "--health-cmd", "echo healthy", + "--health-interval", "1s", + testutil.CommonImage, "sleep", "30") + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + return helpers.Command("inspect", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All(func(stdout string, t tig.T) { + // Get container ID and check systemd timer + containerInspect := nerdtest.InspectContainer(helpers, data.Identifier()) + containerID := containerInspect.ID + + // Check systemd timer + result := helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") + result.Run(&test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, _ tig.T) { + // Verify that a timer exists for this specific container + assert.Assert(t, strings.Contains(stdout, containerID), + "expected to find nerdctl healthcheck timer containing container ID: %s", containerID) + }, + }) + // Stop container and verify cleanup + helpers.Ensure("stop", data.Identifier()) + + // Check that timer is gone + result = helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") + result.Run(&test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, _ tig.T) { + assert.Assert(t, !strings.Contains(stdout, containerID), + "expected nerdctl healthcheck timer for container ID %s to be removed after container stop", containerID) + + }, + }) + }), + } + }, + }, + { + Description: "Container restart recreates systemd timer", + Setup: func(data test.Data, helpers test.Helpers) { + helpers.Ensure("run", "-d", "--name", data.Identifier(), + "--health-cmd", "echo restart-test", + "--health-interval", "2s", + testutil.CommonImage, "sleep", "60") + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + // Get container ID for verification + containerInspect := nerdtest.InspectContainer(helpers, data.Identifier()) + containerID := containerInspect.ID + + // Step 1: Verify timer exists initially + result := helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") + result.Run(&test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, t tig.T) { + assert.Assert(t, strings.Contains(stdout, containerID), + "expected timer for container %s to exist initially", containerID) + }, + }) + + // Step 2: Stop container + helpers.Ensure("stop", data.Identifier()) + + // Step 3: Verify timer is removed after stop + result = helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") + result.Run(&test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, t tig.T) { + assert.Assert(t, !strings.Contains(stdout, containerID), + "expected timer for container %s to be removed after stop", containerID) + }, + }) + + // Step 4: Restart container + helpers.Ensure("start", data.Identifier()) + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + + // Step 5: Verify timer is recreated after restart - this is our final verification + return helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, t tig.T) { + containerInspect := nerdtest.InspectContainer(helpers, data.Identifier()) + containerID := containerInspect.ID + assert.Assert(t, strings.Contains(stdout, containerID), + "expected timer for container %s to be recreated after restart", containerID) + }, + } + }, + }, + } + testCase.Run(t) +} diff --git a/cmd/nerdctl/container/container_run.go b/cmd/nerdctl/container/container_run.go index 67b797bdce9..70222cff71a 100644 --- a/cmd/nerdctl/container/container_run.go +++ b/cmd/nerdctl/container/container_run.go @@ -37,6 +37,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/containerutil" "github.com/containerd/nerdctl/v2/pkg/defaults" "github.com/containerd/nerdctl/v2/pkg/errutil" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/labels" "github.com/containerd/nerdctl/v2/pkg/logging" "github.com/containerd/nerdctl/v2/pkg/netutil" @@ -240,7 +241,6 @@ func setCreateFlags(cmd *cobra.Command) { cmd.Flags().Duration("health-timeout", 0, "Maximum time to allow one check to run (default: 30s)") cmd.Flags().Int("health-retries", 0, "Consecutive failures needed to report unhealthy (default: 3)") cmd.Flags().Duration("health-start-period", 0, "Start period for the container to initialize before starting health-retries countdown") - cmd.Flags().Duration("health-start-interval", 0, "Time between running the checks during the start period") cmd.Flags().Bool("no-healthcheck", false, "Disable any container-specified HEALTHCHECK") // #region env flags @@ -445,6 +445,14 @@ func runAction(cmd *cobra.Command, args []string) error { return err } + // Setup container healthchecks. + if err := healthcheck.CreateTimer(ctx, c); err != nil { + return fmt.Errorf("failed to create healthcheck timer: %w", err) + } + if err := healthcheck.StartTimer(ctx, c); err != nil { + return fmt.Errorf("failed to start healthcheck timer: %w", err) + } + if createOpt.Detach { fmt.Fprintln(createOpt.Stdout, id) return nil diff --git a/cmd/nerdctl/container/container_run_test.go b/cmd/nerdctl/container/container_run_test.go index 64692a3ead7..e3d50940f8b 100644 --- a/cmd/nerdctl/container/container_run_test.go +++ b/cmd/nerdctl/container/container_run_test.go @@ -841,6 +841,9 @@ func TestRunDomainname(t *testing.T) { } func TestRunHealthcheckFlags(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("healthcheck tests are skipped in rootless environment") + } testCase := nerdtest.Setup() testCases := []struct { @@ -990,6 +993,9 @@ func TestRunHealthcheckFlags(t *testing.T) { } func TestRunHealthcheckFromImage(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("healthcheck tests are skipped in rootless environment") + } nerdtest.Setup() dockerfile := fmt.Sprintf(`FROM %s diff --git a/cmd/nerdctl/helpers/flagutil.go b/cmd/nerdctl/helpers/flagutil.go index 32217ae95c6..22fc1acb1bf 100644 --- a/cmd/nerdctl/helpers/flagutil.go +++ b/cmd/nerdctl/helpers/flagutil.go @@ -52,8 +52,7 @@ func ValidateHealthcheckFlags(options types.ContainerCreateOptions) error { options.HealthInterval != 0 || options.HealthTimeout != 0 || options.HealthRetries != 0 || - options.HealthStartPeriod != 0 || - options.HealthStartInterval != 0 + options.HealthStartPeriod != 0 if options.NoHealthcheck { if options.HealthCmd != "" || healthFlagsSet { @@ -74,9 +73,6 @@ func ValidateHealthcheckFlags(options types.ContainerCreateOptions) error { if options.HealthStartPeriod < 0 { return fmt.Errorf("--health-start-period cannot be negative") } - if options.HealthStartInterval < 0 { - return fmt.Errorf("--health-start-interval cannot be negative") - } return nil } diff --git a/docs/healthchecks.md b/docs/healthchecks.md index b47c92748b5..933116e3941 100644 --- a/docs/healthchecks.md +++ b/docs/healthchecks.md @@ -2,32 +2,42 @@ `nerdctl` supports Docker-compatible health checks for containers, allowing users to monitor container health via a user-defined command. -Currently, health checks can be triggered manually using the nerdctl container healthcheck command. Automatic orchestration (e.g., periodic checks) will be added in a future update. +## Configuration Options Health checks can be configured in multiple ways: -1. At container creation time using nerdctl run or nerdctl create with `--health-*` flags +1. At container creation time using `nerdctl run` or `nerdctl create` with these flags: + - `--health-cmd`: Command to run to check health + - `--health-interval`: Time between running the check (default: 30s) + - `--health-timeout`: Maximum time to allow one check to run (default: 30s) + - `--health-retries`: Consecutive failures needed to report unhealthy (default: 3) + - `--health-start-period`: Start period for the container to initialize before starting health-retries countdown + - `--no-healthcheck`: Disable any container-specified HEALTHCHECK + 2. At image build time using HEALTHCHECK in a Dockerfile -3. In docker-compose.yaml files, if using nerdctl compose -When a container is created, nerdctl determines the health check configuration based on the following priority: +**Note:** The `--health-start-interval` option is currently not supported by nerdctl. -1. **CLI flags** take highest precedence (e.g., `--health-cmd`, etc.) -2. If no CLI flags are set, nerdctl will use any health check defined in the image. -3. If neither is present, no health check will be configured +## Configuration Priority -Example: +When a container is created, nerdctl determines the health check configuration based on this priority: -```bash -nerdctl run --name web --health-cmd="curl -f http://localhost || exit 1" --health-interval=30s --health-timeout=5s --health-retries=3 nginx -``` +1. CLI flags take highest precedence (e.g., `--health-cmd`, etc.) +2. If no CLI flags are set, nerdctl will use any health check defined in the image +3. If neither is present, no health check will be configured ### Disabling Health Checks You can disable health checks using the following flag during container create/run: ```bash ---no-healthcheck +nerdctl run -d --name app \ + --health-cmd="./health-check.sh" \ + --health-interval=30s \ + --health-timeout=10s \ + --health-retries=3 \ + --health-start-period=60s \ + myapp ``` ### Running Health Checks Manually @@ -37,15 +47,54 @@ configured health check inside the container and reports the result. It serves a health checks, especially in scenarios where external scheduling is used. Example: - ``` nerdctl container healthcheck ``` -### Future Work (WIP) +## Automatic Health Checks with systemd + +On Linux systems with systemd, nerdctl automatically creates and manages systemd timer units to execute health checks at the configured intervals. This provides reliable scheduling and execution of health checks without requiring a persistent daemon. + +### Requirements for Automatic Health Checks + +- systemd must be available on the system +- Container must not be running in rootless mode +- Environment variable `DISABLE_HC_SYSTEMD` must not be set to "true" + +### How It Works -Since nerdctl is daemonless and does not have a persistent background process, we rely on systemd(or external schedulers) -to invoke nerdctl container healthcheck at configured intervals. This allows periodic health checks for containers in a -systemd-based environment. We are actively working on automating health checks, where we will listen to container lifecycle -events and generate appropriate systemd service and timer units. This will enable nerdctl to support automated, -Docker-compatible health checks by leveraging systemd for scheduling and lifecycle integration. \ No newline at end of file +1. When a container with health checks is created, nerdctl: + - Creates a systemd timer unit for the container + - Configures the timer according to the health check interval + - Starts monitoring the container's health status + +2. The health check status can be one of: + - `starting`: During container initialization + - `healthy`: When health checks are passing + - `unhealthy`: After specified number of consecutive failures +## Examples + +1. Basic health check that verifies a web server: +```bash +nerdctl run -d --name web \ + --health-cmd="curl -f http://localhost/ || exit 1" \ + --health-interval=5s \ + --health-retries=3 \ + nginx +``` + +2. Health check with initialization period: +```bash +nerdctl run -d --name app \ + --health-cmd="./health-check.sh" \ + --health-interval=30s \ + --health-timeout=10s \ + --health-retries=3 \ + --health-start-period=60s \ + myapp +``` + +3. Disable health checks: +```bash +nerdctl run --no-healthcheck myapp +``` diff --git a/hack/test-integration.sh b/hack/test-integration.sh index cdbeb61957f..26727987b3c 100755 --- a/hack/test-integration.sh +++ b/hack/test-integration.sh @@ -50,7 +50,7 @@ for arg in "$@"; do done if [ "$needsudo" == "true" ] || [ "$needsudo" == "yes" ] || [ "$needsudo" == "1" ]; then - gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -exec sudo -args -test.allow-kill-daemon "$@" + gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -exec sudo -v -run TestHealthCheck_SystemdIntegration_Basic -args -test.allow-kill-daemon ./cmd/nerdctl/container/ else - gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -args -test.allow-kill-daemon "$@" + gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -v -run TestHealthCheck_SystemdIntegration_Basic -args -test.allow-kill-daemon ./cmd/nerdctl/container/ fi diff --git a/pkg/api/types/container_types.go b/pkg/api/types/container_types.go index 3e157bb303d..a19fb5aea1f 100644 --- a/pkg/api/types/container_types.go +++ b/pkg/api/types/container_types.go @@ -292,13 +292,12 @@ type ContainerCreateOptions struct { ImagePullOpt ImagePullOptions // Healthcheck related fields - HealthCmd string - HealthInterval time.Duration - HealthTimeout time.Duration - HealthRetries int - HealthStartPeriod time.Duration - HealthStartInterval time.Duration - NoHealthcheck bool + HealthCmd string + HealthInterval time.Duration + HealthTimeout time.Duration + HealthRetries int + HealthStartPeriod time.Duration + NoHealthcheck bool // UserNS name for user namespace mapping of container UserNS string diff --git a/pkg/cmd/container/create.go b/pkg/cmd/container/create.go index a0c8fc2bf9b..232d8a27b77 100644 --- a/pkg/cmd/container/create.go +++ b/pkg/cmd/container/create.go @@ -891,9 +891,6 @@ func withHealthcheck(options types.ContainerCreateOptions, ensuredImage *imgutil if options.HealthStartPeriod != 0 { hc.StartPeriod = options.HealthStartPeriod } - if options.HealthStartInterval != 0 { - hc.StartInterval = options.HealthStartInterval - } // If no healthcheck config is set (via CLI or image), return empty string so we skip adding to container config. if reflect.DeepEqual(hc, &healthcheck.Healthcheck{}) { diff --git a/pkg/cmd/container/health_check.go b/pkg/cmd/container/health_check.go index 6fe31c8ebc3..e2646497c3f 100644 --- a/pkg/cmd/container/health_check.go +++ b/pkg/cmd/container/health_check.go @@ -59,7 +59,6 @@ func HealthCheck(ctx context.Context, client *containerd.Client, container conta hcConfig.Interval = timeoutWithDefault(hcConfig.Interval, healthcheck.DefaultProbeInterval) hcConfig.Timeout = timeoutWithDefault(hcConfig.Timeout, healthcheck.DefaultProbeTimeout) hcConfig.StartPeriod = timeoutWithDefault(hcConfig.StartPeriod, healthcheck.DefaultStartPeriod) - hcConfig.StartInterval = timeoutWithDefault(hcConfig.StartInterval, healthcheck.DefaultStartInterval) if hcConfig.Retries == 0 { hcConfig.Retries = healthcheck.DefaultProbeRetries } diff --git a/pkg/cmd/container/kill.go b/pkg/cmd/container/kill.go index 080336d9f87..d42a7cd8c82 100644 --- a/pkg/cmd/container/kill.go +++ b/pkg/cmd/container/kill.go @@ -35,6 +35,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/api/types" "github.com/containerd/nerdctl/v2/pkg/clientutil" "github.com/containerd/nerdctl/v2/pkg/containerutil" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker" "github.com/containerd/nerdctl/v2/pkg/labels" "github.com/containerd/nerdctl/v2/pkg/netutil" @@ -111,6 +112,11 @@ func killContainer(ctx context.Context, container containerd.Container, signal s return err } + // Clean up healthcheck systemd units + if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil { + log.G(ctx).Warnf("failed to clean up healthcheck units for container %s: %s", container.ID(), err) + } + // signal will be sent once resume is finished if paused { if err := task.Resume(ctx); err != nil { diff --git a/pkg/cmd/container/remove.go b/pkg/cmd/container/remove.go index 28048a2f6a1..b9df2b2acaf 100644 --- a/pkg/cmd/container/remove.go +++ b/pkg/cmd/container/remove.go @@ -34,6 +34,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/clientutil" "github.com/containerd/nerdctl/v2/pkg/containerutil" "github.com/containerd/nerdctl/v2/pkg/dnsutil/hostsstore" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker" "github.com/containerd/nerdctl/v2/pkg/ipcutil" "github.com/containerd/nerdctl/v2/pkg/labels" @@ -179,6 +180,11 @@ func RemoveContainer(ctx context.Context, c containerd.Container, globalOptions // Otherwise, nil the error so that we do not write the error label on the container retErr = nil + // Clean up healthcheck systemd units + if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, c); err != nil { + log.G(ctx).WithError(err).Warnf("failed to clean up healthcheck units for container %q", id) + } + // Now, delete the actual container var delOpts []containerd.DeleteOpts if _, err := c.Image(ctx); err == nil { diff --git a/pkg/cmd/container/stop.go b/pkg/cmd/container/stop.go index e1f347b6b96..755686e4bd8 100644 --- a/pkg/cmd/container/stop.go +++ b/pkg/cmd/container/stop.go @@ -25,6 +25,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/api/types" "github.com/containerd/nerdctl/v2/pkg/containerutil" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker" ) @@ -39,6 +40,9 @@ func Stop(ctx context.Context, client *containerd.Client, reqs []string, opt typ if err := cleanupNetwork(ctx, found.Container, opt.GOptions); err != nil { return fmt.Errorf("unable to cleanup network for container: %s", found.Req) } + if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, found.Container); err != nil { + return fmt.Errorf("unable to cleanup healthcheck timer for container: %s: %w", found.Req, err) + } if err := containerutil.Stop(ctx, found.Container, opt.Timeout, opt.Signal); err != nil { if errdefs.IsNotFound(err) { fmt.Fprintf(opt.Stderr, "No such container: %s\n", found.Req) diff --git a/pkg/containerutil/containerutil.go b/pkg/containerutil/containerutil.go index 31ded1a3b93..5022962d8c1 100644 --- a/pkg/containerutil/containerutil.go +++ b/pkg/containerutil/containerutil.go @@ -48,6 +48,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/consoleutil" "github.com/containerd/nerdctl/v2/pkg/errutil" "github.com/containerd/nerdctl/v2/pkg/formatter" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/ipcutil" "github.com/containerd/nerdctl/v2/pkg/labels" "github.com/containerd/nerdctl/v2/pkg/labels/k8slabels" @@ -286,6 +287,15 @@ func Start(ctx context.Context, container containerd.Container, isAttach bool, i if err := task.Start(ctx); err != nil { return err } + + // If container has health checks configured, create and start systemd timer/service files. + if err := healthcheck.CreateTimer(ctx, container); err != nil { + return fmt.Errorf("failed to create healthcheck timer: %w", err) + } + if err := healthcheck.StartTimer(ctx, container); err != nil { + return fmt.Errorf("failed to start healthcheck timer: %w", err) + } + if !isAttach { return nil } @@ -517,6 +527,14 @@ func Unpause(ctx context.Context, client *containerd.Client, id string) error { return err } + // Recreate healthcheck related systemd timer/service files. + if err := healthcheck.CreateTimer(ctx, container); err != nil { + return fmt.Errorf("failed to create healthcheck timer: %w", err) + } + if err := healthcheck.StartTimer(ctx, container); err != nil { + return fmt.Errorf("failed to start healthcheck timer: %w", err) + } + switch status.Status { case containerd.Paused: return task.Resume(ctx) diff --git a/pkg/healthcheck/executor.go b/pkg/healthcheck/executor.go index f5c4216b302..d362d988cf8 100644 --- a/pkg/healthcheck/executor.go +++ b/pkg/healthcheck/executor.go @@ -128,26 +128,42 @@ func updateHealthStatus(ctx context.Context, container containerd.Container, hcC currentHealth = &HealthState{ Status: Starting, FailingStreak: 0, + StartPeriod: hcConfig.StartPeriod > 0, } } - // Check if still within start period - startPeriod := hcConfig.StartPeriod + // Get container info for start period check info, err := container.Info(ctx) if err != nil { return fmt.Errorf("failed to get container info: %w", err) } containerCreated := info.CreatedAt - stillInStartPeriod := hcResult.Start.Sub(containerCreated) < startPeriod - - // Update health status based on exit code - if hcResult.ExitCode == 0 { - currentHealth.Status = Healthy - currentHealth.FailingStreak = 0 - } else if !stillInStartPeriod { - currentHealth.FailingStreak++ - if currentHealth.FailingStreak >= hcConfig.Retries { - currentHealth.Status = Unhealthy + + // Check if we're in start period workflow + inStartPeriodTime := hcResult.Start.Sub(containerCreated) < hcConfig.StartPeriod + inStartPeriodState := currentHealth.StartPeriod + + if inStartPeriodTime && inStartPeriodState { + // Start Period Workflow + if hcResult.ExitCode == 0 { + // First healthy result transitions us out of start period + currentHealth.Status = Healthy + currentHealth.FailingStreak = 0 + currentHealth.StartPeriod = false + } + // Ignore unhealthy results during start period + } else { + // Health Interval Workflow + if hcResult.ExitCode == 0 { + if currentHealth.Status != Healthy { + currentHealth.Status = Healthy + currentHealth.FailingStreak = 0 + } + } else { + currentHealth.FailingStreak++ + if currentHealth.FailingStreak >= hcConfig.Retries && currentHealth.Status != Unhealthy { + currentHealth.Status = Unhealthy + } } } diff --git a/pkg/healthcheck/health.go b/pkg/healthcheck/health.go index 8e0301b492a..b38072e8562 100644 --- a/pkg/healthcheck/health.go +++ b/pkg/healthcheck/health.go @@ -43,7 +43,6 @@ const ( DefaultProbeInterval = 30 * time.Second // Default interval between probe runs. Also applies before the first probe. DefaultProbeTimeout = 30 * time.Second // Max duration a single probe run may take before it's considered failed. DefaultStartPeriod = 0 * time.Second // Grace period for container startup before health checks count as failures. - DefaultStartInterval = 5 * time.Second // Interval between checks during the start period. DefaultProbeRetries = 3 // Number of consecutive failures before marking container as unhealthy. MaxLogEntries = 5 // Maximum number of health check log entries to keep. MaxOutputLenForInspect = 4096 // Max output length (in bytes) stored in health check logs during inspect. Longer outputs are truncated. @@ -70,18 +69,18 @@ type HealthcheckResult struct { // Healthcheck represents the health check configuration type Healthcheck struct { - Test []string `json:"Test,omitempty"` // Test is the check to perform that the container is healthy - Interval time.Duration `json:"Interval,omitempty"` // Interval is the time to wait between checks - Timeout time.Duration `json:"Timeout,omitempty"` // Timeout is the time to wait before considering the check to have hung - Retries int `json:"Retries,omitempty"` // Retries is the number of consecutive failures needed to consider a container as unhealthy - StartPeriod time.Duration `json:"StartPeriod,omitempty"` // StartPeriod is the period for the container to initialize before the health check starts - StartInterval time.Duration `json:"StartInterval,omitempty"` // StartInterval is the time between health checks during the start period + Test []string `json:"Test,omitempty"` // Test is the check to perform that the container is healthy + Interval time.Duration `json:"Interval,omitempty"` // Interval is the time to wait between checks + Timeout time.Duration `json:"Timeout,omitempty"` // Timeout is the time to wait before considering the check to have hung + Retries int `json:"Retries,omitempty"` // Retries is the number of consecutive failures needed to consider a container as unhealthy + StartPeriod time.Duration `json:"StartPeriod,omitempty"` // StartPeriod is the period for the container to initialize before the health check starts } // HealthState stores the current health state of a container type HealthState struct { Status HealthStatus // Status is one of [Starting], [Healthy] or [Unhealthy] FailingStreak int // FailingStreak is the number of consecutive failures + StartPeriod bool // StartPeriod indicates if we're in the start period workflow } // ToJSONString serializes HealthState to a JSON string for label storage diff --git a/pkg/healthcheck/healthcheck_manager_darwin.go b/pkg/healthcheck/healthcheck_manager_darwin.go new file mode 100644 index 00000000000..46858c4966e --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_darwin.go @@ -0,0 +1,53 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + + containerd "github.com/containerd/containerd/v2/client" +) + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// StartTimer starts the healthcheck timer unit. +func StartTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +// This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead. +/* +func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { + return nil +} +*/ + +// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service +// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging +// on systemd operations. On Darwin, this is a no-op since systemd is not available. +func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID string) error { + return nil +} diff --git a/pkg/healthcheck/healthcheck_manager_freebsd.go b/pkg/healthcheck/healthcheck_manager_freebsd.go new file mode 100644 index 00000000000..63085d4feae --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_freebsd.go @@ -0,0 +1,43 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + + containerd "github.com/containerd/containerd/v2/client" +) + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// StartTimer starts the healthcheck timer unit. +func StartTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { + return nil +} diff --git a/pkg/healthcheck/healthcheck_manager_linux.go b/pkg/healthcheck/healthcheck_manager_linux.go new file mode 100644 index 00000000000..3593ac0d09b --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_linux.go @@ -0,0 +1,347 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + "fmt" + "math/rand" + "os" + "os/exec" + "strings" + "time" + + "github.com/coreos/go-systemd/v22/dbus" + + containerd "github.com/containerd/containerd/v2/client" + "github.com/containerd/log" + + "github.com/containerd/nerdctl/v2/pkg/defaults" + "github.com/containerd/nerdctl/v2/pkg/labels" + "github.com/containerd/nerdctl/v2/pkg/rootlessutil" +) + +// IsDbusFunctional tests if DBUS connections actually work in the current environment. +// This is more reliable than just checking if DBUS tools are installed. +func IsDbusFunctional(ctx context.Context) bool { + // Create timeout context to prevent hanging on DBUS connection attempts + timeoutCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + defer cancel() + + var conn *dbus.Conn + var err error + + if rootlessutil.IsRootless() { + // Test user DBUS connection for rootless environments + conn, err = dbus.NewUserConnectionContext(timeoutCtx) + } else { + // Test system DBUS connection for rootful environments + conn, err = dbus.NewSystemConnectionContext(timeoutCtx) + } + + if err != nil { + return false + } + defer conn.Close() + return true +} + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + hc := extractHealthcheck(ctx, container) + if hc == nil { + log.G(ctx).Debugf("No healthcheck configuration found for container %s", container.ID()) + return nil + } + + if shouldSkipHealthCheckSystemd(ctx, hc) { + // Log specific reason for skipping to help with troubleshooting + if !defaults.IsSystemdAvailable() { + log.G(ctx).Infof("Skipping healthcheck timer for container %s: systemd not available", container.ID()) + } else if !IsDbusFunctional(ctx) { + log.G(ctx).Infof("Skipping healthcheck timer for container %s: DBUS connection unavailable (likely containerized environment)", container.ID()) + } else if os.Getenv("DISABLE_HC_SYSTEMD") == "true" { + log.G(ctx).Infof("Skipping healthcheck timer for container %s: disabled by DISABLE_HC_SYSTEMD", container.ID()) + } else if hc == nil || len(hc.Test) == 0 || hc.Test[0] == "NONE" || hc.Interval == 0 { + log.G(ctx).Debugf("Skipping healthcheck timer for container %s: invalid healthcheck configuration", container.ID()) + } + return nil + } + + containerID := container.ID() + hcName := hcUnitName(containerID, true) + log.G(ctx).Debugf("Creating healthcheck timer unit: %s", hcName) + + cmd := []string{} + if rootlessutil.IsRootless() { + cmd = append(cmd, "--user") + cmd = append(cmd, fmt.Sprintf("--uid=%d", rootlessutil.ParentEUID())) + } + + if path := os.Getenv("PATH"); path != "" { + cmd = append(cmd, "--setenv=PATH="+path) + } + + // Always use health-interval for timer frequency + cmd = append(cmd, "--unit", hcName, "--on-unit-inactive="+hc.Interval.String(), "--timer-property=AccuracySec=1s") + + cmd = append(cmd, "nerdctl", "container", "healthcheck", containerID) + if log.G(ctx).Logger.IsLevelEnabled(log.DebugLevel) { + cmd = append(cmd, "--debug") + } + + // conn, err := dbus.NewSystemConnectionContext(context.Background()) + // if err != nil { + // return fmt.Errorf("systemd DBUS connect error: %w", err) + // } + // defer conn.Close() + + log.G(ctx).Debugf("creating healthcheck timer with: systemd-run %s", strings.Join(cmd, " ")) + run := exec.Command("systemd-run", cmd...) + if out, err := run.CombinedOutput(); err != nil { + return fmt.Errorf("systemd-run failed: %w\noutput: %s", err, strings.TrimSpace(string(out))) + } + + return nil +} + +// StartTimer starts the healthcheck timer unit. +func StartTimer(ctx context.Context, container containerd.Container) error { + log.G(ctx).Infof("DEBUG: StartTimer called for container %s", container.ID()) + + hc := extractHealthcheck(ctx, container) + if hc == nil { + log.G(ctx).Infof("DEBUG: No healthcheck found, skipping StartTimer") + return nil + } + if shouldSkipHealthCheckSystemd(ctx, hc) { + log.G(ctx).Infof("DEBUG: Skipping healthcheck systemd, shouldSkip=true") + return nil + } + + hcName := hcUnitName(container.ID(), true) + log.G(ctx).Infof("DEBUG: Starting timer for unit: %s, rootless=%v", hcName, rootlessutil.IsRootless()) + + var conn *dbus.Conn + var err error + if rootlessutil.IsRootless() { + log.G(ctx).Infof("DEBUG: Attempting user DBUS connection...") + conn, err = dbus.NewUserConnectionContext(ctx) + } else { + log.G(ctx).Infof("DEBUG: Attempting system DBUS connection...") + conn, err = dbus.NewSystemConnectionContext(ctx) + } + if err != nil { + log.G(ctx).Errorf("DEBUG: DBUS connection failed: %v", err) + return fmt.Errorf("systemd DBUS connect error: %w", err) + } + defer conn.Close() + log.G(ctx).Infof("DEBUG: DBUS connection successful") + + startChan := make(chan string) + unit := hcName + ".service" + log.G(ctx).Infof("DEBUG: About to restart unit: %s", unit) + + if _, err := conn.RestartUnitContext(context.Background(), unit, "fail", startChan); err != nil { + log.G(ctx).Errorf("DEBUG: RestartUnitContext failed: %v", err) + return err + } + + log.G(ctx).Infof("DEBUG: Waiting for restart confirmation...") + if msg := <-startChan; msg != "done" { + log.G(ctx).Errorf("DEBUG: Unexpected restart result: %s", msg) + return fmt.Errorf("unexpected systemd restart result: %s", msg) + } + + log.G(ctx).Infof("DEBUG: StartTimer completed successfully") + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + hc := extractHealthcheck(ctx, container) + if hc == nil { + return nil + } + + return ForceRemoveTransientHealthCheckFiles(ctx, container.ID()) +} + +// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service +// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging +// on systemd operations. It logs errors as warnings but continues cleanup attempts. +func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID string) error { + log.G(ctx).Debugf("Force removing healthcheck timer unit: %s", containerID) + + // Create a timeout context for systemd operations (5 seconds default) + timeoutCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + unitName := hcUnitName(containerID, true) + timer := unitName + ".timer" + service := unitName + ".service" + + // Channel to collect any critical errors (though we'll continue cleanup regardless) + errChan := make(chan error, 3) + + // Goroutine for DBUS connection and cleanup operations + go func() { + defer close(errChan) + + var conn *dbus.Conn + var err error + if rootlessutil.IsRootless() { + conn, err = dbus.NewUserConnectionContext(ctx) + } else { + conn, err = dbus.NewSystemConnectionContext(ctx) + } + if err != nil { + log.G(ctx).Warnf("systemd DBUS connect error during force cleanup: %v", err) + errChan <- fmt.Errorf("systemd DBUS connect error: %w", err) + return + } + defer conn.Close() + + // Stop timer with timeout + go func() { + select { + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout stopping timer %s during force cleanup", timer) + return + default: + tChan := make(chan string, 1) + if _, err := conn.StopUnitContext(timeoutCtx, timer, "ignore-dependencies", tChan); err == nil { + select { + case msg := <-tChan: + if msg != "done" { + log.G(ctx).Warnf("timer stop message during force cleanup: %s", msg) + } + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout waiting for timer stop confirmation: %s", timer) + } + } else { + log.G(ctx).Warnf("failed to stop timer %s during force cleanup: %v", timer, err) + } + } + }() + + // Stop service with timeout + go func() { + select { + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout stopping service %s during force cleanup", service) + return + default: + sChan := make(chan string, 1) + if _, err := conn.StopUnitContext(timeoutCtx, service, "ignore-dependencies", sChan); err == nil { + select { + case msg := <-sChan: + if msg != "done" { + log.G(ctx).Warnf("service stop message during force cleanup: %s", msg) + } + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout waiting for service stop confirmation: %s", service) + } + } else { + log.G(ctx).Warnf("failed to stop service %s during force cleanup: %v", service, err) + } + } + }() + + // Reset failed units (best effort, non-blocking) + go func() { + select { + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout resetting failed unit %s during force cleanup", service) + return + default: + if err := conn.ResetFailedUnitContext(timeoutCtx, service); err != nil { + log.G(ctx).Warnf("failed to reset failed unit %s during force cleanup: %v", service, err) + } + } + }() + + // Wait a short time for operations to complete, but don't block indefinitely + select { + case <-time.After(3 * time.Second): + log.G(ctx).Debugf("force cleanup operations completed for container %s", containerID) + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("force cleanup timed out for container %s", containerID) + } + }() + + // Wait for the cleanup goroutine to finish or timeout + select { + case err := <-errChan: + if err != nil { + log.G(ctx).Warnf("force cleanup encountered errors but continuing: %v", err) + } + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("force cleanup timed out for container %s, but cleanup may continue in background", containerID) + } + + // Always return nil - this function should never block the caller + // even if systemd operations fail or timeout + log.G(ctx).Debugf("force cleanup completed (non-blocking) for container %s", containerID) + return nil +} + +// hcUnitName returns a systemd unit name for a container healthcheck. +func hcUnitName(containerID string, bare bool) string { + unit := containerID + if !bare { + unit += fmt.Sprintf("-%x", rand.Int()) + } + return unit +} + +func extractHealthcheck(ctx context.Context, container containerd.Container) *Healthcheck { + l, err := container.Labels(ctx) + if err != nil { + log.G(ctx).WithError(err).Debugf("could not get labels for container %s", container.ID()) + return nil + } + hcStr, ok := l[labels.HealthCheck] + if !ok || hcStr == "" { + return nil + } + hc, err := HealthCheckFromJSON(hcStr) + if err != nil { + log.G(ctx).WithError(err).Debugf("invalid healthcheck config on container %s", container.ID()) + return nil + } + return hc +} + +// shouldSkipHealthCheckSystemd determines if healthcheck timers should be skipped. +func shouldSkipHealthCheckSystemd(ctx context.Context, hc *Healthcheck) bool { + // Don't proceed if systemd is unavailable or disabled + if !defaults.IsSystemdAvailable() || os.Getenv("DISABLE_HC_SYSTEMD") == "true" { + return true + } + + // Test actual DBUS connectivity - this is more reliable than checking for tools + if !IsDbusFunctional(ctx) { + return true + } + + // Don't proceed if health check is nil, empty, explicitly NONE or interval is 0. + if hc == nil || len(hc.Test) == 0 || hc.Test[0] == "NONE" || hc.Interval == 0 { + return true + } + return false +} diff --git a/pkg/healthcheck/healthcheck_manager_windows.go b/pkg/healthcheck/healthcheck_manager_windows.go new file mode 100644 index 00000000000..d18c3bce72b --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_windows.go @@ -0,0 +1,53 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + + containerd "github.com/containerd/containerd/v2/client" +) + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// StartTimer starts the healthcheck timer unit. +func StartTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +// This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead. +/* +func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { + return nil +} +*/ + +// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service +// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging +// on systemd operations. On Windows, this is a no-op since systemd is not available. +func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID string) error { + return nil +} diff --git a/pkg/ocihook/ocihook.go b/pkg/ocihook/ocihook.go index 89b6c6b1410..86d4a826f7b 100644 --- a/pkg/ocihook/ocihook.go +++ b/pkg/ocihook/ocihook.go @@ -560,8 +560,11 @@ func onCreateRuntime(opts *handlerOpts) error { } func onPostStop(opts *handlerOpts) error { + log.L.Debugf("onPostStop hook triggered for container %s (namespace: %s)", opts.state.ID, opts.state.Annotations[labels.Namespace]) + lf, err := state.New(opts.state.Annotations[labels.StateDir]) if err != nil { + log.L.WithError(err).Errorf("failed to create state store for container %s", opts.state.ID) return err } @@ -585,6 +588,7 @@ func onPostStop(opts *handlerOpts) error { ctx := context.Background() ns := opts.state.Annotations[labels.Namespace] + if opts.cni != nil { var err error b4nnEnabled, b4nnBindEnabled, err := bypass4netnsutil.IsBypass4netnsEnabled(opts.state.Annotations) diff --git a/test-dbus-debug.md b/test-dbus-debug.md new file mode 100644 index 00000000000..d179cd827bb --- /dev/null +++ b/test-dbus-debug.md @@ -0,0 +1,86 @@ +# DBUS Debugging Test Plan + +## What We Added + +### 1. GitHub Workflow Debugging (.github/workflows/job-test-in-container.yml) +- Added a new step "Debug: DBUS availability in container" before running integration tests +- Checks DBUS tool availability, binary locations, systemd tools, environment variables, and package info +- Runs in the same container that will execute the tests + +### 2. Rootless Test Script Debugging (Dockerfile.d/test-integration-rootless.sh) +- Added debugging as ROOT user before switching to rootless +- Added debugging as ROOTLESS user after SSH switch +- Added debugging AFTER containerd-rootless-setuptool.sh setup +- Checks DBUS tools, environment variables, systemd status, and DBUS connectivity + +## Expected Output + +When the CI runs, we should see: + +### Container Level (from GitHub workflow): +``` +=== DBUS Debugging in Container === +--- DBUS Tools Availability --- +/usr/bin/dbus-launch +/usr/bin/dbus-daemon +/usr/bin/dbus-send +/usr/bin/dbus-monitor +/usr/bin/systemd-run +--- DBUS Binaries Location --- +-rwxr-xr-x 1 root root ... /usr/bin/dbus-launch +-rwxr-xr-x 1 root root ... /usr/bin/dbus-daemon +... +--- Package Info --- +dbus 1.14.10-4ubuntu4.1 +systemd 255.4-1ubuntu8.4 +... +``` + +### Root User Level: +``` +=== DBUS Debugging as ROOT === +--- DBUS Tools Availability (root) --- +/usr/bin/dbus-launch +/usr/bin/systemd-run +... +--- Environment (root) --- +PATH: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +USER: root +UID: 0 +DBUS_SESSION_BUS_ADDRESS: unset +XDG_RUNTIME_DIR: unset +``` + +### Rootless User Level: +``` +=== DBUS Debugging as ROOTLESS USER === +--- DBUS Tools Availability (rootless) --- +/usr/bin/dbus-launch +/usr/bin/systemd-run +... +--- Environment (rootless) --- +PATH: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +USER: rootless +UID: 1000 +DBUS_SESSION_BUS_ADDRESS: unset +XDG_RUNTIME_DIR: /run/user/1000 +``` + +## What This Will Tell Us + +1. **Are DBUS tools installed?** - We'll see if dbus-launch, dbus-daemon, etc. are available +2. **Are they in PATH?** - We'll see the exact paths and verify accessibility +3. **Environment differences** - Compare root vs rootless environment setup +4. **Systemd user session status** - See if systemd --user works properly +5. **DBUS connectivity** - Test if dbus-launch can establish connections +6. **Setup impact** - See how containerd-rootless-setuptool.sh affects DBUS environment + +## Next Steps After Getting Debug Output + +Based on the debug output, we can: +1. **If DBUS tools are missing**: Add them to the Dockerfile +2. **If DBUS tools exist but fail**: Implement graceful fallback in healthcheck code +3. **If environment is wrong**: Fix environment setup in test script +4. **If systemd user session fails**: Implement proper session initialization + +This debugging will give us the exact information needed to solve the healthcheck timer failures in CI.