diff --git a/cmd/nerdctl/container/container_create.go b/cmd/nerdctl/container/container_create.go index fc5cbdd97c8..e30d529481a 100644 --- a/cmd/nerdctl/container/container_create.go +++ b/cmd/nerdctl/container/container_create.go @@ -279,10 +279,6 @@ func createOptions(cmd *cobra.Command) (types.ContainerCreateOptions, error) { if err != nil { return opt, err } - opt.HealthStartInterval, err = cmd.Flags().GetDuration("health-start-interval") - if err != nil { - return opt, err - } opt.NoHealthcheck, err = cmd.Flags().GetBool("no-healthcheck") if err != nil { return opt, err diff --git a/cmd/nerdctl/container/container_health_check_test.go b/cmd/nerdctl/container/container_health_check_linux_test.go similarity index 80% rename from cmd/nerdctl/container/container_health_check_test.go rename to cmd/nerdctl/container/container_health_check_linux_test.go index aa2d1603313..6c8044823b4 100644 --- a/cmd/nerdctl/container/container_health_check_test.go +++ b/cmd/nerdctl/container/container_health_check_linux_test.go @@ -19,7 +19,6 @@ package container import ( "encoding/json" "errors" - "fmt" "strings" "testing" "time" @@ -32,11 +31,16 @@ import ( "github.com/containerd/nerdctl/mod/tigron/tig" "github.com/containerd/nerdctl/v2/pkg/healthcheck" + "github.com/containerd/nerdctl/v2/pkg/rootlessutil" "github.com/containerd/nerdctl/v2/pkg/testutil" "github.com/containerd/nerdctl/v2/pkg/testutil/nerdtest" ) func TestContainerHealthCheckBasic(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("healthcheck tests are skipped in rootless environment") + } + testCase := nerdtest.Setup() // Docker CLI does not provide a standalone healthcheck command. @@ -134,6 +138,10 @@ func TestContainerHealthCheckBasic(t *testing.T) { } func TestContainerHealthCheckAdvance(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("healthcheck tests are skipped in rootless environment") + } + testCase := nerdtest.Setup() // Docker CLI does not provide a standalone healthcheck command. @@ -391,43 +399,6 @@ func TestContainerHealthCheckAdvance(t *testing.T) { } }, }, - { - Description: "Healthcheck emits large output repeatedly", - Setup: func(data test.Data, helpers test.Helpers) { - helpers.Ensure("run", "-d", "--name", data.Identifier(), - "--health-cmd", "yes X | head -c 60000", - "--health-interval", "1s", "--health-timeout", "2s", - testutil.CommonImage, "sleep", nerdtest.Infinity) - nerdtest.EnsureContainerStarted(helpers, data.Identifier()) - }, - Cleanup: func(data test.Data, helpers test.Helpers) { - helpers.Anyhow("rm", "-f", data.Identifier()) - }, - Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { - for i := 0; i < 3; i++ { - helpers.Ensure("container", "healthcheck", data.Identifier()) - time.Sleep(2 * time.Second) - } - return helpers.Command("inspect", data.Identifier()) - }, - Expected: func(data test.Data, helpers test.Helpers) *test.Expected { - return &test.Expected{ - ExitCode: 0, - Output: expect.All(func(_ string, t tig.T) { - inspect := nerdtest.InspectContainer(helpers, data.Identifier()) - h := inspect.State.Health - debug, _ := json.MarshalIndent(h, "", " ") - t.Log(string(debug)) - assert.Assert(t, h != nil, "expected health state") - assert.Equal(t, h.Status, healthcheck.Healthy) - assert.Assert(t, len(h.Log) >= 3, "expected at least 3 health log entries") - for _, log := range h.Log { - assert.Assert(t, len(log.Output) >= 1024, fmt.Sprintf("each output should be >= 1024 bytes, was: %s", log.Output)) - } - }), - } - }, - }, { Description: "Health log in inspect keeps only the latest 5 entries", Setup: func(data test.Data, helpers test.Helpers) { @@ -602,3 +573,122 @@ func TestContainerHealthCheckAdvance(t *testing.T) { testCase.Run(t) } + +// func TestHealthCheck_SystemdIntegration_Basic(t *testing.T) { +// testCase := nerdtest.Setup() +// testCase.Require = require.Not(nerdtest.Docker) + +// testCase.SubTests = []*test.Case{ +// //{ +// // Description: "Basic healthy container with systemd-triggered healthcheck", +// // Setup: func(data test.Data, helpers test.Helpers) { +// // helpers.Ensure("run", "-d", "--name", data.Identifier(), +// // "--health-cmd", "echo healthy", +// // "--health-interval", "2s", +// // testutil.CommonImage, "sleep", "30") +// // // Wait for a couple of healthchecks to execute +// // time.Sleep(5 * time.Second) +// // }, +// // Cleanup: func(data test.Data, helpers test.Helpers) { +// // helpers.Anyhow("rm", "-f", data.Identifier()) +// // }, +// // Expected: func(data test.Data, helpers test.Helpers) *test.Expected { +// // return &test.Expected{ +// // ExitCode: 0, +// // Output: expect.All(func(stdout, _ string, t *testing.T) { +// // inspect := nerdtest.InspectContainer(helpers, data.Identifier()) +// // h := inspect.State.Health +// // assert.Assert(t, h != nil, "expected health state to be present") +// // assert.Equal(t, h.Status, "healthy") +// // assert.Assert(t, len(h.Log) > 0, "expected at least one health check log entry") +// // }), +// // } +// // }, +// //}, +// //{ +// // Description: "Kill stops healthcheck execution", +// // Setup: func(data test.Data, helpers test.Helpers) { +// // helpers.Ensure("run", "-d", "--name", data.Identifier(), +// // "--health-cmd", "echo healthy", +// // "--health-interval", "1s", +// // testutil.CommonImage, "sleep", "30") +// // time.Sleep(5 * time.Second) // Wait for at least one health check to execute +// // helpers.Ensure("kill", data.Identifier()) // Kill the container +// // time.Sleep(3 * time.Second) // Wait to allow any potential extra healthchecks (shouldn't happen) +// // }, +// // Cleanup: func(data test.Data, helpers test.Helpers) { +// // helpers.Anyhow("rm", "-f", data.Identifier()) +// // }, +// // Expected: func(data test.Data, helpers test.Helpers) *test.Expected { +// // return &test.Expected{ +// // ExitCode: 0, +// // Output: expect.All(func(stdout, _ string, t *testing.T) { +// // inspect := nerdtest.InspectContainer(helpers, data.Identifier()) +// // h := inspect.State.Health +// // assert.Assert(t, h != nil, "expected health state to be present") +// // assert.Assert(t, len(h.Log) > 0, "expected at least one health check log entry") +// // +// // // Get container FinishedAt timestamp +// // containerEnd, err := time.Parse(time.RFC3339Nano, inspect.State.FinishedAt) +// // assert.NilError(t, err, "parsing container FinishedAt") +// // +// // // Assert all healthcheck log start times are before container finished +// // for _, entry := range h.Log { +// // assert.NilError(t, err, "parsing healthcheck Start time") +// // assert.Assert(t, entry.Start.Before(containerEnd), "healthcheck ran after container was killed") +// // } +// // }), +// // } +// // }, +// //}, + +// // { +// // Description: "Pause/unpause halts and resumes healthcheck execution", +// // Setup: func(data test.Data, helpers test.Helpers) { +// // data.Labels().Set("cID", data.Identifier()) +// // helpers.Ensure("run", "-d", "--name", data.Identifier(), +// // "--health-cmd", "echo healthy", +// // "--health-interval", "1s", +// // testutil.CommonImage, "sleep", "30") +// // time.Sleep(4 * time.Second) + +// // // Inspect using raw command +// // helpers.Command("container", "inspect", data.Labels().Get("cID")). +// // Run(&test.Expected{ +// // ExitCode: expect.ExitCodeNoCheck, +// // Output: func(stdout string, _ string, t *testing.T) { +// // var dc []dockercompat.Container +// // err := json.Unmarshal([]byte(stdout), &dc) +// // assert.NilError(t, err) +// // assert.Equal(t, len(dc), 1) +// // h := dc[0].State.Health +// // assert.Assert(t, h != nil, "expected health state to be present") +// // data.Labels().Set("healthStatus", h.Status) +// // data.Labels().Set("logCount", strconv.Itoa(len(h.Log))) +// // fmt.Printf("📋 Setup Inspect: Status=%s, LogCount=%s\n", h.Status, strconv.Itoa(len(h.Log))) +// // }, +// // }) +// // }, +// // Cleanup: func(data test.Data, helpers test.Helpers) { +// // helpers.Anyhow("rm", "-f", data.Identifier()) +// // }, +// // Expected: func(data test.Data, helpers test.Helpers) *test.Expected { +// // return &test.Expected{ +// // ExitCode: 0, +// // Output: expect.All(func(stdout, _ string, t *testing.T) { +// // before := data.Labels().Get("logCountBeforePause") +// // after := data.Labels().Get("logCountAfterUnpause") + +// // beforeCount, _ := strconv.Atoi(before) +// // afterCount, _ := strconv.Atoi(after) + +// // assert.Assert(t, afterCount > beforeCount, +// // "expected more healthchecks after unpause (got %d → %d)", beforeCount, afterCount) +// // }), +// // } +// // }, +// // }, +// } + +// testCase.Run(t) +// } diff --git a/cmd/nerdctl/container/container_run.go b/cmd/nerdctl/container/container_run.go index 67b797bdce9..0b746a7e397 100644 --- a/cmd/nerdctl/container/container_run.go +++ b/cmd/nerdctl/container/container_run.go @@ -22,12 +22,8 @@ import ( "runtime" "strings" - "github.com/spf13/cobra" - "golang.org/x/term" - "github.com/containerd/console" "github.com/containerd/log" - "github.com/containerd/nerdctl/v2/cmd/nerdctl/completion" "github.com/containerd/nerdctl/v2/pkg/annotations" "github.com/containerd/nerdctl/v2/pkg/api/types" @@ -37,11 +33,14 @@ import ( "github.com/containerd/nerdctl/v2/pkg/containerutil" "github.com/containerd/nerdctl/v2/pkg/defaults" "github.com/containerd/nerdctl/v2/pkg/errutil" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/labels" "github.com/containerd/nerdctl/v2/pkg/logging" "github.com/containerd/nerdctl/v2/pkg/netutil" "github.com/containerd/nerdctl/v2/pkg/signalutil" "github.com/containerd/nerdctl/v2/pkg/taskutil" + "github.com/spf13/cobra" + "golang.org/x/term" ) const ( @@ -240,7 +239,6 @@ func setCreateFlags(cmd *cobra.Command) { cmd.Flags().Duration("health-timeout", 0, "Maximum time to allow one check to run (default: 30s)") cmd.Flags().Int("health-retries", 0, "Consecutive failures needed to report unhealthy (default: 3)") cmd.Flags().Duration("health-start-period", 0, "Start period for the container to initialize before starting health-retries countdown") - cmd.Flags().Duration("health-start-interval", 0, "Time between running the checks during the start period") cmd.Flags().Bool("no-healthcheck", false, "Disable any container-specified HEALTHCHECK") // #region env flags @@ -445,6 +443,14 @@ func runAction(cmd *cobra.Command, args []string) error { return err } + // Setup container healthchecks. + if err := healthcheck.CreateTimer(ctx, c); err != nil { + return fmt.Errorf("failed to create healthcheck timer: %w", err) + } + if err := healthcheck.StartTimer(ctx, c); err != nil { + return fmt.Errorf("failed to start healthcheck timer: %w", err) + } + if createOpt.Detach { fmt.Fprintln(createOpt.Stdout, id) return nil diff --git a/cmd/nerdctl/container/container_run_test.go b/cmd/nerdctl/container/container_run_test.go index 64692a3ead7..e3d50940f8b 100644 --- a/cmd/nerdctl/container/container_run_test.go +++ b/cmd/nerdctl/container/container_run_test.go @@ -841,6 +841,9 @@ func TestRunDomainname(t *testing.T) { } func TestRunHealthcheckFlags(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("healthcheck tests are skipped in rootless environment") + } testCase := nerdtest.Setup() testCases := []struct { @@ -990,6 +993,9 @@ func TestRunHealthcheckFlags(t *testing.T) { } func TestRunHealthcheckFromImage(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("healthcheck tests are skipped in rootless environment") + } nerdtest.Setup() dockerfile := fmt.Sprintf(`FROM %s diff --git a/cmd/nerdctl/helpers/flagutil.go b/cmd/nerdctl/helpers/flagutil.go index 32217ae95c6..22fc1acb1bf 100644 --- a/cmd/nerdctl/helpers/flagutil.go +++ b/cmd/nerdctl/helpers/flagutil.go @@ -52,8 +52,7 @@ func ValidateHealthcheckFlags(options types.ContainerCreateOptions) error { options.HealthInterval != 0 || options.HealthTimeout != 0 || options.HealthRetries != 0 || - options.HealthStartPeriod != 0 || - options.HealthStartInterval != 0 + options.HealthStartPeriod != 0 if options.NoHealthcheck { if options.HealthCmd != "" || healthFlagsSet { @@ -74,9 +73,6 @@ func ValidateHealthcheckFlags(options types.ContainerCreateOptions) error { if options.HealthStartPeriod < 0 { return fmt.Errorf("--health-start-period cannot be negative") } - if options.HealthStartInterval < 0 { - return fmt.Errorf("--health-start-interval cannot be negative") - } return nil } diff --git a/docs/healthchecks.md b/docs/healthchecks.md index b47c92748b5..5e4250b96a0 100644 --- a/docs/healthchecks.md +++ b/docs/healthchecks.md @@ -1,51 +1,74 @@ # Health Check Support in nerdctl -`nerdctl` supports Docker-compatible health checks for containers, allowing users to monitor container health via a user-defined command. +`nerdctl` supports Docker-compatible health checks for containers, allowing you to monitor container health through user-defined commands. -Currently, health checks can be triggered manually using the nerdctl container healthcheck command. Automatic orchestration (e.g., periodic checks) will be added in a future update. +## Configuration Options Health checks can be configured in multiple ways: -1. At container creation time using nerdctl run or nerdctl create with `--health-*` flags +1. At container creation time using `nerdctl run` or `nerdctl create` with these flags: + - `--health-cmd`: Command to run to check health + - `--health-interval`: Time between running the check (default: 30s) + - `--health-timeout`: Maximum time to allow one check to run (default: 30s) + - `--health-retries`: Consecutive failures needed to report unhealthy (default: 3) + - `--health-start-period`: Start period for the container to initialize before starting health-retries countdown + - `--no-healthcheck`: Disable any container-specified HEALTHCHECK + 2. At image build time using HEALTHCHECK in a Dockerfile -3. In docker-compose.yaml files, if using nerdctl compose +3. In docker-compose.yaml files when using nerdctl compose + +## Configuration Priority -When a container is created, nerdctl determines the health check configuration based on the following priority: +When a container is created, nerdctl determines the health check configuration based on this priority: -1. **CLI flags** take highest precedence (e.g., `--health-cmd`, etc.) -2. If no CLI flags are set, nerdctl will use any health check defined in the image. +1. CLI flags take highest precedence (e.g., `--health-cmd`, etc.) +2. If no CLI flags are set, nerdctl will use any health check defined in the image 3. If neither is present, no health check will be configured -Example: +## Automatic Health Checks with systemd -```bash -nerdctl run --name web --health-cmd="curl -f http://localhost || exit 1" --health-interval=30s --health-timeout=5s --health-retries=3 nginx -``` +On Linux systems with systemd, nerdctl automatically creates and manages systemd timer units to execute health checks at the configured intervals. This provides reliable scheduling and execution of health checks without requiring a persistent daemon. -### Disabling Health Checks +### Requirements for Automatic Health Checks -You can disable health checks using the following flag during container create/run: - -```bash ---no-healthcheck -``` +- systemd must be available on the system +- Container must not be running in rootless mode +- Environment variable `DISABLE_HC_SYSTEMD` must not be set to "true" -### Running Health Checks Manually +### How It Works -nerdctl provides a container healthcheck command that can be manually triggered by the user. This command runs the -configured health check inside the container and reports the result. It serves as the entry point for executing -health checks, especially in scenarios where external scheduling is used. +1. When a container with health checks is created, nerdctl: + - Creates a systemd timer unit for the container + - Configures the timer according to the health check interval + - Starts monitoring the container's health status -Example: +2. The health check status can be one of: + - `starting`: During container initialization + - `healthy`: When health checks are passing + - `unhealthy`: After specified number of consecutive failures +## Examples -``` -nerdctl container healthcheck +1. Basic health check that verifies a web server: +```bash +nerdctl run -d --name web \ + --health-cmd="curl -f http://localhost/ || exit 1" \ + --health-interval=5s \ + --health-retries=3 \ + nginx ``` -### Future Work (WIP) +2. Health check with initialization period: +```bash +nerdctl run -d --name app \ + --health-cmd="./health-check.sh" \ + --health-interval=30s \ + --health-timeout=10s \ + --health-retries=3 \ + --health-start-period=60s \ + myapp +``` -Since nerdctl is daemonless and does not have a persistent background process, we rely on systemd(or external schedulers) -to invoke nerdctl container healthcheck at configured intervals. This allows periodic health checks for containers in a -systemd-based environment. We are actively working on automating health checks, where we will listen to container lifecycle -events and generate appropriate systemd service and timer units. This will enable nerdctl to support automated, -Docker-compatible health checks by leveraging systemd for scheduling and lifecycle integration. \ No newline at end of file +3. Disable health checks: +```bash +nerdctl run --no-healthcheck myapp +``` diff --git a/go.mod b/go.mod index 3b8062c33e9..282dd278c17 100644 --- a/go.mod +++ b/go.mod @@ -120,7 +120,7 @@ require ( github.com/santhosh-tekuri/jsonschema/v6 v6.0.1 // indirect github.com/sasha-s/go-deadlock v0.3.5 // indirect //gomodjail:unconfined - github.com/sirupsen/logrus v1.9.3 // indirect + github.com/sirupsen/logrus v1.9.3 github.com/smallstep/pkcs7 v0.1.1 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/stefanberger/go-pkcs11uri v0.0.0-20230803200340-78284954bff6 // indirect diff --git a/pkg/api/types/container_types.go b/pkg/api/types/container_types.go index 3e157bb303d..a19fb5aea1f 100644 --- a/pkg/api/types/container_types.go +++ b/pkg/api/types/container_types.go @@ -292,13 +292,12 @@ type ContainerCreateOptions struct { ImagePullOpt ImagePullOptions // Healthcheck related fields - HealthCmd string - HealthInterval time.Duration - HealthTimeout time.Duration - HealthRetries int - HealthStartPeriod time.Duration - HealthStartInterval time.Duration - NoHealthcheck bool + HealthCmd string + HealthInterval time.Duration + HealthTimeout time.Duration + HealthRetries int + HealthStartPeriod time.Duration + NoHealthcheck bool // UserNS name for user namespace mapping of container UserNS string diff --git a/pkg/cmd/container/create.go b/pkg/cmd/container/create.go index a0c8fc2bf9b..232d8a27b77 100644 --- a/pkg/cmd/container/create.go +++ b/pkg/cmd/container/create.go @@ -891,9 +891,6 @@ func withHealthcheck(options types.ContainerCreateOptions, ensuredImage *imgutil if options.HealthStartPeriod != 0 { hc.StartPeriod = options.HealthStartPeriod } - if options.HealthStartInterval != 0 { - hc.StartInterval = options.HealthStartInterval - } // If no healthcheck config is set (via CLI or image), return empty string so we skip adding to container config. if reflect.DeepEqual(hc, &healthcheck.Healthcheck{}) { diff --git a/pkg/cmd/container/health_check.go b/pkg/cmd/container/health_check.go index 6fe31c8ebc3..e2646497c3f 100644 --- a/pkg/cmd/container/health_check.go +++ b/pkg/cmd/container/health_check.go @@ -59,7 +59,6 @@ func HealthCheck(ctx context.Context, client *containerd.Client, container conta hcConfig.Interval = timeoutWithDefault(hcConfig.Interval, healthcheck.DefaultProbeInterval) hcConfig.Timeout = timeoutWithDefault(hcConfig.Timeout, healthcheck.DefaultProbeTimeout) hcConfig.StartPeriod = timeoutWithDefault(hcConfig.StartPeriod, healthcheck.DefaultStartPeriod) - hcConfig.StartInterval = timeoutWithDefault(hcConfig.StartInterval, healthcheck.DefaultStartInterval) if hcConfig.Retries == 0 { hcConfig.Retries = healthcheck.DefaultProbeRetries } diff --git a/pkg/cmd/container/kill.go b/pkg/cmd/container/kill.go index 080336d9f87..d42a7cd8c82 100644 --- a/pkg/cmd/container/kill.go +++ b/pkg/cmd/container/kill.go @@ -35,6 +35,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/api/types" "github.com/containerd/nerdctl/v2/pkg/clientutil" "github.com/containerd/nerdctl/v2/pkg/containerutil" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker" "github.com/containerd/nerdctl/v2/pkg/labels" "github.com/containerd/nerdctl/v2/pkg/netutil" @@ -111,6 +112,11 @@ func killContainer(ctx context.Context, container containerd.Container, signal s return err } + // Clean up healthcheck systemd units + if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil { + log.G(ctx).Warnf("failed to clean up healthcheck units for container %s: %s", container.ID(), err) + } + // signal will be sent once resume is finished if paused { if err := task.Resume(ctx); err != nil { diff --git a/pkg/cmd/container/remove.go b/pkg/cmd/container/remove.go index 28048a2f6a1..b9df2b2acaf 100644 --- a/pkg/cmd/container/remove.go +++ b/pkg/cmd/container/remove.go @@ -34,6 +34,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/clientutil" "github.com/containerd/nerdctl/v2/pkg/containerutil" "github.com/containerd/nerdctl/v2/pkg/dnsutil/hostsstore" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker" "github.com/containerd/nerdctl/v2/pkg/ipcutil" "github.com/containerd/nerdctl/v2/pkg/labels" @@ -179,6 +180,11 @@ func RemoveContainer(ctx context.Context, c containerd.Container, globalOptions // Otherwise, nil the error so that we do not write the error label on the container retErr = nil + // Clean up healthcheck systemd units + if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, c); err != nil { + log.G(ctx).WithError(err).Warnf("failed to clean up healthcheck units for container %q", id) + } + // Now, delete the actual container var delOpts []containerd.DeleteOpts if _, err := c.Image(ctx); err == nil { diff --git a/pkg/cmd/container/stop.go b/pkg/cmd/container/stop.go index e1f347b6b96..755686e4bd8 100644 --- a/pkg/cmd/container/stop.go +++ b/pkg/cmd/container/stop.go @@ -25,6 +25,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/api/types" "github.com/containerd/nerdctl/v2/pkg/containerutil" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker" ) @@ -39,6 +40,9 @@ func Stop(ctx context.Context, client *containerd.Client, reqs []string, opt typ if err := cleanupNetwork(ctx, found.Container, opt.GOptions); err != nil { return fmt.Errorf("unable to cleanup network for container: %s", found.Req) } + if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, found.Container); err != nil { + return fmt.Errorf("unable to cleanup healthcheck timer for container: %s: %w", found.Req, err) + } if err := containerutil.Stop(ctx, found.Container, opt.Timeout, opt.Signal); err != nil { if errdefs.IsNotFound(err) { fmt.Fprintf(opt.Stderr, "No such container: %s\n", found.Req) diff --git a/pkg/containerutil/containerutil.go b/pkg/containerutil/containerutil.go index 60da6f11895..65ce184c613 100644 --- a/pkg/containerutil/containerutil.go +++ b/pkg/containerutil/containerutil.go @@ -48,6 +48,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/consoleutil" "github.com/containerd/nerdctl/v2/pkg/errutil" "github.com/containerd/nerdctl/v2/pkg/formatter" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/ipcutil" "github.com/containerd/nerdctl/v2/pkg/labels" "github.com/containerd/nerdctl/v2/pkg/labels/k8slabels" @@ -286,6 +287,15 @@ func Start(ctx context.Context, container containerd.Container, isAttach bool, i if err := task.Start(ctx); err != nil { return err } + + // If container has health checks configured, create and start systemd timer/service files. + if err := healthcheck.CreateTimer(ctx, container); err != nil { + return fmt.Errorf("failed to create healthcheck timer: %w", err) + } + if err := healthcheck.StartTimer(ctx, container); err != nil { + return fmt.Errorf("failed to start healthcheck timer: %w", err) + } + if !isAttach { return nil } @@ -349,6 +359,11 @@ func Stop(ctx context.Context, container containerd.Container, timeout *time.Dur } }() + // Clean up healthcheck units if configured. + // if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil { + // return fmt.Errorf("failed to clean up healthcheck units for container %s", container.ID()) + // } + if timeout == nil { t, ok := l[labels.StopTimeout] if !ok { @@ -487,6 +502,11 @@ func Pause(ctx context.Context, client *containerd.Client, id string) error { return err } + // Clean up healthcheck units if configured. + // if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil { + // return fmt.Errorf("failed to clean up healthcheck units for container %s", container.ID()) + // } + switch status.Status { case containerd.Paused: return fmt.Errorf("container %s is already paused", id) @@ -514,6 +534,14 @@ func Unpause(ctx context.Context, client *containerd.Client, id string) error { return err } + // Recreate healthcheck related systemd timer/service files. + if err := healthcheck.CreateTimer(ctx, container); err != nil { + return fmt.Errorf("failed to create healthcheck timer: %w", err) + } + if err := healthcheck.StartTimer(ctx, container); err != nil { + return fmt.Errorf("failed to start healthcheck timer: %w", err) + } + switch status.Status { case containerd.Paused: return task.Resume(ctx) diff --git a/pkg/healthcheck/executor.go b/pkg/healthcheck/executor.go index f5c4216b302..d362d988cf8 100644 --- a/pkg/healthcheck/executor.go +++ b/pkg/healthcheck/executor.go @@ -128,26 +128,42 @@ func updateHealthStatus(ctx context.Context, container containerd.Container, hcC currentHealth = &HealthState{ Status: Starting, FailingStreak: 0, + StartPeriod: hcConfig.StartPeriod > 0, } } - // Check if still within start period - startPeriod := hcConfig.StartPeriod + // Get container info for start period check info, err := container.Info(ctx) if err != nil { return fmt.Errorf("failed to get container info: %w", err) } containerCreated := info.CreatedAt - stillInStartPeriod := hcResult.Start.Sub(containerCreated) < startPeriod - - // Update health status based on exit code - if hcResult.ExitCode == 0 { - currentHealth.Status = Healthy - currentHealth.FailingStreak = 0 - } else if !stillInStartPeriod { - currentHealth.FailingStreak++ - if currentHealth.FailingStreak >= hcConfig.Retries { - currentHealth.Status = Unhealthy + + // Check if we're in start period workflow + inStartPeriodTime := hcResult.Start.Sub(containerCreated) < hcConfig.StartPeriod + inStartPeriodState := currentHealth.StartPeriod + + if inStartPeriodTime && inStartPeriodState { + // Start Period Workflow + if hcResult.ExitCode == 0 { + // First healthy result transitions us out of start period + currentHealth.Status = Healthy + currentHealth.FailingStreak = 0 + currentHealth.StartPeriod = false + } + // Ignore unhealthy results during start period + } else { + // Health Interval Workflow + if hcResult.ExitCode == 0 { + if currentHealth.Status != Healthy { + currentHealth.Status = Healthy + currentHealth.FailingStreak = 0 + } + } else { + currentHealth.FailingStreak++ + if currentHealth.FailingStreak >= hcConfig.Retries && currentHealth.Status != Unhealthy { + currentHealth.Status = Unhealthy + } } } diff --git a/pkg/healthcheck/health.go b/pkg/healthcheck/health.go index 8e0301b492a..b38072e8562 100644 --- a/pkg/healthcheck/health.go +++ b/pkg/healthcheck/health.go @@ -43,7 +43,6 @@ const ( DefaultProbeInterval = 30 * time.Second // Default interval between probe runs. Also applies before the first probe. DefaultProbeTimeout = 30 * time.Second // Max duration a single probe run may take before it's considered failed. DefaultStartPeriod = 0 * time.Second // Grace period for container startup before health checks count as failures. - DefaultStartInterval = 5 * time.Second // Interval between checks during the start period. DefaultProbeRetries = 3 // Number of consecutive failures before marking container as unhealthy. MaxLogEntries = 5 // Maximum number of health check log entries to keep. MaxOutputLenForInspect = 4096 // Max output length (in bytes) stored in health check logs during inspect. Longer outputs are truncated. @@ -70,18 +69,18 @@ type HealthcheckResult struct { // Healthcheck represents the health check configuration type Healthcheck struct { - Test []string `json:"Test,omitempty"` // Test is the check to perform that the container is healthy - Interval time.Duration `json:"Interval,omitempty"` // Interval is the time to wait between checks - Timeout time.Duration `json:"Timeout,omitempty"` // Timeout is the time to wait before considering the check to have hung - Retries int `json:"Retries,omitempty"` // Retries is the number of consecutive failures needed to consider a container as unhealthy - StartPeriod time.Duration `json:"StartPeriod,omitempty"` // StartPeriod is the period for the container to initialize before the health check starts - StartInterval time.Duration `json:"StartInterval,omitempty"` // StartInterval is the time between health checks during the start period + Test []string `json:"Test,omitempty"` // Test is the check to perform that the container is healthy + Interval time.Duration `json:"Interval,omitempty"` // Interval is the time to wait between checks + Timeout time.Duration `json:"Timeout,omitempty"` // Timeout is the time to wait before considering the check to have hung + Retries int `json:"Retries,omitempty"` // Retries is the number of consecutive failures needed to consider a container as unhealthy + StartPeriod time.Duration `json:"StartPeriod,omitempty"` // StartPeriod is the period for the container to initialize before the health check starts } // HealthState stores the current health state of a container type HealthState struct { Status HealthStatus // Status is one of [Starting], [Healthy] or [Unhealthy] FailingStreak int // FailingStreak is the number of consecutive failures + StartPeriod bool // StartPeriod indicates if we're in the start period workflow } // ToJSONString serializes HealthState to a JSON string for label storage diff --git a/pkg/healthcheck/healthcheck_manager_darwin.go b/pkg/healthcheck/healthcheck_manager_darwin.go new file mode 100644 index 00000000000..63085d4feae --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_darwin.go @@ -0,0 +1,43 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + + containerd "github.com/containerd/containerd/v2/client" +) + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// StartTimer starts the healthcheck timer unit. +func StartTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { + return nil +} diff --git a/pkg/healthcheck/healthcheck_manager_freebsd.go b/pkg/healthcheck/healthcheck_manager_freebsd.go new file mode 100644 index 00000000000..63085d4feae --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_freebsd.go @@ -0,0 +1,43 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + + containerd "github.com/containerd/containerd/v2/client" +) + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// StartTimer starts the healthcheck timer unit. +func StartTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { + return nil +} diff --git a/pkg/healthcheck/healthcheck_manager_linux.go b/pkg/healthcheck/healthcheck_manager_linux.go new file mode 100644 index 00000000000..49f5e8419bc --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_linux.go @@ -0,0 +1,204 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + "fmt" + "math/rand" + "os" + "os/exec" + "strings" + + "github.com/coreos/go-systemd/v22/dbus" + "github.com/sirupsen/logrus" + + containerd "github.com/containerd/containerd/v2/client" + "github.com/containerd/log" + + "github.com/containerd/nerdctl/v2/pkg/defaults" + "github.com/containerd/nerdctl/v2/pkg/labels" + "github.com/containerd/nerdctl/v2/pkg/rootlessutil" +) + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + hc := extractHealthcheck(ctx, container) + if hc == nil { + return nil + } + if shouldSkipHealthCheckSystemd(hc) { + return nil + } + + containerID := container.ID() + hcName := hcUnitName(containerID, true) + logrus.Debugf("Creating healthcheck timer unit: %s", hcName) + + cmd := []string{} + if rootlessutil.IsRootless() { + cmd = append(cmd, "--user") + } + if path := os.Getenv("PATH"); path != "" { + cmd = append(cmd, "--setenv=PATH="+path) + } + + // Always use health-interval for timer frequency + cmd = append(cmd, "--unit", hcName, "--on-unit-inactive="+hc.Interval.String(), "--timer-property=AccuracySec=1s") + + cmd = append(cmd, "nerdctl", "container", "healthcheck", containerID) + if logrus.IsLevelEnabled(logrus.DebugLevel) { + cmd = append(cmd, "--debug") + } + + conn, err := dbus.NewSystemConnectionContext(context.Background()) + if err != nil { + return fmt.Errorf("systemd DBUS connect error: %w", err) + } + defer conn.Close() + + logrus.Debugf("creating healthcheck timer with: systemd-run %s", strings.Join(cmd, " ")) + run := exec.Command("systemd-run", cmd...) + if out, err := run.CombinedOutput(); err != nil { + return fmt.Errorf("systemd-run failed: %w\noutput: %s", err, strings.TrimSpace(string(out))) + } + + return nil +} + +// StartTimer starts the healthcheck timer unit. +// TODO if we persist hcName to container state, pass that to this function. +func StartTimer(ctx context.Context, container containerd.Container) error { + hc := extractHealthcheck(ctx, container) + if hc == nil { + return nil + } + if shouldSkipHealthCheckSystemd(hc) { + return nil + } + + hcName := hcUnitName(container.ID(), true) + conn, err := dbus.NewSystemConnectionContext(context.Background()) + if err != nil { + return fmt.Errorf("systemd DBUS connect error: %w", err) + } + defer conn.Close() + + startChan := make(chan string) + unit := hcName + ".service" + if _, err := conn.RestartUnitContext(context.Background(), unit, "fail", startChan); err != nil { + return err + } + if msg := <-startChan; msg != "done" { + return fmt.Errorf("unexpected systemd restart result: %s", msg) + } + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + hc := extractHealthcheck(ctx, container) + if hc == nil { + return nil + } + if shouldSkipHealthCheckSystemd(hc) { + return nil + } + + return RemoveTransientHealthCheckFilesByID(ctx, container.ID()) +} + +// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { + // Don't proceed if systemd is unavailable or disabled + if !defaults.IsSystemdAvailable() || os.Getenv("DISABLE_HC_SYSTEMD") == "true" { + return nil + } + + logrus.Debugf("Removing healthcheck timer unit: %s", containerID) + + conn, err := dbus.NewSystemConnectionContext(context.Background()) + if err != nil { + return fmt.Errorf("systemd DBUS connect error: %w", err) + } + defer conn.Close() + + unitName := hcUnitName(containerID, true) + timer := unitName + ".timer" + service := unitName + ".service" + + // Stop timer + tChan := make(chan string) + if _, err := conn.StopUnitContext(context.Background(), timer, "ignore-dependencies", tChan); err == nil { + if msg := <-tChan; msg != "done" { + logrus.Warnf("timer stop message: %s", msg) + } + } + + // Stop service + sChan := make(chan string) + if _, err := conn.StopUnitContext(context.Background(), service, "ignore-dependencies", sChan); err == nil { + if msg := <-sChan; msg != "done" { + logrus.Warnf("service stop message: %s", msg) + } + } + + // Reset failed units + _ = conn.ResetFailedUnitContext(context.Background(), service) + return nil +} + +// hcUnitName returns a systemd unit name for a container healthcheck. +func hcUnitName(containerID string, bare bool) string { + unit := containerID + if !bare { + unit += fmt.Sprintf("-%x", rand.Int()) + } + return unit +} + +func extractHealthcheck(ctx context.Context, container containerd.Container) *Healthcheck { + l, err := container.Labels(ctx) + if err != nil { + log.G(ctx).WithError(err).Debugf("could not get labels for container %s", container.ID()) + return nil + } + hcStr, ok := l[labels.HealthCheck] + if !ok || hcStr == "" { + return nil + } + hc, err := HealthCheckFromJSON(hcStr) + if err != nil { + log.G(ctx).WithError(err).Debugf("invalid healthcheck config on container %s", container.ID()) + return nil + } + return hc +} + +// shouldSkipHealthCheckSystemd determines if healthcheck timers should be skipped. +func shouldSkipHealthCheckSystemd(hc *Healthcheck) bool { + // Don't proceed if systemd is unavailable or disabled + if !defaults.IsSystemdAvailable() || os.Getenv("DISABLE_HC_SYSTEMD") == "true" { + return true + } + + // Don't proceed if health check is nil, empty, explicitly NONE or interval is 0. + if hc == nil || len(hc.Test) == 0 || hc.Test[0] == "NONE" || hc.Interval == 0 { + return true + } + return false +} diff --git a/pkg/healthcheck/healthcheck_manager_windows.go b/pkg/healthcheck/healthcheck_manager_windows.go new file mode 100644 index 00000000000..63085d4feae --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_windows.go @@ -0,0 +1,43 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + + containerd "github.com/containerd/containerd/v2/client" +) + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// StartTimer starts the healthcheck timer unit. +func StartTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { + return nil +} diff --git a/pkg/ocihook/ocihook.go b/pkg/ocihook/ocihook.go index 89b6c6b1410..86d4a826f7b 100644 --- a/pkg/ocihook/ocihook.go +++ b/pkg/ocihook/ocihook.go @@ -560,8 +560,11 @@ func onCreateRuntime(opts *handlerOpts) error { } func onPostStop(opts *handlerOpts) error { + log.L.Debugf("onPostStop hook triggered for container %s (namespace: %s)", opts.state.ID, opts.state.Annotations[labels.Namespace]) + lf, err := state.New(opts.state.Annotations[labels.StateDir]) if err != nil { + log.L.WithError(err).Errorf("failed to create state store for container %s", opts.state.ID) return err } @@ -585,6 +588,7 @@ func onPostStop(opts *handlerOpts) error { ctx := context.Background() ns := opts.state.Annotations[labels.Namespace] + if opts.cni != nil { var err error b4nnEnabled, b4nnBindEnabled, err := bypass4netnsutil.IsBypass4netnsEnabled(opts.state.Annotations)