diff --git a/README.md b/README.md index 248a8c6..f5eb72e 100644 --- a/README.md +++ b/README.md @@ -80,21 +80,21 @@ Examples: --name bm-e2e-1731561 Flags: - --force Skip the destructive-action confirmation prompt - -h, --help help for check-bm-server - --image-path string Installimage IMAGE path for operating system inside the Hetzner rescue system (default "/root/.oldroot/nfs/images/Ubuntu-2404-noble-amd64-base.tar.gz") - --name string HetznerBareMetalHost metadata.name. Optional if YAML contains exactly one host - --poll-interval duration Polling interval for wait steps (default 10s) - --timeout-activate-rescue duration Timeout for activating rescue boot (default 45s) - --timeout-check-disk-rescue duration Timeout for checking target disks in rescue (default 1m0s) - --timeout-ensure-ssh-key duration Timeout for ensuring SSH key in Robot (default 1m0s) - --timeout-fetch-server duration Timeout for fetching server details from Robot (default 30s) - --timeout-install duration Timeout for one Ubuntu install step (default 9m0s) - --timeout-load-input duration Timeout for input parsing + env loading (default 30s) - --timeout-reboot-os duration Timeout for rebooting into installed OS (default 45s) - --timeout-reboot-rescue duration Timeout for requesting reboot to rescue (default 45s) - --timeout-wait-os duration Timeout for waiting until installed OS is reachable (default 6m0s) - --timeout-wait-rescue duration Timeout for waiting until rescue SSH is reachable (default 8m0s) + --force Skip the destructive-action confirmation prompt + -h, --help help for check-bm-server + --image-path string Installimage IMAGE path for operating system inside the Hetzner rescue system (default "/root/.oldroot/nfs/images/Ubuntu-2404-noble-amd64-base.tar.gz") + --name string HetznerBareMetalHost metadata.name. Optional if YAML contains exactly one host + --poll-interval duration Polling interval for wait steps (default 10s) + --timeout-activate-rescue duration Timeout for activating rescue boot (default 45s) + --timeout-check-all-disks duration Timeout for checking health of all disks in rescue (pass 1 only) (default 3m0s) + --timeout-ensure-ssh-key duration Timeout for ensuring SSH key in Robot (default 1m0s) + --timeout-fetch-server duration Timeout for fetching server details from Robot (default 30s) + --timeout-install duration Timeout for one Ubuntu install step (default 9m0s) + --timeout-load-input duration Timeout for input parsing + env loading (default 30s) + --timeout-reboot-os duration Timeout for rebooting into installed OS (default 45s) + --timeout-reboot-rescue duration Timeout for requesting reboot to rescue (default 45s) + --timeout-wait-os duration Timeout for waiting until installed OS is reachable (default 6m0s) + --timeout-wait-rescue duration Timeout for waiting until rescue SSH is reachable (default 8m0s) ``` ### `caphcli create-host-yaml --help` diff --git a/internal/cmd/check_bm_server.go b/internal/cmd/check_bm_server.go index c8a9a54..a77d549 100644 --- a/internal/cmd/check_bm_server.go +++ b/internal/cmd/check_bm_server.go @@ -53,7 +53,7 @@ target server.`, flags.DurationVar(&cfg.Timeouts.ActivateRescue, "timeout-activate-rescue", provisioncheck.DefaultActivateRescueTimeout, "Timeout for activating rescue boot") flags.DurationVar(&cfg.Timeouts.RebootToRescue, "timeout-reboot-rescue", provisioncheck.DefaultRebootToRescueTimeout, "Timeout for requesting reboot to rescue") flags.DurationVar(&cfg.Timeouts.WaitForRescue, "timeout-wait-rescue", provisioncheck.DefaultWaitForRescueTimeout, "Timeout for waiting until rescue SSH is reachable") - flags.DurationVar(&cfg.Timeouts.CheckDiskInRescue, "timeout-check-disk-rescue", provisioncheck.DefaultCheckDiskInRescueTimeout, "Timeout for checking target disks in rescue") + flags.DurationVar(&cfg.Timeouts.CheckAllDisksHealth, "timeout-check-all-disks", provisioncheck.DefaultCheckAllDisksHealthTimeout, "Timeout for checking health of all disks in rescue (pass 1 only)") flags.DurationVar(&cfg.Timeouts.InstallUbuntu, "timeout-install", provisioncheck.DefaultInstallUbuntuTimeout, "Timeout for one Ubuntu install step") flags.DurationVar(&cfg.Timeouts.RebootToOS, "timeout-reboot-os", provisioncheck.DefaultRebootToOSTimeout, "Timeout for rebooting into installed OS") flags.DurationVar(&cfg.Timeouts.WaitForOS, "timeout-wait-os", provisioncheck.DefaultWaitForOSTimeout, "Timeout for waiting until installed OS is reachable") diff --git a/internal/provisioncheck/provisioncheck.go b/internal/provisioncheck/provisioncheck.go index 27927c5..9dff180 100644 --- a/internal/provisioncheck/provisioncheck.go +++ b/internal/provisioncheck/provisioncheck.go @@ -63,8 +63,8 @@ const ( DefaultRebootToRescueTimeout = 45 * time.Second // DefaultWaitForRescueTimeout is the default timeout for waiting until rescue SSH is reachable. DefaultWaitForRescueTimeout = 8 * time.Minute - // DefaultCheckDiskInRescueTimeout is the default timeout for smartctl disk checks in rescue. - DefaultCheckDiskInRescueTimeout = 1 * time.Minute + // DefaultCheckAllDisksHealthTimeout is the default timeout for the all-disk health check in rescue. + DefaultCheckAllDisksHealthTimeout = 3 * time.Minute // DefaultInstallUbuntuTimeout is the default timeout for one installimage run. DefaultInstallUbuntuTimeout = 9 * time.Minute // DefaultRebootToOSTimeout is the default timeout for rebooting into the installed OS. @@ -79,16 +79,16 @@ const ( // Timeouts contains per-step timeouts for the provision check workflow. type Timeouts struct { - LoadInput time.Duration - EnsureSSHKey time.Duration - FetchServerDetails time.Duration - ActivateRescue time.Duration - RebootToRescue time.Duration - WaitForRescue time.Duration - CheckDiskInRescue time.Duration - InstallUbuntu time.Duration - RebootToOS time.Duration - WaitForOS time.Duration + LoadInput time.Duration + EnsureSSHKey time.Duration + FetchServerDetails time.Duration + ActivateRescue time.Duration + RebootToRescue time.Duration + WaitForRescue time.Duration + CheckAllDisksHealth time.Duration + InstallUbuntu time.Duration + RebootToOS time.Duration + WaitForOS time.Duration } // Config configures the provision check run. @@ -109,16 +109,16 @@ func DefaultConfig() Config { ImagePath: DefaultUbuntu2404ImagePath, PollInterval: DefaultPollInterval, Timeouts: Timeouts{ - LoadInput: DefaultLoadInputTimeout, - EnsureSSHKey: DefaultEnsureSSHKeyTimeout, - FetchServerDetails: DefaultFetchServerDetailsTimeout, - ActivateRescue: DefaultActivateRescueTimeout, - RebootToRescue: DefaultRebootToRescueTimeout, - WaitForRescue: DefaultWaitForRescueTimeout, - CheckDiskInRescue: DefaultCheckDiskInRescueTimeout, - InstallUbuntu: DefaultInstallUbuntuTimeout, - RebootToOS: DefaultRebootToOSTimeout, - WaitForOS: DefaultWaitForOSTimeout, + LoadInput: DefaultLoadInputTimeout, + EnsureSSHKey: DefaultEnsureSSHKeyTimeout, + FetchServerDetails: DefaultFetchServerDetailsTimeout, + ActivateRescue: DefaultActivateRescueTimeout, + RebootToRescue: DefaultRebootToRescueTimeout, + WaitForRescue: DefaultWaitForRescueTimeout, + CheckAllDisksHealth: DefaultCheckAllDisksHealthTimeout, + InstallUbuntu: DefaultInstallUbuntuTimeout, + RebootToOS: DefaultRebootToOSTimeout, + WaitForOS: DefaultWaitForOSTimeout, }, Input: os.Stdin, Output: os.Stdout, @@ -159,8 +159,8 @@ func (cfg Config) withDefaults() Config { if cfg.Timeouts.WaitForRescue == 0 { cfg.Timeouts.WaitForRescue = defaults.Timeouts.WaitForRescue } - if cfg.Timeouts.CheckDiskInRescue == 0 { - cfg.Timeouts.CheckDiskInRescue = defaults.Timeouts.CheckDiskInRescue + if cfg.Timeouts.CheckAllDisksHealth == 0 { + cfg.Timeouts.CheckAllDisksHealth = defaults.Timeouts.CheckAllDisksHealth } if cfg.Timeouts.InstallUbuntu == 0 { cfg.Timeouts.InstallUbuntu = defaults.Timeouts.InstallUbuntu @@ -210,7 +210,7 @@ func (cfg Config) Validate() error { if err := validateTimeout("--timeout-wait-rescue", cfg.Timeouts.WaitForRescue); err != nil { return err } - if err := validateTimeout("--timeout-check-disk-rescue", cfg.Timeouts.CheckDiskInRescue); err != nil { + if err := validateTimeout("--timeout-check-all-disks", cfg.Timeouts.CheckAllDisksHealth); err != nil { return err } if err := validateTimeout("--timeout-install", cfg.Timeouts.InstallUbuntu); err != nil { @@ -459,13 +459,15 @@ func (r *runner) cycle(ctx context.Context, pass int) error { return err } - err = r.runStep(ctx, fmt.Sprintf("pass-%d-check-disk-in-rescue", pass), r.cfg.Timeouts.CheckDiskInRescue, - func(stepCtx context.Context, progress stepProgress) error { - ssh := r.newRescueSSHClient() - return r.checkDiskInRescue(stepCtx, ssh, progress) - }) - if err != nil { - return err + if pass == 1 { + err = r.runStep(ctx, "pass-1-check-all-disks-health", r.cfg.Timeouts.CheckAllDisksHealth, + func(stepCtx context.Context, progress stepProgress) error { + ssh := r.newRescueSSHClient() + return r.checkAllDisksHealth(stepCtx, ssh, progress) + }) + if err != nil { + return err + } } err = r.runStep(ctx, fmt.Sprintf("pass-%d-install-ubuntu-24.04", pass), r.cfg.Timeouts.InstallUbuntu, @@ -625,18 +627,35 @@ func (r *runner) runInstall(ctx context.Context, ssh sshclient.Client, progress }) } -func (r *runner) checkDiskInRescue(ctx context.Context, ssh sshclient.Client, progress stepProgress) error { - rootWWNs := r.host.Spec.RootDeviceHints.ListOfWWN() - if len(rootWWNs) == 0 { - return errors.New("rootDeviceHints are required in the input HBMH") +func (r *runner) checkAllDisksHealth(ctx context.Context, ssh sshclient.Client, progress stepProgress) error { + out := ssh.GetHardwareDetailsStorage() + if out.Err != nil { + return fmt.Errorf("get storage details: %w", out.Err) + } + if strings.TrimSpace(out.StdOut) == "" { + return errors.New("storage output is empty") } - diskInfo, err := ssh.CheckDisk(ctx, rootWWNs) - if err != nil { - return fmt.Errorf("check-disk failed: %w", err) + var allWWNs []string + for _, line := range strings.Split(strings.TrimSpace(out.StdOut), "\n") { + var s storageDetails + if err := json.Unmarshal([]byte(validJSONFromSSHOutput(line)), &s); err != nil { + return fmt.Errorf("parse lsblk line %q: %w", line, err) + } + if s.Type == "disk" && normalizeWWN(s.WWN) != "" { + allWWNs = append(allWWNs, s.WWN) + } + } + + if len(allWWNs) == 0 { + return errors.New("no disk WWNs found — cannot run all-disk health check") } - progress("check-disk ok: %s", strings.TrimSpace(diskInfo)) + diskInfo, err := ssh.CheckDisk(ctx, allWWNs) + if err != nil { + return fmt.Errorf("check-all-disks failed: %w", err) + } + progress("check-all-disks ok:\n%s", strings.TrimSpace(diskInfo)) return nil }