Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 15 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,21 +80,21 @@ Examples:
--name bm-e2e-1731561

Flags:
--force Skip the destructive-action confirmation prompt
-h, --help help for check-bm-server
--image-path string Installimage IMAGE path for operating system inside the Hetzner rescue system (default "/root/.oldroot/nfs/images/Ubuntu-2404-noble-amd64-base.tar.gz")
--name string HetznerBareMetalHost metadata.name. Optional if YAML contains exactly one host
--poll-interval duration Polling interval for wait steps (default 10s)
--timeout-activate-rescue duration Timeout for activating rescue boot (default 45s)
--timeout-check-disk-rescue duration Timeout for checking target disks in rescue (default 1m0s)
--timeout-ensure-ssh-key duration Timeout for ensuring SSH key in Robot (default 1m0s)
--timeout-fetch-server duration Timeout for fetching server details from Robot (default 30s)
--timeout-install duration Timeout for one Ubuntu install step (default 9m0s)
--timeout-load-input duration Timeout for input parsing + env loading (default 30s)
--timeout-reboot-os duration Timeout for rebooting into installed OS (default 45s)
--timeout-reboot-rescue duration Timeout for requesting reboot to rescue (default 45s)
--timeout-wait-os duration Timeout for waiting until installed OS is reachable (default 6m0s)
--timeout-wait-rescue duration Timeout for waiting until rescue SSH is reachable (default 8m0s)
--force Skip the destructive-action confirmation prompt
-h, --help help for check-bm-server
--image-path string Installimage IMAGE path for operating system inside the Hetzner rescue system (default "/root/.oldroot/nfs/images/Ubuntu-2404-noble-amd64-base.tar.gz")
--name string HetznerBareMetalHost metadata.name. Optional if YAML contains exactly one host
--poll-interval duration Polling interval for wait steps (default 10s)
--timeout-activate-rescue duration Timeout for activating rescue boot (default 45s)
--timeout-check-all-disks duration Timeout for checking health of all disks in rescue (pass 1 only) (default 3m0s)
--timeout-ensure-ssh-key duration Timeout for ensuring SSH key in Robot (default 1m0s)
--timeout-fetch-server duration Timeout for fetching server details from Robot (default 30s)
--timeout-install duration Timeout for one Ubuntu install step (default 9m0s)
--timeout-load-input duration Timeout for input parsing + env loading (default 30s)
--timeout-reboot-os duration Timeout for rebooting into installed OS (default 45s)
--timeout-reboot-rescue duration Timeout for requesting reboot to rescue (default 45s)
--timeout-wait-os duration Timeout for waiting until installed OS is reachable (default 6m0s)
--timeout-wait-rescue duration Timeout for waiting until rescue SSH is reachable (default 8m0s)
```

### `caphcli create-host-yaml --help`
Expand Down
2 changes: 1 addition & 1 deletion internal/cmd/check_bm_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ target server.`,
flags.DurationVar(&cfg.Timeouts.ActivateRescue, "timeout-activate-rescue", provisioncheck.DefaultActivateRescueTimeout, "Timeout for activating rescue boot")
flags.DurationVar(&cfg.Timeouts.RebootToRescue, "timeout-reboot-rescue", provisioncheck.DefaultRebootToRescueTimeout, "Timeout for requesting reboot to rescue")
flags.DurationVar(&cfg.Timeouts.WaitForRescue, "timeout-wait-rescue", provisioncheck.DefaultWaitForRescueTimeout, "Timeout for waiting until rescue SSH is reachable")
flags.DurationVar(&cfg.Timeouts.CheckDiskInRescue, "timeout-check-disk-rescue", provisioncheck.DefaultCheckDiskInRescueTimeout, "Timeout for checking target disks in rescue")
flags.DurationVar(&cfg.Timeouts.CheckAllDisksHealth, "timeout-check-all-disks", provisioncheck.DefaultCheckAllDisksHealthTimeout, "Timeout for checking health of all disks in rescue (pass 1 only)")
flags.DurationVar(&cfg.Timeouts.InstallUbuntu, "timeout-install", provisioncheck.DefaultInstallUbuntuTimeout, "Timeout for one Ubuntu install step")
flags.DurationVar(&cfg.Timeouts.RebootToOS, "timeout-reboot-os", provisioncheck.DefaultRebootToOSTimeout, "Timeout for rebooting into installed OS")
flags.DurationVar(&cfg.Timeouts.WaitForOS, "timeout-wait-os", provisioncheck.DefaultWaitForOSTimeout, "Timeout for waiting until installed OS is reachable")
Expand Down
99 changes: 59 additions & 40 deletions internal/provisioncheck/provisioncheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ const (
DefaultRebootToRescueTimeout = 45 * time.Second
// DefaultWaitForRescueTimeout is the default timeout for waiting until rescue SSH is reachable.
DefaultWaitForRescueTimeout = 8 * time.Minute
// DefaultCheckDiskInRescueTimeout is the default timeout for smartctl disk checks in rescue.
DefaultCheckDiskInRescueTimeout = 1 * time.Minute
// DefaultCheckAllDisksHealthTimeout is the default timeout for the all-disk health check in rescue.
DefaultCheckAllDisksHealthTimeout = 3 * time.Minute
// DefaultInstallUbuntuTimeout is the default timeout for one installimage run.
DefaultInstallUbuntuTimeout = 9 * time.Minute
// DefaultRebootToOSTimeout is the default timeout for rebooting into the installed OS.
Expand All @@ -79,16 +79,16 @@ const (

// Timeouts contains per-step timeouts for the provision check workflow.
type Timeouts struct {
LoadInput time.Duration
EnsureSSHKey time.Duration
FetchServerDetails time.Duration
ActivateRescue time.Duration
RebootToRescue time.Duration
WaitForRescue time.Duration
CheckDiskInRescue time.Duration
InstallUbuntu time.Duration
RebootToOS time.Duration
WaitForOS time.Duration
LoadInput time.Duration
EnsureSSHKey time.Duration
FetchServerDetails time.Duration
ActivateRescue time.Duration
RebootToRescue time.Duration
WaitForRescue time.Duration
CheckAllDisksHealth time.Duration
InstallUbuntu time.Duration
RebootToOS time.Duration
WaitForOS time.Duration
}

// Config configures the provision check run.
Expand All @@ -109,16 +109,16 @@ func DefaultConfig() Config {
ImagePath: DefaultUbuntu2404ImagePath,
PollInterval: DefaultPollInterval,
Timeouts: Timeouts{
LoadInput: DefaultLoadInputTimeout,
EnsureSSHKey: DefaultEnsureSSHKeyTimeout,
FetchServerDetails: DefaultFetchServerDetailsTimeout,
ActivateRescue: DefaultActivateRescueTimeout,
RebootToRescue: DefaultRebootToRescueTimeout,
WaitForRescue: DefaultWaitForRescueTimeout,
CheckDiskInRescue: DefaultCheckDiskInRescueTimeout,
InstallUbuntu: DefaultInstallUbuntuTimeout,
RebootToOS: DefaultRebootToOSTimeout,
WaitForOS: DefaultWaitForOSTimeout,
LoadInput: DefaultLoadInputTimeout,
EnsureSSHKey: DefaultEnsureSSHKeyTimeout,
FetchServerDetails: DefaultFetchServerDetailsTimeout,
ActivateRescue: DefaultActivateRescueTimeout,
RebootToRescue: DefaultRebootToRescueTimeout,
WaitForRescue: DefaultWaitForRescueTimeout,
CheckAllDisksHealth: DefaultCheckAllDisksHealthTimeout,
InstallUbuntu: DefaultInstallUbuntuTimeout,
RebootToOS: DefaultRebootToOSTimeout,
WaitForOS: DefaultWaitForOSTimeout,
},
Input: os.Stdin,
Output: os.Stdout,
Expand Down Expand Up @@ -159,8 +159,8 @@ func (cfg Config) withDefaults() Config {
if cfg.Timeouts.WaitForRescue == 0 {
cfg.Timeouts.WaitForRescue = defaults.Timeouts.WaitForRescue
}
if cfg.Timeouts.CheckDiskInRescue == 0 {
cfg.Timeouts.CheckDiskInRescue = defaults.Timeouts.CheckDiskInRescue
if cfg.Timeouts.CheckAllDisksHealth == 0 {
cfg.Timeouts.CheckAllDisksHealth = defaults.Timeouts.CheckAllDisksHealth
}
if cfg.Timeouts.InstallUbuntu == 0 {
cfg.Timeouts.InstallUbuntu = defaults.Timeouts.InstallUbuntu
Expand Down Expand Up @@ -210,7 +210,7 @@ func (cfg Config) Validate() error {
if err := validateTimeout("--timeout-wait-rescue", cfg.Timeouts.WaitForRescue); err != nil {
return err
}
if err := validateTimeout("--timeout-check-disk-rescue", cfg.Timeouts.CheckDiskInRescue); err != nil {
if err := validateTimeout("--timeout-check-all-disks", cfg.Timeouts.CheckAllDisksHealth); err != nil {
return err
}
if err := validateTimeout("--timeout-install", cfg.Timeouts.InstallUbuntu); err != nil {
Expand Down Expand Up @@ -459,13 +459,15 @@ func (r *runner) cycle(ctx context.Context, pass int) error {
return err
}

err = r.runStep(ctx, fmt.Sprintf("pass-%d-check-disk-in-rescue", pass), r.cfg.Timeouts.CheckDiskInRescue,
func(stepCtx context.Context, progress stepProgress) error {
ssh := r.newRescueSSHClient()
return r.checkDiskInRescue(stepCtx, ssh, progress)
})
if err != nil {
return err
if pass == 1 {
err = r.runStep(ctx, "pass-1-check-all-disks-health", r.cfg.Timeouts.CheckAllDisksHealth,
func(stepCtx context.Context, progress stepProgress) error {
ssh := r.newRescueSSHClient()
return r.checkAllDisksHealth(stepCtx, ssh, progress)
})
if err != nil {
return err
}
}

err = r.runStep(ctx, fmt.Sprintf("pass-%d-install-ubuntu-24.04", pass), r.cfg.Timeouts.InstallUbuntu,
Expand Down Expand Up @@ -625,18 +627,35 @@ func (r *runner) runInstall(ctx context.Context, ssh sshclient.Client, progress
})
}

func (r *runner) checkDiskInRescue(ctx context.Context, ssh sshclient.Client, progress stepProgress) error {
rootWWNs := r.host.Spec.RootDeviceHints.ListOfWWN()
if len(rootWWNs) == 0 {
return errors.New("rootDeviceHints are required in the input HBMH")
func (r *runner) checkAllDisksHealth(ctx context.Context, ssh sshclient.Client, progress stepProgress) error {
out := ssh.GetHardwareDetailsStorage()
if out.Err != nil {
return fmt.Errorf("get storage details: %w", out.Err)
}
if strings.TrimSpace(out.StdOut) == "" {
return errors.New("storage output is empty")
}

diskInfo, err := ssh.CheckDisk(ctx, rootWWNs)
if err != nil {
return fmt.Errorf("check-disk failed: %w", err)
var allWWNs []string
for _, line := range strings.Split(strings.TrimSpace(out.StdOut), "\n") {
var s storageDetails
if err := json.Unmarshal([]byte(validJSONFromSSHOutput(line)), &s); err != nil {
return fmt.Errorf("parse lsblk line %q: %w", line, err)
}
if s.Type == "disk" && normalizeWWN(s.WWN) != "" {
allWWNs = append(allWWNs, s.WWN)
}
}

if len(allWWNs) == 0 {
return errors.New("no disk WWNs found — cannot run all-disk health check")
}

progress("check-disk ok: %s", strings.TrimSpace(diskInfo))
diskInfo, err := ssh.CheckDisk(ctx, allWWNs)
if err != nil {
return fmt.Errorf("check-all-disks failed: %w", err)
}
progress("check-all-disks ok:\n%s", strings.TrimSpace(diskInfo))
return nil
}

Expand Down
Loading