Skip to content
Open
10 changes: 2 additions & 8 deletions scripts/multinode/configure-node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,8 @@ function copy_bootstrap_kubeconfig() {
}

function run_healthcheck() {
if ! sudo systemctl start greenboot-healthcheck; then
echo "Error: Failed to start greenboot-healthcheck service"
exit 1
fi

greenboot_status=$(systemctl show -p Result --value greenboot-healthcheck)
if [ "${greenboot_status}" != "success" ]; then
echo "Error: greenboot-healthcheck did not complete successfully (Result: ${greenboot_status})"
if ! sudo microshift healthcheck -v=2 --timeout=600s; then
echo "Error: Failed to run the 'microshift healthcheck' command"
exit 1
fi
}
Expand Down
21 changes: 5 additions & 16 deletions test/assets/auto-recovery/microshift-auto-recovery
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,11 @@ set -xeuo pipefail
# boot_success=0 is set when deployment is staged or when grub boots the system.
# boot_success=1 is set when greenboot succeeds after deploying new image.

# At this time we cannot depend on missing boot_counter meaning system is done with the "deployment testing & rebooting"
# because of a bug in greenboot: set-success tries to clear boot_counter from wrong grub env file.
if grep -q "/boot/grubenv" /usr/libexec/greenboot/greenboot-grub2-set-success; then
if grub2-editenv - list | grep -q ^boot_success=0; then
echo "Greenboot didn't decide the system is healthy after staging new deployment."
echo "Quiting to not interfere with the process"
exit 0
fi
else
# greenboot-grub2-set-success uses correct path.
# When the deployment testing is done, boot_counter should be removed.
if grub2-editenv - list | grep -q ^boot_counter=; then
echo "Greenboot didn't decide the system is healthy after staging new deployment."
echo "Quiting to not interfere with the process"
exit 0
fi
# When the deployment testing is done, boot_counter should be removed.
if grub2-editenv - list | grep -q ^boot_counter=; then
echo "Greenboot didn't decide the system is healthy after staging new deployment."
echo "Quiting to not interfere with the process"
exit 0
fi

/usr/bin/microshift restore --auto-recovery /var/lib/microshift-auto-recovery
Expand Down
21 changes: 5 additions & 16 deletions test/assets/auto-recovery/red-script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,11 @@ set -xeuo pipefail
# boot_success=0 is set when deployment is staged or when grub boots the system.
# boot_success=1 is set when greenboot succeeds after deploying new image.

# At this time we cannot depend on missing boot_counter meaning system is done with the "deployment testing & rebooting"
# because of a bug in greenboot: set-success tries to clear boot_counter from wrong grub env file.
if grep -q "/boot/grubenv" /usr/libexec/greenboot/greenboot-grub2-set-success; then
if grub2-editenv - list | grep -q ^boot_success=0; then
echo "Greenboot didn't decide the system is healthy after staging new deployment."
echo "Quiting to not interfere with the process"
exit 0
fi
else
# greenboot-grub2-set-success uses correct path.
# When the deployment testing is done, boot_counter should be removed.
if grub2-editenv - list | grep -q ^boot_counter=; then
echo "Greenboot didn't decide the system is healthy after staging new deployment."
echo "Quiting to not interfere with the process"
exit 0
fi
# When the deployment testing is done, boot_counter should be removed.
if grub2-editenv - list | grep -q ^boot_counter=; then
echo "Greenboot didn't decide the system is healthy after staging new deployment."
echo "Quiting to not interfere with the process"
exit 0
fi

echo "System is unhealthy and greenboot's 'deployment testing' procedure is not active - running auto-recovery for MicroShift"
Expand Down
8 changes: 8 additions & 0 deletions test/resources/libostree.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,14 @@ def get_current_boot_id() -> str:
return boot_id.replace("-", "")


def get_last_reboots_count() -> int:
"""
Get number of system reboots using 'last reboot' command
"""
stdout = remote_sudo("last reboot | grep -c '^reboot'")
return int(stdout)


def does_backup_exist(deploy_id: str, boot_id: str = "") -> bool:
prefix = get_deployment_backup_prefix_path(deploy_id)

Expand Down
27 changes: 20 additions & 7 deletions test/resources/microshift-host.resource
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,29 @@ SSH Connection To MicroShift Host Should Be Functional
Should Be Equal As Integers 0 ${rc}

Reboot MicroShift Host
[Documentation] Reboot the MicroShift host and wait until
... SSH connection is working again and boot identifier changes
[Documentation] Reboot the MicroShift host, waiting until SSH connection
... is working again and boot identifier changes. Exactly one reboot
... should have happened. Returns the new boot identifier.
...
... Expects that initial SSH connection to MicroShift host is active.

# Save the initial boot identifier and number of reboots
${bootid}= Get Current Boot Id
${reboots}= Get Last Reboots Count

# Reboot the system
SSHLibrary.Start Command reboot sudo=True
Sleep 30s

Wait Until Keyword Succeeds 5m 15s
# Wait until the system reboots and the boot identifier changes
${cur_bootid}= Wait Until Keyword Succeeds 5m 5s
... System Should Be Rebooted ${bootid}

# Verify that there was exactly one reboot
${cur_reboots}= Get Last Reboots Count
Should Be Equal As Integers ${cur_reboots} ${reboots + 1}

RETURN ${cur_bootid}

Create Thin Storage Pool
[Documentation] Create a new thin storage pool
${lvmd_vg}= Set Variable If '${LVMD_VG_OVERRIDE}' != '' ${LVMD_VG_OVERRIDE} rhel
Expand Down Expand Up @@ -107,9 +118,11 @@ System Should Not Be Ostree
System Should Be Rebooted
[Documentation] Assert if the system rebooted comparing the current and provided boot identifier
[Arguments] ${old_bootid}
${rebooted}= Is System Rebooted ${old_bootid}
${rebooted} ${cur_bootid}= Is System Rebooted ${old_bootid}
Should Be True ${rebooted}

RETURN ${cur_bootid}

Is System Rebooted
[Documentation] Check if the system rebooted comparing the current and provided boot identifier
[Arguments] ${old_bootid}
Expand All @@ -118,10 +131,10 @@ Is System Rebooted
${cur_bootid}= Get Current Boot Id
${len}= Get Length ${cur_bootid}
IF ${len} == 0
RETURN False
RETURN False ${old_bootid}
ELSE
${system_rebooted}= Evaluate '${old_bootid}' != '${cur_bootid}'
RETURN ${system_rebooted}
RETURN ${system_rebooted} ${cur_bootid}
END

Change Hostname
Expand Down
18 changes: 7 additions & 11 deletions test/resources/ostree-health.resource
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,14 @@ Greenboot Health Check Exited
Systemctl Check Service SubState greenboot-healthcheck.service exited

Restart Greenboot And Wait For Success
[Documentation] Restart the greenboot-healthcheck service and check its status
[Documentation] Verify the health of the deployments using the MicroShift healthcheck command
[Arguments] ${wait_timeout}=600s

VAR ${unit_name} greenboot-healthcheck.service

# Note that the Systemctl keyword from systemd.resource cannot be used to
# restart the greenboot-healthcheck service due to the keyword expecting
# the 'running' state after the restart. This condition does not apply on
# services like greenboot that exit after their startup finishes.
# Note that the newer implementation of greenboot-healthcheck service no
# longer allows service restart. Using the MicroShift healthcheck command
# to verify the health of the deployments.
${stdout} ${stderr} ${rc} Execute Command
... systemctl restart ${unit_name}
... microshift healthcheck -v=2 --timeout="${wait_timeout}"
... sudo=True return_stdout=True return_stderr=True return_rc=True
IF ${rc} != 0 Systemctl Print Service Status And Logs ${unit_name}
IF ${rc} != 0 Log Many ${stdout} ${stderr}
Should Be Equal As Integers 0 ${rc}

Wait Until Greenboot Health Check Exited
18 changes: 7 additions & 11 deletions test/suites/backup/auto-recovery-extra.robot
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ Using Systemd Dropin To React On Failure Of MicroShift
Auto Recovery On Red Boot
[Documentation] Verify greenboot integration to start auto-recovery procedure.

Greenboot Workaround For Boot Counter
Command Should Work rm -rf ${SAFETY_BACKUP}

Stop MicroShift
Expand All @@ -55,10 +54,13 @@ Auto Recovery On Red Boot
Corrupt Etcd Database
Start MicroShift Expecting Failure

${bootid}= Get Current Boot Id
Command Should Fail systemctl restart greenboot-healthcheck
Wait Until Keyword Succeeds 5m 15s
... System Should Be Rebooted ${bootid}
# Reboot to trigger greenboot auto-recovery, saving the boot identifier
${cur_bootid}= Reboot MicroShift Host
# The auto-recovery procedure will reboot the system after restoring the backup
${new_bootid}= Wait Until Keyword Succeeds 5m 15s
... System Should Be Rebooted ${cur_bootid}
Should Not Be Equal As Strings ${cur_bootid} ${new_bootid}
# Verify the system is healthy after auto-recovery
Wait Until Greenboot Health Check Exited

[Teardown] Run Keywords
Expand Down Expand Up @@ -157,9 +159,3 @@ Set Up Greenboot Red Script
Command Should Work mkdir -p /etc/greenboot/red.d
${drop_in}= Operating System.Get File ./assets/auto-recovery/red-script.sh
Upload String To File ${drop_in} /etc/greenboot/red.d/99-auto-recovery.sh

Greenboot Workaround For Boot Counter
[Documentation] If the grub's boot_success is 1, clears boot_counter.
# Because of greenboot's bug, we need to it here, so the system doesn't reboot after red boot.
Command Should Work
... bash -c "grub2-editenv list | grep -q boot_success=1 && /usr/bin/grub2-editenv /boot/grub2/grubenv unset boot_counter || true"
3 changes: 2 additions & 1 deletion test/suites/fault-tests/outages.robot
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ Network Disconnection
Local Command Should Work ${STRESS_TESTING_SCRIPT} -d network_outage ${STRESS_TESTING_REMOTE_FLAGS}

# Check results
${system_rebooted}= Is System Rebooted ${old_bootid}
${system_rebooted} ${cur_bootid}= Is System Rebooted ${old_bootid}
Should Not Be True ${system_rebooted}
Should Not Be Equal As Strings ${cur_bootid} ${old_bootid}
Wait For MicroShift
All Pods Should Be Running timeout=600s

Expand Down
36 changes: 18 additions & 18 deletions test/suites/greenboot/greenboot.robot
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,16 @@ Test Tags restart slow
${LOCAL_WORKLOAD_FILE} ../docs/config/busybox_running_check.sh
${GREENBOOT_WORKLOAD_FILE} /etc/greenboot/check/required.d/50_busybox_running_check.sh
${GREENBOOT_CONFIG_FILE} /etc/greenboot/greenboot.conf
${GREENBOOT_CONFIG_CONTENT} MICROSHIFT_WAIT_TIMEOUT_SEC=180
${WAIT_TIMEOUT} 180
${GREENBOOT_CONFIG_CONTENT} MICROSHIFT_WAIT_TIMEOUT_SEC=${WAIT_TIMEOUT}
${MANIFEST_SOURCE_DIR} ./assets/kustomizations/greenboot/
${MANIFEST_DIR} /etc/microshift/manifests
${HOSTNAME_BIN_PATH} ""
${HOSTNAME_BIN_PATH} /usr/bin/hostname


*** Test Cases ***
Run with User Workload
[Documentation] Add user's workload and verify that the workload starts and greenboot is successful.
[Documentation] Add a user workload, verify that it starts and greenboot is successful
Restart Greenboot And Wait For Success
Add User Workload
Cleanup And Start
Expand All @@ -38,11 +39,12 @@ Simulate Service Failure
Restart Greenboot And Wait For Success
Disrupt Service
Cleanup MicroShift --all --keep-images
# not using 'Start MicroShift' kw because it retries
# Not using the 'Start MicroShift' keyword because it retries
Run Keyword And Expect Error 0 != 1
... Systemctl start microshift
# Lower the default wait timeout to fail-fast tests
Run Keyword And Expect Error 0 != 1
... Restart Greenboot And Wait For Success
... Restart Greenboot And Wait For Success ${WAIT_TIMEOUT}s

[Teardown] Run Keywords
... Restore Service
Expand All @@ -53,8 +55,9 @@ Simulate Pod Failure
Restart Greenboot And Wait For Success
Disrupt Pod Network
Restart MicroShift
# Lower the default wait timeout to fail-fast tests
Run Keyword And Expect Error 0 != 1
... Restart Greenboot And Wait For Success
... Restart Greenboot And Wait For Success ${WAIT_TIMEOUT}s

[Teardown] Run Keywords
... Remove Drop In MicroShift Config 10-svcNetwork
Expand All @@ -68,7 +71,8 @@ Setup Suite
Check Required Env Variables
Login MicroShift Host
Setup Kubeconfig
Upload String To File ${GREENBOOT_CONFIG_CONTENT} ${GREENBOOT_CONFIG_FILE} # change the default timeout
# Change the default timeout to fail-fast tests
Upload String To File ${GREENBOOT_CONFIG_CONTENT} ${GREENBOOT_CONFIG_FILE}

Add User Workload
[Documentation] Upload User workload files to the MicroShift host
Expand All @@ -92,7 +96,7 @@ Cleanup User Workload
Should Be Equal As Integers 0 ${rc}

Disrupt Service
[Documentation] Prevent Microshift service from starting correctly.
[Documentation] Prevent MicroShift service from starting correctly

${stdout} ${rc}= Execute Command
... which hostname
Expand All @@ -109,18 +113,14 @@ Disrupt Service
Should Be Equal As Integers 0 ${rc}

Restore Service
[Documentation] Restore Microshift service to the correct form.
[Documentation] Restore MicroShift service to the correct form
${stdout} ${rc}= Execute Command
... chmod 755 ${HOSTNAME_BIN_PATH}
... sudo=True return_rc=True
Should Be Equal As Integers 0 ${rc}

# Reboot to regain ostree deployment (revert usroverlay)
${is_ostree}= Is System OSTree
IF ${is_ostree} Reboot MicroShift Host

Disrupt Pod Network
[Documentation] Prevent Microshift pods From starting correctly
[Documentation] Prevent MicroShift pods from starting correctly
${configuration}= Catenate SEPARATOR=\n
... network:
... \ clusterNetwork:
Expand All @@ -131,8 +131,8 @@ Disrupt Pod Network
Drop In MicroShift Config ${configuration} 10-svcNetwork

Cleanup And Start
[Documentation] Wipe Microshift data and start it.
[Documentation] Wipe MicroShift data and restart the system
Cleanup MicroShift --all --keep-images
Start MicroShift
Setup Kubeconfig
Restart Greenboot And Wait For Success
Enable MicroShift
Reboot MicroShift Host
Wait Until Greenboot Health Check Exited
3 changes: 2 additions & 1 deletion test/suites/tuned/microshift-tuned.robot
Original file line number Diff line number Diff line change
Expand Up @@ -98,5 +98,6 @@ Restart MicroShift-Tuned Not Expecting Reboot
Command Should Work systemctl restart microshift-tuned.service
Wait Until Keyword Succeeds 1m 10s
... Systemctl Check Service SubState microshift-tuned.service dead
${rebooted}= Is System Rebooted ${bootid}
${rebooted} ${cur_bootid}= Is System Rebooted ${bootid}
Should Not Be True ${rebooted}
Should Be Equal As Strings ${cur_bootid} ${bootid}