diff --git a/scripts/multinode/configure-node.sh b/scripts/multinode/configure-node.sh index 7aef43ebf6..e55d133be4 100755 --- a/scripts/multinode/configure-node.sh +++ b/scripts/multinode/configure-node.sh @@ -97,14 +97,8 @@ function copy_bootstrap_kubeconfig() { } function run_healthcheck() { - if ! sudo systemctl start greenboot-healthcheck; then - echo "Error: Failed to start greenboot-healthcheck service" - exit 1 - fi - - greenboot_status=$(systemctl show -p Result --value greenboot-healthcheck) - if [ "${greenboot_status}" != "success" ]; then - echo "Error: greenboot-healthcheck did not complete successfully (Result: ${greenboot_status})" + if ! sudo microshift healthcheck -v=2 --timeout=600s; then + echo "Error: Failed to run the 'microshift healthcheck' command" exit 1 fi } diff --git a/test/assets/auto-recovery/microshift-auto-recovery b/test/assets/auto-recovery/microshift-auto-recovery index b0c2b84eba..2301d187a5 100644 --- a/test/assets/auto-recovery/microshift-auto-recovery +++ b/test/assets/auto-recovery/microshift-auto-recovery @@ -5,22 +5,11 @@ set -xeuo pipefail # boot_success=0 is set when deployment is staged or when grub boots the system. # boot_success=1 is set when greenboot succeeds after deploying new image. -# At this time we cannot depend on missing boot_counter meaning system is done with the "deployment testing & rebooting" -# because of a bug in greenboot: set-success tries to clear boot_counter from wrong grub env file. -if grep -q "/boot/grubenv" /usr/libexec/greenboot/greenboot-grub2-set-success; then - if grub2-editenv - list | grep -q ^boot_success=0; then - echo "Greenboot didn't decide the system is healthy after staging new deployment." - echo "Quiting to not interfere with the process" - exit 0 - fi -else - # greenboot-grub2-set-success uses correct path. - # When the deployment testing is done, boot_counter should be removed. - if grub2-editenv - list | grep -q ^boot_counter=; then - echo "Greenboot didn't decide the system is healthy after staging new deployment." - echo "Quiting to not interfere with the process" - exit 0 - fi +# When the deployment testing is done, boot_counter should be removed. +if grub2-editenv - list | grep -q ^boot_counter=; then + echo "Greenboot didn't decide the system is healthy after staging new deployment." + echo "Quiting to not interfere with the process" + exit 0 fi /usr/bin/microshift restore --auto-recovery /var/lib/microshift-auto-recovery diff --git a/test/assets/auto-recovery/red-script.sh b/test/assets/auto-recovery/red-script.sh index 26d7a8e28b..28d4c5466c 100644 --- a/test/assets/auto-recovery/red-script.sh +++ b/test/assets/auto-recovery/red-script.sh @@ -5,22 +5,11 @@ set -xeuo pipefail # boot_success=0 is set when deployment is staged or when grub boots the system. # boot_success=1 is set when greenboot succeeds after deploying new image. -# At this time we cannot depend on missing boot_counter meaning system is done with the "deployment testing & rebooting" -# because of a bug in greenboot: set-success tries to clear boot_counter from wrong grub env file. -if grep -q "/boot/grubenv" /usr/libexec/greenboot/greenboot-grub2-set-success; then - if grub2-editenv - list | grep -q ^boot_success=0; then - echo "Greenboot didn't decide the system is healthy after staging new deployment." - echo "Quiting to not interfere with the process" - exit 0 - fi -else - # greenboot-grub2-set-success uses correct path. - # When the deployment testing is done, boot_counter should be removed. - if grub2-editenv - list | grep -q ^boot_counter=; then - echo "Greenboot didn't decide the system is healthy after staging new deployment." - echo "Quiting to not interfere with the process" - exit 0 - fi +# When the deployment testing is done, boot_counter should be removed. +if grub2-editenv - list | grep -q ^boot_counter=; then + echo "Greenboot didn't decide the system is healthy after staging new deployment." + echo "Quiting to not interfere with the process" + exit 0 fi echo "System is unhealthy and greenboot's 'deployment testing' procedure is not active - running auto-recovery for MicroShift" diff --git a/test/resources/libostree.py b/test/resources/libostree.py index e777d0cbf7..0cfb5bbe50 100644 --- a/test/resources/libostree.py +++ b/test/resources/libostree.py @@ -138,6 +138,14 @@ def get_current_boot_id() -> str: return boot_id.replace("-", "") +def get_last_reboots_count() -> int: + """ + Get number of system reboots using 'last reboot' command + """ + stdout = remote_sudo("last reboot | grep -c '^reboot'") + return int(stdout) + + def does_backup_exist(deploy_id: str, boot_id: str = "") -> bool: prefix = get_deployment_backup_prefix_path(deploy_id) diff --git a/test/resources/microshift-host.resource b/test/resources/microshift-host.resource index a61b5fa3e4..c81ee956a6 100644 --- a/test/resources/microshift-host.resource +++ b/test/resources/microshift-host.resource @@ -48,18 +48,29 @@ SSH Connection To MicroShift Host Should Be Functional Should Be Equal As Integers 0 ${rc} Reboot MicroShift Host - [Documentation] Reboot the MicroShift host and wait until - ... SSH connection is working again and boot identifier changes + [Documentation] Reboot the MicroShift host, waiting until SSH connection + ... is working again and boot identifier changes. Exactly one reboot + ... should have happened. Returns the new boot identifier. ... ... Expects that initial SSH connection to MicroShift host is active. + # Save the initial boot identifier and number of reboots ${bootid}= Get Current Boot Id + ${reboots}= Get Last Reboots Count + + # Reboot the system SSHLibrary.Start Command reboot sudo=True - Sleep 30s - Wait Until Keyword Succeeds 5m 15s + # Wait until the system reboots and the boot identifier changes + ${cur_bootid}= Wait Until Keyword Succeeds 5m 5s ... System Should Be Rebooted ${bootid} + # Verify that there was exactly one reboot + ${cur_reboots}= Get Last Reboots Count + Should Be Equal As Integers ${cur_reboots} ${reboots + 1} + + RETURN ${cur_bootid} + Create Thin Storage Pool [Documentation] Create a new thin storage pool ${lvmd_vg}= Set Variable If '${LVMD_VG_OVERRIDE}' != '' ${LVMD_VG_OVERRIDE} rhel @@ -107,9 +118,11 @@ System Should Not Be Ostree System Should Be Rebooted [Documentation] Assert if the system rebooted comparing the current and provided boot identifier [Arguments] ${old_bootid} - ${rebooted}= Is System Rebooted ${old_bootid} + ${rebooted} ${cur_bootid}= Is System Rebooted ${old_bootid} Should Be True ${rebooted} + RETURN ${cur_bootid} + Is System Rebooted [Documentation] Check if the system rebooted comparing the current and provided boot identifier [Arguments] ${old_bootid} @@ -118,10 +131,10 @@ Is System Rebooted ${cur_bootid}= Get Current Boot Id ${len}= Get Length ${cur_bootid} IF ${len} == 0 - RETURN False + RETURN False ${old_bootid} ELSE ${system_rebooted}= Evaluate '${old_bootid}' != '${cur_bootid}' - RETURN ${system_rebooted} + RETURN ${system_rebooted} ${cur_bootid} END Change Hostname diff --git a/test/resources/ostree-health.resource b/test/resources/ostree-health.resource index 5d431bc7b9..1b82249ad6 100644 --- a/test/resources/ostree-health.resource +++ b/test/resources/ostree-health.resource @@ -19,18 +19,14 @@ Greenboot Health Check Exited Systemctl Check Service SubState greenboot-healthcheck.service exited Restart Greenboot And Wait For Success - [Documentation] Restart the greenboot-healthcheck service and check its status + [Documentation] Verify the health of the deployments using the MicroShift healthcheck command + [Arguments] ${wait_timeout}=600s - VAR ${unit_name} greenboot-healthcheck.service - - # Note that the Systemctl keyword from systemd.resource cannot be used to - # restart the greenboot-healthcheck service due to the keyword expecting - # the 'running' state after the restart. This condition does not apply on - # services like greenboot that exit after their startup finishes. + # Note that the newer implementation of greenboot-healthcheck service no + # longer allows service restart. Using the MicroShift healthcheck command + # to verify the health of the deployments. ${stdout} ${stderr} ${rc} Execute Command - ... systemctl restart ${unit_name} + ... microshift healthcheck -v=2 --timeout="${wait_timeout}" ... sudo=True return_stdout=True return_stderr=True return_rc=True - IF ${rc} != 0 Systemctl Print Service Status And Logs ${unit_name} + IF ${rc} != 0 Log Many ${stdout} ${stderr} Should Be Equal As Integers 0 ${rc} - - Wait Until Greenboot Health Check Exited diff --git a/test/scenarios-bootc/upstream/cos10-src@greenboot.sh.disabled b/test/scenarios-bootc/upstream/cos10-src@greenboot.sh similarity index 100% rename from test/scenarios-bootc/upstream/cos10-src@greenboot.sh.disabled rename to test/scenarios-bootc/upstream/cos10-src@greenboot.sh diff --git a/test/scenarios-bootc/upstream/cos9-src@greenboot.sh.disabled b/test/scenarios-bootc/upstream/cos9-src@greenboot.sh similarity index 100% rename from test/scenarios-bootc/upstream/cos9-src@greenboot.sh.disabled rename to test/scenarios-bootc/upstream/cos9-src@greenboot.sh diff --git a/test/suites/backup/auto-recovery-extra.robot b/test/suites/backup/auto-recovery-extra.robot index c77c2622cc..22e7e3e1c9 100644 --- a/test/suites/backup/auto-recovery-extra.robot +++ b/test/suites/backup/auto-recovery-extra.robot @@ -44,7 +44,6 @@ Using Systemd Dropin To React On Failure Of MicroShift Auto Recovery On Red Boot [Documentation] Verify greenboot integration to start auto-recovery procedure. - Greenboot Workaround For Boot Counter Command Should Work rm -rf ${SAFETY_BACKUP} Stop MicroShift @@ -55,10 +54,13 @@ Auto Recovery On Red Boot Corrupt Etcd Database Start MicroShift Expecting Failure - ${bootid}= Get Current Boot Id - Command Should Fail systemctl restart greenboot-healthcheck - Wait Until Keyword Succeeds 5m 15s - ... System Should Be Rebooted ${bootid} + # Reboot to trigger greenboot auto-recovery, saving the boot identifier + ${cur_bootid}= Reboot MicroShift Host + # The auto-recovery procedure will reboot the system after restoring the backup + ${new_bootid}= Wait Until Keyword Succeeds 5m 15s + ... System Should Be Rebooted ${cur_bootid} + Should Not Be Equal As Strings ${cur_bootid} ${new_bootid} + # Verify the system is healthy after auto-recovery Wait Until Greenboot Health Check Exited [Teardown] Run Keywords @@ -157,9 +159,3 @@ Set Up Greenboot Red Script Command Should Work mkdir -p /etc/greenboot/red.d ${drop_in}= Operating System.Get File ./assets/auto-recovery/red-script.sh Upload String To File ${drop_in} /etc/greenboot/red.d/99-auto-recovery.sh - -Greenboot Workaround For Boot Counter - [Documentation] If the grub's boot_success is 1, clears boot_counter. - # Because of greenboot's bug, we need to it here, so the system doesn't reboot after red boot. - Command Should Work - ... bash -c "grub2-editenv list | grep -q boot_success=1 && /usr/bin/grub2-editenv /boot/grub2/grubenv unset boot_counter || true" diff --git a/test/suites/fault-tests/outages.robot b/test/suites/fault-tests/outages.robot index 0961d76868..c7b9cb5994 100644 --- a/test/suites/fault-tests/outages.robot +++ b/test/suites/fault-tests/outages.robot @@ -39,8 +39,9 @@ Network Disconnection Local Command Should Work ${STRESS_TESTING_SCRIPT} -d network_outage ${STRESS_TESTING_REMOTE_FLAGS} # Check results - ${system_rebooted}= Is System Rebooted ${old_bootid} + ${system_rebooted} ${cur_bootid}= Is System Rebooted ${old_bootid} Should Not Be True ${system_rebooted} + Should Not Be Equal As Strings ${cur_bootid} ${old_bootid} Wait For MicroShift All Pods Should Be Running timeout=600s diff --git a/test/suites/greenboot/greenboot.robot b/test/suites/greenboot/greenboot.robot index d0fff9c0ed..da0b575d19 100644 --- a/test/suites/greenboot/greenboot.robot +++ b/test/suites/greenboot/greenboot.robot @@ -16,15 +16,16 @@ Test Tags restart slow ${LOCAL_WORKLOAD_FILE} ../docs/config/busybox_running_check.sh ${GREENBOOT_WORKLOAD_FILE} /etc/greenboot/check/required.d/50_busybox_running_check.sh ${GREENBOOT_CONFIG_FILE} /etc/greenboot/greenboot.conf -${GREENBOOT_CONFIG_CONTENT} MICROSHIFT_WAIT_TIMEOUT_SEC=180 +${WAIT_TIMEOUT} 180 +${GREENBOOT_CONFIG_CONTENT} MICROSHIFT_WAIT_TIMEOUT_SEC=${WAIT_TIMEOUT} ${MANIFEST_SOURCE_DIR} ./assets/kustomizations/greenboot/ ${MANIFEST_DIR} /etc/microshift/manifests -${HOSTNAME_BIN_PATH} "" +${HOSTNAME_BIN_PATH} /usr/bin/hostname *** Test Cases *** Run with User Workload - [Documentation] Add user's workload and verify that the workload starts and greenboot is successful. + [Documentation] Add a user workload, verify that it starts and greenboot is successful Restart Greenboot And Wait For Success Add User Workload Cleanup And Start @@ -38,11 +39,12 @@ Simulate Service Failure Restart Greenboot And Wait For Success Disrupt Service Cleanup MicroShift --all --keep-images - # not using 'Start MicroShift' kw because it retries + # Not using the 'Start MicroShift' keyword because it retries Run Keyword And Expect Error 0 != 1 ... Systemctl start microshift + # Lower the default wait timeout to fail-fast tests Run Keyword And Expect Error 0 != 1 - ... Restart Greenboot And Wait For Success + ... Restart Greenboot And Wait For Success ${WAIT_TIMEOUT}s [Teardown] Run Keywords ... Restore Service @@ -53,8 +55,9 @@ Simulate Pod Failure Restart Greenboot And Wait For Success Disrupt Pod Network Restart MicroShift + # Lower the default wait timeout to fail-fast tests Run Keyword And Expect Error 0 != 1 - ... Restart Greenboot And Wait For Success + ... Restart Greenboot And Wait For Success ${WAIT_TIMEOUT}s [Teardown] Run Keywords ... Remove Drop In MicroShift Config 10-svcNetwork @@ -68,7 +71,8 @@ Setup Suite Check Required Env Variables Login MicroShift Host Setup Kubeconfig - Upload String To File ${GREENBOOT_CONFIG_CONTENT} ${GREENBOOT_CONFIG_FILE} # change the default timeout + # Change the default timeout to fail-fast tests + Upload String To File ${GREENBOOT_CONFIG_CONTENT} ${GREENBOOT_CONFIG_FILE} Add User Workload [Documentation] Upload User workload files to the MicroShift host @@ -92,7 +96,7 @@ Cleanup User Workload Should Be Equal As Integers 0 ${rc} Disrupt Service - [Documentation] Prevent Microshift service from starting correctly. + [Documentation] Prevent MicroShift service from starting correctly ${stdout} ${rc}= Execute Command ... which hostname @@ -109,18 +113,14 @@ Disrupt Service Should Be Equal As Integers 0 ${rc} Restore Service - [Documentation] Restore Microshift service to the correct form. + [Documentation] Restore MicroShift service to the correct form ${stdout} ${rc}= Execute Command ... chmod 755 ${HOSTNAME_BIN_PATH} ... sudo=True return_rc=True Should Be Equal As Integers 0 ${rc} - # Reboot to regain ostree deployment (revert usroverlay) - ${is_ostree}= Is System OSTree - IF ${is_ostree} Reboot MicroShift Host - Disrupt Pod Network - [Documentation] Prevent Microshift pods From starting correctly + [Documentation] Prevent MicroShift pods from starting correctly ${configuration}= Catenate SEPARATOR=\n ... network: ... \ clusterNetwork: @@ -131,8 +131,8 @@ Disrupt Pod Network Drop In MicroShift Config ${configuration} 10-svcNetwork Cleanup And Start - [Documentation] Wipe Microshift data and start it. + [Documentation] Wipe MicroShift data and restart the system Cleanup MicroShift --all --keep-images - Start MicroShift - Setup Kubeconfig - Restart Greenboot And Wait For Success + Enable MicroShift + Reboot MicroShift Host + Wait Until Greenboot Health Check Exited diff --git a/test/suites/tuned/microshift-tuned.robot b/test/suites/tuned/microshift-tuned.robot index dd975fb8c5..c2cc1e05bc 100644 --- a/test/suites/tuned/microshift-tuned.robot +++ b/test/suites/tuned/microshift-tuned.robot @@ -98,5 +98,6 @@ Restart MicroShift-Tuned Not Expecting Reboot Command Should Work systemctl restart microshift-tuned.service Wait Until Keyword Succeeds 1m 10s ... Systemctl Check Service SubState microshift-tuned.service dead - ${rebooted}= Is System Rebooted ${bootid} + ${rebooted} ${cur_bootid}= Is System Rebooted ${bootid} Should Not Be True ${rebooted} + Should Be Equal As Strings ${cur_bootid} ${bootid}