stackhpc · bertiethorpe · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025 · Nov 7, 2025
@@ -154,6 +154,7 @@ jobs:
         run: |
           . venv/bin/activate
           . environments/.stackhpc/activate
+          ansible-playbook --limit login,control ansible/adhoc/lock-unlock-instances.yml -e "lock_unlock_action=unlock"
           cd "$STACKHPC_TF_DIR"
           tofu init
           tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
@@ -237,6 +238,7 @@ jobs:
         run: |
           . venv/bin/activate
           . environments/.stackhpc/activate
+          ansible-playbook ansible/adhoc/lock-unlock-instances.yml -e "lock_unlock_action=unlock"
           cd "$STACKHPC_TF_DIR"
           tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR"
         if: ${{ success() || cancelled() }}
@@ -142,7 +142,9 @@ To configure the appliance, ensure the venv and the environment are [activated](
 ansible-playbook ansible/site.yml
 ```
 
-Once it completes you can log in to the cluster using:
+To prevent the cluster instances from being changed or `tofu destroy` running, this playbook begins by locking the OpenStack instances. Any subsequent desired changes to the OpenTofu state require running an unlocking playbook as detailed in the adhoc command section of [docs/operations.md](docs/operations.md).
+
+Once `site.yml` completes you can log in to the cluster using:
 
 ```shell
 ssh rocky@$login_ip

@@ -0,0 +1,27 @@
+---
+# Lock or unlock cluster instances
+
+# Used for site.yml / rebuild-via-slurm.yml
+# Run required for rebuild.yml / tofu destroy / changes to tofu state etc.
+
+# Examples:
+
+#   ansible-playbook --limit login,control ansible/adhoc/lock-unlock-instances.yml -e "lock_unlock_action=unlock"
+
+#   ansansible-playbook ansible/adhoc/lock-unlock-instances.yml -e "lock_unlock_action=unlock" -e "lock_unlock_hosts=compute"
+
+# - name: Unlock compute instances
+#   vars:
+#     lock_unlock_action: unlock
+#     lock_unlock_hosts: compute
+#   ansible.builtin.import_playbook: lock-unlock-instances.yml
+
+- hosts: "{{ lock_unlock_hosts | default('cluster') }}"
+  gather_facts: false
+  become: false
+  tasks:
+    - name: Lock/Unlock instances
+      openstack.cloud.server_action:
+        action: "{{ lock_unlock_action | default('lock') }}"
+        server: "{{ inventory_hostname }}"
+      delegate_to: localhost
@@ -8,6 +8,12 @@
 
 # See docs/slurm-controlled-rebuild.md.
 
+- name: Unlock compute instances for rebuild
+  vars:
+    lock_unlock_action: unlock
+    lock_unlock_hosts: compute
+  ansible.builtin.import_playbook: lock-unlock-instances.yml
+
 - hosts: login
   run_once: true
   gather_facts: false

@@ -5,6 +5,10 @@
 # Use --limit to control which hosts to rebuild (either specific hosts or the <cluster_name>_<partition_name> groups defining partitions).
 # Optionally, supply `-e rebuild_image=<image_name_or_id>` to define a specific image, otherwise the current image is reused.
 #
+# After running site.yml, all instances are locked, so to run the rebuild.yml, the unlock playbook must be run:
+#   ansible-playbook ansible/adhoc/lock-unlock-instances.yml -e "lock_unlock_action=unlock"
+# Similarly to rebuild, --limit can be used to control which hosts to unlock.
+#
 # NOTE: If a hostvar `instance_id` is defined this is used to select hosts.
 #       Otherwise the hostname is used and this must be unique, which may not be the case e.g. if using identically-named staging and production hosts.
 #

@@ -0,0 +1,19 @@
+---
+- hosts: localhost
+  gather_facts: false
+  become: false
+  tasks:
+    - name: Confirm continuing if using production environment
+      ansible.builtin.pause:
+        prompt: |
+          *************************************
+          *  WARNING: PROTECTED ENVIRONMENT!  *
+          *************************************
+
+          Current environment: {{ appliances_environment_name }}
+          Do you really want to continue (yes/no)?
+      register: env_confirm_safe
+      when:
+        - appliances_environment_name in appliances_protected_environments
+        - not (appliances_protected_environment_autoapprove | default(false) | bool)
+      failed_when: not (env_confirm_safe.user_input | bool)
@@ -1,4 +1,10 @@
 ---
+
+- ansible.builtin.import_playbook: safe-env.yml
+
+- name: Lock cluster instances
+  ansible.builtin.import_playbook: adhoc/lock-unlock-instances.yml
+
 - name: Run pre.yml hook
   vars:
     # hostvars not available here, so have to recalculate environment root:

@@ -22,7 +22,7 @@ login and control nodes. The process follows
 1. Compute nodes are reimaged:
 
 ```shell
-ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml
+ansible-playbook -v ansible/adhoc/rebuild-via-slurm.yml
 ```
 
 2. Ansible-init runs against newly reimaged compute nodes

@@ -12,34 +12,36 @@ In summary, the way this functionality works is as follows:
 
 1. The image references(s) are manually updated in the OpenTofu configuration
    in the normal way.
-2. `tofu apply` is run which rebuilds the login and control nodes to the new
+2. The adhoc playbook `lock-unlock-instances.yml` is run limited to control and login
+   nodes, with `lock_unlock_action=unlock` to allow the nodes to be rebuilt.
+3. `tofu apply` is run which rebuilds the login and control nodes to the new
    image(s). The new image reference for compute nodes is ignored, but is
    written into the hosts inventory file (and is therefore available as an
    Ansible hostvar).
-3. The `site.yml` playbook is run which reconfigures the cluster as normal. At
-   this point the cluster is functional, but using a new image for the login
-   and control nodes and the old image for the compute nodes. This playbook
-   also:
+4. The `site.yml` playbook is run which locks the instances again and reconfigures
+   the cluster as normal. At this point the cluster is functional, but using a new
+   image for the login and control nodes and the old image for the compute nodes.
+   This playbook also:
    - Writes cluster configuration to the control node, using the
      [compute_init](../../ansible/roles/compute_init/README.md) role.
    - Configures an application credential and helper programs on the control
      node, using the [rebuild](../../ansible/roles/rebuild/README.md) role.
-4. An admin submits Slurm jobs, one for each node, to a special "rebuild"
-   partition using an Ansible playbook. Because this partition has higher
-   priority than the partitions normal users can use, these rebuild jobs become
-   the next job in the queue for every node (although any jobs currently
-   running will complete as normal).
-5. Because these rebuild jobs have the `--reboot` flag set, before launching them
+5. An admin submits Slurm jobs, one for each node, to a special "rebuild"
+   partition using the adhoc playbook `rebuild-via-slurm.yml`. Because this partition
+   has higher priority than the partitions normal users can use, these rebuild jobs
+   become the next job in the queue for every node (although any jobs currently running
+   will complete as normal).
+6. Because these rebuild jobs have the `--reboot` flag set, before launching them
    the Slurm control node runs a [RebootProgram](https://slurm.schedmd.com/slurm.conf.html#OPT_RebootProgram)
    which compares the current image for the node to the one in the cluster
    configuration, and if it does not match, uses OpenStack to rebuild the
    node to the desired (updated) image.
    TODO: Describe the logic if they DO match
-6. After a rebuild, the compute node runs various Ansible tasks during boot,
+7. After a rebuild, the compute node runs various Ansible tasks during boot,
    controlled by the [compute_init](../../ansible/roles/compute_init/README.md)
    role, to fully configure the node again. It retrieves the required cluster
    configuration information from the control node via an NFS mount.
-7. Once the `slurmd` daemon starts on a compute node, the slurm controller
+8. Once the `slurmd` daemon starts on a compute node, the slurm controller
    registers the node as having finished rebooting. It then launches the actual
    job, which does not do anything.
 

@@ -212,7 +212,9 @@ ansible-playbook ansible/adhoc/$PLAYBOOK
 Currently they include the following (see each playbook for links to documentation):
 
 - `hpctests.yml`: MPI-based cluster tests for latency, bandwidth and floating point performance.
+- `lock-unlock-instances.yml`: Lock cluster instances for preventing tofu changes, or unlock to allow changes.
 - `rebuild.yml`: Rebuild nodes with existing or new images (NB: this is intended for development not for re-imaging nodes on an in-production cluster).
+  Requires `lock-unlock-instances.yml` be run first.
 - `restart-slurm.yml`: Restart all Slurm daemons in the correct order.
 - `update-packages.yml`: Update specified packages on cluster nodes (NB: not recommended for routine use).
 

@@ -100,6 +100,7 @@ sequenceDiagram
     participant cloud as Cloud
     participant nodes as Cluster Instances
     note over ansible: Update OpenTofu cluster_image variable [1]
+    ansible->>cloud: Unlock control and and login nodes
     rect rgb(204, 232, 250)
     note over ansible: $ tofu apply ....
     ansible<<->>cloud: Check login/compute current vs desired images

@@ -4,6 +4,8 @@ ansible_user: rocky
 appliances_repository_root: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}"
 appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
 appliances_environment_name: "{{ appliances_environment_root | basename | regex_replace('\\W+', '') }}" # [a-zA-Z0-9_] only
+appliances_protected_environments:
+  - production
 appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it
 # appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform
 appliances_mode: configure