diff --git a/.github/workflows/rolling-update.yml b/.github/workflows/rolling-update.yml
new file mode 100644
index 00000000..626044a8
--- /dev/null
+++ b/.github/workflows/rolling-update.yml
@@ -0,0 +1,256 @@
+name: Rolling update
+
+# Manually-triggered production rollout. Joins the Tailnet, SSHes over
+# MagicDNS into each node, and invokes scripts/rolling-update.sh.
+# See docs/design/2026_04_24_proposed_deploy_via_tailscale.md.
+
+on:
+  workflow_dispatch:
+    inputs:
+      ref:
+        description: Git ref (tag or sha) to deploy. Also used as the image tag unless image_tag is set.
+        required: true
+        type: string
+      image_tag:
+        description: Override the image tag (default = ref). Used for rollbacks.
+        required: false
+        type: string
+        default: ""
+      nodes:
+        description: Comma-separated raft IDs to roll (e.g. "n1,n2"). Empty = all nodes in NODES_RAFT_MAP.
+        required: false
+        type: string
+        default: ""
+      dry_run:
+        description: Render the plan and run a reachability check only; do NOT touch containers.
+        required: true
+        type: boolean
+        default: true
+
+permissions:
+  contents: read
+  id-token: write   # required by tailscale/github-action OIDC flow
+  packages: read    # required by `docker manifest inspect` on ghcr.io private images
+
+concurrency:
+  group: rolling-update
+  cancel-in-progress: false
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    # Approval gate — see GitHub environment settings for required reviewers.
+    # Dry-runs also use this environment so the secret wiring is identical;
+    # the environment's approval rule should be configured to auto-approve
+    # dry-runs if that distinction is desired (GitHub UI: "Deployment
+    # protection rules").
+    environment: production
+    timeout-minutes: 60
+
+    steps:
+      # The deploy script (scripts/rolling-update.sh) is executed from the
+      # checkout below, after the tailnet join and SSH key load. If `ref`
+      # were unvalidated, anyone with workflow_dispatch permission could
+      # point it at a fork commit containing a modified script that
+      # harvests the SSH key / Tailscale OAuth secret. Validate that
+      # `ref` resolves to (a) the repository's default branch, or (b) a
+      # tag on the repo, before we hand it any secret. Branches other
+      # than the default are rejected so review-gated default is the only
+      # entry point besides immutable tags.
+      - name: Validate ref is default branch or a tag
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REF: ${{ inputs.ref }}
+        run: |
+          set -euo pipefail
+          default_branch=$(gh api "repos/${{ github.repository }}" --jq '.default_branch')
+          default_sha=$(gh api "repos/${{ github.repository }}/commits/$default_branch" --jq '.sha')
+          if [[ "$REF" == "$default_branch" || "$REF" == "$default_sha" ]]; then
+            echo "ref is the default branch ($default_branch / $default_sha)"
+            exit 0
+          fi
+          if gh api "repos/${{ github.repository }}/git/refs/tags/$REF" >/dev/null 2>&1; then
+            echo "ref is a tag"
+            exit 0
+          fi
+          # Also accept a sha that is reachable from the default branch's HEAD
+          # so historical default-branch commits remain deployable for rollback.
+          if git -c "http.https://github.com/.extraheader=" ls-remote "https://github.com/${{ github.repository }}.git" | grep -q "^$REF"; then
+            echo "::error::ref '$REF' is not the default branch or a tag. Branches other than '$default_branch' are disallowed to prevent arbitrary-code execution with production secrets."
+            exit 1
+          fi
+          echo "ref '$REF' treated as a sha; checkout will fail if it is not reachable."
+
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ inputs.ref }}
+          persist-credentials: false
+
+      - name: Install jq
+        run: sudo apt-get install -y --no-install-recommends jq
+
+      - name: Verify image exists on ghcr.io
+        env:
+          IMAGE_BASE: ${{ vars.IMAGE_BASE }}
+          IMAGE_TAG: ${{ inputs.image_tag || inputs.ref }}
+          GHCR_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -euo pipefail
+          if [[ -z "$IMAGE_BASE" ]]; then
+            echo "::error::IMAGE_BASE repository variable is not set"
+            exit 1
+          fi
+          echo "Checking $IMAGE_BASE:$IMAGE_TAG"
+          echo "$GHCR_TOKEN" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin >/dev/null
+          if ! docker manifest inspect "$IMAGE_BASE:$IMAGE_TAG" >/dev/null; then
+            echo "::error::image $IMAGE_BASE:$IMAGE_TAG not found on ghcr.io"
+            exit 1
+          fi
+
+      - name: Join Tailnet (ephemeral)
+        uses: tailscale/github-action@v3
+        with:
+          oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }}
+          oauth-secret: ${{ secrets.TS_OAUTH_SECRET }}
+          tags: tag:ci-deploy
+
+      - name: Configure SSH
+        env:
+          SSH_KEY: ${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}
+          KNOWN_HOSTS: ${{ secrets.DEPLOY_KNOWN_HOSTS }}
+        run: |
+          set -euo pipefail
+          mkdir -p ~/.ssh
+          chmod 700 ~/.ssh
+          printf '%s\n' "$SSH_KEY" > ~/.ssh/id_ed25519
+          chmod 600 ~/.ssh/id_ed25519
+          printf '%s\n' "$KNOWN_HOSTS" > ~/.ssh/known_hosts
+          chmod 644 ~/.ssh/known_hosts
+          # Sanity: no stray CRLF in the key, no empty file.
+          test -s ~/.ssh/id_ed25519 || { echo "::error::DEPLOY_SSH_PRIVATE_KEY is empty"; exit 1; }
+          ssh-keygen -lf ~/.ssh/id_ed25519 >/dev/null
+
+      - name: Render NODES and SSH_TARGETS
+        id: render
+        env:
+          NODES_RAFT_MAP: ${{ vars.NODES_RAFT_MAP }}
+          SSH_TARGETS_MAP: ${{ vars.SSH_TARGETS_MAP }}
+          NODES_FILTER: ${{ inputs.nodes }}
+        run: |
+          set -euo pipefail
+          if [[ -z "$NODES_RAFT_MAP" || -z "$SSH_TARGETS_MAP" ]]; then
+            echo "::error::NODES_RAFT_MAP or SSH_TARGETS_MAP is not set in the production environment variables"
+            exit 1
+          fi
+          if [[ -n "$NODES_FILTER" ]]; then
+            # Filter NODES_RAFT_MAP and SSH_TARGETS_MAP to the requested subset.
+            # Reject any filter ID that does not appear in the map: silently
+            # dropping unknown IDs would let a typo like "n1,n9" proceed as
+            # a one-node rollout of n1 alone, which is a staged-deploy
+            # footgun.
+            IFS=',' read -r -a wanted <<< "$NODES_FILTER"
+            IFS=',' read -r -a entries <<< "$NODES_RAFT_MAP"
+            declare -a known_ids=()
+            for e in "${entries[@]}"; do
+              known_ids+=("${e%%=*}")
+            done
+            unknown=""
+            for w in "${wanted[@]}"; do
+              found=0
+              for k in "${known_ids[@]}"; do
+                if [[ "$k" == "$w" ]]; then found=1; break; fi
+              done
+              if [[ $found -eq 0 ]]; then unknown+="${unknown:+, }$w"; fi
+            done
+            if [[ -n "$unknown" ]]; then
+              echo "::error::nodes filter '$NODES_FILTER' references unknown raft IDs: $unknown. Known IDs: ${known_ids[*]}"
+              exit 1
+            fi
+            filter_csv() {
+              local all="$1"
+              local filter="$2"
+              local out=""
+              IFS=',' read -r -a list_entries <<< "$all"
+              IFS=',' read -r -a list_wanted <<< "$filter"
+              for e in "${list_entries[@]}"; do
+                key="${e%%=*}"
+                for w in "${list_wanted[@]}"; do
+                  if [[ "$key" == "$w" ]]; then
+                    out+="${e},"
+                    break
+                  fi
+                done
+              done
+              echo "${out%,}"
+            }
+            NODES_RAFT_MAP="$(filter_csv "$NODES_RAFT_MAP" "$NODES_FILTER")"
+            SSH_TARGETS_MAP="$(filter_csv "$SSH_TARGETS_MAP" "$NODES_FILTER")"
+            if [[ -z "$NODES_RAFT_MAP" ]]; then
+              echo "::error::nodes filter '$NODES_FILTER' matches nothing in NODES_RAFT_MAP"
+              exit 1
+            fi
+          fi
+          {
+            echo "NODES=$NODES_RAFT_MAP"
+            echo "SSH_TARGETS=$SSH_TARGETS_MAP"
+          } >> "$GITHUB_OUTPUT"
+          echo "::group::Deploy plan"
+          echo "NODES=$NODES_RAFT_MAP"
+          echo "SSH_TARGETS=$SSH_TARGETS_MAP"
+          echo "::endgroup::"
+
+      - name: Tailscale reachability check
+        env:
+          SSH_TARGETS: ${{ steps.render.outputs.SSH_TARGETS }}
+        run: |
+          set -euo pipefail
+          IFS=',' read -r -a entries <<< "$SSH_TARGETS"
+          failed=0
+          for e in "${entries[@]}"; do
+            host="${e##*=}"
+            host="${host%%:*}"
+            # strip user@ if present
+            host="${host##*@}"
+            if tailscale ping --c 2 --timeout 3s "$host" >/dev/null 2>&1; then
+              echo "  ok   $host"
+            else
+              echo "::error::$host not reachable over tailnet"
+              failed=1
+            fi
+          done
+          if [[ "$failed" -ne 0 ]]; then
+            exit 1
+          fi
+
+      - name: Dry-run summary
+        if: ${{ inputs.dry_run }}
+        env:
+          NODES: ${{ steps.render.outputs.NODES }}
+          SSH_TARGETS: ${{ steps.render.outputs.SSH_TARGETS }}
+          IMAGE_BASE: ${{ vars.IMAGE_BASE }}
+          IMAGE_TAG: ${{ inputs.image_tag || inputs.ref }}
+          SSH_USER: ${{ vars.SSH_USER }}
+        run: |
+          set -euo pipefail
+          cat <<EOF
+          ==== DRY RUN — no containers were touched ====
+          image:       ${IMAGE_BASE}:${IMAGE_TAG}
+          SSH user:    ${SSH_USER}
+          NODES:       ${NODES}
+          SSH_TARGETS: ${SSH_TARGETS}
+          ref:         ${{ inputs.ref }}
+          Re-run with dry_run=false to apply.
+          EOF
+
+      - name: Roll cluster
+        if: ${{ !inputs.dry_run }}
+        env:
+          NODES: ${{ steps.render.outputs.NODES }}
+          SSH_TARGETS: ${{ steps.render.outputs.SSH_TARGETS }}
+          SSH_USER: ${{ vars.SSH_USER }}
+          IMAGE: ${{ vars.IMAGE_BASE }}:${{ inputs.image_tag || inputs.ref }}
+          SSH_STRICT_HOST_KEY_CHECKING: "yes"
+        run: |
+          set -euo pipefail
+          ./scripts/rolling-update.sh
diff --git a/docs/deploy_via_tailscale_runbook.md b/docs/deploy_via_tailscale_runbook.md
new file mode 100644
index 00000000..82e6f051
--- /dev/null
+++ b/docs/deploy_via_tailscale_runbook.md
@@ -0,0 +1,230 @@
+# Deploy via Tailscale + GitHub Actions — Runbook
+
+Companion doc to `docs/design/2026_04_24_proposed_deploy_via_tailscale.md`. This
+runbook is for operators: what to configure on GitHub and Tailscale so the
+`rolling-update` workflow can execute a production deploy.
+
+## 1. Precondition: Tailscale on every node
+
+Each cluster node must have `tailscale` installed, logged into the tailnet, and
+tagged so the CI runner's ACL can reach it.
+
+```
+# on each kv0X node
+sudo tailscale up \
+  --ssh=false \
+  --advertise-tags=tag:elastickv-node \
+  --accept-routes=false
+```
+
+`--ssh=false` disables Tailscale SSH, so the node's regular system
+sshd must be running and authorised to accept connections on the
+tailnet interface. The workflow uses plain SSH over the tailnet
+(Tailscale is only the network layer); if you rely on Tailscale SSH
+for operator access elsewhere, drop this flag but keep in mind the
+workflow still connects to the system sshd.
+
+Verify the node is reachable by MagicDNS from another tailnet peer:
+
+```
+tailscale status | grep kv0X
+ping kv0X.<tailnet>.ts.net
+```
+
+## 2. Tailscale ACL
+
+In the Tailscale admin console, add the deploy rule to the tailnet ACL:
+
+```jsonc
+"tagOwners": {
+  "tag:ci-deploy":      ["autogroup:admin"],
+  "tag:elastickv-node": ["autogroup:admin"],
+},
+"acls": [
+  {
+    "action": "accept",
+    "src":    ["tag:ci-deploy"],
+    "dst":    ["tag:elastickv-node:22"],
+  },
+],
+```
+
+`tag:ci-deploy` must NOT have access to any other port on the tailnet. The
+deploy workflow only needs SSH.
+
+## 3. Tailscale OAuth client
+
+Admin console → Settings → OAuth clients → New client:
+
+- Description: `elastickv GitHub Actions deploy`
+- Scopes: `auth_keys` (write). Recent `tailscale/github-action` versions
+  may additionally require `devices:write` (to register and clean up
+  the ephemeral node); enable that if the join step fails with an
+  authorization error. The action's README is the definitive source
+  for current scope requirements. `devices:core` is NOT a valid
+  Tailscale OAuth scope — earlier drafts of this runbook named it and
+  would have produced an auth failure.
+- Tags: `tag:ci-deploy`
+
+Copy the client ID and secret; they go into GitHub in the next step.
+
+## 4. GitHub environment: `production`
+
+Repo → Settings → Environments → New environment: `production`.
+
+### Required reviewers
+Configure "Required reviewers" on the environment. **Every run that targets
+this environment pauses for approval** — including dry-runs, because
+GitHub's native environment-protection rules cannot be made conditional on
+workflow inputs. Three ways to handle the dry-run-approval friction:
+
+1. **Accept the prompt for dry-runs too.** A dry-run requires one approver
+   click before it proceeds; still cheap and keeps the policy simple.
+2. **Add a second environment `production-dry-run` without required
+   reviewers** and change the workflow to pick the environment via
+   `environment: ${{ inputs.dry_run && 'production-dry-run' || 'production' }}`.
+   Cleanest but doubles the secrets/vars you must keep in sync.
+3. **Install a deployment-protection-rule GitHub App** (custom or
+   marketplace) that approves runs whose inputs show `dry_run == true`.
+   Most flexible; most setup.
+
+v1 ships with approach 1 (single environment, prompt on every run).
+Approach 2 is the recommended upgrade once the friction becomes annoying.
+
+### Environment secrets
+
+| Name | Value |
+|------|-------|
+| `TS_OAUTH_CLIENT_ID`        | Tailscale OAuth client ID from step 3 |
+| `TS_OAUTH_SECRET`           | Tailscale OAuth secret from step 3 |
+| `DEPLOY_SSH_PRIVATE_KEY`    | OpenSSH private key, authorized on every node under the deploy user |
+| `DEPLOY_KNOWN_HOSTS`        | `ssh-keyscan -H kv01.<tailnet>.ts.net kv02.<tailnet>.ts.net …` output. Use `-H` to hash hostnames so the secret's contents don't leak the tailnet topology if the runner environment is compromised. |
+
+The SSH key should be ed25519, dedicated to CI (not a reused developer key).
+Regenerate on operator rotation.
+
+### Environment variables
+
+| Name | Value | Example |
+|------|-------|---------|
+| `IMAGE_BASE`      | Container image path (no tag)     | `ghcr.io/bootjp/elastickv` |
+| `SSH_USER`        | SSH login on every node           | `bootjp` |
+| `NODES_RAFT_MAP`  | Comma-separated `raftId=host` (no port — the script appends `RAFT_PORT`). Use full MagicDNS FQDNs so every node can resolve the advertised address regardless of local DNS search domains. The workflow renders this into the script's `NODES` env var. | `n1=kv01.<tailnet>.ts.net,n2=kv02.<tailnet>.ts.net,n3=kv03.<tailnet>.ts.net,n4=kv04.<tailnet>.ts.net,n5=kv05.<tailnet>.ts.net` |
+| `SSH_TARGETS_MAP` | Comma-separated `raftId=ssh-host`. The workflow renders this into the script's `SSH_TARGETS` env var. Usually identical to `NODES_RAFT_MAP` unless SSH access uses a different hostname. | `n1=kv01.<tailnet>.ts.net,n2=kv02.<tailnet>.ts.net,...` |
+
+**Why two names?** The workflow uses `NODES_RAFT_MAP` / `SSH_TARGETS_MAP`
+in the `production` environment to keep the GitHub-side names
+distinct from the script-side env var names it hands to
+`rolling-update.sh`. If you run the script by hand from a workstation
+you must export `NODES` and `SSH_TARGETS` directly — the workflow-side
+names are only understood by the workflow's render step.
+
+## 5. Running a deploy
+
+Actions tab → "Rolling update" → Run workflow.
+
+Inputs:
+
+- `ref` — the git tag or sha to deploy (also used as the container image tag)
+- `image_tag` — override only for rollbacks (e.g., deploy tag `v1.2.3` of a
+  commit that was also `v1.2.3`)
+- `nodes` — subset of raft IDs, e.g., `n1,n2`. Empty rolls all nodes.
+- `dry_run` — default `true`. Renders the plan and checks reachability without
+  touching containers.
+
+Recommended first-run sequence:
+
+1. `dry_run: true`, `nodes: n1`, `ref: <target>` — confirms tailnet join,
+   SSH config, image availability, target mapping. No production impact.
+2. `dry_run: false`, `nodes: n1` — roll a single node, verify the cluster
+   stays healthy and the image is correct.
+3. `dry_run: false`, `nodes:` (empty) — full roll.
+
+## 6. Rollback
+
+Re-run the workflow with `image_tag` set to the previous-known-good sha. The
+`nodes` input can target specific nodes if only some carry the bad image.
+
+### If a running workflow is cancelled mid-rollout
+
+GitHub cancelling the job between node steps is the one operational
+hazard that needs manual cleanup.
+
+1. **Look at the last log line from the `Roll cluster` step.** The
+   script emits `==> [<raft-id>@<host>] start` at the beginning of
+   each per-node recreate (see `scripts/rolling-update.sh:398`).
+   Whichever `<raft-id>` appears in the last such line is the one in
+   flight when the cancel signal landed.
+2. **SSH into that node** over Tailscale and run `docker ps`. If the
+   container is absent or `Exited`, finish the recreate by hand. The
+   `docker run` invocation itself is redirected to `/dev/null` by the
+   script, so the workflow log does NOT contain the full argv. To
+   reconstruct it, read the `Roll cluster` step's rendered
+   environment — the workflow exports `IMAGE`, `DATA_DIR`,
+   `RAFT_PORT`, `REDIS_PORT`, `S3_PORT`, `ENABLE_S3`, `NODES`,
+   `SSH_TARGETS`, and the merged `EXTRA_ENV` before invoking the
+   script. Anything not explicitly set (e.g., `RAFT_PORT` in a
+   minimally-overridden deploy) falls back to the script's default
+   (`RAFT_PORT=50051`, `REDIS_PORT=6379`, `S3_PORT=9000`,
+   `ENABLE_S3=true`). GOMEMLIMIT / CONTAINER_MEMORY_LIMIT (PR #617)
+   are propagated via `EXTRA_ENV` once that PR lands. Together the
+   rendered env + the node's `deploy.env` is enough to reconstruct
+   the same `docker run` you would see if you re-ran with the same
+   inputs.
+3. **Confirm the new leader via `raftadmin` or metrics** before re-running
+   the workflow with `nodes:` scoped to the remaining untouched IDs. Do
+   NOT re-run the full rollout if the partial one is still in flight —
+   it will stop the same node you are trying to recover.
+4. **File a ticket** with the log excerpt so we can eventually teach the
+   workflow to set a start-marker on each node and fast-skip completed
+   nodes on re-run.
+
+The script is idempotent. `scripts/rolling-update.sh:794-798` skips a
+node when its running image id equals the target image and its gRPC
+endpoint is healthy — an already-rolled node is a no-op, not a
+redundant stop/recreate. Re-running the workflow with the same
+`ref` after confirming the interrupted node is healthy is therefore
+safe: nodes that already match the target image are passed over,
+and only the still-stale one gets recreated.
+
+## 7. What the workflow does NOT do (yet)
+
+- **No post-deploy health verification beyond tailnet reachability.** The
+  script itself blocks on `raftadmin` leadership transfer and health-gate
+  timeouts, but the workflow does not independently probe Prometheus or
+  Redis after the roll. Add this when we have a canonical post-deploy
+  assertion suite.
+- **No auto-rollback on failure.** If the script exits non-zero mid-roll,
+  the cluster is left in whatever state the script reached. The operator
+  must inspect and either re-roll or roll back manually.
+- **No Jepsen gate.** The deploy does not require a green Jepsen run on
+  `ref` before proceeding.
+- **No image-signature check.** `cosign verify` on the image is a follow-up.
+
+## 8. Troubleshooting
+
+### Job pauses indefinitely at "Waiting for approval"
+Expected for **every** run in v1 — `.github/workflows/rolling-update.yml`
+sets `environment: production` unconditionally, so both dry-run and
+non-dry-run executions pause for approval. A reviewer from the
+`production` environment must click Approve. Check the "Required
+reviewers" list in the environment settings. See §4 "GitHub
+environment" for the dry-run-approval alternatives (approach 2: add a
+second `production-dry-run` environment without required reviewers)
+if the friction becomes intolerable.
+
+### `tailscale ping` fails for a node
+The node may not be running `tailscaled`, not tagged `tag:elastickv-node`, or
+the tailnet ACL may have drifted. `tailscale status` on the node should show
+the tag; the admin console should show the IP in the `tag:elastickv-node`
+group.
+
+### `image ... not found on ghcr.io`
+The verification step hit the ghcr manifest API and got a 404. Either the
+image tag was not pushed (check the `Docker Image CI` workflow for `ref`) or
+the tag is a moving tag (`latest`) that the verification step can't
+distinguish from stale. Specify an immutable tag.
+
+### SSH `Host key verification failed`
+`DEPLOY_KNOWN_HOSTS` is stale. Re-run `ssh-keyscan -H` against every node and
+update the secret.
diff --git a/docs/design/2026_04_24_proposed_deploy_via_tailscale.md b/docs/design/2026_04_24_proposed_deploy_via_tailscale.md
new file mode 100644
index 00000000..2eb0168f
--- /dev/null
+++ b/docs/design/2026_04_24_proposed_deploy_via_tailscale.md
@@ -0,0 +1,208 @@
+# Deploy via Tailscale + GitHub Actions
+
+**Status:** Proposed
+**Author:** bootjp
+**Date:** 2026-04-24
+
+---
+
+## 1. Background
+
+Today the rolling-update flow is manual: an operator SSHes to their workstation,
+exports the required env vars (`NODES`, `SSH_TARGETS`, image tag, etc.),
+invokes `scripts/rolling-update.sh`, and watches it roll the cluster.
+
+Problems:
+
+- **No audit trail.** Who rolled what, when, and from which commit is only
+  visible in each operator's local shell history.
+- **Manual secret handling.** SSH keys, Tailscale auth, and S3 creds live on
+  operator workstations. Joining and leaving the ops rotation requires key
+  shuffling.
+- **No approval gate.** The production cluster is rolled by whoever types the
+  command. A typo can take out the cluster before anyone else sees it.
+- **No dry-run.** The script supports neither `--dry-run` nor a preview mode;
+  operators who want to verify targeting have to read the script.
+
+The 2026-04-24 incident compounds the risk: the cluster is fragile enough that
+a rolling update executed against the wrong `NODES` list could cascade into an
+election storm.
+
+## 2. Proposal
+
+Move rolling deploys to a GitHub Actions workflow that joins the Tailnet via
+`tailscale/github-action`, SSHes into each node over Tailscale MagicDNS, and
+invokes the existing `scripts/rolling-update.sh`. All secrets live in GitHub
+environments; every deploy becomes a PR-linked, reviewable event.
+
+**Precondition (operator responsibility):** Tailscale is already installed and
+logged in on every node, with SSH access enabled over the tailnet.
+
+### 2.1 Workflow shape
+
+```yaml
+name: Rolling update
+on:
+  workflow_dispatch:
+    inputs:
+      ref:           # git sha/tag of the image to deploy
+      image_tag:     # defaults to $ref; override only for rollbacks
+      nodes:         # subset of raft IDs; empty = full roll
+      dry_run:       # bool, default TRUE — renders plan but doesn't roll
+
+jobs:
+  deploy:
+    environment: production    # requires approval
+    concurrency:
+      group: rolling-update
+      cancel-in-progress: false
+    runs-on: ubuntu-latest
+    steps:
+      - checkout
+      - join tailnet (tailscale/github-action, ephemeral)
+      - configure SSH (add DEPLOY_SSH_PRIVATE_KEY to agent)
+      - render NODES + SSH_TARGETS from repo config
+      - if dry_run: print the derived env and exit
+      - else: ./scripts/rolling-update.sh
+```
+
+### 2.2 Secrets and variables
+
+Stored in a GitHub `production` environment (not repo-wide):
+
+**Secrets:**
+- `TS_OAUTH_CLIENT_ID`, `TS_OAUTH_SECRET` — Tailscale OAuth client scoped to
+  "devices:write" on a single tag (e.g., `tag:ci-deploy`). Ephemeral nodes;
+  cleaned up automatically after the job.
+- `DEPLOY_SSH_PRIVATE_KEY` — SSH key authorized on every node. Restricted to
+  the `deploy` user (if we split it out) or `bootjp` (initial).
+- `DEPLOY_KNOWN_HOSTS` — pre-populated `known_hosts` with the Tailnet MagicDNS
+  entries. Prevents the first-connect TOFU prompt.
+
+**Variables (non-secret):**
+- `NODES_RAFT_MAP` — `n1=kv01.tailnet.ts.net,n2=kv02.tailnet.ts.net,...`
+  (full MagicDNS FQDNs; bare short names can resolve differently
+  depending on each node's search-domain configuration). The script
+  appends `RAFT_PORT` automatically, so do NOT include a port here.
+  The runbook (`docs/deploy_via_tailscale_runbook.md`) carries the
+  same FQDN convention; keep the two in sync if either changes.
+- `SSH_TARGETS_MAP` — `n1=kv01.tailnet.ts.net,...` (MagicDNS FQDN).
+- `IMAGE_BASE` — `ghcr.io/bootjp/elastickv` (tag is appended from the input).
+- `SSH_USER` — e.g., `bootjp`.
+
+### 2.3 Tailscale authentication
+
+Use OAuth ephemeral nodes (not a long-lived auth key):
+
+- Create an OAuth client in Tailscale admin console with scope
+  `auth_keys` (write) on tag `tag:ci-deploy`. (`tailscale/github-action`
+  uses the OAuth client to mint a short-lived auth key on each run;
+  recent action versions may also require `devices:write` so the
+  ephemeral node can register and be cleaned up — consult the action's
+  README for the current scope list. Earlier drafts of this doc named
+  `devices:core`, which is not a supported Tailscale OAuth scope and
+  would fail authentication.)
+- Store client ID + secret in GitHub env secrets.
+- `tailscale/github-action@v3` joins the tailnet for the duration of the job
+  as an ephemeral tagged node; disconnects automatically on job exit.
+
+ACLs on the Tailnet side should limit `tag:ci-deploy` to SSH (tcp/22) on
+`tag:elastickv-node` only. No other ports, no other tags.
+
+### 2.4 SSH
+
+Two options:
+
+- **A. Tailscale SSH.** Lets CI SSH in without managing an SSH keypair: the
+  Tailnet ACL is the authorization model. Requires the nodes to have
+  `--ssh` flag on `tailscaled` (or `tailscale up --ssh`) and the Tailnet ACL
+  to grant `tag:ci-deploy` SSH access to node tag + user. No SSH keys in
+  GitHub at all.
+- **B. Plain SSH over Tailscale.** CI brings an SSH key; nodes continue to
+  use `~/.ssh/authorized_keys`. Tailscale is just the network layer.
+
+**Recommendation for v1: B** (plain SSH). Nodes already have `authorized_keys`
+for the current manual flow; nothing to change on the node side. Tailscale
+SSH (A) can be a follow-up once the key-rotation story is written up.
+
+### 2.5 Dry-run semantics
+
+With `dry_run: true` (the default):
+
+- Everything up to script invocation runs (checkout, tailnet join, SSH agent
+  load, `NODES`/`SSH_TARGETS` render).
+- The script is invoked with `--help` + the rendered env is printed as a
+  collapsed log group.
+- `tailscale ping` is run against each SSH target to confirm reachability.
+- The actual `docker stop/rm/run` loop does NOT execute.
+
+This catches the common failure modes (bad secret, bad env mapping, a node
+unreachable over the tailnet) before touching any live container.
+
+### 2.6 Production environment approval
+
+Mark the `production` GitHub environment as requiring approval from a list
+of reviewers. GitHub's native environment-protection rules do NOT support
+conditioning approval on workflow inputs, so **both** dry-run and non-
+dry-run runs will pause for approval when `environment: production` is
+declared unconditionally on the job. That is the v1 policy — simpler,
+one environment, one approver list; see runbook §4 for the dry-run-
+approval alternatives (a second `production-dry-run` environment without
+required reviewers, or a deployment-protection-rule GitHub App).
+
+Alternative: require approval unconditionally and treat the dry-run as a
+"preview" that an approver must ack. This is the v1 shape by default.
+
+**Recommendation:** approval required for every run in v1 (one
+environment). Add the second environment only when the dry-run friction
+becomes annoying.
+
+### 2.7 Rollback
+
+Rolling back uses the same workflow with `image_tag: <previous-sha>`. The
+script already supports the rollout order env var (`ROLLING_ORDER`) so an
+operator can force-roll only the affected nodes.
+
+**Gap:** there is no "stop mid-rollout" control today. If the workflow is
+cancelled via GitHub UI during a roll, the in-flight node may be mid-recreate.
+`rolling-update.sh` is supposed to be idempotent and crash-safe, but this
+should be verified before we call the workflow production-ready.
+
+## 3. Open questions
+
+- **SSH user.** Continue using `bootjp` (personal) or provision a shared
+  `deploy` user on each node? v1 sticks with `bootjp` to keep scope tight;
+  follow-up can introduce `deploy` with a limited sudo rule for `docker`.
+- **Secret scope.** Environment-scoped secrets (as proposed) vs.
+  repository-scoped. Environment-scoped wins on blast radius but requires
+  the GitHub environment to be pre-created. Assume pre-created.
+- **Image availability check.** Should the workflow verify the image tag
+  exists on ghcr.io before starting the roll? Cheap to add (`docker manifest
+  inspect` in a pre-step) and prevents a half-rolled cluster when the tag is
+  typo'd.
+- **Jepsen gating.** The existing `jepsen-test.yml` workflow exists.
+  Option: require a green Jepsen run on `ref` within the last N hours before
+  allowing deploy. Skipped for v1; worth revisiting before rolling this out
+  to high-traffic periods.
+
+## 4. Out of scope for v1
+
+- Automatic deploys on merge to main (needs more test coverage before we'd
+  trust it).
+- Blue-green or canary strategies (we don't have the traffic-routing layer
+  for it).
+- Metrics-based rollback trigger (watch p99, auto-revert if it jumps).
+- Tailscale SSH (option A above).
+- A shared `deploy` user with restricted sudo.
+
+## 5. Implementation plan
+
+1. Write `.github/workflows/rolling-update.yml` implementing §2.1.
+2. Document the secrets/variables setup in
+   `docs/operations/deploy_runbook.md` (new).
+3. Run once with `dry_run: true` on a feature branch to validate secrets
+   wiring without touching prod.
+4. Run once with `dry_run: false` targeting a single node (via the `nodes`
+   input) to prove the happy path.
+5. Cut over: archive the operator-local rolling flow, document the new one
+   as the canonical path.