diff --git a/.github/workflows/rolling-update.yml b/.github/workflows/rolling-update.yml new file mode 100644 index 00000000..626044a8 --- /dev/null +++ b/.github/workflows/rolling-update.yml @@ -0,0 +1,256 @@ +name: Rolling update + +# Manually-triggered production rollout. Joins the Tailnet, SSHes over +# MagicDNS into each node, and invokes scripts/rolling-update.sh. +# See docs/design/2026_04_24_proposed_deploy_via_tailscale.md. + +on: + workflow_dispatch: + inputs: + ref: + description: Git ref (tag or sha) to deploy. Also used as the image tag unless image_tag is set. + required: true + type: string + image_tag: + description: Override the image tag (default = ref). Used for rollbacks. + required: false + type: string + default: "" + nodes: + description: Comma-separated raft IDs to roll (e.g. "n1,n2"). Empty = all nodes in NODES_RAFT_MAP. + required: false + type: string + default: "" + dry_run: + description: Render the plan and run a reachability check only; do NOT touch containers. + required: true + type: boolean + default: true + +permissions: + contents: read + id-token: write # required by tailscale/github-action OIDC flow + packages: read # required by `docker manifest inspect` on ghcr.io private images + +concurrency: + group: rolling-update + cancel-in-progress: false + +jobs: + deploy: + runs-on: ubuntu-latest + # Approval gate — see GitHub environment settings for required reviewers. + # Dry-runs also use this environment so the secret wiring is identical; + # the environment's approval rule should be configured to auto-approve + # dry-runs if that distinction is desired (GitHub UI: "Deployment + # protection rules"). + environment: production + timeout-minutes: 60 + + steps: + # The deploy script (scripts/rolling-update.sh) is executed from the + # checkout below, after the tailnet join and SSH key load. If `ref` + # were unvalidated, anyone with workflow_dispatch permission could + # point it at a fork commit containing a modified script that + # harvests the SSH key / Tailscale OAuth secret. Validate that + # `ref` resolves to (a) the repository's default branch, or (b) a + # tag on the repo, before we hand it any secret. Branches other + # than the default are rejected so review-gated default is the only + # entry point besides immutable tags. + - name: Validate ref is default branch or a tag + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REF: ${{ inputs.ref }} + run: | + set -euo pipefail + default_branch=$(gh api "repos/${{ github.repository }}" --jq '.default_branch') + default_sha=$(gh api "repos/${{ github.repository }}/commits/$default_branch" --jq '.sha') + if [[ "$REF" == "$default_branch" || "$REF" == "$default_sha" ]]; then + echo "ref is the default branch ($default_branch / $default_sha)" + exit 0 + fi + if gh api "repos/${{ github.repository }}/git/refs/tags/$REF" >/dev/null 2>&1; then + echo "ref is a tag" + exit 0 + fi + # Also accept a sha that is reachable from the default branch's HEAD + # so historical default-branch commits remain deployable for rollback. + if git -c "http.https://github.com/.extraheader=" ls-remote "https://github.com/${{ github.repository }}.git" | grep -q "^$REF"; then + echo "::error::ref '$REF' is not the default branch or a tag. Branches other than '$default_branch' are disallowed to prevent arbitrary-code execution with production secrets." + exit 1 + fi + echo "ref '$REF' treated as a sha; checkout will fail if it is not reachable." + + - name: Checkout + uses: actions/checkout@v6 + with: + ref: ${{ inputs.ref }} + persist-credentials: false + + - name: Install jq + run: sudo apt-get install -y --no-install-recommends jq + + - name: Verify image exists on ghcr.io + env: + IMAGE_BASE: ${{ vars.IMAGE_BASE }} + IMAGE_TAG: ${{ inputs.image_tag || inputs.ref }} + GHCR_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + if [[ -z "$IMAGE_BASE" ]]; then + echo "::error::IMAGE_BASE repository variable is not set" + exit 1 + fi + echo "Checking $IMAGE_BASE:$IMAGE_TAG" + echo "$GHCR_TOKEN" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin >/dev/null + if ! docker manifest inspect "$IMAGE_BASE:$IMAGE_TAG" >/dev/null; then + echo "::error::image $IMAGE_BASE:$IMAGE_TAG not found on ghcr.io" + exit 1 + fi + + - name: Join Tailnet (ephemeral) + uses: tailscale/github-action@v3 + with: + oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }} + oauth-secret: ${{ secrets.TS_OAUTH_SECRET }} + tags: tag:ci-deploy + + - name: Configure SSH + env: + SSH_KEY: ${{ secrets.DEPLOY_SSH_PRIVATE_KEY }} + KNOWN_HOSTS: ${{ secrets.DEPLOY_KNOWN_HOSTS }} + run: | + set -euo pipefail + mkdir -p ~/.ssh + chmod 700 ~/.ssh + printf '%s\n' "$SSH_KEY" > ~/.ssh/id_ed25519 + chmod 600 ~/.ssh/id_ed25519 + printf '%s\n' "$KNOWN_HOSTS" > ~/.ssh/known_hosts + chmod 644 ~/.ssh/known_hosts + # Sanity: no stray CRLF in the key, no empty file. + test -s ~/.ssh/id_ed25519 || { echo "::error::DEPLOY_SSH_PRIVATE_KEY is empty"; exit 1; } + ssh-keygen -lf ~/.ssh/id_ed25519 >/dev/null + + - name: Render NODES and SSH_TARGETS + id: render + env: + NODES_RAFT_MAP: ${{ vars.NODES_RAFT_MAP }} + SSH_TARGETS_MAP: ${{ vars.SSH_TARGETS_MAP }} + NODES_FILTER: ${{ inputs.nodes }} + run: | + set -euo pipefail + if [[ -z "$NODES_RAFT_MAP" || -z "$SSH_TARGETS_MAP" ]]; then + echo "::error::NODES_RAFT_MAP or SSH_TARGETS_MAP is not set in the production environment variables" + exit 1 + fi + if [[ -n "$NODES_FILTER" ]]; then + # Filter NODES_RAFT_MAP and SSH_TARGETS_MAP to the requested subset. + # Reject any filter ID that does not appear in the map: silently + # dropping unknown IDs would let a typo like "n1,n9" proceed as + # a one-node rollout of n1 alone, which is a staged-deploy + # footgun. + IFS=',' read -r -a wanted <<< "$NODES_FILTER" + IFS=',' read -r -a entries <<< "$NODES_RAFT_MAP" + declare -a known_ids=() + for e in "${entries[@]}"; do + known_ids+=("${e%%=*}") + done + unknown="" + for w in "${wanted[@]}"; do + found=0 + for k in "${known_ids[@]}"; do + if [[ "$k" == "$w" ]]; then found=1; break; fi + done + if [[ $found -eq 0 ]]; then unknown+="${unknown:+, }$w"; fi + done + if [[ -n "$unknown" ]]; then + echo "::error::nodes filter '$NODES_FILTER' references unknown raft IDs: $unknown. Known IDs: ${known_ids[*]}" + exit 1 + fi + filter_csv() { + local all="$1" + local filter="$2" + local out="" + IFS=',' read -r -a list_entries <<< "$all" + IFS=',' read -r -a list_wanted <<< "$filter" + for e in "${list_entries[@]}"; do + key="${e%%=*}" + for w in "${list_wanted[@]}"; do + if [[ "$key" == "$w" ]]; then + out+="${e}," + break + fi + done + done + echo "${out%,}" + } + NODES_RAFT_MAP="$(filter_csv "$NODES_RAFT_MAP" "$NODES_FILTER")" + SSH_TARGETS_MAP="$(filter_csv "$SSH_TARGETS_MAP" "$NODES_FILTER")" + if [[ -z "$NODES_RAFT_MAP" ]]; then + echo "::error::nodes filter '$NODES_FILTER' matches nothing in NODES_RAFT_MAP" + exit 1 + fi + fi + { + echo "NODES=$NODES_RAFT_MAP" + echo "SSH_TARGETS=$SSH_TARGETS_MAP" + } >> "$GITHUB_OUTPUT" + echo "::group::Deploy plan" + echo "NODES=$NODES_RAFT_MAP" + echo "SSH_TARGETS=$SSH_TARGETS_MAP" + echo "::endgroup::" + + - name: Tailscale reachability check + env: + SSH_TARGETS: ${{ steps.render.outputs.SSH_TARGETS }} + run: | + set -euo pipefail + IFS=',' read -r -a entries <<< "$SSH_TARGETS" + failed=0 + for e in "${entries[@]}"; do + host="${e##*=}" + host="${host%%:*}" + # strip user@ if present + host="${host##*@}" + if tailscale ping --c 2 --timeout 3s "$host" >/dev/null 2>&1; then + echo " ok $host" + else + echo "::error::$host not reachable over tailnet" + failed=1 + fi + done + if [[ "$failed" -ne 0 ]]; then + exit 1 + fi + + - name: Dry-run summary + if: ${{ inputs.dry_run }} + env: + NODES: ${{ steps.render.outputs.NODES }} + SSH_TARGETS: ${{ steps.render.outputs.SSH_TARGETS }} + IMAGE_BASE: ${{ vars.IMAGE_BASE }} + IMAGE_TAG: ${{ inputs.image_tag || inputs.ref }} + SSH_USER: ${{ vars.SSH_USER }} + run: | + set -euo pipefail + cat <.ts.net +``` + +## 2. Tailscale ACL + +In the Tailscale admin console, add the deploy rule to the tailnet ACL: + +```jsonc +"tagOwners": { + "tag:ci-deploy": ["autogroup:admin"], + "tag:elastickv-node": ["autogroup:admin"], +}, +"acls": [ + { + "action": "accept", + "src": ["tag:ci-deploy"], + "dst": ["tag:elastickv-node:22"], + }, +], +``` + +`tag:ci-deploy` must NOT have access to any other port on the tailnet. The +deploy workflow only needs SSH. + +## 3. Tailscale OAuth client + +Admin console → Settings → OAuth clients → New client: + +- Description: `elastickv GitHub Actions deploy` +- Scopes: `auth_keys` (write). Recent `tailscale/github-action` versions + may additionally require `devices:write` (to register and clean up + the ephemeral node); enable that if the join step fails with an + authorization error. The action's README is the definitive source + for current scope requirements. `devices:core` is NOT a valid + Tailscale OAuth scope — earlier drafts of this runbook named it and + would have produced an auth failure. +- Tags: `tag:ci-deploy` + +Copy the client ID and secret; they go into GitHub in the next step. + +## 4. GitHub environment: `production` + +Repo → Settings → Environments → New environment: `production`. + +### Required reviewers +Configure "Required reviewers" on the environment. **Every run that targets +this environment pauses for approval** — including dry-runs, because +GitHub's native environment-protection rules cannot be made conditional on +workflow inputs. Three ways to handle the dry-run-approval friction: + +1. **Accept the prompt for dry-runs too.** A dry-run requires one approver + click before it proceeds; still cheap and keeps the policy simple. +2. **Add a second environment `production-dry-run` without required + reviewers** and change the workflow to pick the environment via + `environment: ${{ inputs.dry_run && 'production-dry-run' || 'production' }}`. + Cleanest but doubles the secrets/vars you must keep in sync. +3. **Install a deployment-protection-rule GitHub App** (custom or + marketplace) that approves runs whose inputs show `dry_run == true`. + Most flexible; most setup. + +v1 ships with approach 1 (single environment, prompt on every run). +Approach 2 is the recommended upgrade once the friction becomes annoying. + +### Environment secrets + +| Name | Value | +|------|-------| +| `TS_OAUTH_CLIENT_ID` | Tailscale OAuth client ID from step 3 | +| `TS_OAUTH_SECRET` | Tailscale OAuth secret from step 3 | +| `DEPLOY_SSH_PRIVATE_KEY` | OpenSSH private key, authorized on every node under the deploy user | +| `DEPLOY_KNOWN_HOSTS` | `ssh-keyscan -H kv01..ts.net kv02..ts.net …` output. Use `-H` to hash hostnames so the secret's contents don't leak the tailnet topology if the runner environment is compromised. | + +The SSH key should be ed25519, dedicated to CI (not a reused developer key). +Regenerate on operator rotation. + +### Environment variables + +| Name | Value | Example | +|------|-------|---------| +| `IMAGE_BASE` | Container image path (no tag) | `ghcr.io/bootjp/elastickv` | +| `SSH_USER` | SSH login on every node | `bootjp` | +| `NODES_RAFT_MAP` | Comma-separated `raftId=host` (no port — the script appends `RAFT_PORT`). Use full MagicDNS FQDNs so every node can resolve the advertised address regardless of local DNS search domains. The workflow renders this into the script's `NODES` env var. | `n1=kv01..ts.net,n2=kv02..ts.net,n3=kv03..ts.net,n4=kv04..ts.net,n5=kv05..ts.net` | +| `SSH_TARGETS_MAP` | Comma-separated `raftId=ssh-host`. The workflow renders this into the script's `SSH_TARGETS` env var. Usually identical to `NODES_RAFT_MAP` unless SSH access uses a different hostname. | `n1=kv01..ts.net,n2=kv02..ts.net,...` | + +**Why two names?** The workflow uses `NODES_RAFT_MAP` / `SSH_TARGETS_MAP` +in the `production` environment to keep the GitHub-side names +distinct from the script-side env var names it hands to +`rolling-update.sh`. If you run the script by hand from a workstation +you must export `NODES` and `SSH_TARGETS` directly — the workflow-side +names are only understood by the workflow's render step. + +## 5. Running a deploy + +Actions tab → "Rolling update" → Run workflow. + +Inputs: + +- `ref` — the git tag or sha to deploy (also used as the container image tag) +- `image_tag` — override only for rollbacks (e.g., deploy tag `v1.2.3` of a + commit that was also `v1.2.3`) +- `nodes` — subset of raft IDs, e.g., `n1,n2`. Empty rolls all nodes. +- `dry_run` — default `true`. Renders the plan and checks reachability without + touching containers. + +Recommended first-run sequence: + +1. `dry_run: true`, `nodes: n1`, `ref: ` — confirms tailnet join, + SSH config, image availability, target mapping. No production impact. +2. `dry_run: false`, `nodes: n1` — roll a single node, verify the cluster + stays healthy and the image is correct. +3. `dry_run: false`, `nodes:` (empty) — full roll. + +## 6. Rollback + +Re-run the workflow with `image_tag` set to the previous-known-good sha. The +`nodes` input can target specific nodes if only some carry the bad image. + +### If a running workflow is cancelled mid-rollout + +GitHub cancelling the job between node steps is the one operational +hazard that needs manual cleanup. + +1. **Look at the last log line from the `Roll cluster` step.** The + script emits `==> [@] start` at the beginning of + each per-node recreate (see `scripts/rolling-update.sh:398`). + Whichever `` appears in the last such line is the one in + flight when the cancel signal landed. +2. **SSH into that node** over Tailscale and run `docker ps`. If the + container is absent or `Exited`, finish the recreate by hand. The + `docker run` invocation itself is redirected to `/dev/null` by the + script, so the workflow log does NOT contain the full argv. To + reconstruct it, read the `Roll cluster` step's rendered + environment — the workflow exports `IMAGE`, `DATA_DIR`, + `RAFT_PORT`, `REDIS_PORT`, `S3_PORT`, `ENABLE_S3`, `NODES`, + `SSH_TARGETS`, and the merged `EXTRA_ENV` before invoking the + script. Anything not explicitly set (e.g., `RAFT_PORT` in a + minimally-overridden deploy) falls back to the script's default + (`RAFT_PORT=50051`, `REDIS_PORT=6379`, `S3_PORT=9000`, + `ENABLE_S3=true`). GOMEMLIMIT / CONTAINER_MEMORY_LIMIT (PR #617) + are propagated via `EXTRA_ENV` once that PR lands. Together the + rendered env + the node's `deploy.env` is enough to reconstruct + the same `docker run` you would see if you re-ran with the same + inputs. +3. **Confirm the new leader via `raftadmin` or metrics** before re-running + the workflow with `nodes:` scoped to the remaining untouched IDs. Do + NOT re-run the full rollout if the partial one is still in flight — + it will stop the same node you are trying to recover. +4. **File a ticket** with the log excerpt so we can eventually teach the + workflow to set a start-marker on each node and fast-skip completed + nodes on re-run. + +The script is idempotent. `scripts/rolling-update.sh:794-798` skips a +node when its running image id equals the target image and its gRPC +endpoint is healthy — an already-rolled node is a no-op, not a +redundant stop/recreate. Re-running the workflow with the same +`ref` after confirming the interrupted node is healthy is therefore +safe: nodes that already match the target image are passed over, +and only the still-stale one gets recreated. + +## 7. What the workflow does NOT do (yet) + +- **No post-deploy health verification beyond tailnet reachability.** The + script itself blocks on `raftadmin` leadership transfer and health-gate + timeouts, but the workflow does not independently probe Prometheus or + Redis after the roll. Add this when we have a canonical post-deploy + assertion suite. +- **No auto-rollback on failure.** If the script exits non-zero mid-roll, + the cluster is left in whatever state the script reached. The operator + must inspect and either re-roll or roll back manually. +- **No Jepsen gate.** The deploy does not require a green Jepsen run on + `ref` before proceeding. +- **No image-signature check.** `cosign verify` on the image is a follow-up. + +## 8. Troubleshooting + +### Job pauses indefinitely at "Waiting for approval" +Expected for **every** run in v1 — `.github/workflows/rolling-update.yml` +sets `environment: production` unconditionally, so both dry-run and +non-dry-run executions pause for approval. A reviewer from the +`production` environment must click Approve. Check the "Required +reviewers" list in the environment settings. See §4 "GitHub +environment" for the dry-run-approval alternatives (approach 2: add a +second `production-dry-run` environment without required reviewers) +if the friction becomes intolerable. + +### `tailscale ping` fails for a node +The node may not be running `tailscaled`, not tagged `tag:elastickv-node`, or +the tailnet ACL may have drifted. `tailscale status` on the node should show +the tag; the admin console should show the IP in the `tag:elastickv-node` +group. + +### `image ... not found on ghcr.io` +The verification step hit the ghcr manifest API and got a 404. Either the +image tag was not pushed (check the `Docker Image CI` workflow for `ref`) or +the tag is a moving tag (`latest`) that the verification step can't +distinguish from stale. Specify an immutable tag. + +### SSH `Host key verification failed` +`DEPLOY_KNOWN_HOSTS` is stale. Re-run `ssh-keyscan -H` against every node and +update the secret. diff --git a/docs/design/2026_04_24_proposed_deploy_via_tailscale.md b/docs/design/2026_04_24_proposed_deploy_via_tailscale.md new file mode 100644 index 00000000..2eb0168f --- /dev/null +++ b/docs/design/2026_04_24_proposed_deploy_via_tailscale.md @@ -0,0 +1,208 @@ +# Deploy via Tailscale + GitHub Actions + +**Status:** Proposed +**Author:** bootjp +**Date:** 2026-04-24 + +--- + +## 1. Background + +Today the rolling-update flow is manual: an operator SSHes to their workstation, +exports the required env vars (`NODES`, `SSH_TARGETS`, image tag, etc.), +invokes `scripts/rolling-update.sh`, and watches it roll the cluster. + +Problems: + +- **No audit trail.** Who rolled what, when, and from which commit is only + visible in each operator's local shell history. +- **Manual secret handling.** SSH keys, Tailscale auth, and S3 creds live on + operator workstations. Joining and leaving the ops rotation requires key + shuffling. +- **No approval gate.** The production cluster is rolled by whoever types the + command. A typo can take out the cluster before anyone else sees it. +- **No dry-run.** The script supports neither `--dry-run` nor a preview mode; + operators who want to verify targeting have to read the script. + +The 2026-04-24 incident compounds the risk: the cluster is fragile enough that +a rolling update executed against the wrong `NODES` list could cascade into an +election storm. + +## 2. Proposal + +Move rolling deploys to a GitHub Actions workflow that joins the Tailnet via +`tailscale/github-action`, SSHes into each node over Tailscale MagicDNS, and +invokes the existing `scripts/rolling-update.sh`. All secrets live in GitHub +environments; every deploy becomes a PR-linked, reviewable event. + +**Precondition (operator responsibility):** Tailscale is already installed and +logged in on every node, with SSH access enabled over the tailnet. + +### 2.1 Workflow shape + +```yaml +name: Rolling update +on: + workflow_dispatch: + inputs: + ref: # git sha/tag of the image to deploy + image_tag: # defaults to $ref; override only for rollbacks + nodes: # subset of raft IDs; empty = full roll + dry_run: # bool, default TRUE — renders plan but doesn't roll + +jobs: + deploy: + environment: production # requires approval + concurrency: + group: rolling-update + cancel-in-progress: false + runs-on: ubuntu-latest + steps: + - checkout + - join tailnet (tailscale/github-action, ephemeral) + - configure SSH (add DEPLOY_SSH_PRIVATE_KEY to agent) + - render NODES + SSH_TARGETS from repo config + - if dry_run: print the derived env and exit + - else: ./scripts/rolling-update.sh +``` + +### 2.2 Secrets and variables + +Stored in a GitHub `production` environment (not repo-wide): + +**Secrets:** +- `TS_OAUTH_CLIENT_ID`, `TS_OAUTH_SECRET` — Tailscale OAuth client scoped to + "devices:write" on a single tag (e.g., `tag:ci-deploy`). Ephemeral nodes; + cleaned up automatically after the job. +- `DEPLOY_SSH_PRIVATE_KEY` — SSH key authorized on every node. Restricted to + the `deploy` user (if we split it out) or `bootjp` (initial). +- `DEPLOY_KNOWN_HOSTS` — pre-populated `known_hosts` with the Tailnet MagicDNS + entries. Prevents the first-connect TOFU prompt. + +**Variables (non-secret):** +- `NODES_RAFT_MAP` — `n1=kv01.tailnet.ts.net,n2=kv02.tailnet.ts.net,...` + (full MagicDNS FQDNs; bare short names can resolve differently + depending on each node's search-domain configuration). The script + appends `RAFT_PORT` automatically, so do NOT include a port here. + The runbook (`docs/deploy_via_tailscale_runbook.md`) carries the + same FQDN convention; keep the two in sync if either changes. +- `SSH_TARGETS_MAP` — `n1=kv01.tailnet.ts.net,...` (MagicDNS FQDN). +- `IMAGE_BASE` — `ghcr.io/bootjp/elastickv` (tag is appended from the input). +- `SSH_USER` — e.g., `bootjp`. + +### 2.3 Tailscale authentication + +Use OAuth ephemeral nodes (not a long-lived auth key): + +- Create an OAuth client in Tailscale admin console with scope + `auth_keys` (write) on tag `tag:ci-deploy`. (`tailscale/github-action` + uses the OAuth client to mint a short-lived auth key on each run; + recent action versions may also require `devices:write` so the + ephemeral node can register and be cleaned up — consult the action's + README for the current scope list. Earlier drafts of this doc named + `devices:core`, which is not a supported Tailscale OAuth scope and + would fail authentication.) +- Store client ID + secret in GitHub env secrets. +- `tailscale/github-action@v3` joins the tailnet for the duration of the job + as an ephemeral tagged node; disconnects automatically on job exit. + +ACLs on the Tailnet side should limit `tag:ci-deploy` to SSH (tcp/22) on +`tag:elastickv-node` only. No other ports, no other tags. + +### 2.4 SSH + +Two options: + +- **A. Tailscale SSH.** Lets CI SSH in without managing an SSH keypair: the + Tailnet ACL is the authorization model. Requires the nodes to have + `--ssh` flag on `tailscaled` (or `tailscale up --ssh`) and the Tailnet ACL + to grant `tag:ci-deploy` SSH access to node tag + user. No SSH keys in + GitHub at all. +- **B. Plain SSH over Tailscale.** CI brings an SSH key; nodes continue to + use `~/.ssh/authorized_keys`. Tailscale is just the network layer. + +**Recommendation for v1: B** (plain SSH). Nodes already have `authorized_keys` +for the current manual flow; nothing to change on the node side. Tailscale +SSH (A) can be a follow-up once the key-rotation story is written up. + +### 2.5 Dry-run semantics + +With `dry_run: true` (the default): + +- Everything up to script invocation runs (checkout, tailnet join, SSH agent + load, `NODES`/`SSH_TARGETS` render). +- The script is invoked with `--help` + the rendered env is printed as a + collapsed log group. +- `tailscale ping` is run against each SSH target to confirm reachability. +- The actual `docker stop/rm/run` loop does NOT execute. + +This catches the common failure modes (bad secret, bad env mapping, a node +unreachable over the tailnet) before touching any live container. + +### 2.6 Production environment approval + +Mark the `production` GitHub environment as requiring approval from a list +of reviewers. GitHub's native environment-protection rules do NOT support +conditioning approval on workflow inputs, so **both** dry-run and non- +dry-run runs will pause for approval when `environment: production` is +declared unconditionally on the job. That is the v1 policy — simpler, +one environment, one approver list; see runbook §4 for the dry-run- +approval alternatives (a second `production-dry-run` environment without +required reviewers, or a deployment-protection-rule GitHub App). + +Alternative: require approval unconditionally and treat the dry-run as a +"preview" that an approver must ack. This is the v1 shape by default. + +**Recommendation:** approval required for every run in v1 (one +environment). Add the second environment only when the dry-run friction +becomes annoying. + +### 2.7 Rollback + +Rolling back uses the same workflow with `image_tag: `. The +script already supports the rollout order env var (`ROLLING_ORDER`) so an +operator can force-roll only the affected nodes. + +**Gap:** there is no "stop mid-rollout" control today. If the workflow is +cancelled via GitHub UI during a roll, the in-flight node may be mid-recreate. +`rolling-update.sh` is supposed to be idempotent and crash-safe, but this +should be verified before we call the workflow production-ready. + +## 3. Open questions + +- **SSH user.** Continue using `bootjp` (personal) or provision a shared + `deploy` user on each node? v1 sticks with `bootjp` to keep scope tight; + follow-up can introduce `deploy` with a limited sudo rule for `docker`. +- **Secret scope.** Environment-scoped secrets (as proposed) vs. + repository-scoped. Environment-scoped wins on blast radius but requires + the GitHub environment to be pre-created. Assume pre-created. +- **Image availability check.** Should the workflow verify the image tag + exists on ghcr.io before starting the roll? Cheap to add (`docker manifest + inspect` in a pre-step) and prevents a half-rolled cluster when the tag is + typo'd. +- **Jepsen gating.** The existing `jepsen-test.yml` workflow exists. + Option: require a green Jepsen run on `ref` within the last N hours before + allowing deploy. Skipped for v1; worth revisiting before rolling this out + to high-traffic periods. + +## 4. Out of scope for v1 + +- Automatic deploys on merge to main (needs more test coverage before we'd + trust it). +- Blue-green or canary strategies (we don't have the traffic-routing layer + for it). +- Metrics-based rollback trigger (watch p99, auto-revert if it jumps). +- Tailscale SSH (option A above). +- A shared `deploy` user with restricted sudo. + +## 5. Implementation plan + +1. Write `.github/workflows/rolling-update.yml` implementing §2.1. +2. Document the secrets/variables setup in + `docs/operations/deploy_runbook.md` (new). +3. Run once with `dry_run: true` on a feature branch to validate secrets + wiring without touching prod. +4. Run once with `dry_run: false` targeting a single node (via the `nodes` + input) to prove the happy path. +5. Cut over: archive the operator-local rolling flow, document the new one + as the canonical path.