From e7d1185836b79a8ccfdcb0afa844ba056e7002a6 Mon Sep 17 00:00:00 2001 From: "John R. D'Orazio" Date: Wed, 29 Apr 2026 19:48:27 +0200 Subject: [PATCH] fix(deploy): make verify step tolerate transient curl errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The verify step exited with code 56 (CURLE_RECV_ERROR) during the first big-batch poll, killing the entire workflow run on a single transient blip. Three robustness fixes: - Append `|| echo '{}'` to each curl call so `set -e` doesn't kill the script when curl fails for any reason — the empty JSON makes jq return empty strings, which keeps the page in PENDING for the next attempt (correct semantics: "not yet translated"). - Add `--max-time 15 --retry 3 --retry-delay 5 --retry-all-errors` so curl handles short network blips at the transport layer. - Append `2>/dev/null || echo ""` to jq calls for the same reason. Also bump MAX_ATTEMPTS from 20 to 60 (10 min → 30 min). The current deploy enqueues up to 50 translations at a time (10 docs × 5 langs on workflow_dispatch / first deploy), and observed throughput is ~1 translation every 5–8 seconds — 50 jobs needs ~5 minutes minimum, plus retry budget for OpenAI capacity issues. Progress messages now show "X/N done" so the run status is readable at a glance. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/deploy-docs.yml | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml index 3650e89..89362e5 100644 --- a/.github/workflows/deploy-docs.yml +++ b/.github/workflows/deploy-docs.yml @@ -308,20 +308,26 @@ jobs: # without this step a silent worker failure (OpenAI timeout, missing # API key, dead Redis worker) lets the deploy report success while # leaving translations stale. - MAX_ATTEMPTS=20 # 20 x 30s = 10 minutes + MAX_ATTEMPTS=60 # 60 x 30s = 30 minutes SLEEP_SECONDS=30 declare -A PENDING for ID in $TRANSLATED_IDS; do PENDING[$ID]=1; done + INITIAL_COUNT=${#PENDING[@]} for attempt in $(seq 1 "$MAX_ATTEMPTS"); do REMAINING=() for ID in "${!PENDING[@]}"; do - RESPONSE=$(curl -s \ + # `|| echo '{}'` keeps `set -e` from killing the whole step on + # transient curl failures (we observed exit 56 / CURLE_RECV_ERROR + # mid-poll). `--retry` handles short network blips at the curl + # layer; the `|| echo` is the safety net for everything else. + RESPONSE=$(curl -s --max-time 15 --retry 3 --retry-delay 5 --retry-all-errors \ -u "$WP_APP_USERNAME:$WP_APP_PASSWORD" \ - "$WP_REST_URL/wp/v2/pages/${ID}?_fields=id,modified_gmt,status&context=edit") - MODIFIED=$(echo "$RESPONSE" | jq -r '.modified_gmt // empty') - STATUS=$(echo "$RESPONSE" | jq -r '.status // empty') + "$WP_REST_URL/wp/v2/pages/${ID}?_fields=id,modified_gmt,status&context=edit" \ + || echo '{}') + MODIFIED=$(echo "$RESPONSE" | jq -r '.modified_gmt // empty' 2>/dev/null || echo "") + STATUS=$(echo "$RESPONSE" | jq -r '.status // empty' 2>/dev/null || echo "") # modified_gmt is ISO 8601 with no offset; lexicographic compare # against DEPLOY_START_GMT (also UTC, no offset) is correct. @@ -343,7 +349,8 @@ jobs: exit 0 fi - echo "Attempt ${attempt}/${MAX_ATTEMPTS}: ${#PENDING[@]} pending - sleeping ${SLEEP_SECONDS}s..." + COMPLETED=$((INITIAL_COUNT - ${#PENDING[@]})) + echo "Attempt ${attempt}/${MAX_ATTEMPTS}: ${COMPLETED}/${INITIAL_COUNT} done, ${#PENDING[@]} pending - sleeping ${SLEEP_SECONDS}s..." sleep "$SLEEP_SECONDS" done