From 7066cbdda95aca2cbce6b91ca6a7d7d50ebd2f68 Mon Sep 17 00:00:00 2001
From: Stefan de Vogelaere <stefan@hashting.com>
Date: Sun, 3 May 2026 12:30:07 +0200
Subject: [PATCH] ci(update_versions): skip agents with broken upstream URLs
 instead of failing

When an upstream release stops publishing one of an agent's declared
binary archives (e.g. vtcode 0.105.5 dropped its Windows zip), the
hourly auto-update workflow currently dies in the registry-build
URL-validation step. That blocks updates for *every* other agent until a
human edits the offending agent.json.

Make the apply step resilient instead:

- Hoist url_exists() and validate_distribution_urls() into
  registry_utils.py so update_versions and build_registry share the
  same check.
- After update_versions writes a new agent.json, re-validate its
  distribution URLs; if any 404, restore the original file content
  byte-for-byte and report the agent as "skipped" rather than failed.
- Track skipped updates separately from real I/O failures so the
  workflow exits 0 when the only problem is a broken upstream binary.
- Surface skipped agents in both the JSON output and the human summary
  so it's obvious which agents are stuck and why.
- Add unit tests covering the revert path and the moved helpers.

Refs: failing run https://github.com/agentclientprotocol/registry/actions/runs/25275167557
---
 .github/workflows/build_registry.py           |  77 +--------
 .github/workflows/registry_utils.py           |  78 +++++++++
 .../workflows/tests/test_registry_utils.py    |  68 ++++++++
 .../workflows/tests/test_update_versions.py   | 102 +++++++++++-
 .github/workflows/update_versions.py          | 152 ++++++++++++++----
 5 files changed, 368 insertions(+), 109 deletions(-)

diff --git a/.github/workflows/build_registry.py b/.github/workflows/build_registry.py
index 8469b1df..1b596f43 100644
--- a/.github/workflows/build_registry.py
+++ b/.github/workflows/build_registry.py
@@ -7,17 +7,14 @@
 import os
 import re
 import sys
-import urllib.error
-import urllib.request
 import xml.etree.ElementTree as ET
 from pathlib import Path
 
 from registry_utils import (
-    extract_npm_package_name,
     extract_npm_package_version,
-    extract_pypi_package_name,
     normalize_version,
     should_skip_dir,
+    validate_distribution_urls,
 )
 
 try:
@@ -48,40 +45,6 @@
 PREFERRED_ICON_SIZE = 16
 ALLOWED_FILL_STROKE_VALUES = {"currentcolor", "none", "inherit"}
 
-# URL validation
-SKIP_URL_VALIDATION = os.environ.get("SKIP_URL_VALIDATION", "").lower() in (
-    "1",
-    "true",
-    "yes",
-)
-
-
-def url_exists(url: str, method: str = "HEAD", retries: int = 3) -> bool:
-    """Check if a URL exists using HEAD or GET request with retries."""
-    import time
-
-    for attempt in range(retries):
-        try:
-            req = urllib.request.Request(url, method=method)
-            req.add_header("User-Agent", "ACP-Registry-Validator/1.0")
-            with urllib.request.urlopen(req, timeout=15) as response:
-                return response.status in (200, 301, 302)
-        except urllib.error.HTTPError as e:
-            # Some servers don't support HEAD, try GET
-            if method == "HEAD" and e.code in (403, 405):
-                return url_exists(url, method="GET", retries=retries - attempt)
-            if attempt < retries - 1 and e.code in (429, 500, 502, 503, 504):
-                time.sleep(2**attempt)
-                continue
-            return False
-        except (urllib.error.URLError, TimeoutError, OSError):
-            if attempt < retries - 1:
-                time.sleep(2**attempt)
-                continue
-            return False
-    return False
-
-
 def extract_version_from_url(url: str) -> str | None:
     """Extract version from binary archive URL."""
     # GitHub releases: /download/v1.0.0/ or /releases/v1.0.0/
@@ -147,44 +110,6 @@ def validate_distribution_versions(agent_version: str, distribution: dict) -> li
     return errors
 
 
-def validate_distribution_urls(distribution: dict) -> list[str]:
-    """Validate that distribution URLs exist."""
-    if SKIP_URL_VALIDATION:
-        return []
-
-    errors = []
-
-    # Check binary archive URLs
-    if "binary" in distribution:
-        for platform, target in distribution["binary"].items():
-            if "archive" in target:
-                url = target["archive"]
-                if not url_exists(url):
-                    errors.append(f"Binary archive URL not accessible for {platform}: {url}")
-
-    # Check npm package URLs (registry.npmjs.org)
-    seen_npm = set()
-    for dist_type in ("npx",):
-        if dist_type in distribution:
-            package = distribution[dist_type].get("package", "")
-            pkg_name = extract_npm_package_name(package)
-            if pkg_name and pkg_name not in seen_npm:
-                seen_npm.add(pkg_name)
-                npm_url = f"https://registry.npmjs.org/{pkg_name}"
-                if not url_exists(npm_url):
-                    errors.append(f"npm package not found: {pkg_name}")
-
-    # Check PyPI package URLs
-    if "uvx" in distribution:
-        package = distribution["uvx"].get("package", "")
-        pkg_name = extract_pypi_package_name(package)
-        pypi_url = f"https://pypi.org/pypi/{pkg_name}/json"
-        if not url_exists(pypi_url):
-            errors.append(f"PyPI package not found: {pkg_name}")
-
-    return errors
-
-
 def validate_icon_monochrome(root: ET.Element) -> list[str]:
     """Validate that icon uses currentColor and no hardcoded colors.
 
diff --git a/.github/workflows/registry_utils.py b/.github/workflows/registry_utils.py
index 5b26904e..7ac7a67f 100644
--- a/.github/workflows/registry_utils.py
+++ b/.github/workflows/registry_utils.py
@@ -1,8 +1,12 @@
 """Shared utilities for ACP registry scripts."""
 
 import json
+import os
 import re
 import sys
+import time
+import urllib.error
+import urllib.request
 from pathlib import Path
 
 SKIP_DIRS = {
@@ -57,6 +61,80 @@ def normalize_version(version: str) -> str:
     return ".".join(parts[:3])
 
 
+# URL validation: env var lets CI / local runs disable network calls.
+SKIP_URL_VALIDATION = os.environ.get("SKIP_URL_VALIDATION", "").lower() in (
+    "1",
+    "true",
+    "yes",
+)
+
+
+def url_exists(url: str, method: str = "HEAD", retries: int = 3) -> bool:
+    """Check if a URL exists using HEAD or GET request with retries."""
+    for attempt in range(retries):
+        try:
+            req = urllib.request.Request(url, method=method)
+            req.add_header("User-Agent", "ACP-Registry-Validator/1.0")
+            with urllib.request.urlopen(req, timeout=15) as response:
+                return response.status in (200, 301, 302)
+        except urllib.error.HTTPError as e:
+            # Some servers don't support HEAD, try GET
+            if method == "HEAD" and e.code in (403, 405):
+                return url_exists(url, method="GET", retries=retries - attempt)
+            if attempt < retries - 1 and e.code in (429, 500, 502, 503, 504):
+                time.sleep(2**attempt)
+                continue
+            return False
+        except (urllib.error.URLError, TimeoutError, OSError):
+            if attempt < retries - 1:
+                time.sleep(2**attempt)
+                continue
+            return False
+    return False
+
+
+def validate_distribution_urls(distribution: dict) -> list[str]:
+    """Validate that distribution URLs exist (binary archives, npm, PyPI).
+
+    Returns a list of human-readable error strings (empty on success).
+    Honors SKIP_URL_VALIDATION env var.
+    """
+    if SKIP_URL_VALIDATION:
+        return []
+
+    errors = []
+
+    # Check binary archive URLs
+    if "binary" in distribution:
+        for platform, target in distribution["binary"].items():
+            if "archive" in target:
+                url = target["archive"]
+                if not url_exists(url):
+                    errors.append(f"Binary archive URL not accessible for {platform}: {url}")
+
+    # Check npm package URLs (registry.npmjs.org)
+    seen_npm = set()
+    for dist_type in ("npx",):
+        if dist_type in distribution:
+            package = distribution[dist_type].get("package", "")
+            pkg_name = extract_npm_package_name(package)
+            if pkg_name and pkg_name not in seen_npm:
+                seen_npm.add(pkg_name)
+                npm_url = f"https://registry.npmjs.org/{pkg_name}"
+                if not url_exists(npm_url):
+                    errors.append(f"npm package not found: {pkg_name}")
+
+    # Check PyPI package URLs
+    if "uvx" in distribution:
+        package = distribution["uvx"].get("package", "")
+        pkg_name = extract_pypi_package_name(package)
+        pypi_url = f"https://pypi.org/pypi/{pkg_name}/json"
+        if not url_exists(pypi_url):
+            errors.append(f"PyPI package not found: {pkg_name}")
+
+    return errors
+
+
 def load_quarantine(registry_dir: Path) -> dict[str, str]:
     """Load quarantine list from registry directory.
 
diff --git a/.github/workflows/tests/test_registry_utils.py b/.github/workflows/tests/test_registry_utils.py
index 988b249a..22a5699f 100644
--- a/.github/workflows/tests/test_registry_utils.py
+++ b/.github/workflows/tests/test_registry_utils.py
@@ -3,6 +3,7 @@
 import json
 import tempfile
 from pathlib import Path
+from unittest.mock import patch
 
 from registry_utils import (
     extract_npm_package_name,
@@ -11,6 +12,7 @@
     load_quarantine,
     normalize_version,
     should_skip_dir,
+    validate_distribution_urls,
 )
 
 
@@ -98,6 +100,72 @@ def test_invalid_json(self):
             assert load_quarantine(Path(d)) == {}
 
 
+class TestValidateDistributionUrls:
+    """Distribution URL validation (moved from build_registry.py)."""
+
+    def test_returns_no_errors_when_all_urls_reachable(self):
+        distribution = {
+            "binary": {
+                "darwin-aarch64": {
+                    "archive": "https://example.com/agent-darwin.tar.gz",
+                    "cmd": "./agent",
+                },
+                "linux-x86_64": {
+                    "archive": "https://example.com/agent-linux.tar.gz",
+                    "cmd": "./agent",
+                },
+            }
+        }
+        with patch("registry_utils.url_exists", return_value=True):
+            assert validate_distribution_urls(distribution) == []
+
+    def test_reports_unreachable_binary_archive(self):
+        distribution = {
+            "binary": {
+                "darwin-aarch64": {
+                    "archive": "https://example.com/ok.tar.gz",
+                    "cmd": "./agent",
+                },
+                "windows-x86_64": {
+                    "archive": "https://example.com/missing-windows.zip",
+                    "cmd": "agent.exe",
+                },
+            }
+        }
+
+        def fake_url_exists(url, *_args, **_kwargs):
+            return "missing" not in url
+
+        with patch("registry_utils.url_exists", side_effect=fake_url_exists):
+            errors = validate_distribution_urls(distribution)
+
+        assert len(errors) == 1
+        assert "windows-x86_64" in errors[0]
+        assert "missing-windows.zip" in errors[0]
+
+    def test_skip_url_validation_env_returns_empty(self, monkeypatch):
+        monkeypatch.setenv("SKIP_URL_VALIDATION", "1")
+        # Re-import to pick up the patched env var.
+        import importlib
+
+        import registry_utils
+
+        importlib.reload(registry_utils)
+        try:
+            distribution = {
+                "binary": {
+                    "darwin-aarch64": {
+                        "archive": "https://example.com/anything.tar.gz",
+                        "cmd": "./agent",
+                    }
+                }
+            }
+            assert registry_utils.validate_distribution_urls(distribution) == []
+        finally:
+            monkeypatch.delenv("SKIP_URL_VALIDATION", raising=False)
+            importlib.reload(registry_utils)
+
+
 class TestShouldSkipDir:
     def test_skips_hidden_runtime_dirs(self):
         assert should_skip_dir(".sandbox")
diff --git a/.github/workflows/tests/test_update_versions.py b/.github/workflows/tests/test_update_versions.py
index a6b17181..4346a7a7 100644
--- a/.github/workflows/tests/test_update_versions.py
+++ b/.github/workflows/tests/test_update_versions.py
@@ -1,12 +1,20 @@
 """Tests for update_versions.py."""
 
+import json
+import tempfile
 import urllib.error
 from pathlib import Path
 from unittest.mock import patch
 
 import pytest
 
-from update_versions import check_agent_version, is_prerelease, make_request
+from update_versions import (
+    VersionUpdate,
+    apply_update,
+    check_agent_version,
+    is_prerelease,
+    make_request,
+)
 
 
 class TestMakeRequestServerErrors:
@@ -266,3 +274,95 @@ def test_returns_error_when_sources_have_no_common_version(
         assert update is None
         assert error is not None
         assert error.error == "Version mismatch across distributions: binary=7.2.1, npx=7.2.4"
+
+
+class TestApplyUpdateUrlValidation:
+    """apply_update reverts and reports skipped when new URLs aren't reachable.
+
+    Regression test for the failure mode where an upstream stops publishing one
+    platform's binary at a new version (e.g. vtcode 0.105.5 dropped Windows zip),
+    which previously caused the entire hourly auto-update workflow to fail.
+    """
+
+    def _write_agent(self, tmpdir: Path, agent_data: dict) -> Path:
+        agent_dir = tmpdir / agent_data["id"]
+        agent_dir.mkdir()
+        path = agent_dir / "agent.json"
+        path.write_text(json.dumps(agent_data, indent=2) + "\n")
+        return path
+
+    def _build_update(self, path: Path, agent_data: dict, new_version: str) -> VersionUpdate:
+        return VersionUpdate(
+            agent_id=agent_data["id"],
+            agent_path=path,
+            current_version=agent_data["version"],
+            latest_version=new_version,
+            distribution_type="binary",
+            source_url="https://github.com/owner/repo",
+        )
+
+    def test_reverts_when_new_binary_url_missing(self):
+        agent_data = {
+            "id": "vtlike",
+            "version": "0.96.14",
+            "distribution": {
+                "binary": {
+                    "darwin-aarch64": {
+                        "archive": "https://github.com/owner/repo/releases/download/0.96.14/agent-0.96.14-aarch64-apple-darwin.tar.gz",
+                        "cmd": "./agent",
+                    },
+                    "windows-x86_64": {
+                        "archive": "https://github.com/owner/repo/releases/download/0.96.14/agent-0.96.14-x86_64-pc-windows-msvc.zip",
+                        "cmd": "agent.exe",
+                    },
+                }
+            },
+        }
+        with tempfile.TemporaryDirectory() as d:
+            tmpdir = Path(d)
+            path = self._write_agent(tmpdir, agent_data)
+            original = path.read_text()
+            update = self._build_update(path, agent_data, "0.105.5")
+
+            # New version's Windows zip 404s; macOS exists.
+            def fake_url_exists(url, *_args, **_kwargs):
+                return "windows" not in url
+
+            with patch("registry_utils.url_exists", side_effect=fake_url_exists):
+                ok, reason = apply_update(update)
+
+            assert ok is False
+            assert reason is not None
+            assert "windows-x86_64" in reason
+            # File must be byte-identical to its pre-update state.
+            assert path.read_text() == original
+
+    def test_succeeds_when_all_new_urls_reachable(self):
+        agent_data = {
+            "id": "okagent",
+            "version": "1.0.0",
+            "distribution": {
+                "binary": {
+                    "darwin-aarch64": {
+                        "archive": "https://github.com/owner/repo/releases/download/1.0.0/agent-1.0.0-aarch64-apple-darwin.tar.gz",
+                        "cmd": "./agent",
+                    },
+                }
+            },
+        }
+        with tempfile.TemporaryDirectory() as d:
+            tmpdir = Path(d)
+            path = self._write_agent(tmpdir, agent_data)
+            update = self._build_update(path, agent_data, "1.1.0")
+
+            with patch("registry_utils.url_exists", return_value=True):
+                ok, reason = apply_update(update)
+
+            assert ok is True
+            assert reason is None
+            written = json.loads(path.read_text())
+            assert written["version"] == "1.1.0"
+            assert (
+                written["distribution"]["binary"]["darwin-aarch64"]["archive"]
+                == "https://github.com/owner/repo/releases/download/1.1.0/agent-1.1.0-aarch64-apple-darwin.tar.gz"
+            )
diff --git a/.github/workflows/update_versions.py b/.github/workflows/update_versions.py
index 141d1b85..88cb3678 100755
--- a/.github/workflows/update_versions.py
+++ b/.github/workflows/update_versions.py
@@ -31,6 +31,7 @@
     extract_pypi_package_name,
     load_quarantine,
     should_skip_dir,
+    validate_distribution_urls,
 )
 
 
@@ -52,6 +53,20 @@ class UpdateError(NamedTuple):
     error: str
 
 
+class SkippedUpdate(NamedTuple):
+    """A planned update that was reverted because its new URLs were not reachable.
+
+    These are not fatal: the workflow keeps the agent at its current version and
+    continues applying the rest of the updates so a single broken upstream doesn't
+    block the whole hourly auto-update cycle.
+    """
+
+    agent_id: str
+    current_version: str
+    latest_version: str
+    reason: str
+
+
 # Directories to scan for agents
 AGENT_DIRS = [
     ".",  # Root directory (active agents)
@@ -368,14 +383,23 @@ def check_agent_version(
     ), None
 
 
-def apply_update(update: VersionUpdate) -> bool:
-    """Apply a version update to an agent, updating all distribution types."""
+def apply_update(update: VersionUpdate) -> tuple[bool, str | None]:
+    """Apply a version update to an agent, updating all distribution types.
+
+    Returns:
+        (True, None): update was applied and validated.
+        (False, reason): update was applied then reverted because the new
+            distribution URLs were unreachable. The agent.json is restored to its
+            pre-update content. ``reason`` describes which URLs failed.
+        (False, None): hard I/O / parse failure (caller should treat as fatal).
+    """
     try:
         with open(update.agent_path) as f:
-            agent_data = json.load(f)
+            original_content = f.read()
+            agent_data = json.loads(original_content)
     except (json.JSONDecodeError, OSError) as e:
         print(f"Error reading {update.agent_path}: {e}", file=sys.stderr)
-        return False
+        return False, None
 
     old_version = agent_data["version"]
     new_version = update.latest_version
@@ -428,10 +452,28 @@ def apply_update(update: VersionUpdate) -> bool:
         with open(update.agent_path, "w") as f:
             json.dump(agent_data, f, indent=2)
             f.write("\n")
-        return True
     except OSError as e:
         print(f"Error writing {update.agent_path}: {e}", file=sys.stderr)
-        return False
+        return False, None
+
+    # Validate that the new distribution URLs are reachable. If any URL fails
+    # (e.g. upstream stopped publishing a Windows binary at this version), revert
+    # to the original content and report the agent as skipped — the caller will
+    # carry on with the remaining updates.
+    url_errors = validate_distribution_urls(distribution)
+    if url_errors:
+        try:
+            with open(update.agent_path, "w") as f:
+                f.write(original_content)
+        except OSError as e:
+            print(
+                f"Error reverting {update.agent_path} after URL check failed: {e}",
+                file=sys.stderr,
+            )
+            return False, None
+        return False, "; ".join(url_errors)
+
+    return True, None
 
 
 def main():
@@ -495,6 +537,51 @@ def main():
             if not args.json:
                 print(f"OK ({agent_data.get('version', 'unknown')})")
 
+    skipped: list[SkippedUpdate] = []
+
+    # Apply updates if requested. We do this before assembling the JSON output so
+    # the JSON can report agents that were reverted because their new URLs 404'd.
+    if args.apply and updates:
+        print()
+        print("Applying updates...")
+        applied = 0
+        failed = 0
+        kept_updates: list[VersionUpdate] = []
+        for update in updates:
+            if not args.json:
+                print(f"  Updating {update.agent_id}...", end=" ", flush=True)
+            ok, skip_reason = apply_update(update)
+            if ok:
+                applied += 1
+                kept_updates.append(update)
+                if not args.json:
+                    print("OK")
+            elif skip_reason:
+                skipped.append(
+                    SkippedUpdate(
+                        agent_id=update.agent_id,
+                        current_version=update.current_version,
+                        latest_version=update.latest_version,
+                        reason=skip_reason,
+                    )
+                )
+                if not args.json:
+                    print(f"SKIPPED ({skip_reason})")
+            else:
+                failed += 1
+                if not args.json:
+                    print("FAILED")
+
+        # Drop reverted agents from the canonical update list so downstream
+        # consumers (commit message, verify-agents) only see what actually changed.
+        updates = kept_updates
+
+        print()
+        summary = f"Applied {applied} updates, {failed} failed"
+        if skipped:
+            summary += f", {len(skipped)} skipped (broken upstream URLs)"
+        print(summary)
+
     # Output results
     if args.json:
         result = {
@@ -510,15 +597,28 @@ def main():
                 for u in updates
             ],
             "errors": [{"agent_id": e.agent_id, "error": e.error} for e in errors],
+            "skipped": [
+                {
+                    "agent_id": s.agent_id,
+                    "current_version": s.current_version,
+                    "latest_version": s.latest_version,
+                    "reason": s.reason,
+                }
+                for s in skipped
+            ],
             "up_to_date": up_to_date,
         }
         print(json.dumps(result, indent=2))
     else:
         print()
         print("=" * 60)
-        print(
-            f"Summary: {len(updates)} updates, {len(errors)} errors, {len(up_to_date)} up-to-date"
+        summary_line = (
+            f"Summary: {len(updates)} updates, {len(errors)} errors, "
+            f"{len(up_to_date)} up-to-date"
         )
+        if skipped:
+            summary_line += f", {len(skipped)} skipped"
+        print(summary_line)
 
         if updates:
             print()
@@ -529,36 +629,24 @@ def main():
                     f"{u.latest_version} ({u.distribution_type})"
                 )
 
+        if skipped:
+            print()
+            print("Skipped (broken upstream URLs, kept at current version):")
+            for s in skipped:
+                print(
+                    f"  - {s.agent_id}: {s.current_version} -> "
+                    f"{s.latest_version}: {s.reason}"
+                )
+
         if errors:
             print()
             print("Errors:")
             for e in errors:
                 print(f"  - {e.agent_id}: {e.error}")
 
-    # Apply updates if requested
-    if args.apply and updates:
-        print()
-        print("Applying updates...")
-        applied = 0
-        failed = 0
-        for update in updates:
-            if not args.json:
-                print(f"  Updating {update.agent_id}...", end=" ", flush=True)
-            if apply_update(update):
-                applied += 1
-                if not args.json:
-                    print("OK")
-            else:
-                failed += 1
-                if not args.json:
-                    print("FAILED")
-
-        print()
-        print(f"Applied {applied} updates, {failed} failed")
-
-        # Exit with error if any updates failed
-        if failed > 0:
-            sys.exit(1)
+    # Exit with error if any apply attempts had hard I/O failures.
+    if args.apply and "failed" in locals() and failed > 0:
+        sys.exit(1)
 
     # Exit with special code if updates are available (for CI)
     if updates and not args.apply: