From 461650464a232c1a285c4b3f2557f10ca439fd85 Mon Sep 17 00:00:00 2001
From: Trey <tgrunnagle@gmail.com>
Date: Fri, 14 Nov 2025 12:10:42 -0800
Subject: [PATCH 1/2] Docs to vMCP composite workflows

Adds docs for https://github.com/stacklok/toolhive/pull/2592
---
 docs/operator/advanced-workflow-patterns.md   | 797 ++++++++++++++++++
 .../composite-tools-quick-reference.md        | 233 +++++
 2 files changed, 1030 insertions(+)
 create mode 100644 docs/operator/advanced-workflow-patterns.md
 create mode 100644 docs/operator/composite-tools-quick-reference.md

diff --git a/docs/operator/advanced-workflow-patterns.md b/docs/operator/advanced-workflow-patterns.md
new file mode 100644
index 000000000..b942d027d
--- /dev/null
+++ b/docs/operator/advanced-workflow-patterns.md
@@ -0,0 +1,797 @@
+# Advanced Workflow Patterns for Virtual MCP Composite Tools
+
+## Overview
+
+This guide covers advanced workflow patterns and best practices for Virtual MCP Composite Tools, including parallel execution, dependency management, error handling strategies, and state management.
+
+## Table of Contents
+
+- [Parallel Execution with DAG](#parallel-execution-with-dag)
+- [Step Dependencies](#step-dependencies)
+- [Advanced Error Handling](#advanced-error-handling)
+- [Workflow State Management](#workflow-state-management)
+- [Performance Optimization](#performance-optimization)
+- [Best Practices](#best-practices)
+- [Common Patterns](#common-patterns)
+
+---
+
+## Parallel Execution with DAG
+
+Virtual MCP Composite Tools use a Directed Acyclic Graph (DAG) execution model that automatically executes independent steps in parallel while respecting dependencies.
+
+### How DAG Execution Works
+
+1. **Execution Levels**: Steps are organized into levels based on dependencies
+2. **Parallel Within Levels**: All steps in the same level execute concurrently
+3. **Sequential Across Levels**: Each level waits for the previous level to complete
+4. **Automatic Optimization**: The system automatically determines optimal parallelization
+
+### Example: Parallel Data Fetching
+
+```yaml
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: VirtualMCPCompositeToolDefinition
+metadata:
+  name: incident-investigation
+spec:
+  name: investigate_incident
+  description: Investigate incident by gathering logs, metrics, and traces in parallel
+  parameters:
+    schema:
+      type: object
+      properties:
+        incident_id:
+          type: string
+        time_range:
+          type: string
+  steps:
+    # Level 1: These three steps run in parallel (no dependencies)
+    - id: fetch_logs
+      type: tool
+      tool: splunk.fetch_logs
+      arguments:
+        incident_id: "{{.params.incident_id}}"
+        time_range: "{{.params.time_range}}"
+
+    - id: fetch_metrics
+      type: tool
+      tool: datadog.fetch_metrics
+      arguments:
+        incident_id: "{{.params.incident_id}}"
+        time_range: "{{.params.time_range}}"
+
+    - id: fetch_traces
+      type: tool
+      tool: jaeger.fetch_traces
+      arguments:
+        incident_id: "{{.params.incident_id}}"
+        time_range: "{{.params.time_range}}"
+
+    # Level 2: Waits for all Level 1 steps to complete
+    - id: correlate
+      type: tool
+      tool: analysis.correlate_data
+      depends_on: [fetch_logs, fetch_metrics, fetch_traces]
+      arguments:
+        logs: "{{.steps.fetch_logs.output}}"
+        metrics: "{{.steps.fetch_metrics.output}}"
+        traces: "{{.steps.fetch_traces.output}}"
+
+    # Level 3: Waits for Level 2
+    - id: create_report
+      type: tool
+      tool: jira.create_issue
+      depends_on: [correlate]
+      arguments:
+        title: "Incident {{.params.incident_id}} Analysis"
+        body: "{{.steps.correlate.output.summary}}"
+```
+
+**Execution Timeline**:
+```
+Time    Level 1 (Parallel)              Level 2         Level 3
+0ms     fetch_logs    ─┐
+0ms     fetch_metrics ─┼─> correlate ──> create_report
+0ms     fetch_traces  ─┘
+```
+
+**Performance**: Fetching 3 data sources takes ~1x time instead of 3x (sequential).
+
+---
+
+## Step Dependencies
+
+Use the `depends_on` field to define explicit dependencies between steps.
+
+### Syntax
+
+```yaml
+steps:
+  - id: step_name
+    depends_on: [dependency1, dependency2, ...]
+    # ... rest of step config
+```
+
+### Dependency Rules
+
+1. **Multiple Dependencies**: Step waits for ALL dependencies to complete
+2. **Transitive Dependencies**: Automatically handled (A→B→C works as expected)
+3. **Cycle Detection**: Circular dependencies are detected and rejected at validation time
+4. **Missing Dependencies**: Referencing non-existent steps fails validation
+
+### Example: Diamond Pattern
+
+```yaml
+steps:
+  # Level 1
+  - id: fetch_data
+    type: tool
+    tool: api.fetch
+
+  # Level 2: Both depend on fetch_data, can run in parallel
+  - id: process_left
+    type: tool
+    tool: transform.left
+    depends_on: [fetch_data]
+
+  - id: process_right
+    type: tool
+    tool: transform.right
+    depends_on: [fetch_data]
+
+  # Level 3: Waits for both Level 2 steps
+  - id: merge_results
+    type: tool
+    tool: combine.merge
+    depends_on: [process_left, process_right]
+```
+
+**Execution Graph**:
+```
+       fetch_data
+       /        \
+process_left  process_right
+       \        /
+      merge_results
+```
+
+### Accessing Dependency Outputs
+
+Use template syntax to access outputs from dependencies:
+
+```yaml
+- id: analyze
+  depends_on: [fetch_logs, fetch_metrics]
+  arguments:
+    # Access specific fields from dependency outputs
+    log_count: "{{.steps.fetch_logs.output.count}}"
+    metric_avg: "{{.steps.fetch_metrics.output.average}}"
+
+    # Pass entire output object
+    raw_data: "{{.steps.fetch_logs.output}}"
+```
+
+---
+
+## Advanced Error Handling
+
+Configure sophisticated error handling at both workflow and step levels.
+
+### Workflow-Level Failure Modes
+
+Set the workflow's `failureMode` to control global error behavior:
+
+```yaml
+spec:
+  name: resilient_workflow
+  failureMode: continue  # Options: abort, continue, best_effort
+  steps:
+    # ...
+```
+
+**Failure Modes**:
+
+| Mode | Behavior | Use Case |
+|------|----------|----------|
+| `abort` | Stop immediately on first error (default) | Critical workflows where partial completion is dangerous |
+| `continue` | Log errors but continue executing remaining steps | Data collection where some failures are acceptable |
+| `best_effort` | Try all steps, aggregate errors at end | Monitoring/reporting where you want maximum data |
+
+### Step-Level Error Handling
+
+Override workflow-level behavior for specific steps:
+
+```yaml
+steps:
+  - id: optional_notification
+    type: tool
+    tool: slack.notify
+    on_error:
+      action: continue_on_error  # Don't fail workflow if Slack is down
+
+  - id: critical_payment
+    type: tool
+    tool: stripe.charge
+    # Inherits workflow failureMode (defaults to abort)
+```
+
+### Retry Logic with Exponential Backoff
+
+Configure automatic retries for transient failures:
+
+```yaml
+steps:
+  - id: fetch_external_api
+    type: tool
+    tool: external.fetch_data
+    on_error:
+      action: retry
+      retry_count: 3           # Maximum 3 retries (4 total attempts)
+      retry_delay: 1s          # Initial delay: 1 second
+      # Exponential backoff: 1s, 2s, 4s
+```
+
+**Retry Behavior**:
+- **Exponential Backoff**: Delay doubles each retry (1s → 2s → 4s → 8s...)
+- **Maximum Retries**: Capped at 10 (configurable per step)
+- **Context Aware**: Respects workflow timeout (won't retry if timeout exceeded)
+- **Error Propagation**: Final error includes retry count in metadata
+
+### Example: Combining Error Strategies
+
+```yaml
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: VirtualMCPCompositeToolDefinition
+metadata:
+  name: robust-deployment
+spec:
+  name: deploy_with_resilience
+  failureMode: abort  # Fail fast by default
+  steps:
+    # Retry transient network issues
+    - id: fetch_artifact
+      type: tool
+      tool: s3.download
+      on_error:
+        action: retry
+        retry_count: 3
+        retry_delay: 2s
+
+    - id: deploy
+      type: tool
+      tool: kubernetes.apply
+      depends_on: [fetch_artifact]
+      # Critical: uses workflow failureMode (abort)
+
+    # Optional post-deployment tasks
+    - id: notify_slack
+      type: tool
+      tool: slack.notify
+      depends_on: [deploy]
+      on_error:
+        action: continue_on_error  # Don't fail if notification fails
+
+    - id: update_dashboard
+      type: tool
+      tool: grafana.update
+      depends_on: [deploy]
+      on_error:
+        action: continue_on_error
+```
+
+---
+
+## Workflow State Management
+
+Virtual MCP tracks workflow execution state for monitoring, debugging, and cancellation.
+
+### State Tracking
+
+The workflow engine automatically maintains state including:
+
+- **Workflow ID**: Unique identifier (UUID) for each execution
+- **Status**: Current state (pending, running, completed, failed, cancelled, timed_out)
+- **Completed Steps**: List of successfully completed steps
+- **Step Results**: Outputs and timing for each step
+- **Pending Elicitations**: User interactions awaiting response
+- **Timestamps**: Start time, end time, last update time
+
+### Workflow Timeout
+
+Configure maximum execution time to prevent runaway workflows:
+
+```yaml
+spec:
+  name: time_sensitive_workflow
+  timeout: 30m  # 30 minutes maximum
+  steps:
+    - id: long_running_task
+      type: tool
+      tool: data.process
+      timeout: 5m  # Individual step timeout
+```
+
+**Timeout Behavior**:
+- Workflow timeout applies to entire execution
+- Step timeouts apply to individual steps
+- Timeouts trigger graceful cancellation (context.DeadlineExceeded)
+- State is saved with `timed_out` status
+
+**Timeout Precedence**:
+```
+Workflow Timeout: 30m
+  ├─ Step 1 (5m timeout)   ✓ Respects both
+  ├─ Step 2 (10m timeout)  ✓ Respects both
+  └─ Step 3 (40m timeout)  ✗ Limited by workflow timeout
+```
+
+### State Persistence
+
+**In-Memory State Store** (Default):
+- Suitable for single-instance deployments
+- Automatic cleanup of completed workflows (configurable)
+- Thread-safe for parallel step execution
+- Workflow status available via API
+
+**Future: Distributed State Store** (Redis/Database):
+- For multi-instance deployments
+- Workflow resumption after restart
+- Cross-instance workflow visibility
+
+### Example: Monitoring Workflow State
+
+```yaml
+# Query workflow status (future CLI support)
+$ thv workflow status <workflow-id>
+
+Workflow ID: a1b2c3d4-e5f6-7890-abcd-ef1234567890
+Status: running
+Started: 2025-01-15 10:30:00
+Duration: 2m 15s
+
+Completed Steps:
+  ✓ fetch_logs    (1.2s)
+  ✓ fetch_metrics (0.8s)
+  ✓ fetch_traces  (1.5s)
+
+In Progress:
+  ⏳ correlate
+
+Pending:
+  ⋯ create_report
+```
+
+---
+
+## Performance Optimization
+
+### Concurrency Limits
+
+The DAG executor limits parallel execution to prevent resource exhaustion:
+
+```go
+// Default: 10 concurrent steps maximum
+// Configurable in workflow engine initialization
+```
+
+**Tuning Recommendations**:
+- **I/O-bound workflows**: Higher concurrency (10-20 steps)
+- **CPU-bound workflows**: Lower concurrency (2-5 steps)
+- **Memory-intensive**: Monitor and adjust based on capacity
+
+### Execution Statistics
+
+The system tracks execution metrics:
+
+```go
+stats := {
+  "total_levels":      3,     // Number of execution levels
+  "total_steps":       8,     // Total steps in workflow
+  "max_parallelism":   3,     // Max steps in any level
+  "sequential_steps":  2,     // Steps that run alone
+}
+```
+
+### Optimization Strategies
+
+1. **Minimize Dependencies**: Reduce `depends_on` where possible
+2. **Group Related Steps**: Steps with similar execution time work well in same level
+3. **Split Large Steps**: Break monolithic steps into parallel sub-steps
+4. **Use Conditional Execution**: Skip unnecessary steps with `condition` field
+
+**Example: Optimized Data Pipeline**
+
+```yaml
+# Before: Sequential (9 seconds total)
+steps:
+  - id: fetch1     # 3s
+  - id: fetch2     # 3s
+  - id: fetch3     # 3s
+
+# After: Parallel (3 seconds total)
+steps:
+  - id: fetch1     # 3s ─┐
+  - id: fetch2     # 3s ─┼─ All run in parallel
+  - id: fetch3     # 3s ─┘
+```
+
+---
+
+## Best Practices
+
+### 1. Design for Parallelism
+
+✅ **DO**: Identify independent operations
+```yaml
+steps:
+  - id: notify_slack
+  - id: notify_email
+  - id: notify_pagerduty
+  # All independent, run in parallel
+```
+
+❌ **DON'T**: Create unnecessary dependencies
+```yaml
+steps:
+  - id: notify_slack
+  - id: notify_email
+    depends_on: [notify_slack]  # Unnecessary!
+  - id: notify_pagerduty
+    depends_on: [notify_email]  # Creates false sequencing
+```
+
+### 2. Declare All Dependencies Explicitly
+
+✅ **DO**: Be explicit about data dependencies
+```yaml
+- id: aggregate
+  depends_on: [fetch_logs, fetch_metrics]  # Clear intent
+  arguments:
+    logs: "{{.steps.fetch_logs.output}}"
+    metrics: "{{.steps.fetch_metrics.output}}"
+```
+
+❌ **DON'T**: Rely on implicit ordering
+```yaml
+# This will fail! process_data tries to access fetch_data output,
+# but they run in parallel without depends_on
+- id: fetch_data
+  type: tool
+  tool: api.fetch
+
+- id: process_data  # ERROR: fetch_data may not have completed!
+  type: tool
+  tool: transform.process
+  arguments:
+    data: "{{.steps.fetch_data.output}}"
+```
+
+### 3. Use Appropriate Error Handling
+
+✅ **DO**: Match error handling to business requirements
+```yaml
+steps:
+  # Critical: must succeed
+  - id: charge_payment
+    type: tool
+    tool: stripe.charge
+    # Uses default abort behavior
+
+  # Optional: nice to have
+  - id: send_receipt
+    type: tool
+    tool: email.send
+    depends_on: [charge_payment]
+    on_error:
+      action: continue_on_error
+```
+
+### 4. Set Realistic Timeouts
+
+✅ **DO**: Set timeouts based on SLAs
+```yaml
+spec:
+  timeout: 5m  # API SLA: 5 minutes
+  steps:
+    - id: external_api
+      timeout: 30s  # Individual operation: 30 seconds
+      on_error:
+        action: retry
+        retry_count: 3
+```
+
+### 5. Keep Steps Focused
+
+✅ **DO**: One responsibility per step
+```yaml
+steps:
+  - id: fetch_user
+    tool: db.query_user
+  - id: validate_permissions
+    tool: auth.check_permissions
+    depends_on: [fetch_user]
+  - id: perform_action
+    tool: api.execute
+    depends_on: [validate_permissions]
+```
+
+❌ **DON'T**: Combine unrelated operations
+```yaml
+steps:
+  - id: do_everything
+    tool: monolith.execute  # Hard to parallelize, test, debug
+```
+
+---
+
+## Common Patterns
+
+### Pattern 1: Fan-Out / Fan-In
+
+Parallel execution followed by aggregation.
+
+```yaml
+steps:
+  # Fan-out: Parallel data collection
+  - id: fetch_source_a
+    type: tool
+    tool: api.fetch_a
+
+  - id: fetch_source_b
+    type: tool
+    tool: api.fetch_b
+
+  - id: fetch_source_c
+    type: tool
+    tool: api.fetch_c
+
+  # Fan-in: Aggregate results
+  - id: aggregate
+    type: tool
+    tool: analysis.combine
+    depends_on: [fetch_source_a, fetch_source_b, fetch_source_c]
+```
+
+**Use Cases**: Data aggregation, multi-source reporting, distributed search
+
+### Pattern 2: Pipeline with Parallel Stages
+
+Sequential stages with parallel operations within each stage.
+
+```yaml
+steps:
+  # Stage 1: Fetch raw data
+  - id: fetch
+    type: tool
+    tool: api.fetch
+
+  # Stage 2: Parallel transformations
+  - id: transform_format_a
+    type: tool
+    tool: transform.to_format_a
+    depends_on: [fetch]
+
+  - id: transform_format_b
+    type: tool
+    tool: transform.to_format_b
+    depends_on: [fetch]
+
+  # Stage 3: Parallel storage
+  - id: store_warehouse
+    type: tool
+    tool: warehouse.store
+    depends_on: [transform_format_a]
+
+  - id: store_cache
+    type: tool
+    tool: cache.store
+    depends_on: [transform_format_b]
+```
+
+**Use Cases**: ETL pipelines, data transformation, multi-target deployments
+
+### Pattern 3: Conditional Parallel Execution
+
+Use conditions to selectively enable parallel branches.
+
+```yaml
+steps:
+  - id: fetch_user
+    type: tool
+    tool: db.query_user
+
+  # Parallel conditional branches
+  - id: notify_slack
+    type: tool
+    tool: slack.notify
+    depends_on: [fetch_user]
+    condition: "{{.steps.fetch_user.output.preferences.slack_enabled}}"
+
+  - id: notify_email
+    type: tool
+    tool: email.send
+    depends_on: [fetch_user]
+    condition: "{{.steps.fetch_user.output.preferences.email_enabled}}"
+
+  - id: notify_sms
+    type: tool
+    tool: sms.send
+    depends_on: [fetch_user]
+    condition: "{{.steps.fetch_user.output.preferences.sms_enabled}}"
+```
+
+**Use Cases**: Multi-channel notifications, feature flags, A/B testing
+
+### Pattern 4: Retry with Fallback
+
+Try primary service, retry on failure, fall back to secondary.
+
+```yaml
+steps:
+  - id: try_primary
+    type: tool
+    tool: primary_api.call
+    on_error:
+      action: retry
+      retry_count: 2
+      retry_delay: 1s
+
+  - id: use_fallback
+    type: tool
+    tool: fallback_api.call
+    depends_on: [try_primary]
+    condition: "{{ne .steps.try_primary.status \"completed\"}}"
+```
+
+**Use Cases**: High availability, disaster recovery, service degradation
+
+### Pattern 5: Parallel Validation
+
+Validate multiple aspects concurrently before proceeding.
+
+```yaml
+steps:
+  # Parallel validations
+  - id: validate_schema
+    type: tool
+    tool: validation.check_schema
+
+  - id: validate_permissions
+    type: tool
+    tool: auth.check_permissions
+
+  - id: validate_quota
+    type: tool
+    tool: billing.check_quota
+
+  # Proceed only if all validations pass
+  - id: execute_action
+    type: tool
+    tool: api.execute
+    depends_on: [validate_schema, validate_permissions, validate_quota]
+```
+
+**Use Cases**: Pre-flight checks, authorization, resource validation
+
+---
+
+## Troubleshooting
+
+### Debugging Parallel Execution
+
+**Problem**: Step fails with "output not found" error
+
+**Solution**: Add dependency to ensure step completes first
+```yaml
+# Before (broken)
+- id: process
+  arguments:
+    data: "{{.steps.fetch.output}}"  # May run before fetch completes!
+
+# After (fixed)
+- id: process
+  depends_on: [fetch]  # Explicit dependency
+  arguments:
+    data: "{{.steps.fetch.output}}"
+```
+
+### Detecting Circular Dependencies
+
+**Problem**: Workflow validation fails with "circular dependency detected"
+
+**Solution**: Review `depends_on` chains for cycles
+```yaml
+# Circular dependency (invalid)
+- id: step_a
+  depends_on: [step_b]
+- id: step_b
+  depends_on: [step_a]  # ❌ Cycle!
+
+# Fixed (valid)
+- id: step_a
+- id: step_b
+  depends_on: [step_a]  # ✓ Linear dependency
+```
+
+### Performance Issues
+
+**Problem**: Workflow slower than expected despite parallel execution
+
+**Checklist**:
+1. Verify steps actually run in parallel (check execution levels)
+2. Check for unnecessary `depends_on` constraints
+3. Review concurrency limits (may be throttling)
+4. Profile individual step execution times
+5. Consider network/external service bottlenecks
+
+---
+
+## Migration from Sequential to Parallel
+
+If you have existing sequential workflows, here's how to migrate:
+
+### Step 1: Identify Independent Steps
+
+Review your workflow and identify steps that:
+- Don't use outputs from other steps
+- Access different external services
+- Perform independent validations or checks
+
+### Step 2: Remove Unnecessary Dependencies
+
+```yaml
+# Before: Implicit sequential execution
+steps:
+  - id: step1
+  - id: step2
+  - id: step3
+
+# After: Explicit independence (parallel)
+steps:
+  - id: step1  # No depends_on = runs in parallel
+  - id: step2  # No depends_on = runs in parallel
+  - id: step3  # No depends_on = runs in parallel
+```
+
+### Step 3: Add Required Dependencies
+
+```yaml
+# If step3 actually needs step1's output:
+steps:
+  - id: step1
+  - id: step2
+  - id: step3
+    depends_on: [step1]  # Explicit data dependency
+    arguments:
+      data: "{{.steps.step1.output}}"
+```
+
+### Step 4: Test Incrementally
+
+1. Start with one parallel group
+2. Validate outputs and timing
+3. Gradually parallelize more steps
+4. Monitor for race conditions or dependency issues
+
+---
+
+## Additional Resources
+
+- [VirtualMCPCompositeToolDefinition Guide](virtualmcpcompositetooldefinition-guide.md) - Basic workflow concepts
+- [Architecture Documentation](../arch/README.md) - System architecture and design
+- [Operator Guide](deploying-mcp-server-with-operator.md) - Kubernetes deployment
+
+---
+
+## Summary
+
+Key takeaways for advanced workflows:
+
+1. ✅ **Embrace Parallelism**: Design workflows for concurrent execution
+2. ✅ **Explicit Dependencies**: Always declare data dependencies with `depends_on`
+3. ✅ **Error Resilience**: Use retry for transient failures, continue for optional steps
+4. ✅ **Set Timeouts**: Prevent runaway workflows with appropriate timeouts
+5. ✅ **Monitor State**: Track workflow execution for debugging and optimization
+
+The DAG execution model provides automatic parallelization while maintaining correctness through dependency management. Follow these patterns and practices to build efficient, reliable, and maintainable workflows.
diff --git a/docs/operator/composite-tools-quick-reference.md b/docs/operator/composite-tools-quick-reference.md
new file mode 100644
index 000000000..e057b4e65
--- /dev/null
+++ b/docs/operator/composite-tools-quick-reference.md
@@ -0,0 +1,233 @@
+# Composite Tools Quick Reference
+
+Quick reference for Virtual MCP Composite Tool workflows.
+
+## Basic Workflow Structure
+
+```yaml
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: VirtualMCPCompositeToolDefinition
+metadata:
+  name: my-workflow
+  namespace: default
+spec:
+  name: my_workflow_name        # Tool name exposed to clients
+  description: What it does      # Required description
+  timeout: 30m                   # Optional: workflow timeout (default: 30m)
+  failureMode: abort             # Optional: abort|continue|best_effort (default: abort)
+
+  parameters:                    # Optional: input parameters
+    schema:
+      type: object
+      properties:
+        param_name:
+          type: string
+
+  steps:                         # Required: workflow steps
+    - id: step1
+      type: tool                 # tool|elicitation
+      tool: workload.tool_name
+      arguments:
+        key: "{{.params.param_name}}"
+```
+
+## Parallel Execution
+
+```yaml
+# Independent steps run in parallel automatically
+steps:
+  - id: fetch_a                  # Level 1: Runs in parallel ─┐
+  - id: fetch_b                  # Level 1: Runs in parallel ─┼─> aggregate
+  - id: fetch_c                  # Level 1: Runs in parallel ─┘
+
+  - id: aggregate                # Level 2: Waits for Level 1
+    depends_on: [fetch_a, fetch_b, fetch_c]
+```
+
+## Step Dependencies
+
+```yaml
+steps:
+  - id: step1
+
+  - id: step2
+    depends_on: [step1]          # Runs after step1 completes
+
+  - id: step3
+    depends_on: [step1, step2]   # Waits for both step1 AND step2
+```
+
+## Template Syntax
+
+```yaml
+# Access input parameters
+"{{.params.parameter_name}}"
+
+# Access step outputs
+"{{.steps.step_id.output}}"
+"{{.steps.step_id.output.field_name}}"
+"{{.steps.step_id.status}}"     # completed|failed|skipped|running
+
+# Conditional logic
+condition: "{{eq .steps.step1.status \"completed\"}}"
+condition: "{{.params.enabled}}"
+condition: "{{gt .steps.step1.output.count 10}}"
+
+# JSON encoding
+arguments:
+  data: "{{json .steps.step1.output}}"
+```
+
+## Error Handling
+
+### Workflow-Level
+
+```yaml
+spec:
+  failureMode: abort             # Stop on first error (default)
+  failureMode: continue          # Log errors, continue workflow
+  failureMode: best_effort       # Try all steps, aggregate errors
+```
+
+### Step-Level (Overrides Workflow)
+
+```yaml
+steps:
+  # Abort on error (default)
+  - id: critical
+    tool: payment.charge
+    # Uses workflow failureMode
+
+  # Continue despite errors
+  - id: optional
+    tool: notification.send
+    on_error:
+      action: continue_on_error
+
+  # Retry with exponential backoff
+  - id: resilient
+    tool: external.api
+    on_error:
+      action: retry
+      retry_count: 3             # Max 3 retries (4 total attempts)
+      retry_delay: 1s            # Initial delay: 1s, 2s, 4s, 8s...
+```
+
+## Timeouts
+
+```yaml
+spec:
+  timeout: 30m                   # Workflow timeout (default: 30m)
+
+  steps:
+    - id: step1
+      timeout: 5m                # Step timeout (default: 5m)
+```
+
+**Precedence**: Step timeout ≤ Workflow timeout
+
+## Common Patterns
+
+### Fan-Out / Fan-In
+
+```yaml
+steps:
+  # Fan-out: Parallel collection
+  - id: fetch_1
+  - id: fetch_2
+  - id: fetch_3
+
+  # Fan-in: Aggregate
+  - id: combine
+    depends_on: [fetch_1, fetch_2, fetch_3]
+```
+
+### Sequential Pipeline
+
+```yaml
+steps:
+  - id: fetch
+  - id: transform
+    depends_on: [fetch]
+  - id: store
+    depends_on: [transform]
+```
+
+### Diamond Pattern
+
+```yaml
+steps:
+  - id: fetch
+
+  - id: process_a
+    depends_on: [fetch]
+  - id: process_b
+    depends_on: [fetch]
+
+  - id: merge
+    depends_on: [process_a, process_b]
+```
+
+### Retry with Fallback
+
+```yaml
+steps:
+  - id: try_primary
+    tool: primary.api
+    on_error:
+      action: retry
+      retry_count: 2
+
+  - id: use_fallback
+    tool: secondary.api
+    depends_on: [try_primary]
+    condition: "{{ne .steps.try_primary.status \"completed\"}}"
+```
+
+## Validation Rules
+
+- ✅ Workflow name: `^[a-z0-9]([a-z0-9_-]*[a-z0-9])?$` (1-64 chars)
+- ✅ Step IDs must be unique
+- ✅ All `depends_on` step IDs must exist
+- ✅ No circular dependencies
+- ✅ Tool format: `workload_id.tool_name`
+- ✅ Max retry count: 10
+- ✅ Max workflow steps: 100
+
+## Debugging
+
+### Check Workflow Status
+
+```yaml
+# In VirtualMCPCompositeToolDefinition
+status:
+  validationStatus: Valid|Invalid
+  validationErrors:
+    - "error message here"
+  referencedBy:
+    - namespace: default
+      name: vmcp-server-1
+```
+
+### Common Issues
+
+| Error | Cause | Fix |
+|-------|-------|-----|
+| "output not found" | Missing `depends_on` | Add dependency |
+| "circular dependency" | Cycle in `depends_on` | Remove cycle |
+| "tool not found" | Invalid tool reference | Check `workload.tool` format |
+| "template error" | Invalid Go template | Fix template syntax |
+
+## Performance Tips
+
+1. ✅ Remove unnecessary `depends_on` constraints
+2. ✅ Group related steps in same execution level
+3. ✅ Set realistic timeouts based on SLAs
+4. ✅ Use retry for transient failures only
+5. ✅ Keep steps focused (one responsibility)
+
+## Links
+
+- [Detailed Guide](virtualmcpcompositetooldefinition-guide.md)
+- [Advanced Patterns](advanced-workflow-patterns.md)
+- [Operator Installation](deploying-toolhive-operator.md)

From 4647eba31142a824124ea2bff47e54277b7361e1 Mon Sep 17 00:00:00 2001
From: Trey <tgrunnagle@gmail.com>
Date: Mon, 17 Nov 2025 08:29:48 -0800
Subject: [PATCH 2/2] Address PR feedback

---
 docs/operator/advanced-workflow-patterns.md   | 174 ++++++++++--------
 .../composite-tools-quick-reference.md        |  91 ++++++---
 2 files changed, 159 insertions(+), 106 deletions(-)

diff --git a/docs/operator/advanced-workflow-patterns.md b/docs/operator/advanced-workflow-patterns.md
index b942d027d..ef9abb3c7 100644
--- a/docs/operator/advanced-workflow-patterns.md
+++ b/docs/operator/advanced-workflow-patterns.md
@@ -38,13 +38,14 @@ spec:
   name: investigate_incident
   description: Investigate incident by gathering logs, metrics, and traces in parallel
   parameters:
-    schema:
-      type: object
-      properties:
-        incident_id:
-          type: string
-        time_range:
-          type: string
+    incident_id:
+      type: string
+      description: The incident identifier
+      required: true
+    time_range:
+      type: string
+      description: Time range for data collection
+      required: true
   steps:
     # Level 1: These three steps run in parallel (no dependencies)
     - id: fetch_logs
@@ -72,7 +73,7 @@ spec:
     - id: correlate
       type: tool
       tool: analysis.correlate_data
-      depends_on: [fetch_logs, fetch_metrics, fetch_traces]
+      dependsOn: [fetch_logs, fetch_metrics, fetch_traces]
       arguments:
         logs: "{{.steps.fetch_logs.output}}"
         metrics: "{{.steps.fetch_metrics.output}}"
@@ -82,7 +83,7 @@ spec:
     - id: create_report
       type: tool
       tool: jira.create_issue
-      depends_on: [correlate]
+      dependsOn: [correlate]
       arguments:
         title: "Incident {{.params.incident_id}} Analysis"
         body: "{{.steps.correlate.output.summary}}"
@@ -102,14 +103,14 @@ Time    Level 1 (Parallel)              Level 2         Level 3
 
 ## Step Dependencies
 
-Use the `depends_on` field to define explicit dependencies between steps.
+Use the `dependsOn` field to define explicit dependencies between steps.
 
 ### Syntax
 
 ```yaml
 steps:
   - id: step_name
-    depends_on: [dependency1, dependency2, ...]
+    dependsOn: [dependency1, dependency2, ...]
     # ... rest of step config
 ```
 
@@ -133,18 +134,18 @@ steps:
   - id: process_left
     type: tool
     tool: transform.left
-    depends_on: [fetch_data]
+    dependsOn: [fetch_data]
 
   - id: process_right
     type: tool
     tool: transform.right
-    depends_on: [fetch_data]
+    dependsOn: [fetch_data]
 
   # Level 3: Waits for both Level 2 steps
   - id: merge_results
     type: tool
     tool: combine.merge
-    depends_on: [process_left, process_right]
+    dependsOn: [process_left, process_right]
 ```
 
 **Execution Graph**:
@@ -162,7 +163,7 @@ Use template syntax to access outputs from dependencies:
 
 ```yaml
 - id: analyze
-  depends_on: [fetch_logs, fetch_metrics]
+  dependsOn: [fetch_logs, fetch_metrics]
   arguments:
     # Access specific fields from dependency outputs
     log_count: "{{.steps.fetch_logs.output.count}}"
@@ -172,6 +173,33 @@ Use template syntax to access outputs from dependencies:
     raw_data: "{{.steps.fetch_logs.output}}"
 ```
 
+### Template System Overview
+
+Workflows use Go's [text/template](https://pkg.go.dev/text/template) with these additional context variables and functions:
+
+**Context Variables**:
+- `.params.*` - Input parameters
+- `.steps.<id>.output` - Step outputs
+- `.steps.<id>.status` - Step status (completed, failed, skipped, running)
+- `.steps.<id>.error` - Step error messages (if failed)
+- `.vars.*` - Workflow-scoped variables
+
+**Custom Functions**:
+- `json` - JSON encode a value
+- `quote` - Quote a string value
+
+**Built-in Functions**: All Go template built-ins are available (`eq`, `ne`, `lt`, `le`, `gt`, `ge`, `and`, `or`, `not`, `index`, `len`, `range`, `with`, `printf`, etc.)
+
+**Example with Advanced Features**:
+```yaml
+- id: conditional_step
+  dependsOn: [fetch_data]
+  condition: "{{and (eq .steps.fetch_data.status \"completed\") (gt (len .steps.fetch_data.output.items) 0)}}"
+  arguments:
+    message: "{{printf \"Found %d items\" (len .steps.fetch_data.output.items)}}"
+    data: "{{json .steps.fetch_data.output}}"
+```
+
 ---
 
 ## Advanced Error Handling
@@ -208,7 +236,7 @@ steps:
     type: tool
     tool: slack.notify
     on_error:
-      action: continue_on_error  # Don't fail workflow if Slack is down
+      action: continue  # Don't fail workflow if Slack is down
 
   - id: critical_payment
     type: tool
@@ -227,13 +255,11 @@ steps:
     tool: external.fetch_data
     on_error:
       action: retry
-      retry_count: 3           # Maximum 3 retries (4 total attempts)
-      retry_delay: 1s          # Initial delay: 1 second
-      # Exponential backoff: 1s, 2s, 4s
+      maxRetries: 3           # Maximum 3 retries (4 total attempts)
 ```
 
 **Retry Behavior**:
-- **Exponential Backoff**: Delay doubles each retry (1s → 2s → 4s → 8s...)
+- **Exponential Backoff**: Delay increases by 1.5x each retry with ±50% randomization (1s → ~1.5s → ~2.25s → ~3.4s...), capped at 60 seconds
 - **Maximum Retries**: Capped at 10 (configurable per step)
 - **Context Aware**: Respects workflow timeout (won't retry if timeout exceeded)
 - **Error Propagation**: Final error includes retry count in metadata
@@ -255,29 +281,28 @@ spec:
       tool: s3.download
       on_error:
         action: retry
-        retry_count: 3
-        retry_delay: 2s
+        maxRetries: 3
 
     - id: deploy
       type: tool
       tool: kubernetes.apply
-      depends_on: [fetch_artifact]
+      dependsOn: [fetch_artifact]
       # Critical: uses workflow failureMode (abort)
 
     # Optional post-deployment tasks
     - id: notify_slack
       type: tool
       tool: slack.notify
-      depends_on: [deploy]
+      dependsOn: [deploy]
       on_error:
-        action: continue_on_error  # Don't fail if notification fails
+        action: continue  # Don't fail if notification fails
 
     - id: update_dashboard
       type: tool
       tool: grafana.update
-      depends_on: [deploy]
+      dependsOn: [deploy]
       on_error:
-        action: continue_on_error
+        action: continue
 ```
 
 ---
@@ -332,36 +357,34 @@ Workflow Timeout: 30m
 - Suitable for single-instance deployments
 - Automatic cleanup of completed workflows (configurable)
 - Thread-safe for parallel step execution
-- Workflow status available via API
+- Workflow status available programmatically via the Composer Go API
 
 **Future: Distributed State Store** (Redis/Database):
 - For multi-instance deployments
 - Workflow resumption after restart
 - Cross-instance workflow visibility
 
-### Example: Monitoring Workflow State
-
-```yaml
-# Query workflow status (future CLI support)
-$ thv workflow status <workflow-id>
-
-Workflow ID: a1b2c3d4-e5f6-7890-abcd-ef1234567890
-Status: running
-Started: 2025-01-15 10:30:00
-Duration: 2m 15s
+### Monitoring Workflow State
 
-Completed Steps:
-  ✓ fetch_logs    (1.2s)
-  ✓ fetch_metrics (0.8s)
-  ✓ fetch_traces  (1.5s)
+Workflow status is currently available programmatically through the Composer Go API:
 
-In Progress:
-  ⏳ correlate
+```go
+// Get workflow status
+status, err := composer.GetWorkflowStatus(ctx, workflowID)
+if err != nil {
+    // Handle error
+}
 
-Pending:
-  ⋯ create_report
+// Check workflow state
+fmt.Printf("Workflow ID: %s\n", status.WorkflowID)
+fmt.Printf("Status: %s\n", status.Status)
+fmt.Printf("Started: %s\n", status.StartTime)
+fmt.Printf("Duration: %s\n", status.Duration)
+fmt.Printf("Completed Steps: %v\n", status.CompletedSteps)
 ```
 
+**Note**: HTTP REST API endpoints for external workflow monitoring are planned for a future release.
+
 ---
 
 ## Performance Optimization
@@ -395,7 +418,7 @@ stats := {
 
 ### Optimization Strategies
 
-1. **Minimize Dependencies**: Reduce `depends_on` where possible
+1. **Minimize Dependencies**: Reduce `dependsOn` where possible
 2. **Group Related Steps**: Steps with similar execution time work well in same level
 3. **Split Large Steps**: Break monolithic steps into parallel sub-steps
 4. **Use Conditional Execution**: Skip unnecessary steps with `condition` field
@@ -436,9 +459,9 @@ steps:
 steps:
   - id: notify_slack
   - id: notify_email
-    depends_on: [notify_slack]  # Unnecessary!
+    dependsOn: [notify_slack]  # Unnecessary!
   - id: notify_pagerduty
-    depends_on: [notify_email]  # Creates false sequencing
+    dependsOn: [notify_email]  # Creates false sequencing
 ```
 
 ### 2. Declare All Dependencies Explicitly
@@ -446,7 +469,7 @@ steps:
 ✅ **DO**: Be explicit about data dependencies
 ```yaml
 - id: aggregate
-  depends_on: [fetch_logs, fetch_metrics]  # Clear intent
+  dependsOn: [fetch_logs, fetch_metrics]  # Clear intent
   arguments:
     logs: "{{.steps.fetch_logs.output}}"
     metrics: "{{.steps.fetch_metrics.output}}"
@@ -482,9 +505,9 @@ steps:
   - id: send_receipt
     type: tool
     tool: email.send
-    depends_on: [charge_payment]
+    dependsOn: [charge_payment]
     on_error:
-      action: continue_on_error
+      action: continue
 ```
 
 ### 4. Set Realistic Timeouts
@@ -498,7 +521,7 @@ spec:
       timeout: 30s  # Individual operation: 30 seconds
       on_error:
         action: retry
-        retry_count: 3
+        maxRetries: 3
 ```
 
 ### 5. Keep Steps Focused
@@ -510,10 +533,10 @@ steps:
     tool: db.query_user
   - id: validate_permissions
     tool: auth.check_permissions
-    depends_on: [fetch_user]
+    dependsOn: [fetch_user]
   - id: perform_action
     tool: api.execute
-    depends_on: [validate_permissions]
+    dependsOn: [validate_permissions]
 ```
 
 ❌ **DON'T**: Combine unrelated operations
@@ -550,7 +573,7 @@ steps:
   - id: aggregate
     type: tool
     tool: analysis.combine
-    depends_on: [fetch_source_a, fetch_source_b, fetch_source_c]
+    dependsOn: [fetch_source_a, fetch_source_b, fetch_source_c]
 ```
 
 **Use Cases**: Data aggregation, multi-source reporting, distributed search
@@ -570,23 +593,23 @@ steps:
   - id: transform_format_a
     type: tool
     tool: transform.to_format_a
-    depends_on: [fetch]
+    dependsOn: [fetch]
 
   - id: transform_format_b
     type: tool
     tool: transform.to_format_b
-    depends_on: [fetch]
+    dependsOn: [fetch]
 
   # Stage 3: Parallel storage
   - id: store_warehouse
     type: tool
     tool: warehouse.store
-    depends_on: [transform_format_a]
+    dependsOn: [transform_format_a]
 
   - id: store_cache
     type: tool
     tool: cache.store
-    depends_on: [transform_format_b]
+    dependsOn: [transform_format_b]
 ```
 
 **Use Cases**: ETL pipelines, data transformation, multi-target deployments
@@ -605,19 +628,19 @@ steps:
   - id: notify_slack
     type: tool
     tool: slack.notify
-    depends_on: [fetch_user]
+    dependsOn: [fetch_user]
     condition: "{{.steps.fetch_user.output.preferences.slack_enabled}}"
 
   - id: notify_email
     type: tool
     tool: email.send
-    depends_on: [fetch_user]
+    dependsOn: [fetch_user]
     condition: "{{.steps.fetch_user.output.preferences.email_enabled}}"
 
   - id: notify_sms
     type: tool
     tool: sms.send
-    depends_on: [fetch_user]
+    dependsOn: [fetch_user]
     condition: "{{.steps.fetch_user.output.preferences.sms_enabled}}"
 ```
 
@@ -634,13 +657,12 @@ steps:
     tool: primary_api.call
     on_error:
       action: retry
-      retry_count: 2
-      retry_delay: 1s
+      maxRetries: 2
 
   - id: use_fallback
     type: tool
     tool: fallback_api.call
-    depends_on: [try_primary]
+    dependsOn: [try_primary]
     condition: "{{ne .steps.try_primary.status \"completed\"}}"
 ```
 
@@ -669,7 +691,7 @@ steps:
   - id: execute_action
     type: tool
     tool: api.execute
-    depends_on: [validate_schema, validate_permissions, validate_quota]
+    dependsOn: [validate_schema, validate_permissions, validate_quota]
 ```
 
 **Use Cases**: Pre-flight checks, authorization, resource validation
@@ -691,7 +713,7 @@ steps:
 
 # After (fixed)
 - id: process
-  depends_on: [fetch]  # Explicit dependency
+  dependsOn: [fetch]  # Explicit dependency
   arguments:
     data: "{{.steps.fetch.output}}"
 ```
@@ -700,18 +722,18 @@ steps:
 
 **Problem**: Workflow validation fails with "circular dependency detected"
 
-**Solution**: Review `depends_on` chains for cycles
+**Solution**: Review `dependsOn` chains for cycles
 ```yaml
 # Circular dependency (invalid)
 - id: step_a
-  depends_on: [step_b]
+  dependsOn: [step_b]
 - id: step_b
-  depends_on: [step_a]  # ❌ Cycle!
+  dependsOn: [step_a]  # ❌ Cycle!
 
 # Fixed (valid)
 - id: step_a
 - id: step_b
-  depends_on: [step_a]  # ✓ Linear dependency
+  dependsOn: [step_a]  # ✓ Linear dependency
 ```
 
 ### Performance Issues
@@ -720,7 +742,7 @@ steps:
 
 **Checklist**:
 1. Verify steps actually run in parallel (check execution levels)
-2. Check for unnecessary `depends_on` constraints
+2. Check for unnecessary `dependsOn` constraints
 3. Review concurrency limits (may be throttling)
 4. Profile individual step execution times
 5. Consider network/external service bottlenecks
@@ -762,7 +784,7 @@ steps:
   - id: step1
   - id: step2
   - id: step3
-    depends_on: [step1]  # Explicit data dependency
+    dependsOn: [step1]  # Explicit data dependency
     arguments:
       data: "{{.steps.step1.output}}"
 ```
@@ -780,7 +802,7 @@ steps:
 
 - [VirtualMCPCompositeToolDefinition Guide](virtualmcpcompositetooldefinition-guide.md) - Basic workflow concepts
 - [Architecture Documentation](../arch/README.md) - System architecture and design
-- [Operator Guide](deploying-mcp-server-with-operator.md) - Kubernetes deployment
+- [Operator Guide](../kind/deploying-mcp-server-with-operator.md) - Kubernetes deployment
 
 ---
 
@@ -789,7 +811,7 @@ steps:
 Key takeaways for advanced workflows:
 
 1. ✅ **Embrace Parallelism**: Design workflows for concurrent execution
-2. ✅ **Explicit Dependencies**: Always declare data dependencies with `depends_on`
+2. ✅ **Explicit Dependencies**: Always declare data dependencies with `dependsOn`
 3. ✅ **Error Resilience**: Use retry for transient failures, continue for optional steps
 4. ✅ **Set Timeouts**: Prevent runaway workflows with appropriate timeouts
 5. ✅ **Monitor State**: Track workflow execution for debugging and optimization
diff --git a/docs/operator/composite-tools-quick-reference.md b/docs/operator/composite-tools-quick-reference.md
index e057b4e65..65aad9643 100644
--- a/docs/operator/composite-tools-quick-reference.md
+++ b/docs/operator/composite-tools-quick-reference.md
@@ -17,11 +17,10 @@ spec:
   failureMode: abort             # Optional: abort|continue|best_effort (default: abort)
 
   parameters:                    # Optional: input parameters
-    schema:
-      type: object
-      properties:
-        param_name:
-          type: string
+    param_name:
+      type: string
+      description: Description of the parameter
+      required: false
 
   steps:                         # Required: workflow steps
     - id: step1
@@ -41,7 +40,7 @@ steps:
   - id: fetch_c                  # Level 1: Runs in parallel ─┘
 
   - id: aggregate                # Level 2: Waits for Level 1
-    depends_on: [fetch_a, fetch_b, fetch_c]
+    dependsOn: [fetch_a, fetch_b, fetch_c]
 ```
 
 ## Step Dependencies
@@ -51,14 +50,18 @@ steps:
   - id: step1
 
   - id: step2
-    depends_on: [step1]          # Runs after step1 completes
+    dependsOn: [step1]          # Runs after step1 completes
 
   - id: step3
-    depends_on: [step1, step2]   # Waits for both step1 AND step2
+    dependsOn: [step1, step2]   # Waits for both step1 AND step2
 ```
 
 ## Template Syntax
 
+Workflows use Go's [text/template](https://pkg.go.dev/text/template) syntax with additional context variables and functions.
+
+### Basic Access
+
 ```yaml
 # Access input parameters
 "{{.params.parameter_name}}"
@@ -68,16 +71,43 @@ steps:
 "{{.steps.step_id.output.field_name}}"
 "{{.steps.step_id.status}}"     # completed|failed|skipped|running
 
-# Conditional logic
-condition: "{{eq .steps.step1.status \"completed\"}}"
-condition: "{{.params.enabled}}"
-condition: "{{gt .steps.step1.output.count 10}}"
+# Access workflow-scoped variables
+"{{.vars.variable_name}}"
+
+# Access step errors
+"{{.steps.step_id.error}}"
+```
+
+### Functions
 
+```yaml
 # JSON encoding
 arguments:
   data: "{{json .steps.step1.output}}"
+
+# String quoting
+arguments:
+  quoted: "{{quote .params.value}}"
 ```
 
+### Conditional Logic
+
+```yaml
+# Comparison operators (eq, ne, lt, le, gt, ge)
+condition: "{{eq .steps.step1.status \"completed\"}}"
+condition: "{{ne .steps.step1.status \"failed\"}}"
+condition: "{{gt .steps.step1.output.count 10}}"
+
+# Boolean operators (and, or, not)
+condition: "{{and .params.enabled (eq .steps.step1.status \"completed\")}}"
+condition: "{{or .params.force (gt .steps.check.output.count 0)}}"
+condition: "{{not .params.disabled}}"
+```
+
+### Advanced Features
+
+All Go template built-ins are available: `index`, `len`, `range`, `with`, `printf`, etc. See [Go text/template documentation](https://pkg.go.dev/text/template) for complete reference.
+
 ## Error Handling
 
 ### Workflow-Level
@@ -102,15 +132,14 @@ steps:
   - id: optional
     tool: notification.send
     on_error:
-      action: continue_on_error
+      action: continue
 
   # Retry with exponential backoff
   - id: resilient
     tool: external.api
     on_error:
       action: retry
-      retry_count: 3             # Max 3 retries (4 total attempts)
-      retry_delay: 1s            # Initial delay: 1s, 2s, 4s, 8s...
+      maxRetries: 3             # Max 3 retries (4 total attempts)
 ```
 
 ## Timeouts
@@ -139,7 +168,7 @@ steps:
 
   # Fan-in: Aggregate
   - id: combine
-    depends_on: [fetch_1, fetch_2, fetch_3]
+    dependsOn: [fetch_1, fetch_2, fetch_3]
 ```
 
 ### Sequential Pipeline
@@ -148,9 +177,9 @@ steps:
 steps:
   - id: fetch
   - id: transform
-    depends_on: [fetch]
+    dependsOn: [fetch]
   - id: store
-    depends_on: [transform]
+    dependsOn: [transform]
 ```
 
 ### Diamond Pattern
@@ -160,12 +189,12 @@ steps:
   - id: fetch
 
   - id: process_a
-    depends_on: [fetch]
+    dependsOn: [fetch]
   - id: process_b
-    depends_on: [fetch]
+    dependsOn: [fetch]
 
   - id: merge
-    depends_on: [process_a, process_b]
+    dependsOn: [process_a, process_b]
 ```
 
 ### Retry with Fallback
@@ -176,11 +205,11 @@ steps:
     tool: primary.api
     on_error:
       action: retry
-      retry_count: 2
+      maxRetries: 2
 
   - id: use_fallback
     tool: secondary.api
-    depends_on: [try_primary]
+    dependsOn: [try_primary]
     condition: "{{ne .steps.try_primary.status \"completed\"}}"
 ```
 
@@ -188,11 +217,13 @@ steps:
 
 - ✅ Workflow name: `^[a-z0-9]([a-z0-9_-]*[a-z0-9])?$` (1-64 chars)
 - ✅ Step IDs must be unique
-- ✅ All `depends_on` step IDs must exist
+- ✅ All `dependsOn` step IDs must exist
 - ✅ No circular dependencies
 - ✅ Tool format: `workload_id.tool_name`
-- ✅ Max retry count: 10
-- ✅ Max workflow steps: 100
+- ✅ Max retry count: 10 (runtime capped - values > 10 are silently reduced with warning)
+- ✅ Max workflow steps: 100 (runtime enforced - workflows > 100 steps fail validation)
+
+**Note**: Max retry and max steps limits are currently enforced at runtime. Future work may add CRD-level validation (`+kubebuilder:validation:MaxItems=100`) and webhook validation to fail at submission time rather than execution time.
 
 ## Debugging
 
@@ -213,14 +244,14 @@ status:
 
 | Error | Cause | Fix |
 |-------|-------|-----|
-| "output not found" | Missing `depends_on` | Add dependency |
-| "circular dependency" | Cycle in `depends_on` | Remove cycle |
+| "output not found" | Missing `dependsOn` | Add dependency |
+| "circular dependency" | Cycle in `dependsOn` | Remove cycle |
 | "tool not found" | Invalid tool reference | Check `workload.tool` format |
 | "template error" | Invalid Go template | Fix template syntax |
 
 ## Performance Tips
 
-1. ✅ Remove unnecessary `depends_on` constraints
+1. ✅ Remove unnecessary `dependsOn` constraints
 2. ✅ Group related steps in same execution level
 3. ✅ Set realistic timeouts based on SLAs
 4. ✅ Use retry for transient failures only
@@ -230,4 +261,4 @@ status:
 
 - [Detailed Guide](virtualmcpcompositetooldefinition-guide.md)
 - [Advanced Patterns](advanced-workflow-patterns.md)
-- [Operator Installation](deploying-toolhive-operator.md)
+- [Operator Installation](../kind/deploying-toolhive-operator.md)