diff --git a/.github/agents/token-eval-reviewer.agent.md b/.github/agents/token-eval-reviewer.agent.md
new file mode 100644
index 0000000..eecede0
--- /dev/null
+++ b/.github/agents/token-eval-reviewer.agent.md
@@ -0,0 +1,21 @@
+---
+name: token-eval-reviewer
+description: Reviews Waza eval results and AI customization changes for quality, safety, and token efficiency.
+tools:
+ - codeSearch
+ - fileRead
+ - runCommand
+---
+
+# Token Eval Reviewer
+
+You review prompt, instruction, skill, and agent customizations.
+
+Focus on:
+
+- Whether the eval task measures a real customer workflow.
+- Whether fixtures are small, safe, and relevant.
+- Whether graders connect to correctness, groundedness, safety, developer experience, or token efficiency.
+- Whether workflow results are actionable for maintainers.
+
+Do not recommend adding proprietary third-party eval platforms for Chapter 8.
diff --git a/.github/instructions/token-optimization.instructions.md b/.github/instructions/token-optimization.instructions.md
new file mode 100644
index 0000000..e11b298
--- /dev/null
+++ b/.github/instructions/token-optimization.instructions.md
@@ -0,0 +1,7 @@
+# Token Optimization Instructions
+
+- Keep prompts, instructions, and agent definitions concise and testable.
+- Prefer repository facts and small fixtures over broad file inclusion.
+- Ask for clarification when the requested analysis lacks enough context.
+- Do not include secrets, production data, or private customer information in eval fixtures.
+- Tie every recommendation to quality, safety, cost, or developer experience.
diff --git a/.github/prompts/context-triage.prompt.md b/.github/prompts/context-triage.prompt.md
new file mode 100644
index 0000000..3e08015
--- /dev/null
+++ b/.github/prompts/context-triage.prompt.md
@@ -0,0 +1,17 @@
+---
+mode: ask
+description: Review a coding task for context quality and token efficiency.
+---
+
+# Context Triage
+
+Review the task, files, and instructions provided by the user.
+
+Identify:
+
+- Context that is necessary.
+- Context that is missing.
+- Context that can be removed.
+- Risks that require clarification before implementation.
+
+Return a concise recommendation before suggesting code changes.
diff --git a/.github/skills/token-optimization.skill.md b/.github/skills/token-optimization.skill.md
new file mode 100644
index 0000000..4950450
--- /dev/null
+++ b/.github/skills/token-optimization.skill.md
@@ -0,0 +1,10 @@
+---
+name: token-optimization
+description: Evaluate prompts, instructions, agents, and context selections for quality, safety, and token efficiency.
+---
+
+# Token Optimization Skill
+
+Use this skill when reviewing AI coding workflows for unnecessary context, unclear prompts, unsafe tool use, or inefficient model selection.
+
+Return findings that connect each recommendation to correctness, safety, developer experience, or token usage.
diff --git a/.github/workflows/waza-eval-suite.yml b/.github/workflows/waza-eval-suite.yml
new file mode 100644
index 0000000..6081304
--- /dev/null
+++ b/.github/workflows/waza-eval-suite.yml
@@ -0,0 +1,66 @@
+name: Waza Evals
+
+on:
+ pull_request:
+ paths:
+ - ".github/agents/**"
+ - ".github/instructions/**"
+ - ".github/prompts/**"
+ - ".github/skills/**"
+ - "evals/**"
+ - "skills/**"
+ - "labs/08-ai-evals-and-observability.md"
+ push:
+ branches: ["main"]
+ paths:
+ - ".github/agents/**"
+ - ".github/instructions/**"
+ - ".github/prompts/**"
+ - ".github/skills/**"
+ - "evals/**"
+ - "skills/**"
+ - "labs/08-ai-evals-and-observability.md"
+ workflow_dispatch:
+
+permissions:
+ contents: read
+
+jobs:
+ evaluate:
+ name: Run Waza eval suite
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+
+ - name: Install Waza
+ env:
+ # Pinned to a Microsoft Waza repository snapshot selected from upstream docs on 2026-05-12.
+ # If this commit becomes unavailable, update to a reviewed upstream commit and SHA-256 together.
+ # SHA-256 verified for this exact install.sh content on 2026-05-12.
+ WAZA_INSTALL_COMMIT: bf77c759d6bee3ba578c1980fa8a176fe8d014e4
+ WAZA_INSTALL_SHA256: f9f949c8ef7ed4e3309cee3f1285d2f7a7dea6db1c965ceab09443d3c7910d7a
+ run: |
+ curl -fsSL \
+ "https://raw.githubusercontent.com/microsoft/waza/${WAZA_INSTALL_COMMIT}/install.sh" \
+ -o install-waza.sh
+ echo "${WAZA_INSTALL_SHA256} install-waza.sh" | sha256sum -c -
+ bash install-waza.sh
+
+ - name: Verify Waza
+ run: waza --version
+
+ - name: Run Waza
+ run: |
+ mkdir -p waza-results
+ waza run evals/token-optimization/eval.yaml \
+ --verbose \
+ --output waza-results/results.json \
+ --reporter junit:waza-results/results.xml
+
+ - name: Upload Waza results
+ if: always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: waza-results
+ path: waza-results/
diff --git a/evals/token-optimization/eval.yaml b/evals/token-optimization/eval.yaml
new file mode 100644
index 0000000..f1722b5
--- /dev/null
+++ b/evals/token-optimization/eval.yaml
@@ -0,0 +1,28 @@
+name: token-optimization-eval
+description: Evaluation suite for token optimization workshop customizations.
+skill: token-optimization
+version: "1.0"
+
+config:
+ # Use mock for pull request validation. For real model runs, change this to
+ # a supported Waza executor, configure the required credentials such as
+ # GITHUB_TOKEN in Actions, and follow https://microsoft.github.io/waza/.
+ executor: mock
+ model: mock-model
+ trials_per_task: 1
+ timeout_seconds: 300
+ parallel: false
+
+graders:
+ - type: text
+ name: mentions_context_quality
+ config:
+ regex_match:
+ - "(?i)(context|token|fixture|prompt|instruction)"
+ - type: behavior
+ name: bounded_tool_use
+ config:
+ max_tool_calls: 5
+
+tasks:
+ - "tasks/*.yaml"
diff --git a/evals/token-optimization/tasks/context-triage.yaml b/evals/token-optimization/tasks/context-triage.yaml
new file mode 100644
index 0000000..be14737
--- /dev/null
+++ b/evals/token-optimization/tasks/context-triage.yaml
@@ -0,0 +1,17 @@
+id: context-triage
+name: Context triage recommendation
+description: The assistant should identify unnecessary context and recommend a measurable prompt or instruction improvement.
+tags:
+ - context
+ - token-efficiency
+inputs:
+ prompt: |
+ Review a coding assistant prompt that includes the whole repository for a one-line documentation fix.
+ Recommend how to reduce context while preserving answer quality.
+expected:
+ output_contains:
+ - "context"
+ - "prompt"
+ - "quality"
+ behavior:
+ max_tool_calls: 5
diff --git a/labs/08-ai-evals-and-observability.md b/labs/08-ai-evals-and-observability.md
index 451d33c..c03927e 100644
--- a/labs/08-ai-evals-and-observability.md
+++ b/labs/08-ai-evals-and-observability.md
@@ -2,11 +2,28 @@
## Core idea
-Token optimization should be measured. AI evals help teams compare prompts, instructions, models, memories, and tool configurations using repeatable examples instead of anecdotes.
+Token optimization should be measured with repeatable, customer-owned evals. Use Microsoft Waza as the workshop evaluation framework so teams can compare prompts, instructions, agents, models, and tool configurations without relying on proprietary third-party products.
-## Candidate eval platform: W&B Weave
+## Required eval framework: Microsoft Waza
-Consider W&B Weave for tracing, prompt and model comparison, qualitative review, and lightweight observability. If a customer already uses another eval platform, keep the workshop tool-agnostic and focus on repeatable datasets, rubrics, and decision criteria.
+Use Waza for this chapter's lab and examples:
+
+- Documentation:
+- Repository:
+- Typical commands: `waza run`, `waza check`, `waza compare`, `waza coverage`, and `waza tokens`.
+- CI integration: run Waza in GitHub Actions and publish the result files as workflow artifacts.
+
+Before delivery, confirm attendees can access the Waza documentation and repository from their network.
+
+This repository includes the minimum workspace artifacts Waza and VS Code customizations need:
+
+- `skills/token-optimization/SKILL.md` for the Waza skill under evaluation.
+- `.github/skills/token-optimization.skill.md` as a workspace skill customization.
+- `.github/prompts/context-triage.prompt.md` as a reusable prompt.
+- `.github/instructions/token-optimization.instructions.md` as shared instructions.
+- `.github/agents/token-eval-reviewer.agent.md` as a VS Code agent customization.
+- `evals/token-optimization/eval.yaml` and task files for Waza.
+- `.github/workflows/waza-eval-suite.yml` to run the eval suite in Actions.
## What to evaluate
@@ -17,15 +34,17 @@ Consider W&B Weave for tracing, prompt and model comparison, qualitative review,
- Safety: did it avoid secrets, unsafe commands, or policy violations?
- Developer experience: was the answer actionable?
-## Recommended tools to consider
+## Run Waza in GitHub Actions
+
+The Actions workflow should:
-- W&B Weave: tracing, prompt/version comparison, human review workflows.
-- promptfoo: lightweight prompt and model regression testing.
-- LangSmith: tracing, datasets, and eval workflows for LangChain-based systems.
-- OpenAI Evals or provider-native eval tools: model and prompt comparison.
-- Azure AI Evaluation: useful for Azure-hosted AI workflows.
-- Ragas or DeepEval: evaluation patterns for retrieval-augmented generation.
-- Custom GitHub Actions or CI checks: simple regression suites for prompts and agent instructions.
+1. Check out the repository.
+2. Install Waza from the official Microsoft project.
+3. Run `waza run evals/token-optimization/eval.yaml --verbose`.
+4. Save JSON and JUnit results.
+5. Upload results as artifacts for review.
+
+Use the mock executor for quick pull request validation. Switch to a real executor only when the customer is ready to provide the required credentials and accept the cost and data handling implications.
## Minimal eval dataset
@@ -39,11 +58,24 @@ Start with 10-20 examples:
## Hands-on lab
-1. Select three representative prompts.
-2. Run each with two instruction sets or two models.
-3. Score outputs from 1-5 on correctness, usefulness, and cost.
-4. Decide which change should become the new default.
+1. Open the repository's Waza workflow in `.github/workflows/waza-eval-suite.yml`.
+2. Review the skill, prompt, instruction, and agent customization files.
+3. Run the Waza workflow from the Actions tab or by opening a pull request.
+4. Download the Waza result artifacts and identify which task failed or passed.
+5. Modify one prompt or instruction and rerun the workflow to compare results.
+6. Decide whether the customization improved correctness, safety, and token efficiency.
+
+## Run customer analyses in VS Code
+
+Use the Chat Customizations Evaluations extensions for VS Code to help customers run the same style of analysis locally:
+
+1. Install the VS Code extensions documented at .
+2. Open the customer's repository in VS Code.
+3. Add or review the workspace customizations: `*.skill.md`, `*.prompt.md`, `*.instructions.md`, and `*.agent.md`.
+4. Run local evaluations against representative prompts and fixtures.
+5. Compare local results with the GitHub Actions Waza results.
+6. Promote only the customizations that improve measured outcomes.
## Practical recommendation
-Use evals to justify changes to model routing, instruction files, MCP configuration, and memory strategy.
+Use Waza evals to justify changes to model routing, instruction files, prompt templates, agent definitions, Model Context Protocol (MCP) configuration, and memory strategy. Keep the eval data small, explicit, customer-owned, and safe to run in CI.
diff --git a/labs/README.md b/labs/README.md
index 78ea9ea..45150a5 100644
--- a/labs/README.md
+++ b/labs/README.md
@@ -22,7 +22,7 @@ Attendees will learn how to:
- A GitHub account and access to an AI coding/chat tool.
- A small sample repository with issues, tests, documentation, and a few realistic defects.
- Optional: access to organization billing, Copilot usage, cloud AI usage, or model provider dashboards.
-- Optional: W&B Weave, LangSmith, promptfoo, OpenAI Evals, Azure AI Evaluation, or another eval/observability tool.
+- Microsoft Waza and the Chat Customizations Evaluations extensions for VS Code for the Chapter 8 eval lab.
## Delivery formats
diff --git a/skills/token-optimization/SKILL.md b/skills/token-optimization/SKILL.md
new file mode 100644
index 0000000..421a5fd
--- /dev/null
+++ b/skills/token-optimization/SKILL.md
@@ -0,0 +1,29 @@
+---
+name: token-optimization
+type: analysis
+description: |
+ USE FOR: Evaluating prompts, instructions, agents, and context choices for token-efficient software engineering workflows.
+ DO NOT USE FOR: Replacing security review, exposing secrets, or analyzing customer data without permission.
+license: MIT
+metadata:
+ version: "1.0"
+---
+
+# Token Optimization
+
+## Overview
+
+Help teams evaluate whether their AI coding customizations improve quality, safety, and token efficiency.
+
+## Triggers
+
+- "Evaluate this prompt for token efficiency."
+- "Compare these instructions for quality and cost."
+- "Review this agent customization before we make it the default."
+
+## Expectations
+
+- Prefer customer-owned eval data.
+- Keep fixtures small and relevant.
+- Score correctness, groundedness, safety, developer experience, and token efficiency.
+- Recommend measurable changes rather than subjective preferences.