From 96f0c4bcd3fdb74059d9d1f5cc00d7abefafba68 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 12 May 2026 21:02:57 +0000 Subject: [PATCH 1/8] Replace chapter 8 with Waza eval lab --- .github/agents/token-eval-reviewer.agent.md | 21 +++++++ .../token-optimization.instructions.md | 7 +++ .github/prompts/context-triage.prompt.md | 17 +++++ .github/skills/token-optimization.skill.md | 10 +++ .github/workflows/waza-evals.yml | 55 ++++++++++++++++ evals/token-optimization/eval.yaml | 25 ++++++++ .../tasks/context-triage.yaml | 17 +++++ labs/08-ai-evals-and-observability.md | 62 ++++++++++++++----- labs/README.md | 2 +- skills/token-optimization/SKILL.md | 29 +++++++++ 10 files changed, 228 insertions(+), 17 deletions(-) create mode 100644 .github/agents/token-eval-reviewer.agent.md create mode 100644 .github/instructions/token-optimization.instructions.md create mode 100644 .github/prompts/context-triage.prompt.md create mode 100644 .github/skills/token-optimization.skill.md create mode 100644 .github/workflows/waza-evals.yml create mode 100644 evals/token-optimization/eval.yaml create mode 100644 evals/token-optimization/tasks/context-triage.yaml create mode 100644 skills/token-optimization/SKILL.md diff --git a/.github/agents/token-eval-reviewer.agent.md b/.github/agents/token-eval-reviewer.agent.md new file mode 100644 index 0000000..eecede0 --- /dev/null +++ b/.github/agents/token-eval-reviewer.agent.md @@ -0,0 +1,21 @@ +--- +name: token-eval-reviewer +description: Reviews Waza eval results and AI customization changes for quality, safety, and token efficiency. +tools: + - codeSearch + - fileRead + - runCommand +--- + +# Token Eval Reviewer + +You review prompt, instruction, skill, and agent customizations. + +Focus on: + +- Whether the eval task measures a real customer workflow. +- Whether fixtures are small, safe, and relevant. +- Whether graders connect to correctness, groundedness, safety, developer experience, or token efficiency. +- Whether workflow results are actionable for maintainers. + +Do not recommend adding proprietary third-party eval platforms for Chapter 8. diff --git a/.github/instructions/token-optimization.instructions.md b/.github/instructions/token-optimization.instructions.md new file mode 100644 index 0000000..e11b298 --- /dev/null +++ b/.github/instructions/token-optimization.instructions.md @@ -0,0 +1,7 @@ +# Token Optimization Instructions + +- Keep prompts, instructions, and agent definitions concise and testable. +- Prefer repository facts and small fixtures over broad file inclusion. +- Ask for clarification when the requested analysis lacks enough context. +- Do not include secrets, production data, or private customer information in eval fixtures. +- Tie every recommendation to quality, safety, cost, or developer experience. diff --git a/.github/prompts/context-triage.prompt.md b/.github/prompts/context-triage.prompt.md new file mode 100644 index 0000000..3e08015 --- /dev/null +++ b/.github/prompts/context-triage.prompt.md @@ -0,0 +1,17 @@ +--- +mode: ask +description: Review a coding task for context quality and token efficiency. +--- + +# Context Triage + +Review the task, files, and instructions provided by the user. + +Identify: + +- Context that is necessary. +- Context that is missing. +- Context that can be removed. +- Risks that require clarification before implementation. + +Return a concise recommendation before suggesting code changes. diff --git a/.github/skills/token-optimization.skill.md b/.github/skills/token-optimization.skill.md new file mode 100644 index 0000000..4950450 --- /dev/null +++ b/.github/skills/token-optimization.skill.md @@ -0,0 +1,10 @@ +--- +name: token-optimization +description: Evaluate prompts, instructions, agents, and context selections for quality, safety, and token efficiency. +--- + +# Token Optimization Skill + +Use this skill when reviewing AI coding workflows for unnecessary context, unclear prompts, unsafe tool use, or inefficient model selection. + +Return findings that connect each recommendation to correctness, safety, developer experience, or token usage. diff --git a/.github/workflows/waza-evals.yml b/.github/workflows/waza-evals.yml new file mode 100644 index 0000000..92d7069 --- /dev/null +++ b/.github/workflows/waza-evals.yml @@ -0,0 +1,55 @@ +name: Waza Evals + +on: + pull_request: + paths: + - ".github/agents/**" + - ".github/instructions/**" + - ".github/prompts/**" + - ".github/skills/**" + - "evals/**" + - "skills/**" + - "labs/08-ai-evals-and-observability.md" + push: + branches: ["main"] + paths: + - ".github/agents/**" + - ".github/instructions/**" + - ".github/prompts/**" + - ".github/skills/**" + - "evals/**" + - "skills/**" + - "labs/08-ai-evals-and-observability.md" + workflow_dispatch: + +permissions: + contents: read + +jobs: + evaluate: + name: Run Waza eval suite + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Waza + run: curl -fsSL https://raw.githubusercontent.com/microsoft/waza/main/install.sh | bash + + - name: Verify Waza + run: waza --version + + - name: Run Waza + run: | + mkdir -p waza-results + waza run evals/token-optimization/eval.yaml \ + --verbose \ + --output waza-results/results.json \ + --reporter junit:waza-results/results.xml + + - name: Upload Waza results + if: always() + uses: actions/upload-artifact@v4 + with: + name: waza-results + path: waza-results/ diff --git a/evals/token-optimization/eval.yaml b/evals/token-optimization/eval.yaml new file mode 100644 index 0000000..579a097 --- /dev/null +++ b/evals/token-optimization/eval.yaml @@ -0,0 +1,25 @@ +name: token-optimization-eval +description: Evaluation suite for token optimization workshop customizations. +skill: token-optimization +version: "1.0" + +config: + executor: mock + model: mock-model + trials_per_task: 1 + timeout_seconds: 300 + parallel: false + +graders: + - type: text + name: mentions_context_quality + config: + regex_match: + - "(?i)(context|token|fixture|prompt|instruction)" + - type: behavior + name: bounded_tool_use + config: + max_tool_calls: 5 + +tasks: + - "tasks/*.yaml" diff --git a/evals/token-optimization/tasks/context-triage.yaml b/evals/token-optimization/tasks/context-triage.yaml new file mode 100644 index 0000000..be14737 --- /dev/null +++ b/evals/token-optimization/tasks/context-triage.yaml @@ -0,0 +1,17 @@ +id: context-triage +name: Context triage recommendation +description: The assistant should identify unnecessary context and recommend a measurable prompt or instruction improvement. +tags: + - context + - token-efficiency +inputs: + prompt: | + Review a coding assistant prompt that includes the whole repository for a one-line documentation fix. + Recommend how to reduce context while preserving answer quality. +expected: + output_contains: + - "context" + - "prompt" + - "quality" + behavior: + max_tool_calls: 5 diff --git a/labs/08-ai-evals-and-observability.md b/labs/08-ai-evals-and-observability.md index 451d33c..8475a68 100644 --- a/labs/08-ai-evals-and-observability.md +++ b/labs/08-ai-evals-and-observability.md @@ -2,11 +2,26 @@ ## Core idea -Token optimization should be measured. AI evals help teams compare prompts, instructions, models, memories, and tool configurations using repeatable examples instead of anecdotes. +Token optimization should be measured with repeatable, customer-owned evals. Use Microsoft Waza as the workshop evaluation framework so teams can compare prompts, instructions, agents, models, and tool configurations without relying on proprietary third-party products. -## Candidate eval platform: W&B Weave +## Required eval framework: Microsoft Waza -Consider W&B Weave for tracing, prompt and model comparison, qualitative review, and lightweight observability. If a customer already uses another eval platform, keep the workshop tool-agnostic and focus on repeatable datasets, rubrics, and decision criteria. +Use Waza for this chapter's lab and examples: + +- Documentation: +- Repository: +- Typical commands: `waza run`, `waza check`, `waza compare`, `waza coverage`, and `waza tokens`. +- CI integration: run Waza in GitHub Actions and publish the result files as workflow artifacts. + +This repository includes the minimum workspace artifacts Waza and VS Code customizations need: + +- `skills/token-optimization/SKILL.md` for the Waza skill under evaluation. +- `.github/skills/token-optimization.skill.md` as a workspace skill customization. +- `.github/prompts/context-triage.prompt.md` as a reusable prompt. +- `.github/instructions/token-optimization.instructions.md` as shared instructions. +- `.github/agents/token-eval-reviewer.agent.md` as a VS Code custom agent. +- `evals/token-optimization/eval.yaml` and task files for Waza. +- `.github/workflows/waza-evals.yml` to run the eval suite in Actions. ## What to evaluate @@ -17,15 +32,17 @@ Consider W&B Weave for tracing, prompt and model comparison, qualitative review, - Safety: did it avoid secrets, unsafe commands, or policy violations? - Developer experience: was the answer actionable? -## Recommended tools to consider +## Run Waza in GitHub Actions + +The Actions workflow should: -- W&B Weave: tracing, prompt/version comparison, human review workflows. -- promptfoo: lightweight prompt and model regression testing. -- LangSmith: tracing, datasets, and eval workflows for LangChain-based systems. -- OpenAI Evals or provider-native eval tools: model and prompt comparison. -- Azure AI Evaluation: useful for Azure-hosted AI workflows. -- Ragas or DeepEval: evaluation patterns for retrieval-augmented generation. -- Custom GitHub Actions or CI checks: simple regression suites for prompts and agent instructions. +1. Check out the repository. +2. Install Waza from the official Microsoft project. +3. Run `waza run evals/token-optimization/eval.yaml --verbose`. +4. Save JSON and JUnit results. +5. Upload results as artifacts for review. + +Use the mock executor for quick pull request validation. Switch to a real executor only when the customer is ready to provide the required credentials and accept the cost and data handling implications. ## Minimal eval dataset @@ -39,11 +56,24 @@ Start with 10-20 examples: ## Hands-on lab -1. Select three representative prompts. -2. Run each with two instruction sets or two models. -3. Score outputs from 1-5 on correctness, usefulness, and cost. -4. Decide which change should become the new default. +1. Open the repository's Waza workflow in `.github/workflows/waza-evals.yml`. +2. Review the skill, prompt, instruction, and agent customization files. +3. Run the Waza workflow from the Actions tab or by opening a pull request. +4. Download the Waza result artifacts and identify which task failed or passed. +5. Modify one prompt or instruction and rerun the workflow to compare results. +6. Decide whether the customization improved correctness, safety, and token efficiency. + +## Run customer analyses in VS Code + +Use the Chat Customizations Evaluations extensions for VS Code to help customers run the same style of analysis locally: + +1. Install the VS Code extensions documented at . +2. Open the customer's repository in VS Code. +3. Add or review the workspace customizations: `*.skill.md`, `*.prompt.md`, `*.instructions.md`, and `*.agent.md`. +4. Run local evaluations against representative prompts and fixtures. +5. Compare local results with the GitHub Actions Waza results. +6. Promote only the customizations that improve measured outcomes. ## Practical recommendation -Use evals to justify changes to model routing, instruction files, MCP configuration, and memory strategy. +Use Waza evals to justify changes to model routing, instruction files, prompt templates, agent definitions, MCP configuration, and memory strategy. Keep the eval data small, explicit, customer-owned, and safe to run in CI. diff --git a/labs/README.md b/labs/README.md index 78ea9ea..45150a5 100644 --- a/labs/README.md +++ b/labs/README.md @@ -22,7 +22,7 @@ Attendees will learn how to: - A GitHub account and access to an AI coding/chat tool. - A small sample repository with issues, tests, documentation, and a few realistic defects. - Optional: access to organization billing, Copilot usage, cloud AI usage, or model provider dashboards. -- Optional: W&B Weave, LangSmith, promptfoo, OpenAI Evals, Azure AI Evaluation, or another eval/observability tool. +- Microsoft Waza and the Chat Customizations Evaluations extensions for VS Code for the Chapter 8 eval lab. ## Delivery formats diff --git a/skills/token-optimization/SKILL.md b/skills/token-optimization/SKILL.md new file mode 100644 index 0000000..421a5fd --- /dev/null +++ b/skills/token-optimization/SKILL.md @@ -0,0 +1,29 @@ +--- +name: token-optimization +type: analysis +description: | + USE FOR: Evaluating prompts, instructions, agents, and context choices for token-efficient software engineering workflows. + DO NOT USE FOR: Replacing security review, exposing secrets, or analyzing customer data without permission. +license: MIT +metadata: + version: "1.0" +--- + +# Token Optimization + +## Overview + +Help teams evaluate whether their AI coding customizations improve quality, safety, and token efficiency. + +## Triggers + +- "Evaluate this prompt for token efficiency." +- "Compare these instructions for quality and cost." +- "Review this agent customization before we make it the default." + +## Expectations + +- Prefer customer-owned eval data. +- Keep fixtures small and relevant. +- Score correctness, groundedness, safety, developer experience, and token efficiency. +- Recommend measurable changes rather than subjective preferences. From 4612845e257c1169e73a56a4e15fc2bb4e2c203d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 12 May 2026 21:04:16 +0000 Subject: [PATCH 2/8] Harden Waza workflow install step --- .github/workflows/waza-evals.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/waza-evals.yml b/.github/workflows/waza-evals.yml index 92d7069..0359775 100644 --- a/.github/workflows/waza-evals.yml +++ b/.github/workflows/waza-evals.yml @@ -34,7 +34,15 @@ jobs: uses: actions/checkout@v4 - name: Install Waza - run: curl -fsSL https://raw.githubusercontent.com/microsoft/waza/main/install.sh | bash + env: + WAZA_INSTALL_COMMIT: bf77c759d6bee3ba578c1980fa8a176fe8d014e4 + WAZA_INSTALL_SHA256: f9f949c8ef7ed4e3309cee3f1285d2f7a7dea6db1c965ceab09443d3c7910d7a + run: | + curl -fsSL \ + "https://raw.githubusercontent.com/microsoft/waza/${WAZA_INSTALL_COMMIT}/install.sh" \ + -o install-waza.sh + echo "${WAZA_INSTALL_SHA256} install-waza.sh" | sha256sum -c - + bash install-waza.sh - name: Verify Waza run: waza --version From 852d8b155d221839b7e83ff05abd9a1a7bdddb3f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 12 May 2026 21:05:10 +0000 Subject: [PATCH 3/8] Address Waza validation review feedback --- .github/workflows/waza-evals.yml | 1 + evals/token-optimization/eval.yaml | 2 ++ labs/08-ai-evals-and-observability.md | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/waza-evals.yml b/.github/workflows/waza-evals.yml index 0359775..8a5486b 100644 --- a/.github/workflows/waza-evals.yml +++ b/.github/workflows/waza-evals.yml @@ -35,6 +35,7 @@ jobs: - name: Install Waza env: + # Pinned to a reviewed Microsoft Waza install script commit; update the commit and SHA-256 together. WAZA_INSTALL_COMMIT: bf77c759d6bee3ba578c1980fa8a176fe8d014e4 WAZA_INSTALL_SHA256: f9f949c8ef7ed4e3309cee3f1285d2f7a7dea6db1c965ceab09443d3c7910d7a run: | diff --git a/evals/token-optimization/eval.yaml b/evals/token-optimization/eval.yaml index 579a097..2e8fcc2 100644 --- a/evals/token-optimization/eval.yaml +++ b/evals/token-optimization/eval.yaml @@ -4,6 +4,8 @@ skill: token-optimization version: "1.0" config: + # Use mock for pull request validation. For real model runs, change this to + # the executor and credential setup documented at https://microsoft.github.io/waza/. executor: mock model: mock-model trials_per_task: 1 diff --git a/labs/08-ai-evals-and-observability.md b/labs/08-ai-evals-and-observability.md index 8475a68..8bb5e84 100644 --- a/labs/08-ai-evals-and-observability.md +++ b/labs/08-ai-evals-and-observability.md @@ -76,4 +76,4 @@ Use the Chat Customizations Evaluations extensions for VS Code to help customers ## Practical recommendation -Use Waza evals to justify changes to model routing, instruction files, prompt templates, agent definitions, MCP configuration, and memory strategy. Keep the eval data small, explicit, customer-owned, and safe to run in CI. +Use Waza evals to justify changes to model routing, instruction files, prompt templates, agent definitions, Model Context Protocol (MCP) configuration, and memory strategy. Keep the eval data small, explicit, customer-owned, and safe to run in CI. From 0ff9b9bf62476b293003d9bb1506a4e04e7564db Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 12 May 2026 21:06:06 +0000 Subject: [PATCH 4/8] Note Waza documentation availability check --- labs/08-ai-evals-and-observability.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/labs/08-ai-evals-and-observability.md b/labs/08-ai-evals-and-observability.md index 8bb5e84..66bc4bf 100644 --- a/labs/08-ai-evals-and-observability.md +++ b/labs/08-ai-evals-and-observability.md @@ -13,6 +13,8 @@ Use Waza for this chapter's lab and examples: - Typical commands: `waza run`, `waza check`, `waza compare`, `waza coverage`, and `waza tokens`. - CI integration: run Waza in GitHub Actions and publish the result files as workflow artifacts. +Before delivery, confirm attendees can access the Waza documentation and repository from their network. + This repository includes the minimum workspace artifacts Waza and VS Code customizations need: - `skills/token-optimization/SKILL.md` for the Waza skill under evaluation. From 06a1fd919609642d758e7c060b0e8b1e7f38ef68 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 12 May 2026 21:06:55 +0000 Subject: [PATCH 5/8] Align Waza workflow file naming --- .github/workflows/{waza-evals.yml => waza-eval-suite.yml} | 0 labs/08-ai-evals-and-observability.md | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename .github/workflows/{waza-evals.yml => waza-eval-suite.yml} (100%) diff --git a/.github/workflows/waza-evals.yml b/.github/workflows/waza-eval-suite.yml similarity index 100% rename from .github/workflows/waza-evals.yml rename to .github/workflows/waza-eval-suite.yml diff --git a/labs/08-ai-evals-and-observability.md b/labs/08-ai-evals-and-observability.md index 66bc4bf..bd2aaf4 100644 --- a/labs/08-ai-evals-and-observability.md +++ b/labs/08-ai-evals-and-observability.md @@ -23,7 +23,7 @@ This repository includes the minimum workspace artifacts Waza and VS Code custom - `.github/instructions/token-optimization.instructions.md` as shared instructions. - `.github/agents/token-eval-reviewer.agent.md` as a VS Code custom agent. - `evals/token-optimization/eval.yaml` and task files for Waza. -- `.github/workflows/waza-evals.yml` to run the eval suite in Actions. +- `.github/workflows/waza-eval-suite.yml` to run the eval suite in Actions. ## What to evaluate @@ -58,7 +58,7 @@ Start with 10-20 examples: ## Hands-on lab -1. Open the repository's Waza workflow in `.github/workflows/waza-evals.yml`. +1. Open the repository's Waza workflow in `.github/workflows/waza-eval-suite.yml`. 2. Review the skill, prompt, instruction, and agent customization files. 3. Run the Waza workflow from the Actions tab or by opening a pull request. 4. Download the Waza result artifacts and identify which task failed or passed. From 5d612aecee3bacf734dd093a36f002a39f452325 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 12 May 2026 21:07:47 +0000 Subject: [PATCH 6/8] Clarify VS Code agent customization wording --- labs/08-ai-evals-and-observability.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/labs/08-ai-evals-and-observability.md b/labs/08-ai-evals-and-observability.md index bd2aaf4..c03927e 100644 --- a/labs/08-ai-evals-and-observability.md +++ b/labs/08-ai-evals-and-observability.md @@ -21,7 +21,7 @@ This repository includes the minimum workspace artifacts Waza and VS Code custom - `.github/skills/token-optimization.skill.md` as a workspace skill customization. - `.github/prompts/context-triage.prompt.md` as a reusable prompt. - `.github/instructions/token-optimization.instructions.md` as shared instructions. -- `.github/agents/token-eval-reviewer.agent.md` as a VS Code custom agent. +- `.github/agents/token-eval-reviewer.agent.md` as a VS Code agent customization. - `evals/token-optimization/eval.yaml` and task files for Waza. - `.github/workflows/waza-eval-suite.yml` to run the eval suite in Actions. From e569580b0db7d45cc66a53d9cfb3d64d64a79bd8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 12 May 2026 21:08:32 +0000 Subject: [PATCH 7/8] Expand Waza workflow validation comments --- .github/workflows/waza-eval-suite.yml | 1 + evals/token-optimization/eval.yaml | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/waza-eval-suite.yml b/.github/workflows/waza-eval-suite.yml index 8a5486b..42d3251 100644 --- a/.github/workflows/waza-eval-suite.yml +++ b/.github/workflows/waza-eval-suite.yml @@ -36,6 +36,7 @@ jobs: - name: Install Waza env: # Pinned to a reviewed Microsoft Waza install script commit; update the commit and SHA-256 together. + # SHA-256 verified for this exact install.sh content on 2026-05-12. WAZA_INSTALL_COMMIT: bf77c759d6bee3ba578c1980fa8a176fe8d014e4 WAZA_INSTALL_SHA256: f9f949c8ef7ed4e3309cee3f1285d2f7a7dea6db1c965ceab09443d3c7910d7a run: | diff --git a/evals/token-optimization/eval.yaml b/evals/token-optimization/eval.yaml index 2e8fcc2..f1722b5 100644 --- a/evals/token-optimization/eval.yaml +++ b/evals/token-optimization/eval.yaml @@ -5,7 +5,8 @@ version: "1.0" config: # Use mock for pull request validation. For real model runs, change this to - # the executor and credential setup documented at https://microsoft.github.io/waza/. + # a supported Waza executor, configure the required credentials such as + # GITHUB_TOKEN in Actions, and follow https://microsoft.github.io/waza/. executor: mock model: mock-model trials_per_task: 1 From 62a0ff78b8bd9a1dc95031589a5bc3f2f0734f5b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 12 May 2026 21:09:16 +0000 Subject: [PATCH 8/8] Document pinned Waza install commit selection --- .github/workflows/waza-eval-suite.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/waza-eval-suite.yml b/.github/workflows/waza-eval-suite.yml index 42d3251..6081304 100644 --- a/.github/workflows/waza-eval-suite.yml +++ b/.github/workflows/waza-eval-suite.yml @@ -35,7 +35,8 @@ jobs: - name: Install Waza env: - # Pinned to a reviewed Microsoft Waza install script commit; update the commit and SHA-256 together. + # Pinned to a Microsoft Waza repository snapshot selected from upstream docs on 2026-05-12. + # If this commit becomes unavailable, update to a reviewed upstream commit and SHA-256 together. # SHA-256 verified for this exact install.sh content on 2026-05-12. WAZA_INSTALL_COMMIT: bf77c759d6bee3ba578c1980fa8a176fe8d014e4 WAZA_INSTALL_SHA256: f9f949c8ef7ed4e3309cee3f1285d2f7a7dea6db1c965ceab09443d3c7910d7a