diff --git a/.github/agents/token-eval-reviewer.agent.md b/.github/agents/token-eval-reviewer.agent.md new file mode 100644 index 0000000..eecede0 --- /dev/null +++ b/.github/agents/token-eval-reviewer.agent.md @@ -0,0 +1,21 @@ +--- +name: token-eval-reviewer +description: Reviews Waza eval results and AI customization changes for quality, safety, and token efficiency. +tools: + - codeSearch + - fileRead + - runCommand +--- + +# Token Eval Reviewer + +You review prompt, instruction, skill, and agent customizations. + +Focus on: + +- Whether the eval task measures a real customer workflow. +- Whether fixtures are small, safe, and relevant. +- Whether graders connect to correctness, groundedness, safety, developer experience, or token efficiency. +- Whether workflow results are actionable for maintainers. + +Do not recommend adding proprietary third-party eval platforms for Chapter 8. diff --git a/.github/instructions/token-optimization.instructions.md b/.github/instructions/token-optimization.instructions.md new file mode 100644 index 0000000..e11b298 --- /dev/null +++ b/.github/instructions/token-optimization.instructions.md @@ -0,0 +1,7 @@ +# Token Optimization Instructions + +- Keep prompts, instructions, and agent definitions concise and testable. +- Prefer repository facts and small fixtures over broad file inclusion. +- Ask for clarification when the requested analysis lacks enough context. +- Do not include secrets, production data, or private customer information in eval fixtures. +- Tie every recommendation to quality, safety, cost, or developer experience. diff --git a/.github/prompts/context-triage.prompt.md b/.github/prompts/context-triage.prompt.md new file mode 100644 index 0000000..3e08015 --- /dev/null +++ b/.github/prompts/context-triage.prompt.md @@ -0,0 +1,17 @@ +--- +mode: ask +description: Review a coding task for context quality and token efficiency. +--- + +# Context Triage + +Review the task, files, and instructions provided by the user. + +Identify: + +- Context that is necessary. +- Context that is missing. +- Context that can be removed. +- Risks that require clarification before implementation. + +Return a concise recommendation before suggesting code changes. diff --git a/.github/skills/token-optimization.skill.md b/.github/skills/token-optimization.skill.md new file mode 100644 index 0000000..4950450 --- /dev/null +++ b/.github/skills/token-optimization.skill.md @@ -0,0 +1,10 @@ +--- +name: token-optimization +description: Evaluate prompts, instructions, agents, and context selections for quality, safety, and token efficiency. +--- + +# Token Optimization Skill + +Use this skill when reviewing AI coding workflows for unnecessary context, unclear prompts, unsafe tool use, or inefficient model selection. + +Return findings that connect each recommendation to correctness, safety, developer experience, or token usage. diff --git a/.github/workflows/waza-eval-suite.yml b/.github/workflows/waza-eval-suite.yml new file mode 100644 index 0000000..6081304 --- /dev/null +++ b/.github/workflows/waza-eval-suite.yml @@ -0,0 +1,66 @@ +name: Waza Evals + +on: + pull_request: + paths: + - ".github/agents/**" + - ".github/instructions/**" + - ".github/prompts/**" + - ".github/skills/**" + - "evals/**" + - "skills/**" + - "labs/08-ai-evals-and-observability.md" + push: + branches: ["main"] + paths: + - ".github/agents/**" + - ".github/instructions/**" + - ".github/prompts/**" + - ".github/skills/**" + - "evals/**" + - "skills/**" + - "labs/08-ai-evals-and-observability.md" + workflow_dispatch: + +permissions: + contents: read + +jobs: + evaluate: + name: Run Waza eval suite + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Waza + env: + # Pinned to a Microsoft Waza repository snapshot selected from upstream docs on 2026-05-12. + # If this commit becomes unavailable, update to a reviewed upstream commit and SHA-256 together. + # SHA-256 verified for this exact install.sh content on 2026-05-12. + WAZA_INSTALL_COMMIT: bf77c759d6bee3ba578c1980fa8a176fe8d014e4 + WAZA_INSTALL_SHA256: f9f949c8ef7ed4e3309cee3f1285d2f7a7dea6db1c965ceab09443d3c7910d7a + run: | + curl -fsSL \ + "https://raw.githubusercontent.com/microsoft/waza/${WAZA_INSTALL_COMMIT}/install.sh" \ + -o install-waza.sh + echo "${WAZA_INSTALL_SHA256} install-waza.sh" | sha256sum -c - + bash install-waza.sh + + - name: Verify Waza + run: waza --version + + - name: Run Waza + run: | + mkdir -p waza-results + waza run evals/token-optimization/eval.yaml \ + --verbose \ + --output waza-results/results.json \ + --reporter junit:waza-results/results.xml + + - name: Upload Waza results + if: always() + uses: actions/upload-artifact@v4 + with: + name: waza-results + path: waza-results/ diff --git a/evals/token-optimization/eval.yaml b/evals/token-optimization/eval.yaml new file mode 100644 index 0000000..f1722b5 --- /dev/null +++ b/evals/token-optimization/eval.yaml @@ -0,0 +1,28 @@ +name: token-optimization-eval +description: Evaluation suite for token optimization workshop customizations. +skill: token-optimization +version: "1.0" + +config: + # Use mock for pull request validation. For real model runs, change this to + # a supported Waza executor, configure the required credentials such as + # GITHUB_TOKEN in Actions, and follow https://microsoft.github.io/waza/. + executor: mock + model: mock-model + trials_per_task: 1 + timeout_seconds: 300 + parallel: false + +graders: + - type: text + name: mentions_context_quality + config: + regex_match: + - "(?i)(context|token|fixture|prompt|instruction)" + - type: behavior + name: bounded_tool_use + config: + max_tool_calls: 5 + +tasks: + - "tasks/*.yaml" diff --git a/evals/token-optimization/tasks/context-triage.yaml b/evals/token-optimization/tasks/context-triage.yaml new file mode 100644 index 0000000..be14737 --- /dev/null +++ b/evals/token-optimization/tasks/context-triage.yaml @@ -0,0 +1,17 @@ +id: context-triage +name: Context triage recommendation +description: The assistant should identify unnecessary context and recommend a measurable prompt or instruction improvement. +tags: + - context + - token-efficiency +inputs: + prompt: | + Review a coding assistant prompt that includes the whole repository for a one-line documentation fix. + Recommend how to reduce context while preserving answer quality. +expected: + output_contains: + - "context" + - "prompt" + - "quality" + behavior: + max_tool_calls: 5 diff --git a/labs/08-ai-evals-and-observability.md b/labs/08-ai-evals-and-observability.md index 451d33c..c03927e 100644 --- a/labs/08-ai-evals-and-observability.md +++ b/labs/08-ai-evals-and-observability.md @@ -2,11 +2,28 @@ ## Core idea -Token optimization should be measured. AI evals help teams compare prompts, instructions, models, memories, and tool configurations using repeatable examples instead of anecdotes. +Token optimization should be measured with repeatable, customer-owned evals. Use Microsoft Waza as the workshop evaluation framework so teams can compare prompts, instructions, agents, models, and tool configurations without relying on proprietary third-party products. -## Candidate eval platform: W&B Weave +## Required eval framework: Microsoft Waza -Consider W&B Weave for tracing, prompt and model comparison, qualitative review, and lightweight observability. If a customer already uses another eval platform, keep the workshop tool-agnostic and focus on repeatable datasets, rubrics, and decision criteria. +Use Waza for this chapter's lab and examples: + +- Documentation: +- Repository: +- Typical commands: `waza run`, `waza check`, `waza compare`, `waza coverage`, and `waza tokens`. +- CI integration: run Waza in GitHub Actions and publish the result files as workflow artifacts. + +Before delivery, confirm attendees can access the Waza documentation and repository from their network. + +This repository includes the minimum workspace artifacts Waza and VS Code customizations need: + +- `skills/token-optimization/SKILL.md` for the Waza skill under evaluation. +- `.github/skills/token-optimization.skill.md` as a workspace skill customization. +- `.github/prompts/context-triage.prompt.md` as a reusable prompt. +- `.github/instructions/token-optimization.instructions.md` as shared instructions. +- `.github/agents/token-eval-reviewer.agent.md` as a VS Code agent customization. +- `evals/token-optimization/eval.yaml` and task files for Waza. +- `.github/workflows/waza-eval-suite.yml` to run the eval suite in Actions. ## What to evaluate @@ -17,15 +34,17 @@ Consider W&B Weave for tracing, prompt and model comparison, qualitative review, - Safety: did it avoid secrets, unsafe commands, or policy violations? - Developer experience: was the answer actionable? -## Recommended tools to consider +## Run Waza in GitHub Actions + +The Actions workflow should: -- W&B Weave: tracing, prompt/version comparison, human review workflows. -- promptfoo: lightweight prompt and model regression testing. -- LangSmith: tracing, datasets, and eval workflows for LangChain-based systems. -- OpenAI Evals or provider-native eval tools: model and prompt comparison. -- Azure AI Evaluation: useful for Azure-hosted AI workflows. -- Ragas or DeepEval: evaluation patterns for retrieval-augmented generation. -- Custom GitHub Actions or CI checks: simple regression suites for prompts and agent instructions. +1. Check out the repository. +2. Install Waza from the official Microsoft project. +3. Run `waza run evals/token-optimization/eval.yaml --verbose`. +4. Save JSON and JUnit results. +5. Upload results as artifacts for review. + +Use the mock executor for quick pull request validation. Switch to a real executor only when the customer is ready to provide the required credentials and accept the cost and data handling implications. ## Minimal eval dataset @@ -39,11 +58,24 @@ Start with 10-20 examples: ## Hands-on lab -1. Select three representative prompts. -2. Run each with two instruction sets or two models. -3. Score outputs from 1-5 on correctness, usefulness, and cost. -4. Decide which change should become the new default. +1. Open the repository's Waza workflow in `.github/workflows/waza-eval-suite.yml`. +2. Review the skill, prompt, instruction, and agent customization files. +3. Run the Waza workflow from the Actions tab or by opening a pull request. +4. Download the Waza result artifacts and identify which task failed or passed. +5. Modify one prompt or instruction and rerun the workflow to compare results. +6. Decide whether the customization improved correctness, safety, and token efficiency. + +## Run customer analyses in VS Code + +Use the Chat Customizations Evaluations extensions for VS Code to help customers run the same style of analysis locally: + +1. Install the VS Code extensions documented at . +2. Open the customer's repository in VS Code. +3. Add or review the workspace customizations: `*.skill.md`, `*.prompt.md`, `*.instructions.md`, and `*.agent.md`. +4. Run local evaluations against representative prompts and fixtures. +5. Compare local results with the GitHub Actions Waza results. +6. Promote only the customizations that improve measured outcomes. ## Practical recommendation -Use evals to justify changes to model routing, instruction files, MCP configuration, and memory strategy. +Use Waza evals to justify changes to model routing, instruction files, prompt templates, agent definitions, Model Context Protocol (MCP) configuration, and memory strategy. Keep the eval data small, explicit, customer-owned, and safe to run in CI. diff --git a/labs/README.md b/labs/README.md index 78ea9ea..45150a5 100644 --- a/labs/README.md +++ b/labs/README.md @@ -22,7 +22,7 @@ Attendees will learn how to: - A GitHub account and access to an AI coding/chat tool. - A small sample repository with issues, tests, documentation, and a few realistic defects. - Optional: access to organization billing, Copilot usage, cloud AI usage, or model provider dashboards. -- Optional: W&B Weave, LangSmith, promptfoo, OpenAI Evals, Azure AI Evaluation, or another eval/observability tool. +- Microsoft Waza and the Chat Customizations Evaluations extensions for VS Code for the Chapter 8 eval lab. ## Delivery formats diff --git a/skills/token-optimization/SKILL.md b/skills/token-optimization/SKILL.md new file mode 100644 index 0000000..421a5fd --- /dev/null +++ b/skills/token-optimization/SKILL.md @@ -0,0 +1,29 @@ +--- +name: token-optimization +type: analysis +description: | + USE FOR: Evaluating prompts, instructions, agents, and context choices for token-efficient software engineering workflows. + DO NOT USE FOR: Replacing security review, exposing secrets, or analyzing customer data without permission. +license: MIT +metadata: + version: "1.0" +--- + +# Token Optimization + +## Overview + +Help teams evaluate whether their AI coding customizations improve quality, safety, and token efficiency. + +## Triggers + +- "Evaluate this prompt for token efficiency." +- "Compare these instructions for quality and cost." +- "Review this agent customization before we make it the default." + +## Expectations + +- Prefer customer-owned eval data. +- Keep fixtures small and relevant. +- Score correctness, groundedness, safety, developer experience, and token efficiency. +- Recommend measurable changes rather than subjective preferences.