From c6abae68333f2e278f687172bd7342805c56dedc Mon Sep 17 00:00:00 2001
From: lamek <kevin.lamenzo@gmail.com>
Date: Sun, 7 Jun 2026 01:47:53 +0000
Subject: [PATCH 1/2] Add experimental AI Evaluations topic page

---
 sites/docs/src/content/ai/evals.md | 30 ++++++++++++++++++++++++++++++
 sites/docs/src/data/sidenav/ai.yml |  2 ++
 2 files changed, 32 insertions(+)
 create mode 100644 sites/docs/src/content/ai/evals.md

diff --git a/sites/docs/src/content/ai/evals.md b/sites/docs/src/content/ai/evals.md
new file mode 100644
index 0000000000..b4b7cd260e
--- /dev/null
+++ b/sites/docs/src/content/ai/evals.md
@@ -0,0 +1,30 @@
+---
+title: AI Evaluations
+sidenav: ai
+description: >
+  Learn about Dart and Flutter's evaluation frameworks for measuring AI tooling
+  reliability.
+---
+
+:::experimental
+Evaluation tooling and benchmarks are experimental and likely to change.
+:::
+
+To explore the evaluation strategy,
+view the open-source dataset, scoring rubrics,
+or get involved with community benchmark datasets,
+visit the [Flutter Evals repository](https://github.com/flutter/evals).
+
+Evaluating the capabilities and reliability of AI agents requires testing
+approaches that model actual developer tasks.
+Because LLMs are non-deterministic,
+standard unit testing is insufficient for verifying agentic behaviors like
+codebase navigation, plan execution, and code synthesis.
+
+To build developer confidence in AI tooling,
+Dart and Flutter use an evaluation system ("evals")
+to test critical user journeys (CUJs).
+Evals measure both deterministic code correctness
+(compilation, lints, automated tests) and qualitative performance
+(reasoning, safety, and conciseness) using automated model judges
+and expert human grading.
diff --git a/sites/docs/src/data/sidenav/ai.yml b/sites/docs/src/data/sidenav/ai.yml
index d7643f3bc1..f077ff7bf8 100644
--- a/sites/docs/src/data/sidenav/ai.yml
+++ b/sites/docs/src/data/sidenav/ai.yml
@@ -19,6 +19,8 @@
           permalink: /ai/gemini-cli-extension
     - title: Developer experience
       permalink: /ai/best-practices/developer-experience
+    - title: "AI Evaluations (experimental)"
+      permalink: /ai/evals
 
 - title: Build AI-powered apps
   expanded: true

From 04f0aaeb569463df703947abefdddce6ab790285 Mon Sep 17 00:00:00 2001
From: lamek <kevin.lamenzo@gmail.com>
Date: Sun, 7 Jun 2026 13:35:30 +0000
Subject: [PATCH 2/2] Fix grammar check comment in evals.md

---
 sites/docs/src/content/ai/evals.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sites/docs/src/content/ai/evals.md b/sites/docs/src/content/ai/evals.md
index b4b7cd260e..35f66c6a19 100644
--- a/sites/docs/src/content/ai/evals.md
+++ b/sites/docs/src/content/ai/evals.md
@@ -11,7 +11,7 @@ Evaluation tooling and benchmarks are experimental and likely to change.
 :::
 
 To explore the evaluation strategy,
-view the open-source dataset, scoring rubrics,
+view the open-source dataset and scoring rubrics,
 or get involved with community benchmark datasets,
 visit the [Flutter Evals repository](https://github.com/flutter/evals).