From b7330b0fed7130b72a8fd3dd844af15f432c1867 Mon Sep 17 00:00:00 2001 From: Grivn Date: Fri, 15 May 2026 03:15:51 +0000 Subject: [PATCH] feat: add eval loop harness module Introduce eval-loop as a feedback-facing harness module with scenarios, suites, rubrics, protocol skills, lifecycle hooks, and evaluator guidance. Add Codex projection support for eval-loop skills and runtime state, plus a Codex app-server smoke target that verifies the projected eval skills are discoverable. Document the eval-loop design in English and Chinese and link it from the harness docs. --- Makefile | 5 +- docs/harness/README.md | 2 + docs/harness/eval-loop/DESIGN.md | 90 ++++++++++++++++ docs/zh/harness/README.md | 2 + docs/zh/harness/eval-loop/DESIGN.md | 88 +++++++++++++++ harness/eval/README.md | 16 +++ harness/hosts/codex/projector.sh | 90 +++++++++++++++- harness/modules/README.md | 3 +- harness/modules/eval-loop/GUIDE.md | 50 +++++++++ harness/modules/eval-loop/README.md | 101 ++++++++++++++++++ harness/modules/eval-loop/env.sh | 14 +++ harness/modules/eval-loop/hooks/compact.md | 13 +++ harness/modules/eval-loop/hooks/nudge.md | 11 ++ harness/modules/eval-loop/hooks/prime.md | 11 ++ harness/modules/eval-loop/hooks/remind.md | 12 +++ harness/modules/eval-loop/module.json | 69 ++++++++++++ .../eval-loop/rubrics/eval-asset-quality.md | 22 ++++ .../rubrics/interface-loop-behavior.md | 22 ++++ .../scenarios/docs/bilingual-doc-sync.md | 29 +++++ .../memory/project-preference-recall.md | 28 +++++ .../scenarios/setup/host-projection-smoke.md | 27 +++++ .../scenarios/skill/skill-creation-reuse.md | 28 +++++ .../modules/eval-loop/skills/eval_analyze.md | 39 +++++++ .../modules/eval-loop/skills/eval_improve.md | 33 ++++++ harness/modules/eval-loop/skills/eval_plan.md | 40 +++++++ harness/modules/eval-loop/skills/eval_run.md | 31 ++++++ .../modules/eval-loop/subagents/evaluator.md | 20 ++++ .../modules/eval-loop/suites/regression.json | 16 +++ harness/modules/eval-loop/suites/smoke.json | 15 +++ harness/setup/README.md | 1 + scripts/codex_app_server_eval.py | 14 ++- 31 files changed, 937 insertions(+), 5 deletions(-) create mode 100644 docs/harness/eval-loop/DESIGN.md create mode 100644 docs/zh/harness/eval-loop/DESIGN.md create mode 100644 harness/modules/eval-loop/GUIDE.md create mode 100644 harness/modules/eval-loop/README.md create mode 100644 harness/modules/eval-loop/env.sh create mode 100644 harness/modules/eval-loop/hooks/compact.md create mode 100644 harness/modules/eval-loop/hooks/nudge.md create mode 100644 harness/modules/eval-loop/hooks/prime.md create mode 100644 harness/modules/eval-loop/hooks/remind.md create mode 100644 harness/modules/eval-loop/module.json create mode 100644 harness/modules/eval-loop/rubrics/eval-asset-quality.md create mode 100644 harness/modules/eval-loop/rubrics/interface-loop-behavior.md create mode 100644 harness/modules/eval-loop/scenarios/docs/bilingual-doc-sync.md create mode 100644 harness/modules/eval-loop/scenarios/memory/project-preference-recall.md create mode 100644 harness/modules/eval-loop/scenarios/setup/host-projection-smoke.md create mode 100644 harness/modules/eval-loop/scenarios/skill/skill-creation-reuse.md create mode 100644 harness/modules/eval-loop/skills/eval_analyze.md create mode 100644 harness/modules/eval-loop/skills/eval_improve.md create mode 100644 harness/modules/eval-loop/skills/eval_plan.md create mode 100644 harness/modules/eval-loop/skills/eval_run.md create mode 100644 harness/modules/eval-loop/subagents/evaluator.md create mode 100644 harness/modules/eval-loop/suites/regression.json create mode 100644 harness/modules/eval-loop/suites/smoke.json diff --git a/Makefile b/Makefile index 764c1b4..b352950 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ ifeq ($(GOBIN),) GOBIN := $(shell go env GOPATH)/bin endif -.PHONY: deps build install uninstall test unit vet harness-validate codex-app-eval codex-app-eval-suite codex-memory-deep-eval codex-skill-deep-eval docker-build docker-run compose-up compose-down compose-dev release-snapshot clean help +.PHONY: deps build install uninstall test unit vet harness-validate codex-app-eval codex-app-eval-suite codex-memory-deep-eval codex-skill-deep-eval codex-eval-loop-smoke docker-build docker-run compose-up compose-down compose-dev release-snapshot clean help .DEFAULT_GOAL := help @@ -60,6 +60,9 @@ codex-memory-deep-eval: ## Run deep real Codex app-server memory regression suit codex-skill-deep-eval: ## Run deep real Codex app-server skill regression suite python3 scripts/codex_app_server_eval.py --suite --suite-name skill-deep +codex-eval-loop-smoke: ## Run real Codex app-server eval-loop projection smoke check + python3 scripts/codex_app_server_eval.py --module eval-loop + # ── Containers / Deployment ────────────────────────────────────────── docker-build: ## Build runtime Docker image diff --git a/docs/harness/README.md b/docs/harness/README.md index cb546e4..57e5244 100644 --- a/docs/harness/README.md +++ b/docs/harness/README.md @@ -30,6 +30,7 @@ projection into host surfaces, and optional daemon scheduling. | Harness Roadmap | [EN](ROADMAP.md) / [中文](../zh/harness/ROADMAP.md) | | Memory Loop | [EN](memory-loop/DESIGN.md) / [中文](../zh/harness/memory-loop/DESIGN.md) / [site](../site/memory-loop/site.html) | | Skill Loop | [EN](skill-loop/DESIGN.md) / [中文](../zh/harness/skill-loop/DESIGN.md) / [site](../site/skill-loop/site.html) | +| Eval Loop | [EN](eval-loop/DESIGN.md) / [中文](../zh/harness/eval-loop/DESIGN.md) | ## Installable Assets @@ -37,6 +38,7 @@ projection into host surfaces, and optional daemon scheduling. | --- | --- | | Memory Loop | [harness/modules/memory-loop](../../harness/modules/memory-loop/README.md) | | Skill Loop | [harness/modules/skill-loop](../../harness/modules/skill-loop/README.md) | +| Eval Loop | [harness/modules/eval-loop](../../harness/modules/eval-loop/README.md) | ## Repository Layout diff --git a/docs/harness/eval-loop/DESIGN.md b/docs/harness/eval-loop/DESIGN.md new file mode 100644 index 0000000..2943835 --- /dev/null +++ b/docs/harness/eval-loop/DESIGN.md @@ -0,0 +1,90 @@ +# Eval Loop MVP Design + +Chinese version: [DESIGN.md](../../zh/harness/eval-loop/DESIGN.md) + +Installable MVP assets: [harness/modules/eval-loop](../../../harness/modules/eval-loop/README.md) + +The eval loop is Mnemon's feedback-facing harness module. It defines how a +HostAgent is tested through realistic scenarios, how evidence is collected, and +how stable failures become curated improvement candidates. + +## Positioning + +The eval loop is a peer of memory-loop and skill-loop. It is not their parent +module. Memory-loop and skill-loop directly affect the HostAgent interface by +changing remembered context and reusable working methods. Eval-loop observes +those effects through scenario execution and feeds findings back into the +project. + +```text +harness/modules/ +├── memory-loop +├── skill-loop +└── eval-loop +``` + +## Core Model + +```text +scenario + | + v +isolated workspace + .mnemon + host projection + | + v +Codex app server HostAgent + | + v +artifacts: transcript, diff, memory state, skill evidence, logs + | + v +rubric judgement + | + v +report and improvement candidate +``` + +Codex app server is the current primary HostAgent. Generic HostAgent +requirements should be extracted from repeated Codex-first scenarios rather +than designed upfront. + +## Assets + +| Asset | Purpose | +| --- | --- | +| Scenario | A reproducible task pressure case with target, setup, prompt, evidence, and expected observations. | +| Suite | A named set of scenarios and loop configuration. | +| Rubric | Criteria for judging behavior and eval asset quality. | +| Skill | Protocol methods for planning, running, analyzing, and improving evals. | +| Evaluator | Background curation worker for deduping candidates and summarizing trends. | + +## Lifecycle + +Eval assets have a stricter lifecycle than skills because they define how the +project judges improvement. + +```text +ephemeral -> candidate -> promoted -> canonical -> retired +``` + +- `ephemeral`: temporary exploration, no review required. +- `candidate`: proposed asset with initial evidence. +- `promoted`: curated asset for local regression. +- `canonical`: stable asset for long-term comparison or gates. +- `retired`: obsolete, flaky, or superseded asset. + +This reduces review pressure: the agent can explore freely, but only stable and +useful assets are reviewed for promotion. + +## First Scope + +The first scenarios focus on Mnemon's current self-evolution work: + +- memory preference recall +- skill creation and reuse +- bilingual documentation synchronization +- host projection smoke checks + +These scenarios evaluate memory-loop and skill-loop today, but the eval-loop +framework is intentionally broader. It can also evaluate setup, host adapters, +docs workflow, commit discipline, and eval-loop itself. diff --git a/docs/zh/harness/README.md b/docs/zh/harness/README.md index a31696c..05e6b44 100644 --- a/docs/zh/harness/README.md +++ b/docs/zh/harness/README.md @@ -25,6 +25,7 @@ host surface projection,以及可选的 daemon scheduling。 | Harness Roadmap | [中文](ROADMAP.md) / [EN](../../harness/ROADMAP.md) | | Memory Loop | [中文](memory-loop/DESIGN.md) / [EN](../../harness/memory-loop/DESIGN.md) / [site](../../site/memory-loop/site.html) | | Skill Loop | [中文](skill-loop/DESIGN.md) / [EN](../../harness/skill-loop/DESIGN.md) / [site](../../site/skill-loop/site.html) | +| Eval Loop | [中文](eval-loop/DESIGN.md) / [EN](../../harness/eval-loop/DESIGN.md) | ## 可安装资产 @@ -32,6 +33,7 @@ host surface projection,以及可选的 daemon scheduling。 | --- | --- | | Memory Loop | [harness/modules/memory-loop](../../../harness/modules/memory-loop/README.md) | | Skill Loop | [harness/modules/skill-loop](../../../harness/modules/skill-loop/README.md) | +| Eval Loop | [harness/modules/eval-loop](../../../harness/modules/eval-loop/README.md) | ## 仓库布局 diff --git a/docs/zh/harness/eval-loop/DESIGN.md b/docs/zh/harness/eval-loop/DESIGN.md new file mode 100644 index 0000000..de338ea --- /dev/null +++ b/docs/zh/harness/eval-loop/DESIGN.md @@ -0,0 +1,88 @@ +# Eval Loop MVP Design + +英文版本:[DESIGN.md](../../../harness/eval-loop/DESIGN.md) + +可安装 MVP 资产:[harness/modules/eval-loop](../../../../harness/modules/eval-loop/README.md) + +Eval loop 是 Mnemon 的 feedback-facing harness module。它定义如何通过真实 +scenario 测试 HostAgent,如何收集证据,以及如何把稳定失败转化为经过治理的 +改进候选。 + +## 定位 + +Eval loop 与 memory-loop、skill-loop 是平级模块,不是它们的父模块。 +memory-loop 和 skill-loop 直接影响 HostAgent interface:前者影响记忆上下文, +后者影响可复用工作方法。eval-loop 通过 scenario 执行观察这些影响,并把发现 +反馈回项目。 + +```text +harness/modules/ +├── memory-loop +├── skill-loop +└── eval-loop +``` + +## 核心模型 + +```text +scenario + | + v +isolated workspace + .mnemon + host projection + | + v +Codex app server HostAgent + | + v +artifacts: transcript, diff, memory state, skill evidence, logs + | + v +rubric judgement + | + v +report and improvement candidate +``` + +Codex app server 是当前 primary HostAgent。通用 HostAgent requirement 应该从 +Codex-first 场景中持续归纳,而不是一开始就前置设计。 + +## 资产 + +| Asset | 作用 | +| --- | --- | +| Scenario | 可复现的任务压力场景,包含 target、setup、prompt、evidence 和预期观察。 | +| Suite | 一组 scenarios 和 loop configuration。 | +| Rubric | 行为判断和 eval asset 质量判断标准。 | +| Skill | eval plan、run、analyze、improve 的 protocol 方法。 | +| Evaluator | 后台 curation worker,用于去重 candidates、总结趋势。 | + +## 生命周期 + +Eval assets 的生命周期应比 skills 更严格,因为它们定义项目如何判断自己是否 +变好。 + +```text +ephemeral -> candidate -> promoted -> canonical -> retired +``` + +- `ephemeral`:临时探索,不需要审计。 +- `candidate`:有初步证据的候选资产。 +- `promoted`:经过整理,可用于本地回归。 +- `canonical`:稳定,可用于长期对比或 gate。 +- `retired`:过时、不稳定或被替代的资产。 + +这样可以降低 review 压力:agent 可以自由探索,但只有稳定且有价值的资产才进入 +promotion 审阅。 + +## 第一阶段范围 + +第一批场景聚焦 Mnemon 当前的自迭代工作: + +- memory preference recall +- skill creation and reuse +- bilingual documentation synchronization +- host projection smoke checks + +这些场景当前主要评估 memory-loop 和 skill-loop,但 eval-loop 框架本身更通用。 +它也可以评估 setup、host adapter、docs workflow、commit discipline,以及 +eval-loop 自身。 diff --git a/harness/eval/README.md b/harness/eval/README.md index 28c0a4f..664c58b 100644 --- a/harness/eval/README.md +++ b/harness/eval/README.md @@ -2,6 +2,16 @@ This directory documents eval modes for host-wrapped loop testing. +The canonical eval loop module lives under: + +```text +harness/modules/eval-loop/ +``` + +Use `harness/eval/` for project-local runner notes and app-server operation +details. Use `harness/modules/eval-loop/` for reusable eval-loop policy, +scenarios, suites, rubrics, protocol skills, and lifecycle guidance. + ## Codex App-Server Eval The Codex app-server eval uses the real Codex app-server protocol instead of a @@ -38,6 +48,12 @@ Run the longer skill-loop regression suite with: make codex-skill-deep-eval ``` +Run the eval-loop projection smoke check with: + +```bash +make codex-eval-loop-smoke +``` + To run an actual Codex turn, use: ```bash diff --git a/harness/hosts/codex/projector.sh b/harness/hosts/codex/projector.sh index 20eba51..86b539b 100755 --- a/harness/hosts/codex/projector.sh +++ b/harness/hosts/codex/projector.sh @@ -20,6 +20,9 @@ Memory loop install options: Skill loop install options: --host-skills-dir DIR +Eval loop install options: + --host-skills-dir DIR + Uninstall options: --purge-memory --purge-library @@ -95,7 +98,7 @@ if [[ -z "${MODULE}" ]]; then usage >&2 exit 2 fi -if [[ "${MODULE}" != "memory-loop" && "${MODULE}" != "skill-loop" ]]; then +if [[ "${MODULE}" != "memory-loop" && "${MODULE}" != "skill-loop" && "${MODULE}" != "eval-loop" ]]; then echo "unsupported module for Codex: ${MODULE}" >&2 exit 1 fi @@ -321,6 +324,61 @@ EOF echo "Host skills: ${HOST_SKILLS_DIR}" } +install_eval_loop() { + ensure_python + [[ -n "${HOST_SKILLS_DIR}" ]] || HOST_SKILLS_DIR="${CONFIG_DIR}/skills" + copy_common_canonical_assets + mkdir -p \ + "${CANONICAL_MODULE_DIR}/scratch" \ + "${CANONICAL_MODULE_DIR}/candidates" \ + "${CANONICAL_MODULE_DIR}/reports" \ + "${CANONICAL_MODULE_DIR}/artifacts" \ + "${CANONICAL_MODULE_DIR}/retired" \ + "${CANONICAL_MODULE_DIR}/scenarios" \ + "${CANONICAL_MODULE_DIR}/suites" \ + "${CANONICAL_MODULE_DIR}/rubrics" \ + "${HOST_SKILLS_DIR}/eval_plan" \ + "${HOST_SKILLS_DIR}/eval_run" \ + "${HOST_SKILLS_DIR}/eval_analyze" \ + "${HOST_SKILLS_DIR}/eval_improve" \ + "${CONFIG_DIR}/mnemon-eval-loop" + + cp -R "${MODULE_DIR}/scenarios/." "${CANONICAL_MODULE_DIR}/scenarios/" + cp -R "${MODULE_DIR}/suites/." "${CANONICAL_MODULE_DIR}/suites/" + cp -R "${MODULE_DIR}/rubrics/." "${CANONICAL_MODULE_DIR}/rubrics/" + + write_runtime_env "${CONFIG_DIR}/mnemon-eval-loop" "MNEMON_EVAL_LOOP_ENV" "MNEMON_EVAL_LOOP_DIR" + install_file "${MODULE_DIR}/GUIDE.md" "${CONFIG_DIR}/mnemon-eval-loop/GUIDE.md" 0644 + cat >> "${CONFIG_DIR}/mnemon-eval-loop/env.sh" </dev/null || true + rmdir "${CANONICAL_MODULE_DIR}/artifacts" 2>/dev/null || true + rmdir "${CANONICAL_MODULE_DIR}/reports" 2>/dev/null || true + rmdir "${CANONICAL_MODULE_DIR}/candidates" 2>/dev/null || true + rmdir "${CANONICAL_MODULE_DIR}/scratch" 2>/dev/null || true + rmdir "${CANONICAL_MODULE_DIR}" 2>/dev/null || true + remove_host_manifest_module + echo "Removed Mnemon eval loop from ${CONFIG_DIR}." +} + case "${ACTION}:${MODULE}" in install:memory-loop) install_memory_loop ;; install:skill-loop) install_skill_loop ;; - status:memory-loop|status:skill-loop) status_module ;; + install:eval-loop) install_eval_loop ;; + status:memory-loop|status:skill-loop|status:eval-loop) status_module ;; uninstall:memory-loop) uninstall_memory_loop ;; uninstall:skill-loop) uninstall_skill_loop ;; + uninstall:eval-loop) uninstall_eval_loop ;; *) echo "unsupported action/module: ${ACTION}/${MODULE}" >&2 exit 1 diff --git a/harness/modules/README.md b/harness/modules/README.md index 08edeb3..bc50623 100644 --- a/harness/modules/README.md +++ b/harness/modules/README.md @@ -5,7 +5,8 @@ This directory contains canonical, host-agnostic loop modules. ```text harness/modules/ ├── memory-loop/ -└── skill-loop/ +├── skill-loop/ +└── eval-loop/ ``` Each module follows the Loop Module Standard and declares its assets in diff --git a/harness/modules/eval-loop/GUIDE.md b/harness/modules/eval-loop/GUIDE.md new file mode 100644 index 0000000..e5ca6cc --- /dev/null +++ b/harness/modules/eval-loop/GUIDE.md @@ -0,0 +1,50 @@ +# Mnemon Eval Loop Guide + +Use the eval loop when a task needs to test whether Mnemon harness behavior +actually improves real HostAgent work. + +## Policy + +- Prefer scenario-driven evals over ad hoc success claims. +- Keep canonical eval assets stable, reproducible, and reviewable. +- Treat LLM-generated evals as ephemeral or candidate assets until they show + stable value. +- Record enough evidence for another maintainer to understand the judgement: + task, host, loop configuration, transcript reference, diff summary, state + changes, rubric result, and proposed next action. +- Do not loosen a rubric to make a run pass. +- Do not promote an eval asset that is flaky, duplicative, too expensive for + its value, or likely to reward harmful behavior. + +## When to Plan an Eval + +Plan an eval when: + +- A memory, skill, setup, host adapter, or docs workflow change claims behavior + improvement. +- A regression is suspected from real project work. +- A repeated failure suggests a missing scenario or rubric. +- An existing scenario no longer distinguishes good behavior from weak behavior. + +## Asset Lifecycle + +Use this lifecycle for scenarios, suites, and rubrics: + +```text +ephemeral -> candidate -> promoted -> canonical -> retired +``` + +- Start with `ephemeral` for exploration. +- Move to `candidate` only after the asset has a clear target, rubric, and + observed value. +- Move to `promoted` after deduplication and at least one stable run. +- Move to `canonical` only when the asset is important enough for long-term + comparison. +- Move to `retired` when it is obsolete, flaky, or superseded. + +## HostAgent Boundary + +Codex app server is the primary HostAgent today. Do not overfit eval assets to +Codex unless the scenario is explicitly testing Codex projection or driver +behavior. Record Codex-specific requirements as observed HostAgent capabilities +before turning them into generic requirements. diff --git a/harness/modules/eval-loop/README.md b/harness/modules/eval-loop/README.md new file mode 100644 index 0000000..dbf8091 --- /dev/null +++ b/harness/modules/eval-loop/README.md @@ -0,0 +1,101 @@ +# Mnemon Eval Loop Harness + +This directory is the canonical eval loop module. It is a feedback-facing loop: +it designs and runs realistic harness scenarios, collects evidence, and turns +stable failures into curated improvement candidates. + +The eval loop is not a parent of memory-loop or skill-loop. It is a peer module +that can evaluate interface-facing loops, host projection, setup, documentation +workflow, commit discipline, and its own eval assets. + +## File Tree + +```text +harness/modules/eval-loop/ +├── README.md +├── module.json +├── env.sh +├── GUIDE.md +├── hooks/ +├── skills/ +├── subagents/ +├── scenarios/ +├── suites/ +└── rubrics/ +``` + +## Core Parts + +| Part | Role | +| --- | --- | +| Scenario | A reproducible task pressure case with target, setup, prompt, evidence, and expected observations. | +| Suite | A named group of scenarios with host and loop configuration. | +| Rubric | Review criteria used to judge behavior, stability, and improvement value. | +| Runner | Host-specific machinery that starts isolated workspaces and drives a HostAgent. Codex app server is the current primary runner. | +| Report | Durable output containing transcript references, diffs, loop state, judgement, and next actions. | + +## Eval Asset Lifecycle + +Eval assets are stricter than skill assets because they define how the project +judges improvement. New assets should not become canonical immediately. + +```text +ephemeral -> candidate -> promoted -> canonical -> retired +``` + +- `ephemeral`: one-off exploration in `scratch`; no review required. +- `candidate`: generated or proposed asset with initial evidence. +- `promoted`: curated asset suitable for local regression. +- `canonical`: stable asset suitable for long-term comparison or gates. +- `retired`: obsolete, flaky, or superseded asset kept for audit. + +## Runtime Directory Protocol + +Installed runtime state resolves through one environment config: + +```text +$MNEMON_EVAL_LOOP_DIR/ +├── env.sh +├── GUIDE.md +├── scratch/ +├── candidates/ +├── reports/ +├── artifacts/ +└── retired/ +``` + +`env.sh` defines: + +```bash +MNEMON_EVAL_LOOP_ENV=/harness/eval-loop/env.sh +MNEMON_EVAL_LOOP_DIR=/harness/eval-loop +MNEMON_EVAL_LOOP_SCRATCH_DIR=$MNEMON_EVAL_LOOP_DIR/scratch +MNEMON_EVAL_LOOP_CANDIDATES_DIR=$MNEMON_EVAL_LOOP_DIR/candidates +MNEMON_EVAL_LOOP_REPORTS_DIR=$MNEMON_EVAL_LOOP_DIR/reports +MNEMON_EVAL_LOOP_ARTIFACTS_DIR=$MNEMON_EVAL_LOOP_DIR/artifacts +MNEMON_EVAL_LOOP_RETIRED_DIR=$MNEMON_EVAL_LOOP_DIR/retired +``` + +## Codex Install + +Install into the current project: + +```bash +bash harness/setup/install.sh --host codex --module eval-loop +``` + +Check status: + +```bash +bash harness/setup/status.sh --host codex --module eval-loop +``` + +Remove the installed Codex integration while preserving reports and candidates: + +```bash +bash harness/setup/uninstall.sh --host codex --module eval-loop +``` + +Existing project-local Codex app-server eval commands remain available through +`make codex-app-eval-suite`, `make codex-memory-deep-eval`, and +`make codex-skill-deep-eval`. diff --git a/harness/modules/eval-loop/env.sh b/harness/modules/eval-loop/env.sh new file mode 100644 index 0000000..c41e2e4 --- /dev/null +++ b/harness/modules/eval-loop/env.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +# Runtime defaults for the Mnemon eval loop. Host projectors rewrite these +# paths when installing the loop into an isolated workspace or global config. + +export MNEMON_EVAL_LOOP_ENV="${MNEMON_EVAL_LOOP_ENV:-${BASH_SOURCE[0]}}" +export MNEMON_EVAL_LOOP_DIR="${MNEMON_EVAL_LOOP_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}" +export MNEMON_EVAL_LOOP_SCRATCH_DIR="${MNEMON_EVAL_LOOP_SCRATCH_DIR:-${MNEMON_EVAL_LOOP_DIR}/scratch}" +export MNEMON_EVAL_LOOP_CANDIDATES_DIR="${MNEMON_EVAL_LOOP_CANDIDATES_DIR:-${MNEMON_EVAL_LOOP_DIR}/candidates}" +export MNEMON_EVAL_LOOP_REPORTS_DIR="${MNEMON_EVAL_LOOP_REPORTS_DIR:-${MNEMON_EVAL_LOOP_DIR}/reports}" +export MNEMON_EVAL_LOOP_ARTIFACTS_DIR="${MNEMON_EVAL_LOOP_ARTIFACTS_DIR:-${MNEMON_EVAL_LOOP_DIR}/artifacts}" +export MNEMON_EVAL_LOOP_RETIRED_DIR="${MNEMON_EVAL_LOOP_RETIRED_DIR:-${MNEMON_EVAL_LOOP_DIR}/retired}" +export MNEMON_EVAL_LOOP_DEFAULT_HOST="${MNEMON_EVAL_LOOP_DEFAULT_HOST:-codex}" +export MNEMON_EVAL_LOOP_DEFAULT_SUITE="${MNEMON_EVAL_LOOP_DEFAULT_SUITE:-smoke}" diff --git a/harness/modules/eval-loop/hooks/compact.md b/harness/modules/eval-loop/hooks/compact.md new file mode 100644 index 0000000..4f97789 --- /dev/null +++ b/harness/modules/eval-loop/hooks/compact.md @@ -0,0 +1,13 @@ +# Eval Loop Compact + +Before context compaction, preserve: + +- Active eval goal and hypothesis. +- Scenario and suite names. +- HostAgent configuration and loop combination. +- Report and artifact paths. +- Rubric outcome and open questions. +- Any candidate eval assets that still need curation. + +Do not carry large transcripts forward in prompt context. Reference artifact +paths instead. diff --git a/harness/modules/eval-loop/hooks/nudge.md b/harness/modules/eval-loop/hooks/nudge.md new file mode 100644 index 0000000..8683db6 --- /dev/null +++ b/harness/modules/eval-loop/hooks/nudge.md @@ -0,0 +1,11 @@ +# Eval Loop Nudge + +At turn completion, if eval work happened: + +- Write or update a report under `$MNEMON_EVAL_LOOP_REPORTS_DIR` when a run + produced evidence. +- Keep raw artifacts under `$MNEMON_EVAL_LOOP_ARTIFACTS_DIR`. +- Place newly proposed scenarios, suites, or rubrics under + `$MNEMON_EVAL_LOOP_CANDIDATES_DIR` unless they were explicitly reviewed. +- Summarize whether the result suggests a code change, loop policy change, + host adapter change, docs update, or eval asset change. diff --git a/harness/modules/eval-loop/hooks/prime.md b/harness/modules/eval-loop/hooks/prime.md new file mode 100644 index 0000000..445c05f --- /dev/null +++ b/harness/modules/eval-loop/hooks/prime.md @@ -0,0 +1,11 @@ +# Eval Loop Prime + +At the start of work, check whether the current task claims harness behavior +improvement or changes eval assets. + +If yes: + +- Load `$MNEMON_EVAL_LOOP_DIR/GUIDE.md` when available. +- Prefer an existing canonical or promoted suite before creating a new scenario. +- Keep new LLM-authored scenarios ephemeral or candidate by default. +- Record the host, loop configuration, and intended evidence before running. diff --git a/harness/modules/eval-loop/hooks/remind.md b/harness/modules/eval-loop/hooks/remind.md new file mode 100644 index 0000000..201579f --- /dev/null +++ b/harness/modules/eval-loop/hooks/remind.md @@ -0,0 +1,12 @@ +# Eval Loop Remind + +Before acting on an eval-related prompt, identify: + +- Target: what behavior or subsystem is being evaluated. +- Scenario: which task pressure case will be run. +- Suite: whether this belongs to smoke, regression, or exploratory coverage. +- Rubric: how behavior will be judged. +- Evidence: which artifacts must be captured. + +If any item is missing, make it explicit in the plan or mark the run +exploratory. diff --git a/harness/modules/eval-loop/module.json b/harness/modules/eval-loop/module.json new file mode 100644 index 0000000..a82e5bd --- /dev/null +++ b/harness/modules/eval-loop/module.json @@ -0,0 +1,69 @@ +{ + "schema_version": 1, + "name": "eval-loop", + "version": "0.1.0", + "description": "Runs scenario-driven harness evaluations, collects evidence, and curates improvements without making eval assets canonical by default.", + "loop_type": "feedback", + "direct_interface_effect": false, + "primary_host": "codex", + "lifecycle_events": [ + "prime", + "remind", + "nudge", + "compact" + ], + "assets": { + "guide": "GUIDE.md", + "env": "env.sh", + "runtime_files": [ + "suites/smoke.json", + "suites/regression.json", + "rubrics/eval-asset-quality.md", + "rubrics/interface-loop-behavior.md", + "scenarios/memory/project-preference-recall.md", + "scenarios/skill/skill-creation-reuse.md", + "scenarios/docs/bilingual-doc-sync.md", + "scenarios/setup/host-projection-smoke.md" + ], + "hooks": { + "prime": "hooks/prime.md", + "remind": "hooks/remind.md", + "nudge": "hooks/nudge.md", + "compact": "hooks/compact.md" + }, + "skills": [ + "skills/eval_plan.md", + "skills/eval_run.md", + "skills/eval_analyze.md", + "skills/eval_improve.md" + ], + "subagents": [ + "subagents/evaluator.md" + ] + }, + "state": { + "canonical": [ + ".mnemon/data", + ".mnemon/reports", + ".mnemon/proposals", + ".mnemon/audit" + ], + "loop_runtime": [ + "scratch", + "candidates", + "reports", + "artifacts", + "retired" + ] + }, + "eval_asset_lifecycle": [ + "ephemeral", + "candidate", + "promoted", + "canonical", + "retired" + ], + "host_adapters": { + "codex": "../../hosts/codex" + } +} diff --git a/harness/modules/eval-loop/rubrics/eval-asset-quality.md b/harness/modules/eval-loop/rubrics/eval-asset-quality.md new file mode 100644 index 0000000..5c84fd7 --- /dev/null +++ b/harness/modules/eval-loop/rubrics/eval-asset-quality.md @@ -0,0 +1,22 @@ +# Eval Asset Quality Rubric + +Use this rubric when reviewing scenarios, suites, and rubrics for promotion. + +## Pass + +- The target and hypothesis are explicit. +- The setup is reproducible. +- Required evidence is named. +- The pass/weak/fail criteria distinguish behavior quality. +- Runtime cost is appropriate for the intended suite. +- The asset is not a duplicate of existing coverage. + +## Weak + +- The asset is useful but missing one review detail, such as artifact paths, + timeout expectations, or a clear suite placement. + +## Fail + +- The asset is vague, duplicative, flaky by design, too expensive for its value, + or likely to reward weak behavior. diff --git a/harness/modules/eval-loop/rubrics/interface-loop-behavior.md b/harness/modules/eval-loop/rubrics/interface-loop-behavior.md new file mode 100644 index 0000000..72cf251 --- /dev/null +++ b/harness/modules/eval-loop/rubrics/interface-loop-behavior.md @@ -0,0 +1,22 @@ +# Interface Loop Behavior Rubric + +Use this rubric when evaluating whether interface-facing loops improved real +HostAgent behavior. + +## Pass + +- The HostAgent behavior shows evidence that the loop affected the task. +- The effect is relevant to the scenario, not generic compliance. +- The result improves the task outcome without polluting memory, skills, docs, + or workspace state. +- The report includes enough artifacts to review the judgement. + +## Weak + +- The loop was visible but only partially affected the task, or evidence is + incomplete. + +## Fail + +- The loop had no observable effect, caused incorrect behavior, polluted state, + or made the task harder to review. diff --git a/harness/modules/eval-loop/scenarios/docs/bilingual-doc-sync.md b/harness/modules/eval-loop/scenarios/docs/bilingual-doc-sync.md new file mode 100644 index 0000000..79d5738 --- /dev/null +++ b/harness/modules/eval-loop/scenarios/docs/bilingual-doc-sync.md @@ -0,0 +1,29 @@ +# Bilingual Documentation Sync + +Target: +- docs workflow +- memory-loop or skill-loop support + +Purpose: +Verify that harness changes update relevant English and Chinese documentation +when the project requires bilingual docs. + +Setup: +- Start an isolated Codex app-server workspace. +- Install the loop combination under test. +- Seed project preference or active skill evidence when the run is testing those + loops. + +Task: +Ask the HostAgent to change a documented harness behavior. + +Expected Evidence: +- Code or harness asset change is present. +- English docs are updated when relevant. +- Chinese docs are updated when relevant. +- The final report mentions verification. + +Rubric: +- pass: code and both language docs are synchronized. +- weak: only one language is updated or docs are incomplete. +- fail: behavior changes without relevant docs. diff --git a/harness/modules/eval-loop/scenarios/memory/project-preference-recall.md b/harness/modules/eval-loop/scenarios/memory/project-preference-recall.md new file mode 100644 index 0000000..d9f1906 --- /dev/null +++ b/harness/modules/eval-loop/scenarios/memory/project-preference-recall.md @@ -0,0 +1,28 @@ +# Project Preference Recall + +Target: +- memory-loop +- HostAgent project behavior + +Purpose: +Verify that a HostAgent can use durable project preferences when a task would +otherwise omit them. + +Setup: +- Start an isolated Codex app-server workspace. +- Install `memory-loop`. +- Seed `.mnemon` with a concrete project preference. + +Task: +Ask the HostAgent to make a small project maintenance change where the seeded +preference matters. + +Expected Evidence: +- The final behavior reflects the seeded preference. +- The report references memory evidence or the projected memory loop state. +- No unrelated preference is written to memory. + +Rubric: +- pass: preference is applied and state remains clean. +- weak: preference is mentioned but incompletely applied. +- fail: preference is ignored or memory is polluted. diff --git a/harness/modules/eval-loop/scenarios/setup/host-projection-smoke.md b/harness/modules/eval-loop/scenarios/setup/host-projection-smoke.md new file mode 100644 index 0000000..807dc95 --- /dev/null +++ b/harness/modules/eval-loop/scenarios/setup/host-projection-smoke.md @@ -0,0 +1,27 @@ +# Host Projection Smoke + +Target: +- setup +- host projection + +Purpose: +Verify that a loop module can be installed into a host surface and reported in +the host manifest. + +Setup: +- Use an isolated workspace. +- Run `harness/setup/install.sh` for the target host and module. + +Task: +Install the module, inspect projected files, and run setup status. + +Expected Evidence: +- Runtime state exists under `.mnemon/harness/`. +- Host projection files exist. +- Manifest contains the installed loop. +- Status reports the module as installed. + +Rubric: +- pass: projection, manifest, and status agree. +- weak: projection exists but manifest or status is incomplete. +- fail: install fails or projected state is missing. diff --git a/harness/modules/eval-loop/scenarios/skill/skill-creation-reuse.md b/harness/modules/eval-loop/scenarios/skill/skill-creation-reuse.md new file mode 100644 index 0000000..1939caf --- /dev/null +++ b/harness/modules/eval-loop/scenarios/skill/skill-creation-reuse.md @@ -0,0 +1,28 @@ +# Skill Creation And Reuse + +Target: +- skill-loop +- reusable workflow behavior + +Purpose: +Verify that repeated workflow friction becomes skill evidence and can lead to a +reviewable skill candidate without immediate uncontrolled activation. + +Setup: +- Start an isolated Codex app-server workspace. +- Install `skill-loop`. +- Provide a task that repeats a maintenance pattern with known missed steps. + +Task: +Ask the HostAgent to complete the maintenance task and reflect on repeated +workflow friction. + +Expected Evidence: +- Usage evidence is appended for reusable workflow friction. +- Any new skill is drafted as a proposal or candidate. +- The host skill surface is not mutated unexpectedly. + +Rubric: +- pass: evidence is captured and activation remains gated. +- weak: evidence is captured but proposal quality is incomplete. +- fail: no evidence is captured or an unreviewed skill is activated. diff --git a/harness/modules/eval-loop/skills/eval_analyze.md b/harness/modules/eval-loop/skills/eval_analyze.md new file mode 100644 index 0000000..d558bfd --- /dev/null +++ b/harness/modules/eval-loop/skills/eval_analyze.md @@ -0,0 +1,39 @@ +--- +name: eval_analyze +description: Analyze Mnemon harness eval reports, classify outcomes, and extract improvement evidence. +--- + +# Eval Analyze + +Use this skill after an eval run to judge behavior and extract improvement +evidence. + +## Procedure + +1. Read the report, relevant artifact summaries, and the selected rubric. +2. Compare observed behavior to the hypothesis. +3. Classify the outcome: + - `pass`: behavior meets the rubric. + - `weak`: partially useful but missing expected evidence or consistency. + - `fail`: behavior contradicts the target expectation. + - `invalid`: setup or scenario issue prevents judgement. +4. Identify the likely improvement target: + - memory-loop + - skill-loop + - eval-loop + - host adapter + - setup + - docs + - scenario or rubric +5. If a new eval asset is warranted, create a candidate summary instead of + editing canonical assets immediately. + +## Output + +Write a concise analysis with: + +- outcome +- evidence +- likely cause +- recommended next action +- candidate eval asset path, if any diff --git a/harness/modules/eval-loop/skills/eval_improve.md b/harness/modules/eval-loop/skills/eval_improve.md new file mode 100644 index 0000000..3cfc90a --- /dev/null +++ b/harness/modules/eval-loop/skills/eval_improve.md @@ -0,0 +1,33 @@ +--- +name: eval_improve +description: Turn stable Mnemon harness eval findings into scoped project, loop, adapter, docs, or eval asset improvements. +--- + +# Eval Improve + +Use this skill to turn stable eval findings into project changes. + +## Procedure + +1. Confirm the finding is backed by a report or repeated observation. +2. Pick one improvement target. Avoid mixing loop policy changes, runner changes, + docs changes, and scenario promotion in one patch unless they are tightly + coupled. +3. For eval asset changes: + - keep exploratory ideas in scratch + - add candidate assets under runtime candidates + - promote canonical repo assets only after curation +4. For code or harness changes, run the narrowest relevant eval or validation. +5. Summarize what changed, which evidence motivated it, and what remains + unproven. + +## Promotion Checklist + +Before making an eval asset canonical, verify: + +- It has a clear target and hypothesis. +- It has an explicit rubric. +- It produces reviewable artifacts. +- It is not duplicative. +- It is stable enough for its intended suite. +- It does not reward weak or unsafe behavior. diff --git a/harness/modules/eval-loop/skills/eval_plan.md b/harness/modules/eval-loop/skills/eval_plan.md new file mode 100644 index 0000000..9a8417e --- /dev/null +++ b/harness/modules/eval-loop/skills/eval_plan.md @@ -0,0 +1,40 @@ +--- +name: eval_plan +description: Design a scenario-driven Mnemon harness eval with target, hypothesis, HostAgent, loop configuration, evidence, and rubric. +--- + +# Eval Plan + +Use this skill to design a scenario-driven eval before running a HostAgent. + +## Procedure + +1. Identify the target: loop, setup behavior, host projection, docs workflow, or + eval-loop itself. +2. Choose an existing scenario and suite when one fits. +3. If no scenario fits, draft an ephemeral plan first. Do not promote it during + the same step. +4. State the hypothesis in observable terms. +5. Select the HostAgent and loop combination. Codex app server is the default + HostAgent for current Mnemon evals. +6. Define the evidence to collect: + - transcript or response reference + - git diff + - `.mnemon` state changes + - projected host surface + - report path + - logs or timeout reason +7. Attach a rubric or mark the run exploratory. + +## Output + +Return a short eval plan with: + +- target +- scenario +- suite +- host +- loops +- hypothesis +- evidence +- expected report path diff --git a/harness/modules/eval-loop/skills/eval_run.md b/harness/modules/eval-loop/skills/eval_run.md new file mode 100644 index 0000000..9f417d4 --- /dev/null +++ b/harness/modules/eval-loop/skills/eval_run.md @@ -0,0 +1,31 @@ +--- +name: eval_run +description: Execute or supervise a planned Mnemon harness eval run in an isolated HostAgent workspace. +--- + +# Eval Run + +Use this skill to execute or supervise a planned eval run. + +## Procedure + +1. Confirm the plan names a host, suite or scenario, and evidence targets. +2. Create or use an isolated workspace. Do not run scenario state in the + developer's active workspace unless the eval explicitly requires it. +3. Install the requested loop modules with `harness/setup`. +4. For Codex app-server evals, use the project runner when available: + + ```bash + python3 scripts/codex_app_server_eval.py --suite + ``` + + Use a specific suite option when the scenario requires it. +5. Collect artifacts and logs before cleanup. +6. Record timeouts, setup failures, and HostAgent readiness failures as eval + evidence, not as silent skips. + +## Boundaries + +- Do not change canonical scenarios, suites, or rubrics while running an eval. +- Do not delete artifacts needed for report review. +- Do not treat an exploratory run as a regression result. diff --git a/harness/modules/eval-loop/subagents/evaluator.md b/harness/modules/eval-loop/subagents/evaluator.md new file mode 100644 index 0000000..5509e39 --- /dev/null +++ b/harness/modules/eval-loop/subagents/evaluator.md @@ -0,0 +1,20 @@ +# Evaluator Subagent + +Use this subagent for background eval curation and report synthesis. + +## Responsibilities + +- Cluster repeated eval observations into fewer candidate scenarios. +- Identify duplicate, flaky, or low-value candidates. +- Recommend whether candidates should remain exploratory, become promoted local + regression assets, or be considered for canonical regression. +- Summarize report trends across runs. +- Extract observed HostAgent capability requirements from Codex-first evals. + +## Non-Goals + +- Do not automatically make candidate eval assets canonical. +- Do not loosen rubrics to reduce failures. +- Do not hide setup or HostAgent failures. +- Do not modify memory-loop or skill-loop policy without a separate explicit + improvement task. diff --git a/harness/modules/eval-loop/suites/regression.json b/harness/modules/eval-loop/suites/regression.json new file mode 100644 index 0000000..62001e5 --- /dev/null +++ b/harness/modules/eval-loop/suites/regression.json @@ -0,0 +1,16 @@ +{ + "name": "regression", + "description": "Broader local regression suite for harness self-evolution behavior.", + "host": "codex", + "lifecycle": "promoted", + "scenarios": [ + "setup/host-projection-smoke", + "memory/project-preference-recall", + "skill/skill-creation-reuse", + "docs/bilingual-doc-sync" + ], + "rubrics": [ + "eval-asset-quality", + "interface-loop-behavior" + ] +} diff --git a/harness/modules/eval-loop/suites/smoke.json b/harness/modules/eval-loop/suites/smoke.json new file mode 100644 index 0000000..72dfdc3 --- /dev/null +++ b/harness/modules/eval-loop/suites/smoke.json @@ -0,0 +1,15 @@ +{ + "name": "smoke", + "description": "Fast checks for eval-loop setup and core interface-loop behavior.", + "host": "codex", + "lifecycle": "promoted", + "scenarios": [ + "setup/host-projection-smoke", + "memory/project-preference-recall", + "skill/skill-creation-reuse" + ], + "rubrics": [ + "eval-asset-quality", + "interface-loop-behavior" + ] +} diff --git a/harness/setup/README.md b/harness/setup/README.md index 59cbefb..9a7e7c5 100644 --- a/harness/setup/README.md +++ b/harness/setup/README.md @@ -19,6 +19,7 @@ bash harness/setup/install.sh --host claude-code --module memory-loop bash harness/setup/status.sh --host claude-code bash harness/setup/uninstall.sh --host claude-code --module memory-loop bash harness/setup/install.sh --host codex --module memory-loop +bash harness/setup/install.sh --host codex --module eval-loop ``` Host-specific projection logic lives under `harness/hosts//`. Loop assets diff --git a/scripts/codex_app_server_eval.py b/scripts/codex_app_server_eval.py index beaad33..adb8a33 100755 --- a/scripts/codex_app_server_eval.py +++ b/scripts/codex_app_server_eval.py @@ -208,6 +208,15 @@ def setup_workspace(args: argparse.Namespace, root: Path) -> tuple[Path, Path, P env["MNEMON_SKILL_LOOP_ARCHIVED_DIR"] = str(skill_dir / "skills" / "archived") env["MNEMON_SKILL_LOOP_USAGE_FILE"] = str(skill_dir / "skills" / ".usage.jsonl") env["MNEMON_SKILL_LOOP_PROPOSALS_DIR"] = str(skill_dir / "proposals") + if "eval-loop" in args.modules: + eval_dir = mnemon_dir / "harness" / "eval-loop" + env["MNEMON_EVAL_LOOP_ENV"] = str(eval_dir / "env.sh") + env["MNEMON_EVAL_LOOP_DIR"] = str(eval_dir) + env["MNEMON_EVAL_LOOP_SCRATCH_DIR"] = str(eval_dir / "scratch") + env["MNEMON_EVAL_LOOP_CANDIDATES_DIR"] = str(eval_dir / "candidates") + env["MNEMON_EVAL_LOOP_REPORTS_DIR"] = str(eval_dir / "reports") + env["MNEMON_EVAL_LOOP_ARTIFACTS_DIR"] = str(eval_dir / "artifacts") + env["MNEMON_EVAL_LOOP_RETIRED_DIR"] = str(eval_dir / "retired") if args.isolated_codex_home: codex_home = run_root / "codex-home" codex_home.mkdir(parents=True, exist_ok=True) @@ -302,6 +311,7 @@ def __init__( SKILL_LOOP_EXPECTED_SKILLS = ["skill_observe", "skill_curate", "skill_author", "skill_manage"] +EVAL_LOOP_EXPECTED_SKILLS = ["eval_plan", "eval_run", "eval_analyze", "eval_improve"] def setup_none(workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> None: @@ -1128,7 +1138,7 @@ def parse_args(argv: list[str]) -> argparse.Namespace: "--module", dest="modules", action="append", - choices=["memory-loop", "skill-loop"], + choices=["memory-loop", "skill-loop", "eval-loop"], default=[], help="Harness module to install. May be repeated. Defaults to memory-loop.", ) @@ -1163,6 +1173,8 @@ def parse_args(argv: list[str]) -> argparse.Namespace: expected.extend(["memory_get", "memory_set"]) if "skill-loop" in args.modules: expected.extend(SKILL_LOOP_EXPECTED_SKILLS) + if "eval-loop" in args.modules: + expected.extend(EVAL_LOOP_EXPECTED_SKILLS) args.expected_skills = expected return args