From b7330b0fed7130b72a8fd3dd844af15f432c1867 Mon Sep 17 00:00:00 2001
From: Grivn <grivn.wang@gmail.com>
Date: Fri, 15 May 2026 03:15:51 +0000
Subject: [PATCH] feat: add eval loop harness module

Introduce eval-loop as a feedback-facing harness module with scenarios, suites, rubrics, protocol skills, lifecycle hooks, and evaluator guidance.

Add Codex projection support for eval-loop skills and runtime state, plus a Codex app-server smoke target that verifies the projected eval skills are discoverable.

Document the eval-loop design in English and Chinese and link it from the harness docs.
---
 Makefile                                      |   5 +-
 docs/harness/README.md                        |   2 +
 docs/harness/eval-loop/DESIGN.md              |  90 ++++++++++++++++
 docs/zh/harness/README.md                     |   2 +
 docs/zh/harness/eval-loop/DESIGN.md           |  88 +++++++++++++++
 harness/eval/README.md                        |  16 +++
 harness/hosts/codex/projector.sh              |  90 +++++++++++++++-
 harness/modules/README.md                     |   3 +-
 harness/modules/eval-loop/GUIDE.md            |  50 +++++++++
 harness/modules/eval-loop/README.md           | 101 ++++++++++++++++++
 harness/modules/eval-loop/env.sh              |  14 +++
 harness/modules/eval-loop/hooks/compact.md    |  13 +++
 harness/modules/eval-loop/hooks/nudge.md      |  11 ++
 harness/modules/eval-loop/hooks/prime.md      |  11 ++
 harness/modules/eval-loop/hooks/remind.md     |  12 +++
 harness/modules/eval-loop/module.json         |  69 ++++++++++++
 .../eval-loop/rubrics/eval-asset-quality.md   |  22 ++++
 .../rubrics/interface-loop-behavior.md        |  22 ++++
 .../scenarios/docs/bilingual-doc-sync.md      |  29 +++++
 .../memory/project-preference-recall.md       |  28 +++++
 .../scenarios/setup/host-projection-smoke.md  |  27 +++++
 .../scenarios/skill/skill-creation-reuse.md   |  28 +++++
 .../modules/eval-loop/skills/eval_analyze.md  |  39 +++++++
 .../modules/eval-loop/skills/eval_improve.md  |  33 ++++++
 harness/modules/eval-loop/skills/eval_plan.md |  40 +++++++
 harness/modules/eval-loop/skills/eval_run.md  |  31 ++++++
 .../modules/eval-loop/subagents/evaluator.md  |  20 ++++
 .../modules/eval-loop/suites/regression.json  |  16 +++
 harness/modules/eval-loop/suites/smoke.json   |  15 +++
 harness/setup/README.md                       |   1 +
 scripts/codex_app_server_eval.py              |  14 ++-
 31 files changed, 937 insertions(+), 5 deletions(-)
 create mode 100644 docs/harness/eval-loop/DESIGN.md
 create mode 100644 docs/zh/harness/eval-loop/DESIGN.md
 create mode 100644 harness/modules/eval-loop/GUIDE.md
 create mode 100644 harness/modules/eval-loop/README.md
 create mode 100644 harness/modules/eval-loop/env.sh
 create mode 100644 harness/modules/eval-loop/hooks/compact.md
 create mode 100644 harness/modules/eval-loop/hooks/nudge.md
 create mode 100644 harness/modules/eval-loop/hooks/prime.md
 create mode 100644 harness/modules/eval-loop/hooks/remind.md
 create mode 100644 harness/modules/eval-loop/module.json
 create mode 100644 harness/modules/eval-loop/rubrics/eval-asset-quality.md
 create mode 100644 harness/modules/eval-loop/rubrics/interface-loop-behavior.md
 create mode 100644 harness/modules/eval-loop/scenarios/docs/bilingual-doc-sync.md
 create mode 100644 harness/modules/eval-loop/scenarios/memory/project-preference-recall.md
 create mode 100644 harness/modules/eval-loop/scenarios/setup/host-projection-smoke.md
 create mode 100644 harness/modules/eval-loop/scenarios/skill/skill-creation-reuse.md
 create mode 100644 harness/modules/eval-loop/skills/eval_analyze.md
 create mode 100644 harness/modules/eval-loop/skills/eval_improve.md
 create mode 100644 harness/modules/eval-loop/skills/eval_plan.md
 create mode 100644 harness/modules/eval-loop/skills/eval_run.md
 create mode 100644 harness/modules/eval-loop/subagents/evaluator.md
 create mode 100644 harness/modules/eval-loop/suites/regression.json
 create mode 100644 harness/modules/eval-loop/suites/smoke.json

diff --git a/Makefile b/Makefile
index 764c1b4..b352950 100644
--- a/Makefile
+++ b/Makefile
@@ -10,7 +10,7 @@ ifeq ($(GOBIN),)
   GOBIN     := $(shell go env GOPATH)/bin
 endif
 
-.PHONY: deps build install uninstall test unit vet harness-validate codex-app-eval codex-app-eval-suite codex-memory-deep-eval codex-skill-deep-eval docker-build docker-run compose-up compose-down compose-dev release-snapshot clean help
+.PHONY: deps build install uninstall test unit vet harness-validate codex-app-eval codex-app-eval-suite codex-memory-deep-eval codex-skill-deep-eval codex-eval-loop-smoke docker-build docker-run compose-up compose-down compose-dev release-snapshot clean help
 
 .DEFAULT_GOAL := help
 
@@ -60,6 +60,9 @@ codex-memory-deep-eval: ## Run deep real Codex app-server memory regression suit
 codex-skill-deep-eval: ## Run deep real Codex app-server skill regression suite
 	python3 scripts/codex_app_server_eval.py --suite --suite-name skill-deep
 
+codex-eval-loop-smoke: ## Run real Codex app-server eval-loop projection smoke check
+	python3 scripts/codex_app_server_eval.py --module eval-loop
+
 # ── Containers / Deployment ──────────────────────────────────────────
 
 docker-build: ## Build runtime Docker image
diff --git a/docs/harness/README.md b/docs/harness/README.md
index cb546e4..57e5244 100644
--- a/docs/harness/README.md
+++ b/docs/harness/README.md
@@ -30,6 +30,7 @@ projection into host surfaces, and optional daemon scheduling.
 | Harness Roadmap | [EN](ROADMAP.md) / [中文](../zh/harness/ROADMAP.md) |
 | Memory Loop | [EN](memory-loop/DESIGN.md) / [中文](../zh/harness/memory-loop/DESIGN.md) / [site](../site/memory-loop/site.html) |
 | Skill Loop | [EN](skill-loop/DESIGN.md) / [中文](../zh/harness/skill-loop/DESIGN.md) / [site](../site/skill-loop/site.html) |
+| Eval Loop | [EN](eval-loop/DESIGN.md) / [中文](../zh/harness/eval-loop/DESIGN.md) |
 
 ## Installable Assets
 
@@ -37,6 +38,7 @@ projection into host surfaces, and optional daemon scheduling.
 | --- | --- |
 | Memory Loop | [harness/modules/memory-loop](../../harness/modules/memory-loop/README.md) |
 | Skill Loop | [harness/modules/skill-loop](../../harness/modules/skill-loop/README.md) |
+| Eval Loop | [harness/modules/eval-loop](../../harness/modules/eval-loop/README.md) |
 
 ## Repository Layout
 
diff --git a/docs/harness/eval-loop/DESIGN.md b/docs/harness/eval-loop/DESIGN.md
new file mode 100644
index 0000000..2943835
--- /dev/null
+++ b/docs/harness/eval-loop/DESIGN.md
@@ -0,0 +1,90 @@
+# Eval Loop MVP Design
+
+Chinese version: [DESIGN.md](../../zh/harness/eval-loop/DESIGN.md)
+
+Installable MVP assets: [harness/modules/eval-loop](../../../harness/modules/eval-loop/README.md)
+
+The eval loop is Mnemon's feedback-facing harness module. It defines how a
+HostAgent is tested through realistic scenarios, how evidence is collected, and
+how stable failures become curated improvement candidates.
+
+## Positioning
+
+The eval loop is a peer of memory-loop and skill-loop. It is not their parent
+module. Memory-loop and skill-loop directly affect the HostAgent interface by
+changing remembered context and reusable working methods. Eval-loop observes
+those effects through scenario execution and feeds findings back into the
+project.
+
+```text
+harness/modules/
+├── memory-loop
+├── skill-loop
+└── eval-loop
+```
+
+## Core Model
+
+```text
+scenario
+   |
+   v
+isolated workspace + .mnemon + host projection
+   |
+   v
+Codex app server HostAgent
+   |
+   v
+artifacts: transcript, diff, memory state, skill evidence, logs
+   |
+   v
+rubric judgement
+   |
+   v
+report and improvement candidate
+```
+
+Codex app server is the current primary HostAgent. Generic HostAgent
+requirements should be extracted from repeated Codex-first scenarios rather
+than designed upfront.
+
+## Assets
+
+| Asset | Purpose |
+| --- | --- |
+| Scenario | A reproducible task pressure case with target, setup, prompt, evidence, and expected observations. |
+| Suite | A named set of scenarios and loop configuration. |
+| Rubric | Criteria for judging behavior and eval asset quality. |
+| Skill | Protocol methods for planning, running, analyzing, and improving evals. |
+| Evaluator | Background curation worker for deduping candidates and summarizing trends. |
+
+## Lifecycle
+
+Eval assets have a stricter lifecycle than skills because they define how the
+project judges improvement.
+
+```text
+ephemeral -> candidate -> promoted -> canonical -> retired
+```
+
+- `ephemeral`: temporary exploration, no review required.
+- `candidate`: proposed asset with initial evidence.
+- `promoted`: curated asset for local regression.
+- `canonical`: stable asset for long-term comparison or gates.
+- `retired`: obsolete, flaky, or superseded asset.
+
+This reduces review pressure: the agent can explore freely, but only stable and
+useful assets are reviewed for promotion.
+
+## First Scope
+
+The first scenarios focus on Mnemon's current self-evolution work:
+
+- memory preference recall
+- skill creation and reuse
+- bilingual documentation synchronization
+- host projection smoke checks
+
+These scenarios evaluate memory-loop and skill-loop today, but the eval-loop
+framework is intentionally broader. It can also evaluate setup, host adapters,
+docs workflow, commit discipline, and eval-loop itself.
diff --git a/docs/zh/harness/README.md b/docs/zh/harness/README.md
index a31696c..05e6b44 100644
--- a/docs/zh/harness/README.md
+++ b/docs/zh/harness/README.md
@@ -25,6 +25,7 @@ host surface projection，以及可选的 daemon scheduling。
 | Harness Roadmap | [中文](ROADMAP.md) / [EN](../../harness/ROADMAP.md) |
 | Memory Loop | [中文](memory-loop/DESIGN.md) / [EN](../../harness/memory-loop/DESIGN.md) / [site](../../site/memory-loop/site.html) |
 | Skill Loop | [中文](skill-loop/DESIGN.md) / [EN](../../harness/skill-loop/DESIGN.md) / [site](../../site/skill-loop/site.html) |
+| Eval Loop | [中文](eval-loop/DESIGN.md) / [EN](../../harness/eval-loop/DESIGN.md) |
 
 ## 可安装资产
 
@@ -32,6 +33,7 @@ host surface projection，以及可选的 daemon scheduling。
 | --- | --- |
 | Memory Loop | [harness/modules/memory-loop](../../../harness/modules/memory-loop/README.md) |
 | Skill Loop | [harness/modules/skill-loop](../../../harness/modules/skill-loop/README.md) |
+| Eval Loop | [harness/modules/eval-loop](../../../harness/modules/eval-loop/README.md) |
 
 ## 仓库布局
 
diff --git a/docs/zh/harness/eval-loop/DESIGN.md b/docs/zh/harness/eval-loop/DESIGN.md
new file mode 100644
index 0000000..de338ea
--- /dev/null
+++ b/docs/zh/harness/eval-loop/DESIGN.md
@@ -0,0 +1,88 @@
+# Eval Loop MVP Design
+
+英文版本：[DESIGN.md](../../../harness/eval-loop/DESIGN.md)
+
+可安装 MVP 资产：[harness/modules/eval-loop](../../../../harness/modules/eval-loop/README.md)
+
+Eval loop 是 Mnemon 的 feedback-facing harness module。它定义如何通过真实
+scenario 测试 HostAgent，如何收集证据，以及如何把稳定失败转化为经过治理的
+改进候选。
+
+## 定位
+
+Eval loop 与 memory-loop、skill-loop 是平级模块，不是它们的父模块。
+memory-loop 和 skill-loop 直接影响 HostAgent interface：前者影响记忆上下文，
+后者影响可复用工作方法。eval-loop 通过 scenario 执行观察这些影响，并把发现
+反馈回项目。
+
+```text
+harness/modules/
+├── memory-loop
+├── skill-loop
+└── eval-loop
+```
+
+## 核心模型
+
+```text
+scenario
+   |
+   v
+isolated workspace + .mnemon + host projection
+   |
+   v
+Codex app server HostAgent
+   |
+   v
+artifacts: transcript, diff, memory state, skill evidence, logs
+   |
+   v
+rubric judgement
+   |
+   v
+report and improvement candidate
+```
+
+Codex app server 是当前 primary HostAgent。通用 HostAgent requirement 应该从
+Codex-first 场景中持续归纳，而不是一开始就前置设计。
+
+## 资产
+
+| Asset | 作用 |
+| --- | --- |
+| Scenario | 可复现的任务压力场景，包含 target、setup、prompt、evidence 和预期观察。 |
+| Suite | 一组 scenarios 和 loop configuration。 |
+| Rubric | 行为判断和 eval asset 质量判断标准。 |
+| Skill | eval plan、run、analyze、improve 的 protocol 方法。 |
+| Evaluator | 后台 curation worker，用于去重 candidates、总结趋势。 |
+
+## 生命周期
+
+Eval assets 的生命周期应比 skills 更严格，因为它们定义项目如何判断自己是否
+变好。
+
+```text
+ephemeral -> candidate -> promoted -> canonical -> retired
+```
+
+- `ephemeral`：临时探索，不需要审计。
+- `candidate`：有初步证据的候选资产。
+- `promoted`：经过整理，可用于本地回归。
+- `canonical`：稳定，可用于长期对比或 gate。
+- `retired`：过时、不稳定或被替代的资产。
+
+这样可以降低 review 压力：agent 可以自由探索，但只有稳定且有价值的资产才进入
+promotion 审阅。
+
+## 第一阶段范围
+
+第一批场景聚焦 Mnemon 当前的自迭代工作：
+
+- memory preference recall
+- skill creation and reuse
+- bilingual documentation synchronization
+- host projection smoke checks
+
+这些场景当前主要评估 memory-loop 和 skill-loop，但 eval-loop 框架本身更通用。
+它也可以评估 setup、host adapter、docs workflow、commit discipline，以及
+eval-loop 自身。
diff --git a/harness/eval/README.md b/harness/eval/README.md
index 28c0a4f..664c58b 100644
--- a/harness/eval/README.md
+++ b/harness/eval/README.md
@@ -2,6 +2,16 @@
 
 This directory documents eval modes for host-wrapped loop testing.
 
+The canonical eval loop module lives under:
+
+```text
+harness/modules/eval-loop/
+```
+
+Use `harness/eval/` for project-local runner notes and app-server operation
+details. Use `harness/modules/eval-loop/` for reusable eval-loop policy,
+scenarios, suites, rubrics, protocol skills, and lifecycle guidance.
+
 ## Codex App-Server Eval
 
 The Codex app-server eval uses the real Codex app-server protocol instead of a
@@ -38,6 +48,12 @@ Run the longer skill-loop regression suite with:
 make codex-skill-deep-eval
 ```
 
+Run the eval-loop projection smoke check with:
+
+```bash
+make codex-eval-loop-smoke
+```
+
 To run an actual Codex turn, use:
 
 ```bash
diff --git a/harness/hosts/codex/projector.sh b/harness/hosts/codex/projector.sh
index 20eba51..86b539b 100755
--- a/harness/hosts/codex/projector.sh
+++ b/harness/hosts/codex/projector.sh
@@ -20,6 +20,9 @@ Memory loop install options:
 Skill loop install options:
   --host-skills-dir DIR
 
+Eval loop install options:
+  --host-skills-dir DIR
+
 Uninstall options:
   --purge-memory
   --purge-library
@@ -95,7 +98,7 @@ if [[ -z "${MODULE}" ]]; then
   usage >&2
   exit 2
 fi
-if [[ "${MODULE}" != "memory-loop" && "${MODULE}" != "skill-loop" ]]; then
+if [[ "${MODULE}" != "memory-loop" && "${MODULE}" != "skill-loop" && "${MODULE}" != "eval-loop" ]]; then
   echo "unsupported module for Codex: ${MODULE}" >&2
   exit 1
 fi
@@ -321,6 +324,61 @@ EOF
   echo "Host skills:  ${HOST_SKILLS_DIR}"
 }
 
+install_eval_loop() {
+  ensure_python
+  [[ -n "${HOST_SKILLS_DIR}" ]] || HOST_SKILLS_DIR="${CONFIG_DIR}/skills"
+  copy_common_canonical_assets
+  mkdir -p \
+    "${CANONICAL_MODULE_DIR}/scratch" \
+    "${CANONICAL_MODULE_DIR}/candidates" \
+    "${CANONICAL_MODULE_DIR}/reports" \
+    "${CANONICAL_MODULE_DIR}/artifacts" \
+    "${CANONICAL_MODULE_DIR}/retired" \
+    "${CANONICAL_MODULE_DIR}/scenarios" \
+    "${CANONICAL_MODULE_DIR}/suites" \
+    "${CANONICAL_MODULE_DIR}/rubrics" \
+    "${HOST_SKILLS_DIR}/eval_plan" \
+    "${HOST_SKILLS_DIR}/eval_run" \
+    "${HOST_SKILLS_DIR}/eval_analyze" \
+    "${HOST_SKILLS_DIR}/eval_improve" \
+    "${CONFIG_DIR}/mnemon-eval-loop"
+
+  cp -R "${MODULE_DIR}/scenarios/." "${CANONICAL_MODULE_DIR}/scenarios/"
+  cp -R "${MODULE_DIR}/suites/." "${CANONICAL_MODULE_DIR}/suites/"
+  cp -R "${MODULE_DIR}/rubrics/." "${CANONICAL_MODULE_DIR}/rubrics/"
+
+  write_runtime_env "${CONFIG_DIR}/mnemon-eval-loop" "MNEMON_EVAL_LOOP_ENV" "MNEMON_EVAL_LOOP_DIR"
+  install_file "${MODULE_DIR}/GUIDE.md" "${CONFIG_DIR}/mnemon-eval-loop/GUIDE.md" 0644
+  cat >> "${CONFIG_DIR}/mnemon-eval-loop/env.sh" <<EOF
+export MNEMON_EVAL_LOOP_SCRATCH_DIR="${CANONICAL_MODULE_DIR}/scratch"
+export MNEMON_EVAL_LOOP_CANDIDATES_DIR="${CANONICAL_MODULE_DIR}/candidates"
+export MNEMON_EVAL_LOOP_REPORTS_DIR="${CANONICAL_MODULE_DIR}/reports"
+export MNEMON_EVAL_LOOP_ARTIFACTS_DIR="${CANONICAL_MODULE_DIR}/artifacts"
+export MNEMON_EVAL_LOOP_RETIRED_DIR="${CANONICAL_MODULE_DIR}/retired"
+export MNEMON_EVAL_LOOP_SCENARIOS_DIR="${CANONICAL_MODULE_DIR}/scenarios"
+export MNEMON_EVAL_LOOP_SUITES_DIR="${CANONICAL_MODULE_DIR}/suites"
+export MNEMON_EVAL_LOOP_RUBRICS_DIR="${CANONICAL_MODULE_DIR}/rubrics"
+export MNEMON_EVAL_LOOP_HOST_SKILLS_DIR="${HOST_SKILLS_DIR}"
+export MNEMON_EVAL_LOOP_DEFAULT_HOST="${MNEMON_EVAL_LOOP_DEFAULT_HOST:-codex}"
+export MNEMON_EVAL_LOOP_DEFAULT_SUITE="${MNEMON_EVAL_LOOP_DEFAULT_SUITE:-smoke}"
+EOF
+
+  install_file "${MODULE_DIR}/skills/eval_plan.md" "${HOST_SKILLS_DIR}/eval_plan/SKILL.md" 0644
+  install_file "${MODULE_DIR}/skills/eval_run.md" "${HOST_SKILLS_DIR}/eval_run/SKILL.md" 0644
+  install_file "${MODULE_DIR}/skills/eval_analyze.md" "${HOST_SKILLS_DIR}/eval_analyze/SKILL.md" 0644
+  install_file "${MODULE_DIR}/skills/eval_improve.md" "${HOST_SKILLS_DIR}/eval_improve/SKILL.md" 0644
+  append_codex_runtime_note "${HOST_SKILLS_DIR}/eval_plan/SKILL.md" "MNEMON_EVAL_LOOP_DIR" "${CONFIG_DIR}/mnemon-eval-loop/env.sh"
+  append_codex_runtime_note "${HOST_SKILLS_DIR}/eval_run/SKILL.md" "MNEMON_EVAL_LOOP_DIR" "${CONFIG_DIR}/mnemon-eval-loop/env.sh"
+  append_codex_runtime_note "${HOST_SKILLS_DIR}/eval_analyze/SKILL.md" "MNEMON_EVAL_LOOP_DIR" "${CONFIG_DIR}/mnemon-eval-loop/env.sh"
+  append_codex_runtime_note "${HOST_SKILLS_DIR}/eval_improve/SKILL.md" "MNEMON_EVAL_LOOP_DIR" "${CONFIG_DIR}/mnemon-eval-loop/env.sh"
+
+  write_host_manifest "${CONFIG_DIR}"
+  echo "Installed Mnemon eval loop for Codex."
+  echo "Config:       ${CONFIG_DIR}"
+  echo "State:        ${CANONICAL_MODULE_DIR}"
+  echo "Host skills:  ${HOST_SKILLS_DIR}"
+}
+
 status_module() {
   echo "Codex ${MODULE}:"
   echo "  config:   ${CONFIG_DIR}"
@@ -375,12 +433,40 @@ uninstall_skill_loop() {
   echo "Removed Mnemon skill loop from ${CONFIG_DIR}."
 }
 
+uninstall_eval_loop() {
+  local env_path="${CONFIG_DIR}/mnemon-eval-loop/env.sh"
+  if [[ -f "${env_path}" ]]; then
+    # shellcheck source=/dev/null
+    source "${env_path}"
+  fi
+  local host_skills_dir="${MNEMON_EVAL_LOOP_HOST_SKILLS_DIR:-${HOST_SKILLS_DIR:-${CONFIG_DIR}/skills}}"
+  rm -rf "${host_skills_dir}/eval_plan"
+  rm -rf "${host_skills_dir}/eval_run"
+  rm -rf "${host_skills_dir}/eval_analyze"
+  rm -rf "${host_skills_dir}/eval_improve"
+  rm -rf "${CONFIG_DIR}/mnemon-eval-loop"
+  rm -rf "${CANONICAL_MODULE_DIR}/scenarios"
+  rm -rf "${CANONICAL_MODULE_DIR}/suites"
+  rm -rf "${CANONICAL_MODULE_DIR}/rubrics"
+  rm -f "${CANONICAL_MODULE_DIR}/GUIDE.md" "${CANONICAL_MODULE_DIR}/env.sh" "${CANONICAL_MODULE_DIR}/module.json"
+  rmdir "${CANONICAL_MODULE_DIR}/retired" 2>/dev/null || true
+  rmdir "${CANONICAL_MODULE_DIR}/artifacts" 2>/dev/null || true
+  rmdir "${CANONICAL_MODULE_DIR}/reports" 2>/dev/null || true
+  rmdir "${CANONICAL_MODULE_DIR}/candidates" 2>/dev/null || true
+  rmdir "${CANONICAL_MODULE_DIR}/scratch" 2>/dev/null || true
+  rmdir "${CANONICAL_MODULE_DIR}" 2>/dev/null || true
+  remove_host_manifest_module
+  echo "Removed Mnemon eval loop from ${CONFIG_DIR}."
+}
+
 case "${ACTION}:${MODULE}" in
   install:memory-loop) install_memory_loop ;;
   install:skill-loop) install_skill_loop ;;
-  status:memory-loop|status:skill-loop) status_module ;;
+  install:eval-loop) install_eval_loop ;;
+  status:memory-loop|status:skill-loop|status:eval-loop) status_module ;;
   uninstall:memory-loop) uninstall_memory_loop ;;
   uninstall:skill-loop) uninstall_skill_loop ;;
+  uninstall:eval-loop) uninstall_eval_loop ;;
   *)
     echo "unsupported action/module: ${ACTION}/${MODULE}" >&2
     exit 1
diff --git a/harness/modules/README.md b/harness/modules/README.md
index 08edeb3..bc50623 100644
--- a/harness/modules/README.md
+++ b/harness/modules/README.md
@@ -5,7 +5,8 @@ This directory contains canonical, host-agnostic loop modules.
 ```text
 harness/modules/
 ├── memory-loop/
-└── skill-loop/
+├── skill-loop/
+└── eval-loop/
 ```
 
 Each module follows the Loop Module Standard and declares its assets in
diff --git a/harness/modules/eval-loop/GUIDE.md b/harness/modules/eval-loop/GUIDE.md
new file mode 100644
index 0000000..e5ca6cc
--- /dev/null
+++ b/harness/modules/eval-loop/GUIDE.md
@@ -0,0 +1,50 @@
+# Mnemon Eval Loop Guide
+
+Use the eval loop when a task needs to test whether Mnemon harness behavior
+actually improves real HostAgent work.
+
+## Policy
+
+- Prefer scenario-driven evals over ad hoc success claims.
+- Keep canonical eval assets stable, reproducible, and reviewable.
+- Treat LLM-generated evals as ephemeral or candidate assets until they show
+  stable value.
+- Record enough evidence for another maintainer to understand the judgement:
+  task, host, loop configuration, transcript reference, diff summary, state
+  changes, rubric result, and proposed next action.
+- Do not loosen a rubric to make a run pass.
+- Do not promote an eval asset that is flaky, duplicative, too expensive for
+  its value, or likely to reward harmful behavior.
+
+## When to Plan an Eval
+
+Plan an eval when:
+
+- A memory, skill, setup, host adapter, or docs workflow change claims behavior
+  improvement.
+- A regression is suspected from real project work.
+- A repeated failure suggests a missing scenario or rubric.
+- An existing scenario no longer distinguishes good behavior from weak behavior.
+
+## Asset Lifecycle
+
+Use this lifecycle for scenarios, suites, and rubrics:
+
+```text
+ephemeral -> candidate -> promoted -> canonical -> retired
+```
+
+- Start with `ephemeral` for exploration.
+- Move to `candidate` only after the asset has a clear target, rubric, and
+  observed value.
+- Move to `promoted` after deduplication and at least one stable run.
+- Move to `canonical` only when the asset is important enough for long-term
+  comparison.
+- Move to `retired` when it is obsolete, flaky, or superseded.
+
+## HostAgent Boundary
+
+Codex app server is the primary HostAgent today. Do not overfit eval assets to
+Codex unless the scenario is explicitly testing Codex projection or driver
+behavior. Record Codex-specific requirements as observed HostAgent capabilities
+before turning them into generic requirements.
diff --git a/harness/modules/eval-loop/README.md b/harness/modules/eval-loop/README.md
new file mode 100644
index 0000000..dbf8091
--- /dev/null
+++ b/harness/modules/eval-loop/README.md
@@ -0,0 +1,101 @@
+# Mnemon Eval Loop Harness
+
+This directory is the canonical eval loop module. It is a feedback-facing loop:
+it designs and runs realistic harness scenarios, collects evidence, and turns
+stable failures into curated improvement candidates.
+
+The eval loop is not a parent of memory-loop or skill-loop. It is a peer module
+that can evaluate interface-facing loops, host projection, setup, documentation
+workflow, commit discipline, and its own eval assets.
+
+## File Tree
+
+```text
+harness/modules/eval-loop/
+├── README.md
+├── module.json
+├── env.sh
+├── GUIDE.md
+├── hooks/
+├── skills/
+├── subagents/
+├── scenarios/
+├── suites/
+└── rubrics/
+```
+
+## Core Parts
+
+| Part | Role |
+| --- | --- |
+| Scenario | A reproducible task pressure case with target, setup, prompt, evidence, and expected observations. |
+| Suite | A named group of scenarios with host and loop configuration. |
+| Rubric | Review criteria used to judge behavior, stability, and improvement value. |
+| Runner | Host-specific machinery that starts isolated workspaces and drives a HostAgent. Codex app server is the current primary runner. |
+| Report | Durable output containing transcript references, diffs, loop state, judgement, and next actions. |
+
+## Eval Asset Lifecycle
+
+Eval assets are stricter than skill assets because they define how the project
+judges improvement. New assets should not become canonical immediately.
+
+```text
+ephemeral -> candidate -> promoted -> canonical -> retired
+```
+
+- `ephemeral`: one-off exploration in `scratch`; no review required.
+- `candidate`: generated or proposed asset with initial evidence.
+- `promoted`: curated asset suitable for local regression.
+- `canonical`: stable asset suitable for long-term comparison or gates.
+- `retired`: obsolete, flaky, or superseded asset kept for audit.
+
+## Runtime Directory Protocol
+
+Installed runtime state resolves through one environment config:
+
+```text
+$MNEMON_EVAL_LOOP_DIR/
+├── env.sh
+├── GUIDE.md
+├── scratch/
+├── candidates/
+├── reports/
+├── artifacts/
+└── retired/
+```
+
+`env.sh` defines:
+
+```bash
+MNEMON_EVAL_LOOP_ENV=<canonical-state>/harness/eval-loop/env.sh
+MNEMON_EVAL_LOOP_DIR=<canonical-state>/harness/eval-loop
+MNEMON_EVAL_LOOP_SCRATCH_DIR=$MNEMON_EVAL_LOOP_DIR/scratch
+MNEMON_EVAL_LOOP_CANDIDATES_DIR=$MNEMON_EVAL_LOOP_DIR/candidates
+MNEMON_EVAL_LOOP_REPORTS_DIR=$MNEMON_EVAL_LOOP_DIR/reports
+MNEMON_EVAL_LOOP_ARTIFACTS_DIR=$MNEMON_EVAL_LOOP_DIR/artifacts
+MNEMON_EVAL_LOOP_RETIRED_DIR=$MNEMON_EVAL_LOOP_DIR/retired
+```
+
+## Codex Install
+
+Install into the current project:
+
+```bash
+bash harness/setup/install.sh --host codex --module eval-loop
+```
+
+Check status:
+
+```bash
+bash harness/setup/status.sh --host codex --module eval-loop
+```
+
+Remove the installed Codex integration while preserving reports and candidates:
+
+```bash
+bash harness/setup/uninstall.sh --host codex --module eval-loop
+```
+
+Existing project-local Codex app-server eval commands remain available through
+`make codex-app-eval-suite`, `make codex-memory-deep-eval`, and
+`make codex-skill-deep-eval`.
diff --git a/harness/modules/eval-loop/env.sh b/harness/modules/eval-loop/env.sh
new file mode 100644
index 0000000..c41e2e4
--- /dev/null
+++ b/harness/modules/eval-loop/env.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+# Runtime defaults for the Mnemon eval loop. Host projectors rewrite these
+# paths when installing the loop into an isolated workspace or global config.
+
+export MNEMON_EVAL_LOOP_ENV="${MNEMON_EVAL_LOOP_ENV:-${BASH_SOURCE[0]}}"
+export MNEMON_EVAL_LOOP_DIR="${MNEMON_EVAL_LOOP_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}"
+export MNEMON_EVAL_LOOP_SCRATCH_DIR="${MNEMON_EVAL_LOOP_SCRATCH_DIR:-${MNEMON_EVAL_LOOP_DIR}/scratch}"
+export MNEMON_EVAL_LOOP_CANDIDATES_DIR="${MNEMON_EVAL_LOOP_CANDIDATES_DIR:-${MNEMON_EVAL_LOOP_DIR}/candidates}"
+export MNEMON_EVAL_LOOP_REPORTS_DIR="${MNEMON_EVAL_LOOP_REPORTS_DIR:-${MNEMON_EVAL_LOOP_DIR}/reports}"
+export MNEMON_EVAL_LOOP_ARTIFACTS_DIR="${MNEMON_EVAL_LOOP_ARTIFACTS_DIR:-${MNEMON_EVAL_LOOP_DIR}/artifacts}"
+export MNEMON_EVAL_LOOP_RETIRED_DIR="${MNEMON_EVAL_LOOP_RETIRED_DIR:-${MNEMON_EVAL_LOOP_DIR}/retired}"
+export MNEMON_EVAL_LOOP_DEFAULT_HOST="${MNEMON_EVAL_LOOP_DEFAULT_HOST:-codex}"
+export MNEMON_EVAL_LOOP_DEFAULT_SUITE="${MNEMON_EVAL_LOOP_DEFAULT_SUITE:-smoke}"
diff --git a/harness/modules/eval-loop/hooks/compact.md b/harness/modules/eval-loop/hooks/compact.md
new file mode 100644
index 0000000..4f97789
--- /dev/null
+++ b/harness/modules/eval-loop/hooks/compact.md
@@ -0,0 +1,13 @@
+# Eval Loop Compact
+
+Before context compaction, preserve:
+
+- Active eval goal and hypothesis.
+- Scenario and suite names.
+- HostAgent configuration and loop combination.
+- Report and artifact paths.
+- Rubric outcome and open questions.
+- Any candidate eval assets that still need curation.
+
+Do not carry large transcripts forward in prompt context. Reference artifact
+paths instead.
diff --git a/harness/modules/eval-loop/hooks/nudge.md b/harness/modules/eval-loop/hooks/nudge.md
new file mode 100644
index 0000000..8683db6
--- /dev/null
+++ b/harness/modules/eval-loop/hooks/nudge.md
@@ -0,0 +1,11 @@
+# Eval Loop Nudge
+
+At turn completion, if eval work happened:
+
+- Write or update a report under `$MNEMON_EVAL_LOOP_REPORTS_DIR` when a run
+  produced evidence.
+- Keep raw artifacts under `$MNEMON_EVAL_LOOP_ARTIFACTS_DIR`.
+- Place newly proposed scenarios, suites, or rubrics under
+  `$MNEMON_EVAL_LOOP_CANDIDATES_DIR` unless they were explicitly reviewed.
+- Summarize whether the result suggests a code change, loop policy change,
+  host adapter change, docs update, or eval asset change.
diff --git a/harness/modules/eval-loop/hooks/prime.md b/harness/modules/eval-loop/hooks/prime.md
new file mode 100644
index 0000000..445c05f
--- /dev/null
+++ b/harness/modules/eval-loop/hooks/prime.md
@@ -0,0 +1,11 @@
+# Eval Loop Prime
+
+At the start of work, check whether the current task claims harness behavior
+improvement or changes eval assets.
+
+If yes:
+
+- Load `$MNEMON_EVAL_LOOP_DIR/GUIDE.md` when available.
+- Prefer an existing canonical or promoted suite before creating a new scenario.
+- Keep new LLM-authored scenarios ephemeral or candidate by default.
+- Record the host, loop configuration, and intended evidence before running.
diff --git a/harness/modules/eval-loop/hooks/remind.md b/harness/modules/eval-loop/hooks/remind.md
new file mode 100644
index 0000000..201579f
--- /dev/null
+++ b/harness/modules/eval-loop/hooks/remind.md
@@ -0,0 +1,12 @@
+# Eval Loop Remind
+
+Before acting on an eval-related prompt, identify:
+
+- Target: what behavior or subsystem is being evaluated.
+- Scenario: which task pressure case will be run.
+- Suite: whether this belongs to smoke, regression, or exploratory coverage.
+- Rubric: how behavior will be judged.
+- Evidence: which artifacts must be captured.
+
+If any item is missing, make it explicit in the plan or mark the run
+exploratory.
diff --git a/harness/modules/eval-loop/module.json b/harness/modules/eval-loop/module.json
new file mode 100644
index 0000000..a82e5bd
--- /dev/null
+++ b/harness/modules/eval-loop/module.json
@@ -0,0 +1,69 @@
+{
+  "schema_version": 1,
+  "name": "eval-loop",
+  "version": "0.1.0",
+  "description": "Runs scenario-driven harness evaluations, collects evidence, and curates improvements without making eval assets canonical by default.",
+  "loop_type": "feedback",
+  "direct_interface_effect": false,
+  "primary_host": "codex",
+  "lifecycle_events": [
+    "prime",
+    "remind",
+    "nudge",
+    "compact"
+  ],
+  "assets": {
+    "guide": "GUIDE.md",
+    "env": "env.sh",
+    "runtime_files": [
+      "suites/smoke.json",
+      "suites/regression.json",
+      "rubrics/eval-asset-quality.md",
+      "rubrics/interface-loop-behavior.md",
+      "scenarios/memory/project-preference-recall.md",
+      "scenarios/skill/skill-creation-reuse.md",
+      "scenarios/docs/bilingual-doc-sync.md",
+      "scenarios/setup/host-projection-smoke.md"
+    ],
+    "hooks": {
+      "prime": "hooks/prime.md",
+      "remind": "hooks/remind.md",
+      "nudge": "hooks/nudge.md",
+      "compact": "hooks/compact.md"
+    },
+    "skills": [
+      "skills/eval_plan.md",
+      "skills/eval_run.md",
+      "skills/eval_analyze.md",
+      "skills/eval_improve.md"
+    ],
+    "subagents": [
+      "subagents/evaluator.md"
+    ]
+  },
+  "state": {
+    "canonical": [
+      ".mnemon/data",
+      ".mnemon/reports",
+      ".mnemon/proposals",
+      ".mnemon/audit"
+    ],
+    "loop_runtime": [
+      "scratch",
+      "candidates",
+      "reports",
+      "artifacts",
+      "retired"
+    ]
+  },
+  "eval_asset_lifecycle": [
+    "ephemeral",
+    "candidate",
+    "promoted",
+    "canonical",
+    "retired"
+  ],
+  "host_adapters": {
+    "codex": "../../hosts/codex"
+  }
+}
diff --git a/harness/modules/eval-loop/rubrics/eval-asset-quality.md b/harness/modules/eval-loop/rubrics/eval-asset-quality.md
new file mode 100644
index 0000000..5c84fd7
--- /dev/null
+++ b/harness/modules/eval-loop/rubrics/eval-asset-quality.md
@@ -0,0 +1,22 @@
+# Eval Asset Quality Rubric
+
+Use this rubric when reviewing scenarios, suites, and rubrics for promotion.
+
+## Pass
+
+- The target and hypothesis are explicit.
+- The setup is reproducible.
+- Required evidence is named.
+- The pass/weak/fail criteria distinguish behavior quality.
+- Runtime cost is appropriate for the intended suite.
+- The asset is not a duplicate of existing coverage.
+
+## Weak
+
+- The asset is useful but missing one review detail, such as artifact paths,
+  timeout expectations, or a clear suite placement.
+
+## Fail
+
+- The asset is vague, duplicative, flaky by design, too expensive for its value,
+  or likely to reward weak behavior.
diff --git a/harness/modules/eval-loop/rubrics/interface-loop-behavior.md b/harness/modules/eval-loop/rubrics/interface-loop-behavior.md
new file mode 100644
index 0000000..72cf251
--- /dev/null
+++ b/harness/modules/eval-loop/rubrics/interface-loop-behavior.md
@@ -0,0 +1,22 @@
+# Interface Loop Behavior Rubric
+
+Use this rubric when evaluating whether interface-facing loops improved real
+HostAgent behavior.
+
+## Pass
+
+- The HostAgent behavior shows evidence that the loop affected the task.
+- The effect is relevant to the scenario, not generic compliance.
+- The result improves the task outcome without polluting memory, skills, docs,
+  or workspace state.
+- The report includes enough artifacts to review the judgement.
+
+## Weak
+
+- The loop was visible but only partially affected the task, or evidence is
+  incomplete.
+
+## Fail
+
+- The loop had no observable effect, caused incorrect behavior, polluted state,
+  or made the task harder to review.
diff --git a/harness/modules/eval-loop/scenarios/docs/bilingual-doc-sync.md b/harness/modules/eval-loop/scenarios/docs/bilingual-doc-sync.md
new file mode 100644
index 0000000..79d5738
--- /dev/null
+++ b/harness/modules/eval-loop/scenarios/docs/bilingual-doc-sync.md
@@ -0,0 +1,29 @@
+# Bilingual Documentation Sync
+
+Target:
+- docs workflow
+- memory-loop or skill-loop support
+
+Purpose:
+Verify that harness changes update relevant English and Chinese documentation
+when the project requires bilingual docs.
+
+Setup:
+- Start an isolated Codex app-server workspace.
+- Install the loop combination under test.
+- Seed project preference or active skill evidence when the run is testing those
+  loops.
+
+Task:
+Ask the HostAgent to change a documented harness behavior.
+
+Expected Evidence:
+- Code or harness asset change is present.
+- English docs are updated when relevant.
+- Chinese docs are updated when relevant.
+- The final report mentions verification.
+
+Rubric:
+- pass: code and both language docs are synchronized.
+- weak: only one language is updated or docs are incomplete.
+- fail: behavior changes without relevant docs.
diff --git a/harness/modules/eval-loop/scenarios/memory/project-preference-recall.md b/harness/modules/eval-loop/scenarios/memory/project-preference-recall.md
new file mode 100644
index 0000000..d9f1906
--- /dev/null
+++ b/harness/modules/eval-loop/scenarios/memory/project-preference-recall.md
@@ -0,0 +1,28 @@
+# Project Preference Recall
+
+Target:
+- memory-loop
+- HostAgent project behavior
+
+Purpose:
+Verify that a HostAgent can use durable project preferences when a task would
+otherwise omit them.
+
+Setup:
+- Start an isolated Codex app-server workspace.
+- Install `memory-loop`.
+- Seed `.mnemon` with a concrete project preference.
+
+Task:
+Ask the HostAgent to make a small project maintenance change where the seeded
+preference matters.
+
+Expected Evidence:
+- The final behavior reflects the seeded preference.
+- The report references memory evidence or the projected memory loop state.
+- No unrelated preference is written to memory.
+
+Rubric:
+- pass: preference is applied and state remains clean.
+- weak: preference is mentioned but incompletely applied.
+- fail: preference is ignored or memory is polluted.
diff --git a/harness/modules/eval-loop/scenarios/setup/host-projection-smoke.md b/harness/modules/eval-loop/scenarios/setup/host-projection-smoke.md
new file mode 100644
index 0000000..807dc95
--- /dev/null
+++ b/harness/modules/eval-loop/scenarios/setup/host-projection-smoke.md
@@ -0,0 +1,27 @@
+# Host Projection Smoke
+
+Target:
+- setup
+- host projection
+
+Purpose:
+Verify that a loop module can be installed into a host surface and reported in
+the host manifest.
+
+Setup:
+- Use an isolated workspace.
+- Run `harness/setup/install.sh` for the target host and module.
+
+Task:
+Install the module, inspect projected files, and run setup status.
+
+Expected Evidence:
+- Runtime state exists under `.mnemon/harness/<module>`.
+- Host projection files exist.
+- Manifest contains the installed loop.
+- Status reports the module as installed.
+
+Rubric:
+- pass: projection, manifest, and status agree.
+- weak: projection exists but manifest or status is incomplete.
+- fail: install fails or projected state is missing.
diff --git a/harness/modules/eval-loop/scenarios/skill/skill-creation-reuse.md b/harness/modules/eval-loop/scenarios/skill/skill-creation-reuse.md
new file mode 100644
index 0000000..1939caf
--- /dev/null
+++ b/harness/modules/eval-loop/scenarios/skill/skill-creation-reuse.md
@@ -0,0 +1,28 @@
+# Skill Creation And Reuse
+
+Target:
+- skill-loop
+- reusable workflow behavior
+
+Purpose:
+Verify that repeated workflow friction becomes skill evidence and can lead to a
+reviewable skill candidate without immediate uncontrolled activation.
+
+Setup:
+- Start an isolated Codex app-server workspace.
+- Install `skill-loop`.
+- Provide a task that repeats a maintenance pattern with known missed steps.
+
+Task:
+Ask the HostAgent to complete the maintenance task and reflect on repeated
+workflow friction.
+
+Expected Evidence:
+- Usage evidence is appended for reusable workflow friction.
+- Any new skill is drafted as a proposal or candidate.
+- The host skill surface is not mutated unexpectedly.
+
+Rubric:
+- pass: evidence is captured and activation remains gated.
+- weak: evidence is captured but proposal quality is incomplete.
+- fail: no evidence is captured or an unreviewed skill is activated.
diff --git a/harness/modules/eval-loop/skills/eval_analyze.md b/harness/modules/eval-loop/skills/eval_analyze.md
new file mode 100644
index 0000000..d558bfd
--- /dev/null
+++ b/harness/modules/eval-loop/skills/eval_analyze.md
@@ -0,0 +1,39 @@
+---
+name: eval_analyze
+description: Analyze Mnemon harness eval reports, classify outcomes, and extract improvement evidence.
+---
+
+# Eval Analyze
+
+Use this skill after an eval run to judge behavior and extract improvement
+evidence.
+
+## Procedure
+
+1. Read the report, relevant artifact summaries, and the selected rubric.
+2. Compare observed behavior to the hypothesis.
+3. Classify the outcome:
+   - `pass`: behavior meets the rubric.
+   - `weak`: partially useful but missing expected evidence or consistency.
+   - `fail`: behavior contradicts the target expectation.
+   - `invalid`: setup or scenario issue prevents judgement.
+4. Identify the likely improvement target:
+   - memory-loop
+   - skill-loop
+   - eval-loop
+   - host adapter
+   - setup
+   - docs
+   - scenario or rubric
+5. If a new eval asset is warranted, create a candidate summary instead of
+   editing canonical assets immediately.
+
+## Output
+
+Write a concise analysis with:
+
+- outcome
+- evidence
+- likely cause
+- recommended next action
+- candidate eval asset path, if any
diff --git a/harness/modules/eval-loop/skills/eval_improve.md b/harness/modules/eval-loop/skills/eval_improve.md
new file mode 100644
index 0000000..3cfc90a
--- /dev/null
+++ b/harness/modules/eval-loop/skills/eval_improve.md
@@ -0,0 +1,33 @@
+---
+name: eval_improve
+description: Turn stable Mnemon harness eval findings into scoped project, loop, adapter, docs, or eval asset improvements.
+---
+
+# Eval Improve
+
+Use this skill to turn stable eval findings into project changes.
+
+## Procedure
+
+1. Confirm the finding is backed by a report or repeated observation.
+2. Pick one improvement target. Avoid mixing loop policy changes, runner changes,
+   docs changes, and scenario promotion in one patch unless they are tightly
+   coupled.
+3. For eval asset changes:
+   - keep exploratory ideas in scratch
+   - add candidate assets under runtime candidates
+   - promote canonical repo assets only after curation
+4. For code or harness changes, run the narrowest relevant eval or validation.
+5. Summarize what changed, which evidence motivated it, and what remains
+   unproven.
+
+## Promotion Checklist
+
+Before making an eval asset canonical, verify:
+
+- It has a clear target and hypothesis.
+- It has an explicit rubric.
+- It produces reviewable artifacts.
+- It is not duplicative.
+- It is stable enough for its intended suite.
+- It does not reward weak or unsafe behavior.
diff --git a/harness/modules/eval-loop/skills/eval_plan.md b/harness/modules/eval-loop/skills/eval_plan.md
new file mode 100644
index 0000000..9a8417e
--- /dev/null
+++ b/harness/modules/eval-loop/skills/eval_plan.md
@@ -0,0 +1,40 @@
+---
+name: eval_plan
+description: Design a scenario-driven Mnemon harness eval with target, hypothesis, HostAgent, loop configuration, evidence, and rubric.
+---
+
+# Eval Plan
+
+Use this skill to design a scenario-driven eval before running a HostAgent.
+
+## Procedure
+
+1. Identify the target: loop, setup behavior, host projection, docs workflow, or
+   eval-loop itself.
+2. Choose an existing scenario and suite when one fits.
+3. If no scenario fits, draft an ephemeral plan first. Do not promote it during
+   the same step.
+4. State the hypothesis in observable terms.
+5. Select the HostAgent and loop combination. Codex app server is the default
+   HostAgent for current Mnemon evals.
+6. Define the evidence to collect:
+   - transcript or response reference
+   - git diff
+   - `.mnemon` state changes
+   - projected host surface
+   - report path
+   - logs or timeout reason
+7. Attach a rubric or mark the run exploratory.
+
+## Output
+
+Return a short eval plan with:
+
+- target
+- scenario
+- suite
+- host
+- loops
+- hypothesis
+- evidence
+- expected report path
diff --git a/harness/modules/eval-loop/skills/eval_run.md b/harness/modules/eval-loop/skills/eval_run.md
new file mode 100644
index 0000000..9f417d4
--- /dev/null
+++ b/harness/modules/eval-loop/skills/eval_run.md
@@ -0,0 +1,31 @@
+---
+name: eval_run
+description: Execute or supervise a planned Mnemon harness eval run in an isolated HostAgent workspace.
+---
+
+# Eval Run
+
+Use this skill to execute or supervise a planned eval run.
+
+## Procedure
+
+1. Confirm the plan names a host, suite or scenario, and evidence targets.
+2. Create or use an isolated workspace. Do not run scenario state in the
+   developer's active workspace unless the eval explicitly requires it.
+3. Install the requested loop modules with `harness/setup`.
+4. For Codex app-server evals, use the project runner when available:
+
+   ```bash
+   python3 scripts/codex_app_server_eval.py --suite
+   ```
+
+   Use a specific suite option when the scenario requires it.
+5. Collect artifacts and logs before cleanup.
+6. Record timeouts, setup failures, and HostAgent readiness failures as eval
+   evidence, not as silent skips.
+
+## Boundaries
+
+- Do not change canonical scenarios, suites, or rubrics while running an eval.
+- Do not delete artifacts needed for report review.
+- Do not treat an exploratory run as a regression result.
diff --git a/harness/modules/eval-loop/subagents/evaluator.md b/harness/modules/eval-loop/subagents/evaluator.md
new file mode 100644
index 0000000..5509e39
--- /dev/null
+++ b/harness/modules/eval-loop/subagents/evaluator.md
@@ -0,0 +1,20 @@
+# Evaluator Subagent
+
+Use this subagent for background eval curation and report synthesis.
+
+## Responsibilities
+
+- Cluster repeated eval observations into fewer candidate scenarios.
+- Identify duplicate, flaky, or low-value candidates.
+- Recommend whether candidates should remain exploratory, become promoted local
+  regression assets, or be considered for canonical regression.
+- Summarize report trends across runs.
+- Extract observed HostAgent capability requirements from Codex-first evals.
+
+## Non-Goals
+
+- Do not automatically make candidate eval assets canonical.
+- Do not loosen rubrics to reduce failures.
+- Do not hide setup or HostAgent failures.
+- Do not modify memory-loop or skill-loop policy without a separate explicit
+  improvement task.
diff --git a/harness/modules/eval-loop/suites/regression.json b/harness/modules/eval-loop/suites/regression.json
new file mode 100644
index 0000000..62001e5
--- /dev/null
+++ b/harness/modules/eval-loop/suites/regression.json
@@ -0,0 +1,16 @@
+{
+  "name": "regression",
+  "description": "Broader local regression suite for harness self-evolution behavior.",
+  "host": "codex",
+  "lifecycle": "promoted",
+  "scenarios": [
+    "setup/host-projection-smoke",
+    "memory/project-preference-recall",
+    "skill/skill-creation-reuse",
+    "docs/bilingual-doc-sync"
+  ],
+  "rubrics": [
+    "eval-asset-quality",
+    "interface-loop-behavior"
+  ]
+}
diff --git a/harness/modules/eval-loop/suites/smoke.json b/harness/modules/eval-loop/suites/smoke.json
new file mode 100644
index 0000000..72dfdc3
--- /dev/null
+++ b/harness/modules/eval-loop/suites/smoke.json
@@ -0,0 +1,15 @@
+{
+  "name": "smoke",
+  "description": "Fast checks for eval-loop setup and core interface-loop behavior.",
+  "host": "codex",
+  "lifecycle": "promoted",
+  "scenarios": [
+    "setup/host-projection-smoke",
+    "memory/project-preference-recall",
+    "skill/skill-creation-reuse"
+  ],
+  "rubrics": [
+    "eval-asset-quality",
+    "interface-loop-behavior"
+  ]
+}
diff --git a/harness/setup/README.md b/harness/setup/README.md
index 59cbefb..9a7e7c5 100644
--- a/harness/setup/README.md
+++ b/harness/setup/README.md
@@ -19,6 +19,7 @@ bash harness/setup/install.sh --host claude-code --module memory-loop
 bash harness/setup/status.sh --host claude-code
 bash harness/setup/uninstall.sh --host claude-code --module memory-loop
 bash harness/setup/install.sh --host codex --module memory-loop
+bash harness/setup/install.sh --host codex --module eval-loop
 ```
 
 Host-specific projection logic lives under `harness/hosts/<host>/`. Loop assets
diff --git a/scripts/codex_app_server_eval.py b/scripts/codex_app_server_eval.py
index beaad33..adb8a33 100755
--- a/scripts/codex_app_server_eval.py
+++ b/scripts/codex_app_server_eval.py
@@ -208,6 +208,15 @@ def setup_workspace(args: argparse.Namespace, root: Path) -> tuple[Path, Path, P
         env["MNEMON_SKILL_LOOP_ARCHIVED_DIR"] = str(skill_dir / "skills" / "archived")
         env["MNEMON_SKILL_LOOP_USAGE_FILE"] = str(skill_dir / "skills" / ".usage.jsonl")
         env["MNEMON_SKILL_LOOP_PROPOSALS_DIR"] = str(skill_dir / "proposals")
+    if "eval-loop" in args.modules:
+        eval_dir = mnemon_dir / "harness" / "eval-loop"
+        env["MNEMON_EVAL_LOOP_ENV"] = str(eval_dir / "env.sh")
+        env["MNEMON_EVAL_LOOP_DIR"] = str(eval_dir)
+        env["MNEMON_EVAL_LOOP_SCRATCH_DIR"] = str(eval_dir / "scratch")
+        env["MNEMON_EVAL_LOOP_CANDIDATES_DIR"] = str(eval_dir / "candidates")
+        env["MNEMON_EVAL_LOOP_REPORTS_DIR"] = str(eval_dir / "reports")
+        env["MNEMON_EVAL_LOOP_ARTIFACTS_DIR"] = str(eval_dir / "artifacts")
+        env["MNEMON_EVAL_LOOP_RETIRED_DIR"] = str(eval_dir / "retired")
     if args.isolated_codex_home:
         codex_home = run_root / "codex-home"
         codex_home.mkdir(parents=True, exist_ok=True)
@@ -302,6 +311,7 @@ def __init__(
 
 
 SKILL_LOOP_EXPECTED_SKILLS = ["skill_observe", "skill_curate", "skill_author", "skill_manage"]
+EVAL_LOOP_EXPECTED_SKILLS = ["eval_plan", "eval_run", "eval_analyze", "eval_improve"]
 
 
 def setup_none(workspace: Path, mnemon_dir: Path, env: dict[str, str]) -> None:
@@ -1128,7 +1138,7 @@ def parse_args(argv: list[str]) -> argparse.Namespace:
         "--module",
         dest="modules",
         action="append",
-        choices=["memory-loop", "skill-loop"],
+        choices=["memory-loop", "skill-loop", "eval-loop"],
         default=[],
         help="Harness module to install. May be repeated. Defaults to memory-loop.",
     )
@@ -1163,6 +1173,8 @@ def parse_args(argv: list[str]) -> argparse.Namespace:
             expected.extend(["memory_get", "memory_set"])
         if "skill-loop" in args.modules:
             expected.extend(SKILL_LOOP_EXPECTED_SKILLS)
+        if "eval-loop" in args.modules:
+            expected.extend(EVAL_LOOP_EXPECTED_SKILLS)
         args.expected_skills = expected
     return args