From 1623044faab9faf83926cb4e791ca4a9a9ef5590 Mon Sep 17 00:00:00 2001 From: "const.koutsakis@aurecongroup.com" Date: Thu, 30 Apr 2026 20:19:06 +1000 Subject: [PATCH] chore: src/ README audit gate + 5 package READMEs (#126, #152) --- .github/branch-protection/develop.json | 1 + .github/branch-protection/main.json | 1 + .github/scripts/check_src_readmes.py | 142 +++++++++++++++++++++++++ .github/workflows/ci.yml | 13 +++ pyproject.toml | 2 +- src/api/README.md | 18 ++++ src/eval/README.md | 22 ++++ src/models/README.md | 17 +++ src/observability/README.md | 20 ++++ src/tools/README.md | 19 ++++ uv.lock | 2 +- 11 files changed, 255 insertions(+), 2 deletions(-) create mode 100644 .github/scripts/check_src_readmes.py create mode 100644 src/api/README.md create mode 100644 src/eval/README.md create mode 100644 src/models/README.md create mode 100644 src/observability/README.md create mode 100644 src/tools/README.md diff --git a/.github/branch-protection/develop.json b/.github/branch-protection/develop.json index 6187955..9543e69 100644 --- a/.github/branch-protection/develop.json +++ b/.github/branch-protection/develop.json @@ -12,6 +12,7 @@ "Version bump check", "Action pinning audit", "Tests required", + "src/ README audit", "Frontend Build", "Frontend Quality", "Branch-protection contexts sync", diff --git a/.github/branch-protection/main.json b/.github/branch-protection/main.json index fecf6fd..d7bafaa 100644 --- a/.github/branch-protection/main.json +++ b/.github/branch-protection/main.json @@ -12,6 +12,7 @@ "Version bump check", "Action pinning audit", "Tests required", + "src/ README audit", "Frontend Build", "Frontend Quality", "Branch-protection contexts sync", diff --git a/.github/scripts/check_src_readmes.py b/.github/scripts/check_src_readmes.py new file mode 100644 index 0000000..9394271 --- /dev/null +++ b/.github/scripts/check_src_readmes.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +"""Enforce the per-package README rule from `CLAUDE.md`. + +`CLAUDE.md` *Code standards*: *"Each `src/` directory has a README +explaining its purpose and key interfaces."* As `src/` grows, READMEs +go missing silently — and even when one exists, it can degrade into +unstructured prose that no contributor reads. This script audits both +shape and substance. + +Behaviour: + +- Walks every subdirectory under `src/` (recursive, skipping + `__pycache__`). +- A subdirectory must have a `README.md` when it contains at least one + `.py` file other than `__init__.py`. Empty directories and + `__init__.py`-only namespace packages are exempt — they have no + surface to document. +- The README must be non-trivial: at least `MIN_BYTES = 200` bytes + after stripping leading/trailing whitespace. Catches the empty-stub + failure mode (#126). +- The README must contain a `## Key interfaces` heading (or its + documented synonym `## Public surface`). #152 promoted the gate + from presence + size to presence + structure: a 200-byte unrelated + paragraph passed pre-#152, but doesn't actually document the + package's public surface. The Purpose statement is encoded + positionally as the H1 heading + immediately-following paragraph + rather than as an explicit `## Purpose` heading — that's the + existing convention across all seven packages and the natural + markdown shape; a separate `## Purpose` heading would duplicate + what the H1 already establishes. + +There is **no exemption mechanism**, mirroring `check_file_length.py` +(see `feedback_no_noqa`). If a future package legitimately needs a +shorter README (e.g. a single-file helper with a self-explanatory +filename), restructure rather than carve out an allowlist entry. + +Exit codes: + 0 — every package with code has a non-trivial, structured README + 1 — at least one package is missing a README, has a stub one, or + lacks the required heading + 2 — `src/` does not exist (run from the wrong directory?) + +Usage (from repo root): + + python .github/scripts/check_src_readmes.py +""" + +from __future__ import annotations + +import re +import sys +from pathlib import Path + +SRC_ROOT = Path("src") +MIN_BYTES = 200 + +# Required heading shapes. Each tuple is "any-of" — a README satisfies the +# rule if it contains at least one matching heading. Match is anchored to +# line start, case-insensitive, allows `#` levels 2-4 so a deeply nested +# subsection still counts. +KEY_INTERFACES_HEADING = re.compile( + r"^#{2,4}\s+(Key interfaces|Public surface)\b", + re.IGNORECASE | re.MULTILINE, +) + + +def _normalised(path: Path) -> str: + return path.as_posix() + + +def _has_documentable_code(directory: Path) -> bool: + """True when the directory has at least one `.py` file beyond `__init__.py`.""" + for entry in directory.iterdir(): + if entry.is_file() and entry.suffix == ".py" and entry.name != "__init__.py": + return True + return False + + +def _readme_failure(directory: Path) -> str | None: + """Return an error message if the directory's README is missing or stub-sized.""" + readme = directory / "README.md" + if not readme.is_file(): + return ( + f"::error file={_normalised(directory)}::missing README.md. " + "`CLAUDE.md` requires every `src/` package to document its " + "purpose and key interfaces." + ) + body = readme.read_text(encoding="utf-8").strip() + if len(body.encode("utf-8")) < MIN_BYTES: + return ( + f"::error file={_normalised(readme)}::README.md is shorter than " + f"{MIN_BYTES} bytes after stripping whitespace. Add purpose + " + "key-interfaces text — a single heading does not satisfy the rule." + ) + if not KEY_INTERFACES_HEADING.search(body): + return ( + f"::error file={_normalised(readme)}::README.md missing a " + "`## Key interfaces` heading (or the synonym `## Public surface`). " + "Per CLAUDE.md the README must document the package's public " + "surface; the heading anchors that section so contributors can " + "find it. Add the heading and list the package's exported names." + ) + return None + + +def main() -> int: + if not SRC_ROOT.is_dir(): + print(f"::error::{SRC_ROOT.as_posix()} not found; run from repo root") + return 2 + + audited: list[Path] = [] + failures: list[str] = [] + + for directory in sorted(p for p in SRC_ROOT.rglob("*") if p.is_dir()): + if directory.name == "__pycache__": + continue + if not _has_documentable_code(directory): + continue + audited.append(directory) + message = _readme_failure(directory) + if message is not None: + failures.append(message) + + if failures: + for line in failures: + print(line) + print( + f"\n{len(failures)} package(s) failed the README audit. " + "Fix in this PR — there is no exemption mechanism, see the " + "module docstring." + ) + return 1 + + print( + f"src/ README audit OK — {len(audited)} package(s) documented " + f"(min {MIN_BYTES} bytes, `## Key interfaces` heading required)." + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fcda20a..2f0b2cb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -151,6 +151,19 @@ jobs: python-version: "3.14" - run: python .github/scripts/check_tests_present.py + src-readmes: + name: src/ README audit + runs-on: ubuntu-latest + # CLAUDE.md: every `src/` package documents its purpose + key + # interfaces. The audit checks shape (presence + min 200 bytes) and + # structure (`## Key interfaces` heading). No exemption mechanism. + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + with: + python-version: "3.14" + - run: python .github/scripts/check_src_readmes.py + frontend-build: name: Frontend Build runs-on: ubuntu-latest diff --git a/pyproject.toml b/pyproject.toml index 27e014a..2e06350 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harness-python-react" -version = "0.2.0" +version = "0.2.1" description = "Production-quality LLM-driven coding harness — Python (FastAPI) backend, Vite + React + TypeScript frontend." readme = "README.md" requires-python = ">=3.14" diff --git a/src/api/README.md b/src/api/README.md new file mode 100644 index 0000000..9bda5e9 --- /dev/null +++ b/src/api/README.md @@ -0,0 +1,18 @@ +# `src/api/` + +FastAPI HTTP surface for the harness scaffold. Owns request handling, route registration, lifespan hooks, and the in-memory session store. Imports from `src/observability/` (instrumentation), `src/models/` (Pydantic contracts) — never the other way around (enforced by `import-linter`). + +## Key interfaces + +- **`main.app`** — the `FastAPI()` instance with CORS middleware, OTel auto-instrumentation, and the v1 router mounted. Exposed for `uvicorn src.api.main:app` and the FastAPI `TestClient` fixtures in `tests/conftest.py`. +- **`main.lifespan(app)`** — async context manager that runs `setup_tracing` → `setup_logging` → `instrument_httpx` → `instrument_fastapi` once on startup, then constructs the `SessionStore`. Single point of "what is process-globally configured?" +- **`routes.router`** — `APIRouter(prefix="/api/v1")`. Every route a real client should hit lives here; un-versioned routes live on `app` directly (only the FastAPI auto-generated docs at `/docs` and `/redoc` qualify). +- **`routes.health`** — `GET /api/v1/health` → `HealthResponse` (status + `importlib.metadata.version`-derived build version). +- **`routes.echo`** — `GET /api/v1/echo?msg=...` → `EchoResponse`. Demonstrates the request/response contract pattern that real domain endpoints follow. +- **`sessions.SessionStore`** — single-process in-memory dict mapping session id → conversation history. Plumbed onto `app.state.session_store` in lifespan; replace with a Redis or DB-backed implementation for multi-process deployments. + +## Conventions + +- Every route returns a `StrictModel` subclass — never a raw dict. The `extra="forbid"` config makes typos and renamed fields fail at construction, not three calls deep. +- Routes are versioned under `/api/v1/`. Adding a v2 endpoint = a new router, never a breaking change to v1. +- The CORS policy is wide-open in the scaffold so the Vite dev server proxy works on first run; tighten via config when a real auth layer arrives. diff --git a/src/eval/README.md b/src/eval/README.md new file mode 100644 index 0000000..088a8db --- /dev/null +++ b/src/eval/README.md @@ -0,0 +1,22 @@ +# `src/eval/` + +Provider-agnostic evaluation harness — golden QA cases driven by a caller-supplied `answer_fn`, three tolerance modes (exact / numeric / semantic), and a `Protocol`-based LLM-judge seam so no SDK leaks into this layer. Designed to be runnable on a fresh clone without LLM credentials (the example case uses `exact_match`); the nightly workflow opt-in is documented in `docs/EVAL_HARNESS.md`. + +## Key interfaces + +- **`models.EvalCase`** — Pydantic shape of an entry in `eval/golden_qa.json`: `id`, `question`, `expected_answer`, `tolerance`, plus optional `category` / `difficulty` / `expected_tools` / `notes`. +- **`models.EvalResult`** — per-case outcome: actual answer, latency, pass/fail, tolerance score, failure reason. +- **`runner.load_golden_dataset(path=None)`** — pure JSON loader. Standalone so call sites can introspect the dataset without paying for an `EvalRunner`. +- **`runner.EvalRunner(answer_fn, judge_client=None, judge_model="")`** — the orchestrator. `answer_fn: Callable[[str], str]` is the single seam — wire your agent loop, direct LLM client, or stub. `judge_client` implements the `judge.LLMClient` Protocol and is only consulted for `semantic_similar` cases. + - **`evaluate(case)`** — run one case; returns `EvalResult`. + - **`evaluate_all()`** — load the golden dataset, evaluate every case, return the list. +- **`judge.LLMClient`** — `Protocol` with one method: `complete_json(*, model, prompt) -> str`. Concrete adapters live in your agent code; the harness never imports OpenAI/Anthropic/Azure SDKs. +- **`judge.evaluate_semantic_similarity(question, expected, actual, client, model)`** — calls the judge, returns `(score, explanation)`. Returns `(None, "no LLM client configured")` when `client is None` (inconclusive — runner treats it as pass with an explanatory note rather than a hard fail). +- **`report.generate_report(results)`** — markdown summary: overall accuracy, per-category, per-difficulty, failure analysis with reason text. +- **`__main__`** — `python -m src.eval` runs the dataset with an identity `answer_fn` (echoes the question) and prints the markdown report. + +## Conventions + +- **`coverage` `omit`s this package** — the eval suite is exercised by `eval/test_golden_qa.py`, not by `tests/`. Counting it would inflate misses on every PR that touches behaviour without re-running the eval workflow. +- **No domain coupling.** The harness ships one trivial echo case; real users replace `eval/golden_qa.json` with their domain dataset and wire `answer_fn` to their agent. +- **Tolerance picks happen per case**, not per runner. A dataset can mix `exact_match`, `numeric_close` (within 1%), and `semantic_similar` (LLM judge ≥ 0.8) cases freely. diff --git a/src/models/README.md b/src/models/README.md new file mode 100644 index 0000000..321e68d --- /dev/null +++ b/src/models/README.md @@ -0,0 +1,17 @@ +# `src/models/` + +Pydantic contracts that cross module or process seams. Import-linter enforces that nothing outside this package matters here — `src.models` depends on **nothing in `src/`** (forbidden contract in `pyproject.toml`). The point: schema bugs surface at construction with clear `ValidationError`s, not as `AttributeError`s deep in the request path. + +## Key interfaces + +- **`_base.StrictModel`** — `BaseModel` subclass with `model_config = ConfigDict(extra="forbid")`. Inherit this for any contract that crosses a boundary; opt into stricter type coercion with `class Foo(StrictModel, strict=True)` when JSON coercion (UUID, int) is undesirable. +- **`health.HealthResponse`** — `status: Literal["ok"]` + `version: str`. Returned by `GET /api/v1/health`. +- **`session.SessionCreate`** / **`session.SessionInfo`** — the create-empty / public-info shapes used by `src.api.sessions.SessionStore`. +- **`config.Settings`** — `pydantic_settings.BaseSettings` reading the four `LLM_*` env vars (`LLM_PROVIDER`, `LLM_API_KEY`, `LLM_BASE_URL`, `LLM_MODEL`). Provider-pluggable seam — wire OpenAI, Anthropic, Azure, or vLLM by swapping the values without touching this code. +- **`config.get_settings()`** — fresh `Settings` constructor (deliberately unmemoised so tests can re-construct after `monkeypatch.setenv`). Real callers should hold a single instance at startup. + +## Conventions + +- One module per logical contract group — `health`, `session`, `config`. Add a new file rather than appending to an existing one. +- Per-class `strict=True` is the call site for any model that should reject `"3.14"` for a float; HTTP-boundary models often skip it because JSON requires the coercion. +- `pyproject.toml` `[tool.ruff.lint.per-file-ignores]` adds `"src/models/**" = ["TCH003"]` because Pydantic needs runtime imports for type annotations. diff --git a/src/observability/README.md b/src/observability/README.md new file mode 100644 index 0000000..d621674 --- /dev/null +++ b/src/observability/README.md @@ -0,0 +1,20 @@ +# `src/observability/` + +OpenTelemetry SDK setup, OTLP exporter wiring, structured-JSON logging with trace correlation, and span helpers built around the OTel GenAI semantic conventions. Wired once from `src.api.main.lifespan` so the rest of the codebase calls into a configured tracer / logger without orchestrating startup. + +## Key interfaces + +- **`tracing.setup_tracing()`** — creates the global `TracerProvider`, attaches a `BatchSpanProcessor` with the OTLP gRPC exporter (or `ConsoleSpanExporter` when `OTEL_EXPORTER=console`). Reads `OTEL_SERVICE_NAME` (default `harness-python-react`) and `OTEL_EXPORTER_OTLP_ENDPOINT` (default `http://localhost:4317`). +- **`tracing.instrument_fastapi(app)`** + **`tracing.instrument_httpx()`** — auto-instrumentation hooks. Each FastAPI request and outbound httpx call gets a span with the standard semantic-convention attributes. +- **`tracing.get_tracer(name)`** — retrieve a tracer by module name when manual instrumentation is needed. +- **`logging.setup_logging(level=None)`** — configures stdlib `logging` to emit single-line JSON via `_JSONFormatter`. Reads `LOG_LEVEL` env when no argument is passed. Loops in `LoggingInstrumentor` so the `otelTraceID` / `otelSpanID` fields land on every record. +- **`logging._JSONFormatter`** — the formatter; tested directly in `tests/test_observability.py` for the trace-correlation contract. +- **`spans.agent_span(name, attributes=None)`** — context manager that opens a span, sets initial attributes from a `Mapping`, yields the live span for further mutation. Use this rather than calling `tracer.start_as_current_span` directly so the attribute-shape stays consistent. +- **`spans.set_span_attributes(span, **kwargs)`** — set multiple attributes; `None` values are silently skipped. +- **`spans.GENAI_*` / `DB_*` constants** — exported attribute-key constants for the OTel GenAI + database semantic conventions. Use these instead of raw strings so a typo is a `NameError`, not a silently-different attribute. + +## Conventions + +- **Semconv keys only.** The constants at the top of `spans.py` are the single source of attribute names. Adding a custom `agent.foo` attribute is a hard no — extend the semconv list (or wait for upstream OTel to bless one). +- **Provider-agnostic exporter.** `OTEL_EXPORTER_OTLP_ENDPOINT` is the standard variable; `docker-compose.yml` sets it to `http://jaeger:4317` in the compose network. +- **Lifespan-only setup.** Don't call `setup_tracing()` from anywhere except `src.api.main.lifespan` — duplicate calls trigger OTel's "Overriding of current TracerProvider" warning, and tests rely on the test-fixture variant that attaches an in-memory exporter to whatever provider exists. diff --git a/src/tools/README.md b/src/tools/README.md new file mode 100644 index 0000000..ad54658 --- /dev/null +++ b/src/tools/README.md @@ -0,0 +1,19 @@ +# `src/tools/` + +Generic tool registry — the dispatcher pattern an LLM-driven agent uses to call typed tools. Each tool is a function `StrictModel -> StrictModel`; the registry maps a name to `(input_schema, callable)` and validates the dict-shaped input against the schema before dispatch. Layer-wise sits below `agent` / `api` / `eval`, above `data` / `models` (enforced by `import-linter`). + +## Key interfaces + +- **`registry.Registry`** — instance class. Construct one per agent/test fixture; the module also exposes a global `registry` singleton that the example `echo_tool` self-registers into at import time. + - **`register(name, input_schema)`** — decorator factory. Wraps a function and inserts the `(input_schema, fn)` pair under `name`. Raises `ValueError` on duplicate registration. + - **`dispatch(name, raw_input)`** — `name`-lookup → `input_schema.model_validate(raw_input)` → call. Pydantic `ValidationError` propagates on bad input (wrong type or unknown keys via `StrictModel.extra="forbid"`). + - **`names()`** — sorted list of registered tool names; cheap introspection for the agent's tool-listing prompt. +- **`registry.UnknownToolError`** — `KeyError` subclass raised by `dispatch` when *name* isn't registered. Caught and rendered as a tool-call-error event by the agent loop. +- **`registry.registry`** — the module-global `Registry` instance. Real agents use this so the `echo_tool` is reachable from any consumer; tests construct private `Registry()` instances to avoid cross-test contamination. +- **`registry.echo_tool`** + **`EchoToolInput`** + **`EchoToolOutput`** — example tool demonstrating the layer. Ships pre-registered under name `"echo"` to give a working dispatch path on a fresh clone. + +## Conventions + +- **Inputs and outputs are `StrictModel`s.** Avoid raw `dict`s — the `extra="forbid"` posture is the entire reason this pattern beats hand-rolled `if/elif` dispatch over JSON. +- **Tools are pure functions** of their input. State the tool needs (DB connection, HTTP client) is injected via partial / closure at registration time, not via module globals. +- **Add a tool = add a module.** Real tools beyond the example get their own file under `src/tools/` and self-register the same way `echo_tool` does. Keep `registry.py` itself focused on the dispatcher. diff --git a/uv.lock b/uv.lock index 9ca9a26..df07bc8 100644 --- a/uv.lock +++ b/uv.lock @@ -328,7 +328,7 @@ wheels = [ [[package]] name = "harness-python-react" -version = "0.2.0" +version = "0.2.1" source = { virtual = "." } dependencies = [ { name = "fastapi" },