diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 76b3a07..c03099a 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -12,6 +12,14 @@
"version": "0.1.0",
"category": "ruby",
"keywords": ["ruby", "bundler", "gem", "dependencies"]
+ },
+ {
+ "name": "security",
+ "source": "./plugins/security",
+ "description": "White-box, dynamically-verified security audit. /security:audit recons a repo, hunts OWASP Top 10:2025 vulnerabilities, proves them with live PoCs in isolated worktrees, and writes a high-signal senior-engineer report.",
+ "version": "0.1.0",
+ "category": "security",
+ "keywords": ["security", "pentest", "vulnerability", "audit", "owasp", "appsec"]
}
]
}
diff --git a/README.md b/README.md
index 5ee7465..0b9e3c1 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,7 @@ If the plugin's commands don't show up in the `/` menu, run `/reload-plugins`.
| Plugin | Description |
| --- | --- |
| [gem](plugins/gem) | Ruby gem helpers. Includes `/gem:bump` for changelog-rich dependency bumps. |
+| [security](plugins/security) | Dynamically-verified security audit. `/security:audit` proves vulnerabilities with live PoCs and writes a senior-engineer report. |
## Developing plugins
diff --git a/plugins/security/.claude-plugin/plugin.json b/plugins/security/.claude-plugin/plugin.json
new file mode 100644
index 0000000..0cfc554
--- /dev/null
+++ b/plugins/security/.claude-plugin/plugin.json
@@ -0,0 +1,13 @@
+{
+ "name": "security",
+ "version": "0.1.0",
+ "description": "White-box, dynamically-verified security audit. /security:audit recons a repo, hunts vulnerabilities across the OWASP Top 10:2025 classes, proves them with live PoCs in isolated worktrees, and writes a high-signal senior-engineer report.",
+ "author": {
+ "name": "84codes",
+ "url": "https://github.com/84codes"
+ },
+ "homepage": "https://github.com/84codes/claude-plugins/tree/main/plugins/security",
+ "repository": "https://github.com/84codes/claude-plugins",
+ "license": "MIT",
+ "keywords": ["security", "pentest", "vulnerability", "audit", "owasp", "appsec", "sast"]
+}
diff --git a/plugins/security/AGENTS.md b/plugins/security/AGENTS.md
new file mode 100644
index 0000000..1ab4295
--- /dev/null
+++ b/plugins/security/AGENTS.md
@@ -0,0 +1,199 @@
+# vuln-audit — agent & design spec
+
+A Claude Code **skill + workflow** that runs a white-box, dynamically-verified
+security audit of a target repository: a
+multi-phase pipeline (recon → triage → deep review → adversarial verify →
+dynamic repro → report) that produces **proven, high-signal findings with
+patches**, not speculative noise.
+
+> Read this file before touching the workflow or prompts. It is the source of
+> truth for the data contracts, taxonomy, severity model, and signal policy.
+
+## Invocation
+
+```
+/security:audit /path/to/target-repo [--no-dynamic] [--classes injection,ssrf] [--out
]
+```
+
+The skill (`skills/audit/SKILL.md`) is the agent-facing entry point. It parses the
+target, picks a writable `outDir`, preflights host capabilities, then calls the
+workflow (`workflows/vuln-audit.js`) with everything assembled in `args` —
+`toolRoot` = `${CLAUDE_PLUGIN_ROOT}` (read-only, holds the prompts), `outDir` =
+where the bundle is written.
+
+## Pipeline
+
+| Phase | What | Primitive |
+|-------|------|-----------|
+| 1. Recon | Detect stack, map attack surface & trust boundaries, pick run strategy, select relevant finder classes | single agent (`prompts/recon.md`) |
+| 2. Triage | One finder per vuln class scans its surface, emits candidate findings (all share `prompts/finder.md`; the workflow injects the class + its OWASP/CWE/ASVS) | `parallel()` finders |
+| 3. Dedup | Collapse same-root-cause findings across call sites | plain JS in the workflow |
+| 4. Deep review | Re-examine each candidate with surrounding context (callers, sanitizers, related files); confirm a reachable source→sink path | `pipeline()` stage |
+| 5. Adversarial verify | Independent skeptics, each a distinct lens, try to **refute** the finding; majority-refute kills it | `parallel()` skeptic panel |
+| 6. Dynamic repro | Survivors are built & run in an isolated git **worktree** (docker-first, via `prompts/playbook.md`); a real PoC is fired and impact observed | `agent(..., {isolation:'worktree'})` |
+| 7. Report | Synthesize the senior-engineer report (`prompts/report-template.md`) | single agent |
+
+## Reference evaluation (why we adopt what we adopt)
+
+- **Anthropic security-guidance** (`code.claude.com/docs/en/security-guidance`)
+ — **adopt methodology.** Validates our core moves: (a) review independence —
+ the reviewer is a *fresh-context* agent, never the author, "instructed only to
+ find problems"; (b) read callers/sanitizers/related files before reporting to
+ keep false positives low. Our tool is the deepest layer: in-session plugin →
+ `/security-review` (branch) → Code Review (PR) → **vuln-audit (on-demand,
+ dynamically verified PoCs)**. We honor its extension convention: if the target
+ has a `.claude/claude-security-guidance.md`, we load it as extra threat-model
+ context.
+- **OWASP Top 10:2025** — **adopt as primary taxonomy.** Current edition; new
+ categories A03 Software Supply Chain Failures and A10 Mishandling of
+ Exceptional Conditions; SSRF folded into A01. Every finder maps to a 2025 ID.
+- **OWASP ASVS v5.0** (17 chapters, ~350 reqs) — **reference only, not a walked
+ checklist.** Walking 350 requirements is exactly the low-signal sidetrack we
+ avoid. Used two ways: (a) coverage map so the finder taxonomy has no blind
+ spots; (b) cite a requirement/chapter ID in findings as a terse, authoritative
+ reference for senior readers.
+- **OSSF Scorecard** — **partial adopt, code-exploitable checks only.** Scorecard
+ scores project *hygiene/posture* (Maintained, License, SBOM, Security-Policy,
+ Contributors) — out of scope for findings. But its CI/CD checks ARE real
+ exploitable issues and feed our `supply-chain` finder: Dangerous-Workflow
+ (`pull_request_target` + untrusted checkout, `${{ }}` script injection),
+ Token-Permissions (over-broad `GITHUB_TOKEN`), Pinned-Dependencies (unpinned
+ actions/deps), Vulnerabilities (known-vuln deps via OSV). Posture/process
+ checks are relegated to the Info appendix, never the high-priority body.
+
+## Vuln-class taxonomy (finders)
+
+Each maps to OWASP Top 10:2025 + CWE + an ASVS v5.0 chapter. The mapping is the
+`CLASS_META` table in `workflows/vuln-audit.js` (single source of truth); all
+classes share one method prompt, `prompts/finder.md`, with the per-class context
+injected by the workflow.
+
+| key | title | OWASP 2025 | ASVS |
+|-----|-------|-----------|------|
+| access-control | Broken Access Control & IDOR | A01 | V8 |
+| ssrf | Server-Side Request Forgery | A01 | V4 |
+| injection | Injection (SQL/NoSQL/OS/LDAP) | A05 | V1/V2 |
+| xss-ssti | XSS & Template Injection | A05 | V1/V3 |
+| auth-session | Authentication & Session | A07 | V6/V7/V9/V10 |
+| crypto | Cryptographic Failures | A04 | V11 |
+| deserialization | Insecure Deserialization & Integrity | A08 | V2/V15 |
+| path-file | Path Traversal & File Handling | A01 | V5 |
+| secrets | Hardcoded Secrets & Credentials | A02 | V14 |
+| misconfig | Security Misconfiguration | A02 | V13 |
+| supply-chain | Software Supply Chain & CI/CD | A03 | V15 |
+| logging-errors | Logging, Error & Exception Handling | A09/A10 | V16 |
+| dos-redos | Denial of Service & ReDoS | A06 | V2 |
+| csrf-cors | CSRF, CORS & Clickjacking | A01 | V3 |
+
+Insecure Design (A06) is cross-cutting and handled in recon/synthesis, not a
+grep-able finder.
+
+## Data contracts
+
+### Finding (finders + deep review)
+`id` · `title` · `vuln_class` · `owasp` (A0x:2025) · `cwe` · `asvs` ·
+`severity` (critical|high|medium|low|info) · `status` (confirmed|likely|triage) ·
+`confidence` (low|medium|high) · `file` · `line` · `end_line` ·
+`code_excerpt` · `source` (untrusted origin) · `sink` (dangerous op) ·
+`data_flow` (source→sink, sanitizers noted) · `sanitizers_checked` (mitigations
+verified absent/ineffective — the FP guard) · `rationale` · `exploit_sketch` ·
+`dynamic_poc_plan` · `proposed_fix` (high-level direction of the change, not a
+patch — implementation is left to whoever takes the issue).
+
+After the pipeline, each finding is also stamped with `fp` (stable fingerprint =
+`djb2(vuln_class | file | sink)`, the cross-scan dedup key), `display_id`
+(`--`, provisional until the courier swaps in the GitHub issue
+number), `status`, `kept`, `reject_reason`, `verdicts`, and `repro`.
+
+### Verdict (adversarial verify)
+`finding_id` · `lens` · `refuted` (bool) · `confidence` · `reasoning`.
+
+### Repro (dynamic verify)
+`finding_id` · `reproduced` (bool) · `method`
+(live-exploit|unit-test|build-only|static-poc) · `environment` ·
+`setup_commands` · `poc` · `observed` (evidence) · `impact` · `notes`.
+
+## Severity model (exploitability × impact)
+
+- **Critical** — remote, unauth → RCE / full data breach / auth bypass; reachable.
+- **High** — low barrier (authenticated or realistic conditions); significant
+ impact (priv-esc, sensitive data, injection with a real sink).
+- **Medium** — unusual conditions or limited impact, or partial mitigations.
+- **Low** — minor info leak, defense-in-depth gap, hard to exploit.
+- **Info** — hygiene/posture, no direct exploit path.
+
+`status` is orthogonal and drives report placement: **confirmed** (dynamically
+reproduced or statically proven + survived verify), **likely** (strong proof, no
+live repro), **triage** (unverified / split verdicts). Only confirmed+likely go
+in the report body; triage goes to an appendix.
+
+## Signal discipline (the anti-noise contract)
+
+The report is for senior engineers. Stay high-signal — enforced in deep review
+and verify:
+
+- Report only issues with a **reachable** path from untrusted input to a
+ dangerous sink. Check for sanitizers/validators/authz on the path first; if
+ present and effective, drop it.
+- No style/lint nits. No generic "defense-in-depth" without a concrete sink. No
+ unreachable/dead code.
+- Posture/process items (missing SECURITY.md, SBOM, license, maintainership) →
+ Info appendix only, never the body.
+- Dedup: one finding per root cause, list N locations.
+- Prefer few proven findings over many speculative ones. Every High+ finding
+ carries a PoC or an explicit source→sink trace.
+
+## Layout
+
+```
+.claude-plugin/plugin.json # plugin manifest (name: security)
+skills/audit/SKILL.md # agent-facing orchestrator (/security:audit)
+workflows/vuln-audit.js # the Workflow script (the engine)
+prompts/recon.md # phase-1 recon prompt
+prompts/finder.md # shared finder method (per-class context injected by the workflow)
+prompts/playbook.md # shared build/run/exploit repro playbook (stack-agnostic)
+prompts/report-template.md # the report format (phase 7)
+docs/issue-tracking.md # output bundle → GitHub issues + naming rules
+```
+(The output bundle is written to a writable `outDir`, NOT into the plugin root,
+which is read-only/ephemeral.)
+
+Schemas live inline in the workflow (the JS sandbox has no filesystem access at
+runtime); prose content lives in `prompts/` so it is editable without touching
+the script, and is passed into the workflow via `args`.
+
+## Output bundle (VM → courier handoff)
+
+The scan runs on a VM and emits a self-contained **bundle** at
+`reports//`; a separate "courier" agent SSHes in, fetches it, and files the
+issues (the courier holds the only GitHub creds — the VM holds none). Bundle:
+
+- `report.md` — the human report (findings referenced by `display_id`).
+- `findings.json` — the structured findings array, verbatim; the machine
+ interface the courier reconciles against, **keyed by `fp`**.
+- `manifest.json` — `{ tool, schema, repo (owner/repo), target_path, ref,
+ commit, slug, date, dynamic, classes_assessed, counts }`; `repo` tells the
+ courier where to file.
+- `evidence/` — optional captured PoC output (repro evidence also lives inline
+ in `findings.json`).
+
+**Issue tracking & the vulnerability ID/naming rules** (scan epic → finding
+sub-issues, reconcile by `fp`, `display_id` = `--`, the
+courier emitter, and what each host needs) live in
+[`docs/issue-tracking.md`](docs/issue-tracking.md) — the portable source of truth
+that travels with the repo.
+
+## Runtime notes (gotchas)
+
+- **`args` arrives as a JSON string.** The Workflow runtime delivers the `args`
+ payload to the script as a JSON *string*, not a parsed object (verified
+ empirically). `vuln-audit.js` normalizes it (`typeof args === 'string' ?
+ JSON.parse(args) : args`) before reading any input — do not remove this.
+- **Invoke by `scriptPath`, not `name`, mid-session.** Named-workflow discovery
+ only registers files that existed at session start.
+- **Subagents have full tools** (Read/Grep/Bash/Write/ast-grep, and web via
+ ToolSearch) and operate on the *target*; only the orchestration JS is
+ sandboxed. Dynamic repro creates its own `git worktree` of the target — the
+ `isolation:'worktree'` option is about the tool repo and is not used here.
+- **Host adaptivity:** pass `hostNotes` so recon picks a runnable strategy
+ (docker vs native) the host can actually execute.
diff --git a/plugins/security/README.md b/plugins/security/README.md
new file mode 100644
index 0000000..458961a
--- /dev/null
+++ b/plugins/security/README.md
@@ -0,0 +1,85 @@
+# security (vulnerability audit)
+
+A white-box, **dynamically-verified** security-audit plugin for internal
+pentests. `/security:audit` points at a repo you own, recons it, hunts
+vulnerabilities across the OWASP Top 10:2025 classes, **proves them with live
+PoCs in isolated git worktrees**, and writes a terse, senior-engineer report —
+proven findings with a high-level proposed fix, not speculative noise.
+
+## Install
+
+```
+/plugin marketplace add 84codes/claude-plugins
+/plugin install security@84codes
+```
+
+Then run `/reload-plugins` if the command doesn't appear.
+
+## Usage
+
+```
+/security:audit /abs/path/to/target-repo
+/security:audit /abs/path/to/target-repo --no-dynamic
+/security:audit /abs/path/to/target-repo --classes injection,ssrf,access-control --ref v1.2.0
+/security:audit /abs/path/to/target-repo --out /abs/writable/dir
+```
+
+The first argument is the path to the target repo (required). The flags:
+
+| Flag | Meaning |
+|------|---------|
+| `--no-dynamic` | Skip the build/run/PoC phase — static review + adversarial verify only. |
+| `--classes` | Comma-separated vuln-class keys to restrict the audit to (e.g. `injection,ssrf,access-control`; see [`AGENTS.md`](AGENTS.md) for the full taxonomy). Default: classes picked by recon. |
+| `--ref` | Git ref to audit. Default: `HEAD`. |
+| `--out` | Writable directory for the output bundle. Default: `/vuln-audit-reports`. |
+
+The output **bundle** is written to `//`: `report.md` +
+`findings.json` + `manifest.json`.
+
+## How it works
+
+```
+recon → triage → consolidate → deep review → adversarial verify → dynamic PoC → report
+```
+
+| Phase | Purpose |
+|-------|---------|
+| Recon | Detect stack, map attack surface, pick relevant vuln classes + run strategy. |
+| Triage | One finder agent per relevant class emits candidates. |
+| Consolidate | Dedup by root cause, assign IDs, drop low-signal noise. |
+| Deep review | Confirm a reachable source→sink path with no mitigation. |
+| Adversarial verify | Independent skeptics try to refute each finding; majority kills it. |
+| Dynamic PoC | Build + run the target in an isolated worktree; fire a real exploit. |
+| Report | Senior-engineer report: severity-first, reference-backed, PoC-evidenced. |
+
+## Requirements
+
+- `git` (target must be a git repo for worktree isolation + the live-PoC phase).
+- `docker` for dynamic verification (works via `sudo` if the daemon needs it);
+ otherwise repro falls back to unit-test/static PoCs (`--no-dynamic` skips it).
+- No security scanners required — the tool is LLM-native and uses
+ `semgrep`/`gitleaks`/`trivy` only opportunistically if present.
+
+## Output & issue tracking
+
+Findings carry a stable fingerprint (`fp`) and a `display_id`
+(`--`). The bundle is designed to be filed to GitHub issues by a
+separate courier step (scan epic + per-finding sub-issues for Critical/High/
+Medium, reconciled by `fp`). See [`docs/issue-tracking.md`](docs/issue-tracking.md).
+
+## Design
+
+Full pipeline spec, vuln-class taxonomy (OWASP 2025 + CWE + ASVS), data
+contracts, and the signal-discipline policy are in
+[`AGENTS.md`](AGENTS.md).
+
+## Safety & scope
+
+Authorized testing only — audit repositories you own or are explicitly cleared
+to test. All PoC traffic is contained to local processes/containers; the tool
+never fires exploits at external hosts, uses real credentials, or exfiltrates
+data.
+
+## License
+
+MIT
diff --git a/plugins/security/docs/issue-tracking.md b/plugins/security/docs/issue-tracking.md
new file mode 100644
index 0000000..36e972b
--- /dev/null
+++ b/plugins/security/docs/issue-tracking.md
@@ -0,0 +1,118 @@
+# Output handling — findings → GitHub issues
+
+How a scan's findings become tracked, fixable, closeable GitHub issues. This is
+the source of truth for the **vulnerability ID / naming rules** and the
+scan→courier→GitHub pipeline. (Design locked 2026-06-02.)
+
+## Topology: scanner VM + courier
+
+Scans run on a **VM**; a separate **courier** agent SSHes in, fetches the scan's
+output, and files it to GitHub. The two run on different hosts on purpose:
+
+- The **VM** runs `/security:audit`, handles untrusted code and working exploits, and
+ holds **no GitHub credentials**.
+- The **courier** holds the only GitHub creds, fetches the bundle read-only over
+ SSH, and creates/updates issues. It is a *pure function of the bundle* — it
+ needs no access to the target source or the VM's git state.
+
+## The bundle (the scan→courier interface)
+
+Each scan drops a self-contained bundle at `reports//` on the VM:
+
+| File | Purpose |
+|------|---------|
+| `report.md` | Human report (findings headed by `display_id`). |
+| `findings.json` | Structured findings array, **verbatim**; the machine interface, **keyed by `fp`**. |
+| `manifest.json` | `{ tool, schema, repo (owner/repo), target_path, ref, commit, slug, date, dynamic, classes_assessed, counts }`. `repo` tells the courier where to file. |
+| `evidence/` | Optional captured PoC output (repro evidence also lives inline in `findings.json`). |
+
+## Vulnerability ID / naming rules
+
+- **Fingerprint** `fp = djb2(vuln_class | file | sink)` (lowercased; line number
+ excluded to reduce churn). This is the **stable, cross-scan dedup key** — same
+ bug → same `fp`, computed identically on the VM and the courier with no shared
+ state. Stored on each issue as a `fp:` label.
+- **Display ID** `--` — e.g. `training-tool-AC-42`. `` is the
+ repo name, `` the short class code (AC, SSRF, INJ, XSS, AUTH, CRYPTO,
+ DESER, PATH, SEC, MISC, SUPPLY, LOG, DOS, CSRF), and **`` is the GitHub issue
+ number**. So `training-tool-AC-42` *is* `84codes/training-tool#42` — one number,
+ both meanings, permanent (GitHub never reuses issue numbers).
+- **Provisional form** `--` (first 4 hex of `fp`, e.g.
+ `training-tool-AC-b4a0`) — used in the VM-side `report.md` *before* an issue
+ exists. The courier stamps the final `-` ID into the issue at filing;
+ `fp` is the glue linking the two forms.
+- Numbers are **not contiguous per class** (GitHub shares the counter with PRs and
+ other issues) — that is fine; the class prefix carries the meaning.
+
+## Issue model
+
+- **Scan issue** (epic), one per run: holds the report (as a comment, see
+ **Report comment** below) + general comments; closes when all its finding
+ sub-issues close.
+- **Finding sub-issue**, one per **Critical / High / Medium** (confirmed+likely).
+ **Low/Info stay in the report appendix — never issues** (same high-signal
+ contract as the report).
+- **Title:** `[Critical] training-tool-AC-42: (access-control)`.
+- **Body:** the report's finding block (refs · location · PoC · impact ·
+ proposed fix) + backlink to the scan issue + the `fp` marker.
+- **Labels:** `security`, `security-scan` (epic), and `fp:` (the dedup
+ key). Severity and class aren't labels — they live in the title
+ (`[Critical] … (access-control)`) and the display ID, so a `sev:`/`vuln:`
+ label would just duplicate that text.
+- **Two distinct "statuses":** *verification* (confirmed/likely — a scan output,
+ carried as the finding's badge in the title/body) vs *lifecycle* (open/fixed —
+ owned entirely by the GitHub issue). The **report has no status table**; the
+ scan epic and its sub-issues are the live status.
+- **PoC handling:** repos are private/internal, so full PoC commands go in the
+ issues (the remediation is a high-level *proposed fix*, not a patch). (If a target were public, use GitHub Security Advisories for
+ Critical/High instead.)
+- **Report comment:** the full `report.md` is **embedded** in a comment on the
+ scan epic, wrapped in a `…
` block (collapsed by
+ default) so the long report never buries the epic's sub-issue checklist. Always
+ embed the report text itself — **never** reference a local bundle path
+ (`reports//…`) or any filesystem location, which is unreachable from
+ GitHub. The epic body points readers to this comment, not to disk.
+
+## Reconcile algorithm (idempotent, keyed by `fp`)
+
+For each Critical/High/Medium finding in `findings.json`, look up existing issues
+by the `fp:` label (`gh issue list --search "label:fp:" --state all`):
+
+- **no match** → create the finding issue, link it under the scan epic.
+- **open match** → comment "still present in scan ``" (no duplicate).
+- **closed match that still reproduces** → reopen as a regression + comment.
+- **previously open, now absent / not reproduced** (dynamic re-verify) → comment +
+ close.
+
+The **report comment** on the epic is upserted the same way: tag it with a
+hidden marker (``), then find-and-edit that comment
+on re-run instead of posting a new one — so the epic never accumulates
+duplicate report blocks.
+
+Re-running the courier on the same bundle is a no-op. The dynamic-repro phase
+doubles as the fix-verifier, so "everything closed when done" is provable, not
+manual.
+
+## Close loop
+
+Fix PRs use `Fixes #N` to auto-close the finding issue on merge; the next scan
+confirms via dynamic re-verify. When all finding sub-issues are closed, the scan
+epic closes.
+
+## Build status
+
+1. **Done (2026-06-02)** — the workflow emits the bundle and stamps `fp` +
+ provisional `display_id`. See `workflows/vuln-audit.js`.
+2. **Not built yet** — the `/security:track ` courier skill +
+ `gh` emitter. Blocked on `gh` being installed + authed on the courier host.
+3. **Always gated** — creating real issues on a repo needs an explicit go-ahead.
+
+## What each host needs
+
+| Host | Role | Requirements |
+|------|------|--------------|
+| **VM (scanner)** | runs `/security:audit`, produces the bundle | Claude Code · this repo · `git` · `docker` · **no `gh`, no GitHub creds** |
+| **Courier** | fetches bundle, files issues | Claude Code · this repo (for `/security:track`) · **`gh` + `gh auth login`** (token: Issues read/write) · **SSH key to the VM** (`ssh`/`rsync`) · `jq` (optional) |
+
+Sub-issue linking uses GitHub's GraphQL API, which `gh api graphql` covers — no
+extra tooling.
diff --git a/plugins/security/prompts/finder.md b/plugins/security/prompts/finder.md
new file mode 100644
index 0000000..9dab008
--- /dev/null
+++ b/plugins/security/prompts/finder.md
@@ -0,0 +1,76 @@
+
+
+# Finder — method (one vuln class per run)
+
+The workflow tells you which class to hunt and gives its OWASP/CWE/ASVS mapping
+and a one-line focus. You know this class well — apply that knowledge; the focus
+hint scopes it, it is not an exhaustive checklist.
+
+## 1. Taint model (the only thing that makes a finding)
+
+A finding is a REACHABLE path from an untrusted SOURCE to a dangerous SINK with
+NO effective control on the path. Miss any of the three and it is not a finding.
+
+- SOURCE — untrusted input: HTTP query/body/path/header/cookie, JSON/multipart
+ fields & filenames, GraphQL args, queue/webhook payloads, parsed file
+ contents, and DB rows that were originally user-set (second-order). When in
+ doubt, treat input as untrusted until a boundary proves otherwise.
+- SINK — the dangerous operation for this class (the interpreter, renderer,
+ deserializer, file op, outbound call, authz decision, crypto primitive, ...).
+- PATH — the source must actually reach the sink at runtime given routing, auth
+ guards, and feature flags. Dead/unreachable code is not a finding.
+
+## 2. How to hunt
+
+1. Start from recon's prioritized surfaces for this class, then widen.
+2. Grep/ast-grep for this class's sinks; for each hit, trace backward to a
+ source and forward through any control. Read the surrounding code and callers,
+ not just the matched line.
+3. Use your own knowledge of the language/framework for the exact sink and safe
+ APIs — do not assume any list is complete. A sink you know but isn't named
+ anywhere is still a sink.
+
+## 3. False-positive guard (check BEFORE flagging)
+
+Before emitting, prove the control on the path is absent or ineffective. A
+finding survives only if there is no effective:
+
+- parameterization / prepared statement / structural builder (injection),
+- context-correct output encoding / auto-escaping (xss),
+- canonicalize-then-confined-root check (path),
+- allowlist / typed cast that drops dangerous values,
+- authn/authz/ownership check on the route (access-control, csrf),
+- safe deserializer / signature+integrity verification (deserialization),
+- destination allowlist + no-redirect + internal-range block (ssrf).
+
+A control that exists but is bypassable (denylist instead of allowlist, wrong
+context, escaping that misses an encoding, a cast that silently coerces) is NOT
+a mitigation — flag it and name the exact bypass. Record what you checked in
+`sanitizers_checked`; that field is the FP guard made explicit. Posture/process
+items, style nits, and defense-in-depth without a concrete sink are not findings
+(see AGENTS.md signal discipline).
+
+## 4. Severity & status
+
+Score per the AGENTS.md severity model (exploitability x impact): Critical =
+remote unauth high-impact reachable; down to Info = no direct exploit path.
+Set `status`: `likely` for a proven static source->sink trace, `confirmed` only
+after dynamic repro, `triage` if reachability or source is uncertain.
+
+## 5. Emit
+
+Return `{findings:[...]}` (or `{findings:[]}` if nothing real). One object per
+distinct root cause — dedup call sites into `locations[]`, note extras in
+`rationale`. The output schema is enforced by the workflow; fill it accurately.
+Set `owasp`/`cwe`/`asvs` from the class context the workflow gave you (pick the
+most specific CWE for the actual bug). `source`, `sink`, `data_flow`, and
+`sanitizers_checked` must be concrete and true — `data_flow` traces variables
+source->sink and states why no control stops it; `sanitizers_checked` names each
+control checked and why it is absent or bypassable. Include an `exploit_sketch`
+and a `dynamic_poc_plan` (the oracle that would prove it on a running instance),
+and a high-level `proposed_fix` (the direction of the change, not a patch).
diff --git a/plugins/security/prompts/playbook.md b/plugins/security/prompts/playbook.md
new file mode 100644
index 0000000..e176659
--- /dev/null
+++ b/plugins/security/prompts/playbook.md
@@ -0,0 +1,110 @@
+
+
+# Repro playbook — build, run, prove (one finding)
+
+Conventions (substitute per finding so parallel repros never collide):
+
+- `FID` — the finding id; use it to make every name/port unique.
+- `WT=/tmp/va-$FID` — isolated git worktree. `IMG=va-$FID:repro` — image tag.
+ `CN=va-$FID` — container name. `PORT` — a free ephemeral host port.
+- The result must set `method` to one of:
+ `live-exploit | unit-test | build-only | static-poc`.
+
+## 1. Isolate
+
+Never touch the original tree. Create a throwaway worktree at the audited ref:
+
+```sh
+git -C worktree add --detach /tmp/va-$FID "$REF" # $REF default HEAD
+cd /tmp/va-$FID
+```
+
+If `` is not a git repo (rare), `cp -a /tmp/va-$FID` and note
+it. All build/run steps run from `WT`.
+
+## 2. Build & run (docker-first)
+
+Use `recon.run_strategy`, `recon.stack`, and recon's boot notes (run command,
+port, prerequisite services) as your starting point — don't re-derive what recon
+already found.
+
+1. Repo ships Docker → prefer it; it usually wires up DB/env/migrations:
+ `docker compose -p va-$FID up -d --build`, else `docker build -t $IMG .`.
+2. No Dockerfile → write a minimal one for the detected stack: a recent stable
+ base image for the language, install the build deps the native packages need,
+ restore dependencies from the lockfile EXACTLY (never upgrade — that changes
+ the audited dependency set), then run the app's own start command on
+ `0.0.0.0`.
+3. Can't containerize → run natively if the host has the runtime (see HOST
+ CONSTRAINTS passed by the workflow).
+
+Run detached, bound to loopback only, on a finding-keyed port:
+
+```sh
+PORT=$(python3 -c 'import socket;s=socket.socket();s.bind(("",0));print(s.getsockname()[1]);s.close()')
+docker run -d --name $CN -p 127.0.0.1:$PORT: $IMG
+```
+
+Bind the host port to `127.0.0.1` so the app is never exposed off-box. Poll for
+health, don't sleep blindly; on failure inspect `docker logs --tail 50 $CN`.
+
+## 3. Seed (only what the PoC needs)
+
+Create the minimum synthetic state — a throwaway user, a row, an auth session —
+using the app's own endpoints/console. Use only fake, local-only credentials;
+never reuse real secrets from the repo beyond what's strictly required to boot.
+
+## 4. Fire the PoC safely
+
+Send the exploit to the LOCAL instance only and capture concrete evidence.
+Tailor the oracle to the finding's source->sink path:
+
+- Injection — error/boolean/time oracle (unbalanced quote, `1=1` vs `1=2`,
+ `pg_sleep`/`SLEEP`), or an OS-command marker (`; sleep 5`, `| id`).
+- Path traversal / file read — pull a host file the app should never serve
+ (`?file=../../../../etc/passwd`).
+- SSRF — point at a CONTAINER-LOCAL canary listener you start, never a real host.
+- Deserialization / RCE — prove exec with a benign in-container side effect
+ (touch a sentinel file), then read it back; never run destructive commands.
+- XSS — confirm the payload is reflected unescaped in the response context.
+- Auth/access-control — perform the action as the wrong (or no) principal and
+ show it succeeds.
+
+Record for the result: the exact request (-> `poc`), the response/log line
+proving impact (leaked row, file contents, sentinel, reflected script, 500 with
+stack -> `observed`), and what it means for the target (-> `impact`). Set
+`reproduced: true`, `method: live-exploit`.
+
+Safety invariants (non-negotiable): traffic stays on `127.0.0.1` / inside `$CN`;
+no outbound connections; no real data; side effects are benign sentinels only.
+
+## 5. Teardown (always, even on failure)
+
+```sh
+docker rm -f $CN 2>/dev/null
+docker compose -p va-$FID down -v 2>/dev/null
+docker image rm -f $IMG 2>/dev/null
+cd / # leave the worktree before removing it
+git -C worktree remove --force /tmp/va-$FID
+git -C worktree prune
+```
+
+## 6. Fallbacks
+
+If a live exploit isn't achievable, downgrade deliberately and set `method`;
+never claim `reproduced: true` without observed runtime evidence.
+
+1. Builds but won't serve (library, or boot blocked) → drive the vulnerable API
+ directly from a focused unit test in the container. `method: unit-test`.
+2. Image builds but the app can't start (missing DB/config) → record that deps
+ install and the vulnerable code is present and reachable, with the
+ source->sink trace as evidence. `method: build-only`.
+3. Can't build at all (toolchain/network blocked) → construct a static PoC: the
+ exact crafted input plus the line-referenced source->sink path showing why it
+ triggers. `method: static-poc`, `reproduced: false`.
diff --git a/plugins/security/prompts/recon.md b/plugins/security/prompts/recon.md
new file mode 100644
index 0000000..97d0a48
--- /dev/null
+++ b/plugins/security/prompts/recon.md
@@ -0,0 +1,195 @@
+
+
+# Recon — Phase 1 (stack, surface, scope, run strategy)
+
+Work the steps in order. Each step's output feeds the recon summary in step 7.
+Be fast and broad first, then precise. When a step is ambiguous, prefer the
+reading that EXPANDS attack surface (assume input is untrusted until proven
+otherwise) but NARROWS finder selection (skip a class only when you can justify
+it). Cite concrete file paths and line numbers for every claim — recon that
+points later phases at real code is worth ten of generic prose.
+
+## 1. Detect stack, frameworks, and build/run system
+
+Identify the primary language(s), frameworks, and how the target builds and runs.
+Record ONE normalized `stack` label (a hint for Phase 6 repro and the report):
+
+`crystal · ruby · node · python · go · php · java-jvm · rust · generic-docker · ci-iac`
+
+Detection signals (read-only; do not install anything):
+
+- **Manifests / lockfiles** — the ground truth for language + package manager:
+ - crystal: `shard.yml`, `shard.lock`
+ - ruby: `Gemfile`, `*.gemspec`, `Gemfile.lock`
+ - node: `package.json` (+ `package-lock.json`/`pnpm-lock.yaml`/`yarn.lock`), `tsconfig.json`
+ - python: `pyproject.toml`, `requirements*.txt`, `Pipfile`, `setup.py`, `poetry.lock`
+ - go: `go.mod`, `go.sum`
+ - php: `composer.json`, `composer.lock`
+ - java-jvm: `pom.xml`, `build.gradle(.kts)`, `settings.gradle`, `*.jar`
+ - rust: `Cargo.toml`, `Cargo.lock`
+- **Build/run tells** — `Dockerfile`, `docker-compose*.yml`, `Procfile`, `Makefile`,
+ `Taskfile.yml`, `bin/`, framework CLIs, and the manifest's scripts/tasks.
+- **Framework** — read deps + entry imports: web (Rails/Sinatra/Lucky/Kemal,
+ Express/Nest/Next/Fastify, Django/Flask/FastAPI, Gin/Echo/chi/net-http,
+ Laravel/Symfony/Slim, Spring/Quarkus/Micronaut, actix/axum/rocket), plus
+ ORMs, template engines, queue/worker libs, and serializers — note each, they
+ steer finder selection.
+
+**Stack label decision:**
+
+- A single dominant app language → that language label.
+- **Polyglot:** label the language that owns the primary attack surface (the
+ network-facing app), note the others in `notes`. A thin shell of one language
+ around a core of another → label the core.
+- No buildable app, just a `Dockerfile`/compose stack to run → `generic-docker`.
+- The repo's PRIMARY artifact is CI/CD pipelines or IaC (GitHub Actions/GitLab
+ CI/Forgejo workflows, Terraform/Pulumi/CloudFormation, k8s/Helm, Ansible) with
+ no app to run → `ci-iac`. (Note: an app repo that ALSO has workflows labels on
+ the app language; `ci-iac` is for infra-/pipeline-primary repos.)
+
+Record `stack` (the label) and `frameworks` (list).
+
+## 2. Map the attack surface and trust boundaries
+
+Enumerate every place untrusted input crosses into the system, and the dangerous
+sinks it could reach. For each, capture `file:line`, the kind, and the untrusted
+source. This is the map every finder navigates — be exhaustive on surface,
+precise on location.
+
+- **HTTP routes/handlers** — every route table, controller, middleware, and
+ handler. Capture method, path, auth requirement, and which params/body/headers
+ flow in. Note dynamic/wildcard routes and catch-alls.
+- **CLIs / entrypoints** — `main`/`bin`, argv parsing, subcommands, scripts run
+ with attacker-influenced args or stdin.
+- **Message/queue consumers** — AMQP/Kafka/SQS/Redis/NATS/cron/webhook handlers;
+ the payload is untrusted input.
+- **Deserialization points** — `JSON.parse`/`Marshal`/`pickle`/`yaml.load`/
+ `ObjectInputStream`/`unserialize`/`serde`/MessagePack/protobuf over untrusted
+ bytes; framework auto-binding/mass-assignment.
+- **File/path operations** — reads/writes/joins/globs/zips/uploads/temp files
+ where any path segment is caller-controlled (traversal, symlink, zip-slip).
+- **Outbound network calls** — every server-side HTTP/DB/SMTP/DNS/socket call
+ whose destination or content can be influenced by a caller (SSRF surface).
+- **Auth/authz boundaries** — login, session/token issuance & validation,
+ role/permission checks, tenant isolation, the line between
+ unauthenticated/authenticated/admin. Mark which routes sit on which side.
+- **Secrets/config loading** — env vars, config files, secret managers, key
+ material, connection strings; note defaults and committed values.
+- **Template/HTML rendering** — server-rendered views, string-built HTML, SSTI-
+ capable engines, `dangerouslySetInnerHTML`/`html_safe`/`|safe`/`v-html`.
+- **Trust boundaries** — draw the line for each: where does data go from
+ trusted→untrusted or low-priv→high-priv? An input is only interesting if it
+ reaches a sink ACROSS a boundary.
+
+For each surface entry note any **sanitizer/validator/authz/parameterization**
+already on the path — the FP guard. A sink fronted by an effective control is
+not a lead; record it so later phases don't re-chase it.
+
+## 3. Select relevant finder classes (and justify skips)
+
+For each of the 14 classes, decide RELEVANT or SKIPPED based on the surface from
+step 2. The 14 classes (the workflow injects each one's full context downstream):
+
+`access-control · ssrf · injection · xss-ssti · auth-session · crypto ·
+deserialization · path-file · secrets · misconfig · supply-chain ·
+logging-errors · dos-redos · csrf-cors`
+
+A class is RELEVANT when its source AND its sink both exist in the surface map —
+e.g. ssrf needs a caller-influenced outbound call; injection needs untrusted
+input reaching a SQL/NoSQL/OS/LDAP/XPath interpreter; xss-ssti needs HTML/template
+rendering of untrusted data; auth-session needs the app to issue/validate
+sessions or tokens; deserialization needs untrusted bytes hitting a deserializer.
+Apply the same source-and-sink test to the rest. `secrets` and `misconfig` are
+near-always worth a quick pass. For `supply-chain`, only code-exploitable CI/CD
+and dependency issues count (per AGENTS.md) — posture/SBOM/maintainership is not.
+
+Output two lists. For every RELEVANT class, add a one-line **priority pointer**:
+the specific surfaces/files from step 2 that finder should hit first. For every
+SKIPPED class, add a one-line **justification** (why no reachable source→sink).
+Default to RELEVANT when uncertain — skipping is a claim you must back.
+
+## 4. Decide the dynamic-verification strategy
+
+Determine how Phase 6 will reproduce findings. Docker-first.
+
+- **Runnable?** Check for `Dockerfile`/`docker-compose*.yml` first (preferred,
+ hermetic), then a native run path (manifest scripts, `Procfile`, `Makefile`
+ targets, framework server command).
+- **Entry command + port** — the exact command that starts the app and the port
+ it binds (read it from config/compose/scripts, don't guess; note env vars and
+ dependent services — DB/cache/queue — needed to boot).
+- **Health check** — how to know it's up (a route, a log line, a port listen).
+- **Not runnable** (library, no server, missing deps, infra-only) → repro falls
+ back to a focused **unit-test** that drives the sink, or a **static PoC** /
+ build-only proof. Say which and why.
+
+Record `run_strategy` as one of:
+`docker-compose | docker | native | unit-test | static-poc`,
+plus the entry command, port, and any boot prerequisites in `notes`.
+
+## 5. Fold in target threat-model guidance
+
+Check for `/.claude/claude-security-guidance.md`. If present, read it and
+fold its threat model into scope: crown-jewel assets, known trust boundaries,
+in/out-of-scope paths, prior findings, and any class-specific guidance. Let it
+RAISE priority and tighten scope; it does NOT lower the signal bar. Summarize the
+relevant points in `notes` and reflect any scope/priority changes in steps 2–4.
+If absent, note that and proceed with defaults.
+
+## 6. Signal discipline (binding — carry it into every later phase)
+
+Recon's selections directly gate noise. Enforce the AGENTS.md contract:
+
+- A class is RELEVANT only when there is a plausible REACHABLE path from
+ untrusted input to a dangerous sink with no effective control already on it.
+ No class earns a slot on defense-in-depth grounds alone.
+- No posture/process items (missing SECURITY.md, SBOM, license, maintainership).
+ These never gate a finder; at most they land in the report's Info appendix.
+- No style/lint nits, no unreachable/dead code, no speculative surfaces.
+- Prefer a tight scope that proves a few real issues over a broad scope that
+ drowns them. When you skip a class, you are asserting there is no reachable
+ source→sink — make that assertion only when the surface map backs it.
+
+## 7. Emit the recon summary
+
+Output exactly ONE structured object (this is the phase deliverable; later
+phases consume it). Shape:
+
+```json
+{
+ "stack": "",
+ "frameworks": ["", "..."],
+ "run_strategy": "docker-compose | docker | native | unit-test | static-poc",
+ "entrypoints": [
+ { "kind": "http|cli|queue|cron|webhook", "ref": "file:line",
+ "detail": "GET /x | subcommand | consumer", "auth": "none|user|admin" }
+ ],
+ "attack_surface": [
+ { "kind": "route|cli|consumer|deser|file|outbound|authz|secret|template",
+ "ref": "file:line", "source": "",
+ "sink": "", "existing_control": "",
+ "classes": [""] }
+ ],
+ "relevant_classes": [
+ { "class": "", "priority_surfaces": ["file:line", "..."] }
+ ],
+ "skipped_classes": [
+ { "class": "", "reason": "sink>" }
+ ],
+ "notes": "run command + port + boot prereqs; polyglot/key rationale; target security-guidance points; blind spots/auth-gated areas; anything Phase 6 needs to boot the app"
+}
+```
+
+Rules for the object: `stack` is exactly one normalized label; `relevant_classes` +
+`skipped_classes` together cover all 14, no overlap; every `attack_surface` entry
+has a real `file:line`; `entrypoints` is the subset of surfaces where untrusted
+input first enters. Keep `notes` operational — it is the bridge to Phase 6.
diff --git a/plugins/security/prompts/report-template.md b/plugins/security/prompts/report-template.md
new file mode 100644
index 0000000..a39b2e8
--- /dev/null
+++ b/plugins/security/prompts/report-template.md
@@ -0,0 +1,94 @@
+
+
+# Security Audit — {{target}} @ {{ref}}
+
+**Scope:** {{paths_in_scope}} · **Out of scope:** {{paths_excluded}}
+**Commit:** {{commit}} · **Date:** {{date}} · **Method:** static + dynamic (isolated worktree, live PoC) · **Tool:** vuln-audit {{version}}
+
+## Summary
+
+{{2–4 sentences: overall posture and the single most important thing to fix
+first. Name the dominant risk theme. Weave the counts into the prose (e.g. "one
+critical and one high, both confirmed; one medium") — no status table, the
+GitHub scan issue is the live status.}}
+
+## Findings
+
+
+
+### [{{id}}] {{title}} · {{Severity}} · {{Confirmed|Likely}}
+
+**Class:** {{vuln_class}} · **Refs:** [{{CWE}}](https://cwe.mitre.org/data/definitions/{{n}}.html) · [{{A0x:2025}}](https://owasp.org/Top10/) · [ASVS {{Vx.y.z}}](https://github.com/OWASP/ASVS)
+**Location:** `{{file}}:{{line}}`{{ · +N other call sites}}
+
+{{2–4 sentences: the specific flaw, the untrusted source, the sink, and why the
+path is reachable (no effective sanitizer/authz). Senior audience — be direct.}}
+
+**PoC**
+```
+{{$ command that reproduced it}}
+{{observed output that proves impact}}
+```
+
+**Impact:** {{one line.}}
+**Proposed fix:** {{1–2 sentences — the high-level direction of the change needed
+and why (e.g. "resolve identity from a server-side session keyed by user id, not
+the client cookie"). NOT a diff or line-level patch — implementation is left to
+whoever picks up the issue.}}
+
+---
+
+## Lower severity (Medium)
+
+
+- `{{file:line}}` — {{one-line description}} — {{ref}} — **fix:** {{one-liner}}
+
+## Appendix
+
+### Low / Info
+| Location | Note | Ref |
+|----------|------|-----|
+| `{{file:line}}` | {{one line}} | {{ref}} |
+
+### Triage — not confirmed
+
+- `{{file:line}}` — {{candidate}} — **why unconfirmed:** {{refuted by verify / could not reproduce / needs prod-like data}}
+
+### Coverage & method
+- **Classes assessed:** {{list}} · **Skipped (not applicable):** {{list}}
+- **ASVS chapters touched:** {{list}}
+- **Dynamic verification:** {{how the target was built/run; what was and wasn't reproducible and why}}
+- **Tools used:** {{semgrep/gitleaks/trivy if present, else "LLM-native"}}
+- **Blind spots:** {{anything not reachable by this audit — auth-gated areas, external services, etc.}}
diff --git a/plugins/security/skills/audit/SKILL.md b/plugins/security/skills/audit/SKILL.md
new file mode 100644
index 0000000..fe83f68
--- /dev/null
+++ b/plugins/security/skills/audit/SKILL.md
@@ -0,0 +1,84 @@
+---
+description: >-
+ Run a white-box, dynamically-verified security audit (internal pentest) of a
+ target code repository. Use when the user asks to audit/pentest a repo for
+ vulnerabilities, find security bugs with proof, or runs /security:audit.
+ Produces a terse, senior-engineer report of proven findings with live PoCs and
+ a high-level proposed fix per finding.
+---
+
+# security:audit
+
+Drives the bundled `vuln-audit` workflow: recon → triage → consolidate → deep
+review → adversarial verify → dynamic PoC → report. Design spec and data
+contracts are in `${CLAUDE_PLUGIN_ROOT}/AGENTS.md`; output handling / issue
+tracking in `${CLAUDE_PLUGIN_ROOT}/docs/issue-tracking.md`. Read the spec before
+changing anything.
+
+## Input
+
+```
+/security:audit [--no-dynamic] [--classes a,b,c] [--ref ] [--out ]
+```
+
+`$ARGUMENTS` holds the target path and any flags.
+
+- `` — absolute path to the repo to audit (required).
+- `--no-dynamic` — skip the build/run/PoC phase (static + adversarial verify only).
+- `--classes` — restrict to specific vuln-class keys (see `AGENTS.md` taxonomy).
+- `--ref` — git ref to audit (default `HEAD`).
+- `--out` — writable directory for the output bundle (default: `/vuln-audit-reports`).
+
+## Steps
+
+1. **Parse `$ARGUMENTS`** into the target path + flags. The bundled tool root is
+ `${CLAUDE_PLUGIN_ROOT}` (expands to the plugin's install dir; it holds
+ `prompts/`, `workflows/`, and the docs — read-only).
+2. **Pick a writable `outDir`** — the plugin root is read-only/ephemeral, so the
+ bundle must go elsewhere. Use `--out` if given, else `/vuln-audit-reports`
+ (absolute path). This is also where a courier later fetches the bundle from.
+3. **Validate the target** — confirm it exists and is a git repo
+ (`git -C rev-parse --git-dir`). Worktree isolation and the live-PoC
+ phase need git. If it isn't a repo, warn and proceed with `--no-dynamic`.
+ Resolve the ref to a concrete commit with
+ `git -C rev-parse --short [` so the run is pinned and
+ reproducible; carry both the ref name and the resolved SHA.
+4. **Preflight host capabilities** → assemble a `hostNotes` string: is `docker`
+ usable non-interactively (note if it needs `sudo`); which native runtimes are
+ present (`python3`, `node`, `ruby`, `go`, `crystal`, ...). If dynamic is on
+ but neither docker nor a usable native runtime exists, fall back to
+ `--no-dynamic` and say repro will be static/unit-test only.
+5. **Check target threat-model** — note whether
+ `/.claude/claude-security-guidance.md` exists; recon folds it in.
+6. **Announce the run** — before invoking, print a one-line startup summary:
+ target name, the resolved commit (short SHA), and the absolute output
+ directory. Name the ref only when it isn't `HEAD` (e.g. `v1.2.0 a1b2c3d`);
+ for a plain `HEAD` run just show the SHA. Drop anything left at its default.
+7. **Invoke the workflow** (it runs in the background and notifies on completion):
+ ```
+ Workflow({ scriptPath: '${CLAUDE_PLUGIN_ROOT}/workflows/vuln-audit.js', args: {
+ toolRoot: '${CLAUDE_PLUGIN_ROOT}',
+ outDir: '',
+ target: '',
+ ref: '][',
+ dynamic: ,
+ onlyClasses: ,
+ scope: '',
+ hostNotes: ''
+ }})
+ ```
+8. **Present the result** — when it completes, read `report_path` and give a
+ tight summary: severity counts and the top 1–3 confirmed findings (title +
+ location + one-line impact). Point to the bundle dir; don't paste the whole
+ report. Surface anything that blocked dynamic verification.
+
+## Notes
+
+- High-signal is the contract: the workflow drops noise, posture/process items,
+ and unreachable findings on purpose. Don't reintroduce them in the summary.
+- The report's remediation is a **high-level proposed fix** (direction, not a
+ patch) — implementation is left to whoever takes the finding.
+- This is the deepest layer of defense-in-depth, complementing the in-session
+ security-guidance plugin, `/security-review`, and PR Code Review.
+- Authorized testing only: target repos you own or are explicitly cleared to
+ audit. All PoC traffic stays local; never fire exploits at external hosts.
diff --git a/plugins/security/workflows/vuln-audit.js b/plugins/security/workflows/vuln-audit.js
new file mode 100644
index 0000000..b7730d2
--- /dev/null
+++ b/plugins/security/workflows/vuln-audit.js
@@ -0,0 +1,246 @@
+export const meta = {
+ name: 'vuln-audit',
+ description: 'White-box, dynamically-verified security audit of a target repo (recon -> triage -> deep review -> adversarial verify -> dynamic PoC -> report)',
+ whenToUse: 'Invoked by the /vuln-audit skill. Runs a multi-phase, high-signal security audit and writes a senior-engineer report.',
+ phases: [
+ { title: 'Recon', detail: 'detect stack, map attack surface, pick run strategy' },
+ { title: 'Triage', detail: 'one finder per relevant vuln class' },
+ { title: 'Consolidate', detail: 'dedup + drop noise + stable IDs' },
+ { title: 'Review', detail: 'deep review: reachable source->sink with no mitigation' },
+ { title: 'Verify', detail: 'adversarial skeptic panel tries to refute' },
+ { title: 'Repro', detail: 'build + run + live PoC in an isolated worktree' },
+ { title: 'Report', detail: 'synthesize the senior-engineer report' },
+ ],
+}
+
+// ---- inputs (assembled by the skill) ----
+// The Workflow runtime may deliver `args` as a JSON string rather than a parsed
+// object; normalize so every input below is read from a real object.
+const A = (typeof args === 'string') ? JSON.parse(args) : (args && typeof args === 'object' ? args : {})
+if (!A.toolRoot || !A.target) {
+ throw new Error(`vuln-audit: missing required args (toolRoot, target). Got keys: ${Object.keys(A).join(', ') || 'none'}`)
+}
+const TOOL = String(A.toolRoot).replace(/\/+$/, '') // this tool's repo (has prompts/, reports/)
+const TARGET = String(A.target).replace(/\/+$/, '')
+const REF = A.ref || 'HEAD'
+const DYNAMIC = A.dynamic !== false // dynamic verification on by default
+const ONLY = (() => { // accept array, JSON-string-of-array, or comma-separated string
+ let v = A.onlyClasses
+ if (typeof v === 'string') { try { v = JSON.parse(v) } catch (_) { v = v.split(',') } }
+ if (Array.isArray(v)) { const a = v.map(s => String(s).trim()).filter(Boolean); return a.length ? a : null }
+ return null
+})()
+const TARGET_NAME = TARGET.split('/').pop() || 'target'
+const SCOPE = A.scope || TARGET
+const HOST = A.hostNotes || '' // host capability/constraint notes (e.g. "docker needs sudo; python3 native available")
+const OUT = A.outDir ? String(A.outDir).replace(/\/+$/, '') : TOOL // writable bundle output dir; defaults to toolRoot (standalone), but a plugin MUST pass a writable outDir — the plugin root is read-only/ephemeral
+
+// Vuln-class taxonomy — single source of truth (mirrors the AGENTS.md table).
+// `code` is the human-facing display-id token; owasp/cwe/asvs are the canonical
+// mapping injected into the generic finder prompt; `focus` scopes the hunt.
+const CLASS_META = {
+ 'access-control': { code: 'AC', title: 'Broken Access Control & IDOR', owasp: 'A01:2025', cwe: 'CWE-639/862/863/601', asvs: 'V8', focus: 'missing/incorrect authz, IDOR by object key, tenant isolation, open redirect' },
+ ssrf: { code: 'SSRF', title: 'Server-Side Request Forgery', owasp: 'A01:2025', cwe: 'CWE-918', asvs: 'V4', focus: 'caller-influenced destination of a server-side outbound request reaching internal/metadata endpoints' },
+ injection: { code: 'INJ', title: 'Injection (SQL/NoSQL/OS/LDAP/XPath)', owasp: 'A05:2025', cwe: 'CWE-89/78/943/90/74', asvs: 'V1/V2', focus: 'untrusted input crossing into the command/query structure with no parameterization/escaping/allowlist' },
+ 'xss-ssti': { code: 'XSS', title: 'XSS & Template Injection', owasp: 'A05:2025', cwe: 'CWE-79/1336/116', asvs: 'V1/V3', focus: 'untrusted data emitted into an HTML/JS context unescaped, or controlling template source (SSTI)' },
+ 'auth-session': { code: 'AUTH', title: 'Authentication & Session', owasp: 'A07:2025', cwe: 'CWE-287/384/620/640/521', asvs: 'V6/V7/V9/V10', focus: 'login/session/token issuance & validation, password reset, MFA, JWT/OAuth flaws' },
+ crypto: { code: 'CRYPTO', title: 'Cryptographic Failures', owasp: 'A04:2025', cwe: 'CWE-327/328/326/330/916/295', asvs: 'V11', focus: 'weak/broken algos, fast or unsalted password hashing, weak RNG for security, missing cert validation' },
+ deserialization: { code: 'DESER', title: 'Insecure Deserialization & Integrity', owasp: 'A08:2025', cwe: 'CWE-502/494/345', asvs: 'V2/V15', focus: 'untrusted bytes to a native/object deserializer; unverified code/data integrity' },
+ 'path-file': { code: 'PATH', title: 'Path Traversal & File Handling', owasp: 'A01:2025', cwe: 'CWE-22/98/73/434', asvs: 'V5', focus: 'caller-controlled path segment reaching a file op (traversal, RFI, zip-slip, dangerous upload)' },
+ secrets: { code: 'SEC', title: 'Hardcoded Secrets & Credentials', owasp: 'A02:2025', cwe: 'CWE-798/259/321/547', asvs: 'V14', focus: 'live credentials/keys/connection strings committed in code or config' },
+ misconfig: { code: 'MISC', title: 'Security Misconfiguration', owasp: 'A02:2025', cwe: 'CWE-16/614/942/1004/611', asvs: 'V13', focus: 'debug flags, permissive CORS, insecure cookie flags, exposed admin, XXE, default creds' },
+ 'supply-chain': { code: 'SUPPLY', title: 'Software Supply Chain & CI/CD', owasp: 'A03:2025', cwe: 'CWE-1104/1357/829/506', asvs: 'V15', focus: 'dangerous CI workflows (pull_request_target + untrusted checkout, script injection, over-broad tokens, unpinned actions), malicious lifecycle scripts, known-vuln/typosquat deps — code-exploitable only' },
+ 'logging-errors': { code: 'LOG', title: 'Logging, Error & Exception Handling', owasp: 'A09/A10:2025', cwe: 'CWE-532/209/755/703/396', asvs: 'V16', focus: 'sensitive data in logs, stack traces/state leaked in errors, fail-open exception handling, log injection' },
+ 'dos-redos': { code: 'DOS', title: 'Denial of Service & ReDoS', owasp: 'A06:2025', cwe: 'CWE-1333/400/770/834', asvs: 'V2', focus: 'user input to a catastrophic-backtracking regex, unbounded alloc/loop, decompression bomb, expensive parse' },
+ 'csrf-cors': { code: 'CSRF', title: 'CSRF, CORS & Clickjacking', owasp: 'A01:2025', cwe: 'CWE-352/1021/942', asvs: 'V3', focus: 'state-changing cookie-auth routes lacking CSRF defense, reflective/permissive CORS, missing framing protection' },
+}
+const ALL_CLASSES = Object.keys(CLASS_META)
+const CLASS_CODE = Object.fromEntries(Object.entries(CLASS_META).map(([k, v]) => [k, v.code]))
+// Deterministic fingerprint (djb2) over class|file|sink — the stable dedup key
+// across scans, identical on the VM and the courier (no shared allocator needed).
+function fpHash(s) { let h = 5381; for (let i = 0; i < s.length; i++) h = ((h * 33) ^ s.charCodeAt(i)) >>> 0; return h.toString(16).padStart(8, '0') }
+function fingerprint(f) { return fpHash(`${f.vuln_class}|${(f.file || '').toLowerCase()}|${(f.sink || '').toLowerCase()}`) }
+
+const SIGNAL = 'SIGNAL DISCIPLINE: audience is senior engineers; stay high-signal. Only treat as real an issue with a REACHABLE path from untrusted input to a dangerous sink, with no effective sanitizer/validator/authz on the path. No style nits, no generic defense-in-depth without a concrete sink, no unreachable/dead code, no posture/process items. Prefer a few proven findings over many speculative ones.'
+
+const LENSES = {
+ exploitability: 'Can a real attacker trigger this with realistic access, and is the impact as claimed? If it needs implausible preconditions, refute.',
+ reachability: 'Is the sink actually reachable from untrusted input at runtime given routing, auth guards, and feature flags? If the path is gated or dead, refute.',
+ correctness: 'Is the technical claim accurate — is this API/pattern genuinely dangerous here, or has the code been misread (safe wrapper, parameterized, framework-escaped)? If misread, refute.',
+}
+
+// ---- schemas ----
+const FINDING_PROPS = {
+ id: { type: 'string' },
+ title: { type: 'string' },
+ vuln_class: { type: 'string' },
+ owasp: { type: 'string' },
+ cwe: { type: 'string' },
+ asvs: { type: 'string' },
+ severity: { enum: ['critical', 'high', 'medium', 'low', 'info'] },
+ status: { enum: ['confirmed', 'likely', 'triage'] },
+ confidence: { enum: ['low', 'medium', 'high'] },
+ file: { type: 'string' },
+ line: { type: 'integer' },
+ code_excerpt: { type: 'string' },
+ source: { type: 'string' },
+ sink: { type: 'string' },
+ data_flow: { type: 'string' },
+ sanitizers_checked: { type: 'string' },
+ rationale: { type: 'string' },
+ exploit_sketch: { type: 'string' },
+ dynamic_poc_plan: { type: 'string' },
+ proposed_fix: { type: 'string' },
+ locations: { type: 'array', items: { type: 'string' } },
+}
+const FINDING = { type: 'object', properties: FINDING_PROPS, required: ['title', 'vuln_class', 'severity', 'file', 'rationale'], additionalProperties: true }
+const FINDINGS = { type: 'object', properties: { findings: { type: 'array', items: FINDING } }, required: ['findings'], additionalProperties: true }
+// Matches the contract emitted by prompts/recon.md: `stack` is a label,
+// `run_strategy` is an enum string, `relevant_classes` is [{class, priority_surfaces}].
+const RECON = {
+ type: 'object',
+ properties: {
+ stack: { type: 'string' },
+ frameworks: { type: 'array', items: { type: 'string' } },
+ run_strategy: { enum: ['docker-compose', 'docker', 'native', 'unit-test', 'static-poc'] },
+ entrypoints: { type: 'array', items: { type: 'object', additionalProperties: true } },
+ attack_surface: { type: 'array', items: { type: 'object', additionalProperties: true } },
+ relevant_classes: { type: 'array', items: { type: 'object', properties: { class: { type: 'string' }, priority_surfaces: { type: 'array', items: { type: 'string' } } }, required: ['class'], additionalProperties: true } },
+ skipped_classes: { type: 'array', items: { type: 'object', additionalProperties: true } },
+ notes: { type: 'string' },
+ },
+ required: ['stack', 'run_strategy', 'relevant_classes'],
+ additionalProperties: true,
+}
+const DEEP = { type: 'object', properties: { keep: { type: 'boolean' }, reject_reason: { type: 'string' }, finding: FINDING }, required: ['keep', 'finding'], additionalProperties: true }
+const VERDICT = { type: 'object', properties: { lens: { type: 'string' }, refuted: { type: 'boolean' }, confidence: { enum: ['low', 'medium', 'high'] }, reasoning: { type: 'string' } }, required: ['refuted', 'reasoning'], additionalProperties: true }
+const REPRO = { type: 'object', properties: { reproduced: { type: 'boolean' }, method: { enum: ['live-exploit', 'unit-test', 'build-only', 'static-poc'] }, environment: { type: 'string' }, setup_commands: { type: 'array', items: { type: 'string' } }, poc: { type: 'string' }, observed: { type: 'string' }, impact: { type: 'string' }, notes: { type: 'string' } }, required: ['reproduced', 'method'], additionalProperties: true }
+const SYNTH = { type: 'object', properties: { report: { type: 'string' }, path: { type: 'string' }, stats: { type: 'object' } }, required: ['report'], additionalProperties: true }
+
+const AGENT = { agentType: 'general-purpose' }
+
+// ---- phase 1: recon ----
+phase('Recon')
+const recon = await agent(
+ `Follow the recon instructions in ${TOOL}/prompts/recon.md. Read that file first, then perform PHASE-1 recon on the target repository at ${TARGET} (ref ${REF}). Use Read/Grep/Bash/ast-grep to inspect it. Output the structured recon summary.\nHOST CONSTRAINTS (factor into run_strategy — do not pick a strategy the host can't execute): ${HOST || 'none noted'}.\n${SIGNAL}`,
+ { label: 'recon', phase: 'Recon', schema: RECON, ...AGENT },
+)
+
+// Explicit --classes is authoritative; otherwise use recon's relevant set (objects -> keys).
+let classes
+if (ONLY) {
+ classes = ONLY.filter(c => ALL_CLASSES.includes(c))
+} else {
+ classes = (recon.relevant_classes || []).map(c => (typeof c === 'string' ? c : c && c.class)).filter(c => ALL_CLASSES.includes(c))
+ if (!classes.length) classes = ALL_CLASSES
+}
+const runnable = DYNAMIC && ['docker-compose', 'docker', 'native'].includes(recon.run_strategy)
+log(`recon: ${recon.stack} | strategy: ${recon.run_strategy} | classes: ${classes.join(', ')} | dynamic: ${runnable ? 'yes' : 'no'}`)
+
+// ---- phase 2: triage finders ----
+phase('Triage')
+const finderResults = (await parallel(classes.map(k => () => agent(
+ `Hunt the "${k}" vulnerability class (${CLASS_META[k].title}) in the target. FIRST read the finder method at ${TOOL}/prompts/finder.md and follow it. Class context — OWASP ${CLASS_META[k].owasp}, CWE ${CLASS_META[k].cwe}, ASVS ${CLASS_META[k].asvs}; focus: ${CLASS_META[k].focus}. Target: ${TARGET} (ref ${REF}). Prioritize these surfaces surfaced by recon: ${JSON.stringify((recon.attack_surface || []).slice(0, 40))}. Inspect code with Read/Grep/Bash/ast-grep. ${SIGNAL} Return {findings:[...]}; each candidate must fill source, sink, data_flow, sanitizers_checked, and set owasp/cwe/asvs from the class context. Return {findings:[]} if nothing real.`,
+ { label: `find:${k}`, phase: 'Triage', schema: FINDINGS, ...AGENT },
+)))).filter(Boolean)
+const raw = finderResults.flatMap(r => (r && r.findings) || [])
+log(`triage: ${raw.length} raw candidates from ${classes.length} finders`)
+
+// ---- phase 3: consolidate (barrier: needs all candidates at once) ----
+phase('Consolidate')
+let consolidated = []
+if (raw.length) {
+ const c = await agent(
+ `You are the triage lead for a security audit of ${TARGET}. Raw candidate findings from per-class finders:\n${JSON.stringify(raw)}\n\nDeduplicate: collapse the same root cause across multiple call sites into ONE finding with a locations[] list. Assign stable ids by class (AC-1, SSRF-1, INJ-1, ...). Drop noise per the signal policy. Order by severity. ${SIGNAL} Return {findings:[...]}.`,
+ { label: 'consolidate', phase: 'Consolidate', schema: FINDINGS, ...AGENT },
+ )
+ consolidated = (c && c.findings) || []
+}
+log(`consolidated: ${consolidated.length} candidate findings`)
+
+// ---- phases 4-6: per-finding pipeline (deep review -> verify -> repro) ----
+const processed = consolidated.length ? await pipeline(
+ consolidated,
+ // 4. deep review
+ (f) => agent(
+ `Deep-review this candidate against the target ${TARGET} (ref ${REF}). Finding:\n${JSON.stringify(f)}\n\nRead the surrounding code: the sink, its callers, any sanitizers/validators/authz on the path, and related files — as a careful reviewer would. Decide if there is a REACHABLE path from untrusted input to the sink with no effective mitigation. If it is a false positive, unreachable, mitigated, or out of scope, set keep=false with a short reject_reason. Otherwise keep=true and return the finding enriched with accurate severity, confidence, data_flow, sanitizers_checked, and a high-level proposed_fix (the DIRECTION of the change and why — not a diff or line-level patch; implementation is left to whoever takes the issue). ${SIGNAL}`,
+ { label: `review:${f.id || f.title}`, phase: 'Review', schema: DEEP, ...AGENT },
+ ),
+ // 5. adversarial verify (skeptic panel)
+ async (rev) => {
+ if (!rev || !rev.keep) return rev
+ const votes = (await parallel(Object.keys(LENSES).map(lens => () => agent(
+ `You are an INDEPENDENT security skeptic. Try to REFUTE this finding for target ${TARGET}, using the "${lens}" lens. Read the actual code to check. Finding:\n${JSON.stringify(rev.finding)}\n\nLens: ${LENSES[lens]}\nDefault to refuted=true if you cannot establish a concrete, reachable exploit. Return your verdict.`,
+ { label: `verify:${rev.finding.id || 'f'}:${lens}`, phase: 'Verify', schema: VERDICT, ...AGENT },
+ )))).filter(Boolean)
+ const refutes = votes.filter(v => v.refuted).length
+ return { ...rev, keep: refutes < 2, refuted: refutes >= 2, verdicts: votes }
+ },
+ // 6. dynamic repro (only survivors, only if runnable)
+ async (rev) => {
+ if (!rev) return rev
+ if (!rev.keep || !runnable) return { ...rev, repro: null }
+ const repro = await agent(
+ `Reproduce this finding dynamically against a RUNNING instance of the target, to prove it. Finding:\n${JSON.stringify(rev.finding)}\nRun strategy: ${recon.run_strategy}. Stack: ${recon.stack}; frameworks: ${JSON.stringify(recon.frameworks || [])}. Boot notes from recon: ${JSON.stringify(recon.notes || '')}.\nFollow the repro playbook at ${TOOL}/prompts/playbook.md. Create a git worktree of ${TARGET} at ${REF} so the original tree is untouched; build & run it (docker-first). Use a UNIQUE container name and an ephemeral host port keyed to "${rev.finding.id || 'f'}" to avoid collisions with parallel repros. Fire the PoC and capture the observed result as evidence. Keep ALL traffic local — no external targets, no real credentials, no exfiltration. Tear down containers/processes and the worktree when done. HOST CONSTRAINTS (honor when choosing how to run — e.g. if docker is unavailable, run the app natively instead): ${HOST || 'none noted'}. If it genuinely cannot run live, fall back to a unit-test or static PoC and set method accordingly. Return the repro result.`,
+ { label: `repro:${rev.finding.id || 'f'}`, phase: 'Repro', schema: REPRO, ...AGENT },
+ )
+ return { ...rev, repro }
+ },
+) : []
+
+const results = processed.filter(Boolean)
+
+// normalize status: confirmed (live repro) > likely (kept, no repro) > triage (rejected)
+const finalFindings = results.map(r => {
+ const f = { ...r.finding }
+ if (!r.keep) f.status = 'triage'
+ else if (r.repro && r.repro.reproduced) f.status = 'confirmed'
+ else f.status = 'likely'
+ const fp = fingerprint(f)
+ const display_id = `${TARGET_NAME}-${CLASS_CODE[f.vuln_class] || 'GEN'}-${fp.slice(0, 4)}` // provisional; courier swaps the suffix for the GitHub issue number
+ return { ...f, fp, display_id, kept: !!r.keep, reject_reason: r.reject_reason || null, verdicts: r.verdicts || null, repro: r.repro || null }
+})
+
+// tally
+const sevOrder = ['critical', 'high', 'medium', 'low', 'info']
+const counts = { bySeverity: {}, byStatus: { confirmed: 0, likely: 0, triage: 0 }, total: finalFindings.length }
+for (const s of sevOrder) counts.bySeverity[s] = 0
+for (const f of finalFindings) {
+ if (counts.bySeverity[f.severity] !== undefined) counts.bySeverity[f.severity]++
+ if (counts.byStatus[f.status] !== undefined) counts.byStatus[f.status]++
+}
+log(`results: ${counts.byStatus.confirmed} confirmed, ${counts.byStatus.likely} likely, ${counts.byStatus.triage} triage`)
+
+// ---- phase 7: synthesize the bundle (report.md + findings.json + manifest.json) ----
+phase('Report')
+const BUNDLE = `${OUT}/reports/${TARGET_NAME}`
+const synth = await agent(
+ `Produce the audit BUNDLE — the self-contained artifact a separate "courier" agent will fetch and file to GitHub. Create the directory ${BUNDLE}/ and write THREE files.
+
+SOURCE DATA — the finalized findings (each carries fp, display_id, severity, status, source/sink/data_flow, PoC via repro.observed, and proposed_fix):
+${JSON.stringify(finalFindings)}
+
+Severity tally: ${JSON.stringify(counts)}
+Recon: ${JSON.stringify({ stack: recon.stack, frameworks: recon.frameworks, run_strategy: recon.run_strategy, relevant_classes: classes, skipped: recon.skipped_classes })}
+
+1. ${BUNDLE}/report.md — the human report. FIRST read ${TOOL}/prompts/report-template.md and follow it EXACTLY. Use each finding's "display_id" as its [ID] in the headings. Target: ${TARGET} (ref ${REF}). Scope: ${SCOPE}. Fill {{commit}} with the resolved commit SHA (the same value you compute for manifest.json via git rev-parse) rendered BARE — no backticks — so GitHub auto-links it. Body = confirmed/likely Critical/High/Medium only; appendix = Low/Info + triage (with why) + coverage & method. Terse, senior-oriented; let CWE/OWASP-2025/ASVS refs carry the explanation; show PoC evidence for confirmed findings.
+
+2. ${BUNDLE}/findings.json — write the SOURCE DATA array above VERBATIM as JSON. Preserve every field and all PoC/observed/fix text exactly; do NOT summarize, reorder, or drop fields. This is the machine interface the courier reconciles against (keyed by "fp").
+
+3. ${BUNDLE}/manifest.json — a JSON object describing the scan. Get real values via Bash: \`date -u +%Y-%m-%dT%H:%M:%SZ\` for date; \`git -C ${TARGET} rev-parse HEAD\` for commit; \`git -C ${TARGET} remote get-url origin\` for the repo (normalize an SSH/HTTPS URL to "owner/repo"). Shape: { "tool": "vuln-audit", "schema": 1, "repo": "", "target_path": "${TARGET}", "ref": "${REF}", "commit": "", "slug": "${TARGET_NAME}", "date": "", "dynamic": ${runnable}, "classes_assessed": ${JSON.stringify(classes)}, "counts": ${JSON.stringify(counts)} }.
+
+Return {report: "", path: "${BUNDLE}/report.md", stats: ${JSON.stringify(counts)}}.`,
+ { label: 'synthesize', phase: 'Report', schema: SYNTH, ...AGENT },
+)
+
+return {
+ bundle_dir: BUNDLE,
+ report_path: (synth && synth.path) || `${BUNDLE}/report.md`,
+ findings_path: `${BUNDLE}/findings.json`,
+ manifest_path: `${BUNDLE}/manifest.json`,
+ report: synth && synth.report,
+ counts,
+ stack: recon.stack,
+ classes_assessed: classes,
+ runnable: !!runnable,
+}
]