diff --git a/.env.example b/.env.example new file mode 100644 index 000000000..7d053cd50 --- /dev/null +++ b/.env.example @@ -0,0 +1,111 @@ +# ───────────────────────────────────────────────────────────────────────────── +# REKALL — Environment Variables Template +# Copy this file to .env and fill in your values. +# Never commit .env to version control. +# +# Architecture: DB-free, flat-file vault (no PostgreSQL, no ChromaDB required) +# ───────────────────────────────────────────────────────────────────────────── + + +# ── LLM (required) ──────────────────────────────────────────────────────────── +# Groq — free tier at console.groq.com (no billing required) +# Used by all agents and the RLM REPL reasoning engine. +# +# Key rotation (recommended for heavy use / demo): +# Add up to 10 keys — the engine round-robins and skips 429'd keys automatically. +# Option A: numbered keys +GROQ_API_KEY=gsk_... +# GROQ_API_KEY_2=gsk_... +# GROQ_API_KEY_3=gsk_... +# Option B: comma-separated list (takes precedence if set) +# GROQ_API_KEYS=gsk_key1,gsk_key2,gsk_key3 + + +# ── Flat-file Vault ─────────────────────────────────────────────────────────── +# Path to the vault directory. +# Docker: /app/vault (mounted via docker-compose volume) +# Local dev: vault (relative to repo root — the default) +VAULT_PATH=vault + + +# ── Service URLs ────────────────────────────────────────────────────────────── +# Go backend — Python engine POSTs callback events here +GO_BACKEND_URL=http://localhost:8000 +# Python engine — Go backend triggers pipeline here +ENGINE_URL=http://localhost:8002 + + +# ── Go Backend ──────────────────────────────────────────────────────────────── +PORT=8000 +GIN_MODE=debug +CORS_ORIGINS=http://localhost:3000 + + +# ── Python Engine ───────────────────────────────────────────────────────────── +LOG_LEVEL=INFO + + +# ── Org Vault (optional) ────────────────────────────────────────────────────── +# Enable cross-repo shared memory (vault/org/ directory) +ORG_VAULT_ENABLED=false + + +# ── Simulation Sandbox (optional) ───────────────────────────────────────────── +# Run counterfactual dry-run between Fix and Governance agents +SIMULATION_ENABLED=false + + +# ── Integrations (optional) ─────────────────────────────────────────────────── +# Set INTEGRATIONS_ENABLED=true to activate Slack and Notion logging. +INTEGRATIONS_ENABLED=false + +# Slack — incoming webhook URL for governance and outcome notifications +SLACK_WEBHOOK_URL=https://hooks.slack.com/services/... + +# Notion — log every incident to a Notion database +NOTION_TOKEN=secret_... +NOTION_DATABASE_ID=... + + +# ── Frontend ────────────────────────────────────────────────────────────────── +# URL the browser uses to reach the Go backend API +NEXT_PUBLIC_API_URL=http://localhost:8000 + + +# ── GitHub Live PR Integration (production only) ────────────────────────────── +# In demo mode (default), PR creation is simulated — no GitHub API calls made. +# The orchestrator emits a "Pull request opened" trace event to the UI only. +# +# Set GITHUB_LIVE_PR=true to enable real PR creation via PyGithub: +# pip install PyGithub (add to engine/requirements.txt) +# The orchestrator's execute step will: +# 1. git branch auto-fix-{incident_id[:8]} +# 2. Commit fix_commands as .rekall/fix-{id}.sh +# 3. Call repo.create_pull() and log the PR URL +# +# GITHUB_TOKEN: fine-grained PAT with repo → Contents (write) + Pull requests (write) +GITHUB_TOKEN=ghp_your_actual_token_here +GITHUB_REPO=abjt01/sample-ci-sad +GITHUB_LIVE_PR=true + + +# ── Minikube Sandbox (optional) ─────────────────────────────────────────────── +# When SANDBOX_ENABLED=true, REKALL deploys each fix into an ephemeral Minikube +# namespace, runs the CI test suite, validates the fix works, and only THEN +# raises the GitHub PR — with test evidence attached to the PR body. +# +# Requires: minikube + kubectl on PATH inside the engine container (or host). +# Set to false for demo mode: sandbox is simulated (no real k8s calls). +# +SANDBOX_ENABLED=false + +# Minikube cluster settings (only used when SANDBOX_ENABLED=true) +MINIKUBE_PROFILE=rekall # minikube profile name +MINIKUBE_CPUS=4 # CPUs allocated to Minikube +MINIKUBE_MEMORY=8192 # MB of RAM for Minikube + +# Sandbox runtime limits +SANDBOX_TIMEOUT=300 # Max seconds for sandbox validation + +# Valkey image used for Redis-dependent services inside the sandbox namespace +SANDBOX_VALKEY_IMAGE=valkey/valkey:7.2-alpine \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..5c3b86828 --- /dev/null +++ b/.gitignore @@ -0,0 +1,63 @@ +# ───────────────────────────────────────────── +# REKALL — .gitignore +# ───────────────────────────────────────────── + +# Environment +.env +.env.local +.env.*.local + +# ── Go ─────────────────────────────────────── +backend/bin/ +backend/coverage.out +backend/coverage.html +*.exe +*.test +.gocache/ +.gomodcache/ + +# ── Python ─────────────────────────────────── +__pycache__/ +*.py[cod] +*.pyo +*.pyd +.Python +*.egg-info/ +dist/ +build/ +.eggs/ +.venv/ +venv/ +env/ +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +htmlcov/ +.coverage +*.cover + +# ── Node / Next.js ─────────────────────────── +frontend/node_modules/ +frontend/.next/ +frontend/out/ +frontend/.vercel/ +frontend/playwright-report/ +frontend/test-results/ +frontend/coverage/ + +# ── ChromaDB ───────────────────────────────── +chroma_data/ +.chroma/ + +# ── Docker ─────────────────────────────────── +*.log + +# ── OS ─────────────────────────────────────── +.DS_Store +Thumbs.db + +# ── IDE ────────────────────────────────────── +.idea/ +.vscode/ +*.swp +*.swo diff --git a/CONTEXT.txt b/CONTEXT.txt new file mode 100644 index 000000000..b79a42eeb --- /dev/null +++ b/CONTEXT.txt @@ -0,0 +1,1087 @@ +╔══════════════════════════════════════════════════════════════════════════════════╗ +║ R E K A L L — C O N T E X T ║ +║ Agentic CI/CD Failure Remediation System ║ +║ Last updated: 2026-04-16 ║ +╚══════════════════════════════════════════════════════════════════════════════════╝ + +This document is the canonical, highly-detailed reference for the REKALL system. +It covers: what it is, what every file does, how data flows, what is implemented, +what is planned, and architectural decisions made along the way. + +Keep this file updated whenever a new feature lands or a significant design +decision is made. It is read by the AI coding assistant at the start of every +new conversation to provide full project context. + +─────────────────────────────────────────────────────────────────────────────────── + 1. WHAT IS REKALL? +─────────────────────────────────────────────────────────────────────────────────── + +REKALL is an Agentic CI/CD Orchestration system. When a CI/CD pipeline (GitHub +Actions, GitLab CI, Jenkins) fails, REKALL: + + 1. Detects the failure (webhook or polling GitHub API) + 2. Diagnoses the root cause using an LLM-backed agent pipeline + 3. Searches a memory vault for known fixes (T1 → T2 → T3 fallback) + 4. Proposes a fix with confidence scoring + 5. Scores governance risk across 6 dimensions + 6. Decides: auto-apply, create PR immediately, or block for human review + 7. [UPCOMING] If block_await_human: deploy the proposed fix into a Minikube + sandbox environment, run the CI suite against it, validate the fix works, + THEN raise a final GitHub PR with full test evidence attached + 8. If human approves (or tests pass in sandbox): opens a real GitHub PR + 9. Learns from the outcome — updates vault confidence via reward signals + +REKALL surfaces everything in a real-time Next.js dashboard via SSE streaming. +Every agent step (monitor, diagnostic, fix, governance, execute, learning) emits +a "running → done/error" event that the UI renders as a live agent timeline. + +Built for the RNSIT Hackathon. Demo-capable on a laptop. Production-capable with +GITHUB_LIVE_PR=true and the upcoming Minikube sandbox integration. + +─────────────────────────────────────────────────────────────────────────────────── + 2. ARCHITECTURE OVERVIEW +─────────────────────────────────────────────────────────────────────────────────── + + ┌───────────────────────────────────────────────────────────────────────────┐ + │ GitHub / GitLab / Jenkins / Simulator │ + │ (webhook POST or polling via GITHUB_TOKEN) │ + └────────────────────────────────┬──────────────────────────────────────────┘ + │ POST /webhooks/github (or /simulate) + ▼ + ┌─────────────────────────────────────────────────────────────────────────┐ + │ Go Backend (Gin) port 8000 │ + │ │ + │ Handlers: │ + │ POST /webhooks/github — real webhook receiver │ + │ POST /webhooks/gitlab — GitLab CI webhook │ + │ POST /webhooks/simulate — demo simulator (5 scenarios) │ + │ POST /webhooks/trigger-github — manual: fetch latest GH failure │ + │ GET /incidents — list all incidents (newest first) │ + │ GET /incidents/:id — full incident detail │ + │ GET /stream/:id — SSE event stream for one incident │ + │ POST /incidents/:id/approve — human approves a blocked fix │ + │ POST /incidents/:id/reject — human rejects a blocked fix │ + │ GET /vault/entries — list vault entries │ + │ GET /vault/stats — vault summary metrics │ + │ GET /metrics — dashboard headline figures │ + │ GET /health — liveness probe │ + │ POST /internal/engine-callback — receives async events from engine │ + │ │ + │ Store: in-memory (sync.RWMutex map) + incidents.json persistence │ + │ SSE: per-incident event broker (fan-out to multiple browsers) │ + └───────────┬──────────────────────────────────────────────────────────────┘ + │ POST /pipeline/run-from-github (or /pipeline/run) + │ POST /pipeline/learn + │ POST /pipeline/create-pr + ▼ + ┌─────────────────────────────────────────────────────────────────────────┐ + │ Python Engine (FastAPI + uvicorn) port 8002 │ + │ │ + │ Endpoints: │ + │ POST /pipeline/run — start pipeline from raw payload │ + │ POST /pipeline/run-from-github — fetch GH failure & run pipeline │ + │ POST /pipeline/learn — LearningAgent outcome callback │ + │ POST /pipeline/create-pr — open GitHub PR (human-approved) │ + │ GET /health — liveness probe │ + │ │ + │ Agent Pipeline (rekall_engine/graph/orchestrator.py): │ + │ MonitorAgent → DiagnosticAgent → FixAgent → [SimulationAgent] → │ + │ GovernanceAgent → PublishGuardAgent → execute / block_await_human │ + │ → LearningAgent │ + │ │ + │ Async callback: POST http://backend:8000/internal/engine-callback │ + │ Event types: agent_log | status | fix_proposal │ + └───────────┬──────────────────────────────────────────────────────────────┘ + │ shared volume /app/vault/ + ▼ + ┌─────────────────────────────────────────────────────────────────────────┐ + │ Flat-File Vault (./vault/) │ + │ │ + │ vault/local/ — per-repo fix entries (JSON files) │ + │ vault/org/ — cross-repo shared memory (optional) │ + │ vault/incidents.json — persisted incident records │ + │ │ + │ NO PostgreSQL. NO ChromaDB. Pure JSON on disk. │ + │ Engine writes. Backend reads (read-only mount in Docker). │ + └─────────────────────────────────────────────────────────────────────────┘ + │ + │ SSE events → browser + ▼ + ┌─────────────────────────────────────────────────────────────────────────┐ + │ Next.js Frontend port 3000 │ + │ │ + │ Pages: │ + │ /dashboard — headline metrics + incident list │ + │ /incidents/[id] — live agent timeline + fix proposal + approve │ + │ │ + │ Key components: │ + │ agent-timeline.tsx — renders live step log with status badges │ + │ fix-proposal-card.tsx— shows fix description, tier, commands │ + │ approval-panel.tsx — approve / reject buttons + status │ + │ risk-gauge.tsx — 0–1 risk score visual │ + │ vault-explorer.tsx — browse vault entries │ + │ sidebar.tsx — navigation │ + └─────────────────────────────────────────────────────────────────────────┘ + +─────────────────────────────────────────────────────────────────────────────────── + 3. DIRECTORY STRUCTURE (annotated) +─────────────────────────────────────────────────────────────────────────────────── + +REKALL1/ +├── .env ← Real secrets (gitignored). DO NOT COMMIT. +├── .env.example ← Template. Safe to commit. +├── CONTEXT.txt ← THIS FILE. Canonical project reference. +├── DEMO_VS_PRODUCTION.md ← Explains demo vs real GitHub API paths. +├── HARDCODED.md ← Values that must change before real prod. +├── QUICKSTART.md ← Getting started guide. +├── Makefile ← make dev / make test / make seed / etc. +├── docker-compose.yml ← 3 services: backend, engine, frontend. +├── run.sh ← Convenience wrapper for docker compose. +├── pyrightconfig.json ← Python type checking config. +│ +├── backend/ ← Go (Gin) REST API + SSE broker +│ ├── Dockerfile +│ ├── go.mod / go.sum ← Module: github.com/rekall/backend +│ ├── Makefile +│ ├── cmd/server/main.go ← Entry point: wires all routes +│ └── internal/ +│ ├── config/config.go ← Reads env vars (ENGINE_URL, VAULT_PATH, etc.) +│ ├── engine/ +│ │ ├── client.go ← HTTP client for Python engine endpoints +│ │ └── client_test.go +│ ├── handlers/ +│ │ ├── approvals.go ← POST /approve, /reject → engine.Learn + engine.CreatePR +│ │ ├── callback.go ← POST /internal/engine-callback (agent_log|status|fix_proposal) +│ │ ├── incidents.go ← GET /incidents, GET /incidents/:id +│ │ ├── metrics.go ← GET /metrics +│ │ ├── stream.go ← GET /stream/:id (SSE) +│ │ ├── vault.go ← GET /vault/entries, /vault/stats +│ │ └── webhooks.go ← POST /webhooks/* + simulate + trigger-github +│ ├── middleware/ +│ │ ├── cors.go ← CORS (CORS_ORIGINS env var) +│ │ └── logger.go ← Structured request logging +│ ├── models/models.go ← All shared Go structs (Incident, FixProposal, etc.) +│ ├── sse/broker.go ← Fan-out SSE broker (one channel per incident ID) +│ ├── vault/reader.go ← Reads flat-file vault JSON from disk +│ └── store/store.go ← In-memory store; persists to incidents.json +│ +├── engine/ ← Python FastAPI engine container +│ ├── Dockerfile +│ ├── main.py ← FastAPI app, endpoints, pipeline dispatch +│ ├── requirements.txt ← groq, fastapi, uvicorn, httpx, PyGithub, etc. +│ └── pytest.ini +│ +├── rekall_engine/ ← Core AI agent library (importable package) +│ ├── __init__.py +│ ├── config.py ← EngineConfig dataclass (groq, vault, thresholds) +│ ├── types.py ← All shared Python types (dataclasses + Pydantic) +│ ├── agents/ +│ │ ├── base.py ← BaseAgent: call_llm(), shared Groq client +│ │ ├── monitor.py ← MonitorAgent: normalise raw webhook → FailureEvent +│ │ ├── diagnostic.py ← DiagnosticAgent: build DiagnosticBundle (LLM sig) +│ │ ├── fix.py ← FixAgent: T1→T2→T3 vault retrieval +│ │ ├── simulation.py ← SimulationAgent: counterfactual dry-run (optional) +│ │ ├── governance.py ← GovernanceAgent: risk score → decision +│ │ ├── publish_guard.py ← PublishGuardAgent: supply-chain safety check +│ │ ├── learning.py ← LearningAgent: update vault confidence +│ │ ├── repl.py ← RLM REPL: reasoning loop (iterative investigation) +│ │ └── rlm_engine.py ← RLM Zoom & Scan: deep log analysis +│ ├── graph/ +│ │ └── orchestrator.py ← Sequential pipeline runner (LangGraph-optional) +│ ├── integrations/ +│ │ ├── notion.py ← Notion database logger (optional, fire-and-forget) +│ │ └── slack.py ← Slack webhook notifier (optional, fire-and-forget) +│ └── vault/ +│ └── store.py ← Flat-file vault read/write (local/ and org/) +│ +├── frontend/ ← Next.js 15 + TypeScript + TailwindCSS +│ ├── app/ +│ │ ├── layout.tsx ← Root layout, font, theme provider +│ │ ├── page.tsx ← Redirects / → /dashboard +│ │ └── (app)/ +│ │ ├── dashboard/page.tsx ← Dashboard with headline stats +│ │ └── incidents/[id]/page.tsx ← Per-incident detail with live timeline +│ ├── components/ ← All UI components (see Section 7) +│ ├── lib/ +│ │ ├── api-client.ts ← All API calls (fetch wrapper) +│ │ ├── types.ts ← TypeScript type definitions +│ │ ├── utils.ts ← Shared utilities +│ │ └── hooks/ +│ │ ├── use-agent-stream.ts ← SSE subscription hook +│ │ └── use-incidents.ts ← Incident list polling hook +│ ├── __tests__/ ← Jest unit tests +│ └── e2e/ ← Playwright E2E tests +│ +├── scripts/ +│ ├── seed-vault.py ← Populate vault/local/ with sample entries +│ ├── simulate.sh ← Fire webhook simulate requests +│ ├── health-check.sh ← Check all 3 services are healthy +│ ├── test-all.sh ← Run Go tests + Python tests + Jest +│ ├── generate_context.py ← Regenerate CONTEXT.txt from file tree +│ └── dev-start.sh ← Start all services locally (no Docker) +│ +└── vault/ + ├── local/ ← Per-repo vault entries (JSON files) + └── org/ ← Cross-repo shared vault (optional) + +─────────────────────────────────────────────────────────────────────────────────── + 4. AGENT PIPELINE — DETAILED +─────────────────────────────────────────────────────────────────────────────────── + +The REKALL AI pipeline runs sequentially through the following agents. Each agent +receives a shared `state: Dict[str, Any]` dict and returns an updated version. +The pipeline is driven by `rekall_engine/graph/orchestrator.py::run_pipeline()`. + +────────────────────────────────────────── + 4.1 MonitorAgent +────────────────────────────────────────── +File: rekall_engine/agents/monitor.py + +Receives: state["raw_webhook"] — raw dict from Go backend +Produces: state["failure_event"] (FailureEvent dataclass) + state["failure_object"] (FailureObject Pydantic model) + + - Validates the incoming payload against known schemas + - Normalises source-specific fields (GitHub Actions vs GitLab vs Jenkins) + - Extracts: incident_id, source, failure_type, log_excerpt, branch, commit_sha + - Maps raw failure_type strings to canonical values: + test | deploy | infra | security | oom | unknown + - If validation fails: marks failure_type as "unknown" and continues + - LLM call: brief prompt to classify if failure type is ambiguous + +────────────────────────────────────────── + 4.2 DiagnosticAgent +────────────────────────────────────────── +File: rekall_engine/agents/diagnostic.py + +Receives: state["failure_object"] +Produces: state["diagnostic_bundle"] (DiagnosticBundle dataclass) + + - Builds the context that will be used for vault lookup and fix generation + - Runs RLM Zoom & Scan (rlm_engine.py) on the full log to extract signal: + - Depth-0 scan: broad overview, 1024 tokens + - Depth-1 investigate: targeted deep-dive, 2048 tokens + - Produces a failure_signature: compact string like "infra:github_actions:oom" + Used as the primary key for vault lookup (substring and exact match first, + vector embedding fallback) + - LLM call: "compress this log into a 3-sentence context_summary for the UI" + - fields: failure_signature, log_excerpt (< 12000 chars), git_diff, context_summary + +────────────────────────────────────────── + 4.3 FixAgent +────────────────────────────────────────── +File: rekall_engine/agents/fix.py + +Receives: state["diagnostic_bundle"] +Produces: state["fix_proposal"] (FixProposal dataclass) + + Three-tier retrieval waterfall: + + T1 — Human vault (vault/local/*.json) + - Exact signature match first (O(1) string lookup) + - If no exact: substring/partial match on failure_signature + - Threshold: similarity_score >= 0.85 to use T1 + - Fast: no LLM call + + T2 — Synthetic vault (vault/local/ synthetic entries) + - Same lookup, lower threshold: >= 0.75 + - Entries created automatically when T3 fixes are promoted after success + + T3 — LLM first principles (RLM REPL) + - Called when vault is empty or below threshold + - repl.py runs an iterative reasoning loop: + Step 1: scan log → identify root cause candidates + Step 2: select top candidate → generate fix steps + Step 3: validate fix steps against log context + - Produces FixSuggestion with fix_commands (bash commands) + reasoning + - vault_candidates=0 when RLM REPL is used (no vault hit) + - The generated fix is stored as T3_llm fix proposal + + The FixProposal contains: + - tier: "T1_human" | "T2_synthetic" | "T3_llm" + - fix_description: human-readable summary + - fix_commands: list of bash commands + - fix_diff: optional unified diff string + - confidence: 0.0 – 1.0 (from vault entry or LLM estimate) + - reasoning: why this fix was selected + +────────────────────────────────────────── + 4.4 SimulationAgent (optional) +────────────────────────────────────────── +File: rekall_engine/agents/simulation.py +Controlled by: SIMULATION_ENABLED=true in .env + +Receives: state["fix_proposal"] +Produces: state["simulation_result"] + + - Counterfactual dry-run: "if this fix had been applied, would the pipeline + have passed?" + - Currently a stub — raises NotImplementedError (non-fatal, pipeline continues) + - Planned: feed fix_commands into the Minikube sandbox (see Section 9) + +────────────────────────────────────────── + 4.5 GovernanceAgent +────────────────────────────────────────── +File: rekall_engine/agents/governance.py + +Receives: state["fix_proposal"], state["diagnostic_bundle"] +Produces: state["governance_decision"] (GovernanceDecision dataclass) + + Risk scoring — weighted sum of active factors: + Factor Weight Trigger condition + llm_generated 0.25 tier == T3_llm + low_confidence 0.20 fix.confidence < 0.50 + no_vault_history 0.15 vault_candidates == 0 + touches_secrets 0.30 fix_commands mention env vars / secret paths + production_branch 0.20 branch in {main, master, production, prod, release} + infra_failure 0.15 failure_type == "infra" + security_failure 0.30 failure_type == "security" + negative_reward 0.20 vault entry has negative cumulative reward + low_similarity 0.10 similarity_score < 0.80 + + Decision thresholds: + risk_score < 0.40 → auto_apply (safe: apply immediately) + risk_score < 0.70 → create_pr (moderate: open PR without human gate) + risk_score >= 0.70 → block_await_human (risky: wait for human approval) + + The governance decision is what drives the rest of the pipeline route. + risk_factors is a list of strings for the UI to display. + +────────────────────────────────────────── + 4.6 PublishGuardAgent +────────────────────────────────────────── +File: rekall_engine/agents/publish_guard.py + +Receives: state["fix_proposal"], state["governance_decision"] +Produces: state["publish_guard_flags"] + + Supply-chain safety gate. Scans fix_commands for dangerous patterns: + - curl | bash / wget | sh (unsandboxed remote execution) + - rm -rf (destructive) + - secret/key/token mutations in env vars + - npm install --global (global package mutation) + - Any command touching /etc/ or /root/ + + If flags are found AND governance decision is auto_apply or create_pr: + - Escalates decision to block_await_human (safety override) + - Adds flags to the governance decision risk_factors list + + This is a pure rule-based check — no LLM call. + +────────────────────────────────────────── + 4.7 Execute step (in orchestrator, not a separate agent) +────────────────────────────────────────── +File: rekall_engine/graph/orchestrator.py (lines ~187-260) + + Decision routing after PublishGuardAgent: + + auto_apply / create_pr: + - If GITHUB_LIVE_PR=true: create branch, commit fix script, open GitHub PR + - If GITHUB_LIVE_PR=false: emit "Pull request opened" trace (demo mode) + - Run LearningAgent immediately (no human in the loop) + - Pipeline → resolved + + block_await_human: + - CRITICAL: serialize the FixProposal and POST it to Go backend as a + "fix_proposal" callback event BEFORE posting the awaiting_approval status. + (This is what the Go Approve handler reads from store to build the CreatePR call.) + - Emit "Awaiting human review" trace to UI + - Set state["paused"] = True + - Pipeline pauses here — LearningAgent runs later via the callback + +────────────────────────────────────────── + 4.8 LearningAgent +────────────────────────────────────────── +File: rekall_engine/agents/learning.py + +Receives: state["outcome"] (Outcome dataclass), state["fix_proposal"] +Produces: (updates vault confidence in-place) + + Called in two contexts: + 1. Auto (after auto_apply / create_pr): outcome.result = "success" + 2. Human callback (after POST /approve or /reject): result = "success" or "rejected" + + Updates the vault entry's confidence using the reward signal: + - success → confidence += reward_step_size (default +1.0) + - rejected → confidence -= 0.5 + - failure → confidence -= 1.0, skip_count += 1 + + If confidence >= skip_threshold (-2.0 default) and fix was T3: promote to T2 + (synthetic entry created in vault). This is the "learning" part of REKALL's RL loop. + +────────────────────────────────────────── + 4.9 RLM Engine (Recursive Language Model) +────────────────────────────────────────── +Files: rekall_engine/agents/rlm_engine.py + rekall_engine/agents/repl.py + + The RLM is invoked when T1 and T2 vault miss (no known fix for this signature). + It is REKALL's "think from first principles" engine. + + Zoom & Scan: + - Segments the log into overlapping windows (8000 chars each) + - Runs Depth-0 scan on each window: "what went wrong here?" + - Ranks segments by failure signal confidence + - Top-ranked segment gets Depth-1 investigation: root cause + fix suggestion + - Token budget: 1024 (depth-0) / 2048 (depth-1) + + REPL loop (repl.py): + - Iterative reasoning: starts with scan result, refines over N steps + - Each step: LLM call → extract action → update context + - Terminates when fix steps are specific enough or step limit hit + - Returns FixSuggestion with reasoning trace (rlm_trace stored in FixProposal) + + RLM is rate-limited by Groq. Uses retry logic (1s → 10s backoff). + Groq 429 errors are normal under load — the retry loop handles them. + +─────────────────────────────────────────────────────────────────────────────────── + 5. DATA FLOW — END TO END +─────────────────────────────────────────────────────────────────────────────────── + + [A] Incident Created + POST /webhooks/simulate {"scenario": "postgres_refused"} + → webhooks.go creates Incident in store (status=processing) + → calls engine.RunFromGitHub() or engine.RunPipeline() + → 200 OK immediately (async) + + [B] SSE Stream Opens + Browser subscribes: GET /stream/{incident_id} + → SSE broker creates channel for this incident ID + → Browser receives events in real time + + [C] Agent Pipeline Runs (async in Python) + For each agent step: + → orchestrator emits AgentLogEntry to asyncio.Queue + → engine/main.py drains queue → POST /internal/engine-callback + → callback.go stores log in memory + publishes to SSE broker + → browser renders step badge in agent-timeline.tsx + + [D] Pipeline Pauses (block_await_human) + → orchestrator POSTs fix_proposal callback (NEW — see Section 6.1) + → callback.go calls store.UpsertFixProposal() + → orchestrator POSTs status=awaiting_approval callback + → callback.go calls store.UpdateIncidentStatus() + → browser shows "Awaiting human review" + Approve / Reject buttons + + [E] Human Approves + POST /incidents/{id}/approve + → approvals.go marks incident resolved in store + → logs "Fix approved by human" + → in a goroutine (background, 2-min timeout): + 1. engine.Learn() → POST /pipeline/learn → LearningAgent + 2. engine.CreatePR() → POST /pipeline/create-pr → _create_pr_async() + → Python creates GitHub branch + commits fix script + opens PR + → POSTs agent_log "execute/done: Pull request opened: " + → browser renders PR URL in agent timeline + + [F] Post-Approval SSE + The "execute" step timeline entry updates in real time as the goroutine + progresses. The SSE stream for this incident is still open (it wasn't + closed by the approval, only by a resolved/failed status event). + + [G] Human Rejects + POST /incidents/{id}/reject + → approvals.go marks incident failed + → goroutine: engine.Learn() with result="rejected" + → No PR is created + +─────────────────────────────────────────────────────────────────────────────────── + 6. KEY FIXES APPLIED (session log) +─────────────────────────────────────────────────────────────────────────────────── + + 6.1 fix_proposal callback bridge (2026-04-16) ← MOST RECENT + ───────────────────────────────────────────────── + Problem: + The Go Approve handler called store.GetLatestFixProposal() → nil every time. + The fix_proposal existed only in Python in-memory state (orchestrator.py). + The callback handler only knew "agent_log" and "status" event types. + Since fix == nil, the engine.CreatePR() call was silently skipped entirely. + PRs were never raised after human approval. + + Fix: + engine/main.py (_run_pipeline_async): + After final_state = await pipeline_task, if paused=True: + Serialize fix_proposal fields → POST "fix_proposal" callback event to Go + BEFORE posting the awaiting_approval status event. + + backend/internal/handlers/callback.go: + Added fixProposalData struct. + Added "fix_proposal" switch case → deserializes → store.UpsertFixProposal(). + + Result: Approve → GetLatestFixProposal finds the fix → CreatePR fires → PR opens. + + 6.2 Approve handler goroutine context (prev session) + ────────────────────────────────────────────────────── + Problem: + engine.Learn() and engine.CreatePR() were called with c.Request.Context(). + The HTTP response is sent before the goroutine finishes. + Go cancels c.Request.Context() when the response is sent → calls failed. + + Fix: + Use context.WithTimeout(context.Background(), 2*time.Minute) for the goroutine. + This detaches from the request lifecycle entirely. + +─────────────────────────────────────────────────────────────────────────────────── + 7. FRONTEND COMPONENTS +─────────────────────────────────────────────────────────────────────────────────── + + agent-timeline.tsx + Renders the live pipeline step list. Subscribes to SSE via use-agent-stream.ts. + Each step has a status badge: running (spinner) | done (green) | error (red). + Steps: monitor → diagnostic → fix → governance → publish_guard → execute → learning. + + fix-proposal-card.tsx + Shows the FixProposal after FixAgent completes: + - Fix description + - Tier badge (T1 Human / T2 Synthetic / T3 LLM) + - Confidence bar (0–100%) + - Fix commands (code block) + - Reasoning text + + approval-panel.tsx + Rendered only when incident.status == "awaiting_approval". + Shows the governance risk score, risk factors, and Approve / Reject buttons. + On click: calls POST /incidents/:id/approve or /reject. + + risk-gauge.tsx + Animated semi-circle gauge showing risk_score (0.0–1.0). + Color: green < 0.40, yellow < 0.70, red >= 0.70. + + incident-card.tsx + Summary card on the dashboard list. Shows source, failure_type, status, age. + + vault-explorer.tsx + Paginated list of vault entries. Shows signature, confidence, tier, retrieval count. + + sidebar.tsx + Navigation: Dashboard | Incidents | Vault | RL Metrics. + + use-agent-stream.ts (hook) + EventSource subscription to GET /stream/:id. + Dispatches "agent_log" and "status" events to component state. + Handles reconnect on disconnect. + + use-incidents.ts (hook) + Polls GET /incidents every 5 seconds for the dashboard list. + + api-client.ts + Wraps all API calls. Base URL from NEXT_PUBLIC_API_URL. + Methods: getIncidents(), getIncident(), getVaultEntries(), getMetrics(), + approveIncident(), rejectIncident(), simulateIncident(). + +─────────────────────────────────────────────────────────────────────────────────── + 8. DATA MODELS +─────────────────────────────────────────────────────────────────────────────────── + + Go Models (backend/internal/models/models.go) + ────────────────────────────────────────────── + Incident {id, source, failure_type, raw_payload, status, created_at, updated_at} + DiagnosticBundle {id, incident_id, failure_signature, log_excerpt, git_diff, test_report, context_summary} + FixProposal {id, incident_id, tier, vault_entry_id, similarity_score, fix_description, fix_commands, + fix_diff, confidence, reasoning, rlm_trace, created_at} + GovernanceDecision{id, incident_id, risk_score, decision, risk_factors, created_at} + AgentLog {id, incident_id, step_name, status, detail, created_at} + VaultEntry {id, failure_signature, failure_type, fix_description, source, confidence, + retrieval_count, success_count, created_at, updated_at} + Outcome (request only) + ApprovalRequest {reviewed_by, notes} + + IncidentStatus values: processing | awaiting_approval | resolved | failed + FixTier values: T1_human | T2_synthetic | T3_llm + GovernanceDecision: auto_apply | create_pr | block_await_human + + Python Types (rekall_engine/types.py) + ────────────────────────────────────── + Section A (dataclasses — Go contract): + FailureEvent, DiagnosticBundle, VaultEntry, FixProposal, GovernanceDecision, + Outcome, AgentLogEntry + + Section B (Pydantic — AI layer): + FailureObject, VaultQuery, VaultCandidate, VaultResponse, FixDetail, + FixSuggestion (with to_fix_proposal() converter) + + Callback event types (POST /internal/engine-callback): + "agent_log" → {incident_id, step_name, status, detail} + "status" → {incident_id, status} + "fix_proposal" → {id, incident_id, tier, fix_description, fix_commands, + fix_diff, vault_entry_id, confidence, reasoning} + +─────────────────────────────────────────────────────────────────────────────────── + 9. MINIKUBE SANDBOX INTEGRATION (PLANNED — NOT YET IMPLEMENTED) +─────────────────────────────────────────────────────────────────────────────────── + + OVERVIEW + ───────── + Currently, when governance decides block_await_human, the pipeline pauses and + waits for a human to click Approve before any PR is raised. The fix is never + actually tested before the PR goes out. + + The Minikube sandbox changes this fundamentally: + + Instead of: detect → diagnose → fix → governance → PAUSE → human approve → PR + + Future flow: detect → diagnose → fix → governance → PAUSE → deploy to Minikube + → run CI test suite inside Minikube → validate fix works + → THEN raise a final GitHub PR with test evidence attached + + This makes REKALL move from "proposes fixes" to "proves fixes work before asking + for human review." The human approves a validated, tested fix — not just a + suggestion. + + MINIKUBE SANDBOX ARCHITECTURE + ─────────────────────────────── + The sandbox is a local Minikube cluster that mirrors the target repository's CI + environment. It is provisioned per-incident and torn down after validation. + + Key design decisions: + - NO Redis. The sandbox uses Valkey (the Redis-fork, Apache 2.0 licence) for + any queue/state needs within the sandbox environment. + - Valkey runs as a pod inside Minikube, not as a host service. + - The Python engine controls the sandbox via kubectl and the Kubernetes Python + client (kubernetes-client/python). + - Each sandbox is ephemeral: created for one incident_id, destroyed after tests. + + SANDBOX AGENT (planned — new agent) + ───────────────────────────────────── + File: rekall_engine/agents/sandbox.py (TO BE CREATED) + + Inputs: state["fix_proposal"], state["diagnostic_bundle"] + Outputs: state["sandbox_result"] {passed: bool, test_log: str, pr_evidence: str} + + Steps: + 1. Provision: + - minikube start (or reuse existing cluster) + - kubectl create namespace rekall-sandbox-{incident_id[:8]} + - helm install / kubectl apply the target repo's base manifests + - Deploy Valkey pod for any Redis-dependent services in the repo + + 2. Apply fix: + - Apply fix_commands as a ConfigMap or Job in the sandbox namespace + - If fix_diff is available: apply as a patch + + 3. Run CI: + - kubectl apply the CI job (GitHub Actions workflow translated to a + Kubernetes Job, or run the test runner directly as a Pod) + - Stream Pod logs back to the engine via kubectl logs --follow + - Post agent_log events to the Go callback in real time: + step_name="sandbox" status="running" detail="Running test suite..." + + 4. Collect evidence: + - Parse exit code of the test Job/Pod + - Collect test XML / stdout + - Build pr_evidence string: summary of pass/fail + key log lines + + 5. Teardown: + - kubectl delete namespace rekall-sandbox-{incident_id[:8]} + - (Optionally keep Minikube cluster running for next incident) + + 6. Result: + - sandbox_result.passed == True → proceed to PR creation + - sandbox_result.passed == False → block, notify human, do NOT create PR + (possibly retry with a revised fix via another RLM REPL round) + + ORCHESTRATOR CHANGES (planned) + ────────────────────────────── + In rekall_engine/graph/orchestrator.py, the block_await_human path becomes: + + block_await_human: + → [existing] POST fix_proposal callback to Go + → [NEW] run SandboxAgent + → if sandbox passes: auto-create PR with pr_evidence in body + → if sandbox fails: pause for human, report failure evidence + → POST status=awaiting_approval if still paused + + VALKEY (not Redis) — rationale + ────────────────────────────── + The Minikube sandbox environment uses Valkey instead of Redis because: + 1. Valkey is the open-source, Apache 2.0 fork of Redis (post SSPL) + 2. No licence compliance concerns in hackathon or production contexts + 3. API-compatible with Redis client libraries (no code changes needed) + 4. Runs as a lightweight pod inside Minikube with the official Valkey image: + image: valkey/valkey:7.2-alpine + 5. Used for: job queuing within the sandbox, short-lived state between + sandbox steps, pub/sub for streaming test log lines back to engine + + ENV VARS FOR SANDBOX (to be added to .env and docker-compose.yml) + ────────────────────────────────────────────────────────────────── + SANDBOX_ENABLED=true # Enable Minikube sandbox validation + MINIKUBE_PROFILE=rekall # Minikube profile name + MINIKUBE_CPUS=4 # CPUs allocated to Minikube + MINIKUBE_MEMORY=8192 # MB RAM for Minikube + SANDBOX_TIMEOUT=300 # Max seconds for sandbox validation + SANDBOX_VALKEY_IMAGE=valkey/valkey:7.2-alpine + + NEW DEPENDENCIES (engine/requirements.txt additions) + ────────────────────────────────────────────────────── + kubernetes # Python k8s client for cluster control + valkey # Valkey client (Redis-compatible API) + + THE FINAL PR BODY (planned) + ──────────────────────────── + When SandboxAgent passes, the GitHub PR body will include: + + ## 🤖 REKALL Auto-Fix (Sandbox Validated) + + **Incident ID:** `{incident_id}` + **Fix tier:** `{fix_tier}` + **Sandbox validation:** ✅ PASSED ({N} tests, 0 failures) + + ### What happened + {context_summary from DiagnosticAgent} + + ### Fix applied + {fix_description} + + ```bash + {fix_commands} + ``` + + ### Test Evidence (Minikube sandbox) + ``` + {pr_evidence — key test output lines} + ``` + + *Fix was automatically validated in a Minikube sandbox before this PR was opened.* + *Auto-generated by REKALL. Approved by: human | sandbox-validated.* + +─────────────────────────────────────────────────────────────────────────────────── + 10. ENVIRONMENT VARIABLES — COMPLETE REFERENCE +─────────────────────────────────────────────────────────────────────────────────── + + Required: + GROQ_API_KEY Groq API key (console.groq.com, free tier) + GITHUB_TOKEN GitHub fine-grained PAT (Contents+PRs write) + GITHUB_REPO Target repo slug: "owner/repo" + + Service connectivity: + GO_BACKEND_URL Python engine → Go callback URL (default: http://backend:8000) + ENGINE_URL Go backend → Python engine URL (default: http://engine:8002) + PORT Go backend port (default: 8000) + CORS_ORIGINS Allowed CORS origins (default: http://localhost:3000) + GIN_MODE "debug" | "release" + NEXT_PUBLIC_API_URL Browser → Go backend URL (default: http://localhost:8000) + + Vault: + VAULT_PATH Path to vault dir. Docker: /app/vault. Local: vault + ORG_VAULT_ENABLED Enable vault/org/ cross-repo memory (default: false) + + AI & LLM: + LOG_LEVEL Python engine log level (default: INFO) + GROQ_MODEL Groq model name (default: llama-3.3-70b-versatile) + + GitHub PR: + GITHUB_LIVE_PR "true" = create real PRs. "false" = demo trace only. + + Simulation: + SIMULATION_ENABLED Run SimulationAgent between Fix and Governance (default: false) + + Integrations: + INTEGRATIONS_ENABLED Enable Slack + Notion logging (default: false) + SLACK_WEBHOOK_URL Slack incoming webhook URL + NOTION_TOKEN Notion API secret + NOTION_DATABASE_ID Notion database ID for incident logging + + Minikube Sandbox (PLANNED): + SANDBOX_ENABLED Enable Minikube sandbox validation (default: false) + MINIKUBE_PROFILE Minikube profile (default: rekall) + MINIKUBE_CPUS CPUs for Minikube (default: 4) + MINIKUBE_MEMORY MB RAM for Minikube (default: 8192) + SANDBOX_TIMEOUT Max sandbox validation time in seconds (default: 300) + SANDBOX_VALKEY_IMAGE Valkey image for sandbox (default: valkey/valkey:7.2-alpine) + +─────────────────────────────────────────────────────────────────────────────────── + 11. DOCKER COMPOSE SERVICES +─────────────────────────────────────────────────────────────────────────────────── + + engine (Python FastAPI) port 8002 → 8002 + - Dockerfile: engine/Dockerfile + - Volume: ./vault → /app/vault (owns writes) + - Env: GROQ_API_KEY, GO_BACKEND_URL, VAULT_PATH, LOG_LEVEL, integrations + - Healthcheck: HTTP GET localhost:8002/health every 15s + + backend (Go Gin) port 8000 → 8000 + - Dockerfile: backend/Dockerfile + - Volume: ./vault → /app/vault:ro (read-only — engine owns writes) + - Env: ENGINE_URL, VAULT_PATH, CORS_ORIGINS, GIN_MODE, PORT + - depends_on: engine (healthy) + - Healthcheck: wget localhost:8000/health every 10s + + frontend (Next.js) port 3000 → 3000 + - Dockerfile: frontend/Dockerfile + - Env: NEXT_PUBLIC_API_URL, NODE_ENV=production + - depends_on: backend (healthy) + + NO postgres. NO chromadb. NO redis. Flat files only (+ Valkey in sandbox). + +─────────────────────────────────────────────────────────────────────────────────── + 12. VAULT SYSTEM +─────────────────────────────────────────────────────────────────────────────────── + + The vault is a directory of JSON files. No vector database, no SQL. + + vault/local/ + One JSON file per vault entry. Filename: {failure_signature}.json + Schema: + { + "id": "uuid", + "failure_signature": "infra:github_actions:oom", + "failure_type": "oom", + "fix_description": "...", + "fix_commands": ["kubectl delete pod ..."], + "fix_diff": null, + "confidence": 0.85, + "retrieval_count": 12, + "success_count": 10, + "source": "human", + "created_at": "2026-04-01T00:00:00Z", + "reward_score": 2.0, + "skip_count": 0 + } + + vault/org/ + Cross-repo shared vault. Same schema. Enabled by ORG_VAULT_ENABLED=true. + + vault/incidents.json + Persisted snapshot of the Go in-memory store. Loaded on startup, saved on + each incident update. Allows incidents to survive backend restarts. + + Vault lookup strategy (FixAgent): + 1. Exact match on failure_signature + 2. Substring match (signature contains failure_signature or vice versa) + 3. No match → T3 LLM (RLM REPL) + + Confidence update (LearningAgent): + On success: entry.confidence += 0.1 (capped at 1.0) + On reject: entry.confidence -= 0.05 (floored at 0.0) + On failure: entry.confidence -= 0.1 + Decay: 0.995 per day (applied on retrieval) + + Ranker (FixAgent): + Entries with reward_score < -2.0 are excluded from T1/T2 retrieval. + Excluded entries are shown in the UI as "skipped by RL ranker". + +─────────────────────────────────────────────────────────────────────────────────── + 13. SSE (SERVER-SENT EVENTS) PROTOCOL +─────────────────────────────────────────────────────────────────────────────────── + + Every incident gets a dedicted SSE channel via GET /stream/{incident_id}. + The channel is created on first subscription and closed when the incident + reaches a terminal state (resolved or failed). + + Event format (text/event-stream): + data: {"type":"agent_log","data":{...}}\n\n + data: {"type":"status","data":{"status":"resolved"}}\n\n + data: [DONE]\n\n ← terminal sentinel + + agent_log payload: + {incident_id, step_name, status, detail, created_at} + + status payload: + {status: "processing" | "awaiting_approval" | "resolved" | "failed"} + + Internal flow: + engine/main.py → POST /internal/engine-callback + → callback.go stores in memory + sse.Broker.Publish() + → broker fans out to all listening EventSource clients for that incident + + The broker has a per-channel buffer (default 100 events) to handle slow clients. + Channels are garbage-collected after a short TTL following terminal state. + +─────────────────────────────────────────────────────────────────────────────────── + 14. WEBHOOK SIMULATOR (DEMO TOOL) +─────────────────────────────────────────────────────────────────────────────────── + + POST /webhooks/simulate {"scenario": "..."} + + Available scenarios (hardcoded in webhooks.go): + postgres_refused — Database connection refused at startup + oom_kill — Container OOM-killed (exit code 137) + test_failure — JUnit test suite failure + secret_leak — GitGuardian-style secret leak detection + image_pull_backoff — Docker image pull failure (ImagePullBackOff) + + Each scenario includes a realistic log_excerpt + git_diff + metadata. + The payload is fed directly to the Python engine as if it came from a real webhook. + This is the primary demo mode — no GitHub API calls, instant, reproducible. + + POST /webhooks/trigger-github + Fetches the latest real failed GitHub Actions run from GITHUB_REPO and runs + the full pipeline on real data. Requires GITHUB_TOKEN. + +─────────────────────────────────────────────────────────────────────────────────── + 15. GOVERNANCE DECISION LOG — EXAMPLE +─────────────────────────────────────────────────────────────────────────────────── + + From the logs (2026-04-16): + + [governance] incident=f385cd09 risk=0.90 decision=block_await_human + factors=['llm_generated', 'no_vault_history', 'touches_secrets', 'production_branch'] + + Breakdown: + llm_generated: +0.25 (T3_llm fix — no vault hit) + no_vault_history: +0.15 (vault_candidates=0) + touches_secrets: +0.30 (fix commands reference env vars) + production_branch: +0.20 (branch=main) + ───────────────────────── + Total risk: 0.90 → block_await_human ✓ + + [publish_guard] passed — no supply-chain flags (the fix commands were clean) + Decision stands: block_await_human → human approval required → goroutine → PR + +─────────────────────────────────────────────────────────────────────────────────── + 16. KNOWN ISSUES / LIMITATIONS (current) +─────────────────────────────────────────────────────────────────────────────────── + + 1. In-memory store is NOT durable across backend restarts (incidents.json is + saved but the FixProposal is not serialized to that file yet). + If backend restarts between pipeline completion and human approval, + store.GetLatestFixProposal() returns nil → PR skipped. + FIX NEEDED: serialize FixProposal into incidents.json on UpsertFixProposal(). + + 2. Groq rate limiting (429) during heavy pipeline runs delays the RLM REPL. + The retry logic (1s → 10s backoff) handles it but introduces latency. + MITIGATION: use llama-3.1-8b-instant (faster, lower TPM usage) for scans. + + 3. The engine container takes ~30s to restart (Python dependency import time). + Backend wait: depends_on engine healthy. If engine is restarting, + new incidents are queued until it's back. + + 4. The duplicate log line issue in docker-compose output: + Each engine log line appears twice. This is because uvicorn's root logger + and the named rekall.* logger both emit. Not a bug — just cosmetic noise. + FIX OPTION: set uvicorn access log to disabled and use rekall.* only. + + 5. The SimulationAgent raises NotImplementedError — this is intentional. + It is a stub pending the Minikube sandbox implementation (see Section 9). + + 6. GITHUB_TOKEN and GROQ_API_KEY are committed in .env (gitignored locally). + For the hackathon this is acceptable. Rotate all keys before any public demo. + +─────────────────────────────────────────────────────────────────────────────────── + 17. MAKEFILE TARGETS +─────────────────────────────────────────────────────────────────────────────────── + + make dev Start all services locally (no Docker) + make docker docker compose up --build + make down docker compose down + make test Run all tests (Go + Python + Jest) + make test-go Go tests only + make test-py Python tests only + make test-js Jest tests only + make seed Seed vault with sample entries (scripts/seed-vault.py) + make simulate Fire all 5 simulator scenarios + make health Check all 3 services are running + make context Regenerate CONTEXT.txt (scripts/generate_context.py) + make logs docker compose logs --follow + +─────────────────────────────────────────────────────────────────────────────────── + 18. TESTING STRATEGY +─────────────────────────────────────────────────────────────────────────────────── + + Go (backend/): + go test ./... + Tests cover: webhook handler, SSE broker, engine client mock, store operations + + Python (rekall_engine/): + pytest engine/ rekall_engine/ + Tests cover: individual agents, vault reader, orchestrator routing + + TypeScript (frontend/): + Jest unit tests: components (agent-timeline, fix-proposal-card, incident-card, + risk-gauge), hooks, api-client + Playwright E2E: dashboard, vault, RL metrics pages + + CI (sample-ci-sad repo): + Target repo for demo. Intentionally has failing workflows. + REKALL watches it via GITHUB_REPO=abjt01/sample-ci-sad. + +─────────────────────────────────────────────────────────────────────────────────── + 19. INTEGRATIONS (OPTIONAL) +─────────────────────────────────────────────────────────────────────────────────── + + Slack (rekall_engine/integrations/slack.py) + Sends a message to SLACK_WEBHOOK_URL when: + - Governance blocks (block_await_human): "@team PR ready for review" + - Incident resolved: "Fix applied: {description}" + - Incident failed: "Fix failed: {description}" + Fire-and-forget: failure logged but never raises. + + Notion (rekall_engine/integrations/notion.py) + Logs every incident to a Notion database (NOTION_DATABASE_ID). + Fields: Incident ID, Source, Failure Type, Status, Risk Score, Decision, + Fix Tier, Confidence, Fix Description, Created At. + Fire-and-forget: failure logged but never raises. + + Both integrations are disabled by default (INTEGRATIONS_ENABLED=false). + Enable with INTEGRATIONS_ENABLED=true + respective secrets. + +─────────────────────────────────────────────────────────────────────────────────── + 20. THINGS THAT MUST NOT CHANGE WITHOUT THOUGHT +─────────────────────────────────────────────────────────────────────────────────── + + 1. The failure_signature format ("source:system:type" e.g. "infra:github_actions:oom") + is the primary vault lookup key. Changing the format breaks all existing vault + entries. If you change it, you must migrate all vault JSON files. + + 2. The callback event types ("agent_log", "status", "fix_proposal") are a + protocol between Python and Go. Adding or removing fields is safe (JSON ignores + unknowns). Renaming event type strings breaks the callback handler switch. + + 3. The Groq model (llama-3.3-70b-versatile) affects RLM reasoning quality. + Downgrading to 8b changes fix quality noticeably. Don't change without testing. + + 4. The governance threshold values (0.40, 0.70) directly control how often + REKALL sends fixes without human review. Raising 0.70 → 0.80 means fewer + human reviews. Lowering 0.40 → 0.30 means fewer auto-applies. Think about + this carefully before changing. + + 5. store.UpsertFixProposal() returns ErrNotFound if the incident doesn't exist. + The fix_proposal callback must arrive AFTER the incident is created in store. + The ordering guarantee: Go creates the incident before calling RunPipeline(), + so the incident always exists by the time the callback arrives. + + 6. The SSE broker closes the incident channel on receiving status=resolved or + status=failed. Do NOT send these statuses until all agent_log callbacks are + complete, or the browser will miss the final log entries. + +─────────────────────────────────────────────────────────────────────────────────── + 21. ROADMAP (priority order) +─────────────────────────────────────────────────────────────────────────────────── + + P0 (critical — must fix before demo) + [ ] Persist FixProposal in incidents.json so it survives backend restarts + [ ] Rotate GITHUB_TOKEN and GROQ_API_KEY before public demo + + P1 (Minikube sandbox — the next big feature) + [ ] SandboxAgent (rekall_engine/agents/sandbox.py) + [ ] Minikube provisioning + cleanup logic + [ ] Valkey pod deployment in sandbox namespace + [ ] CI job translation: GitHub Actions → Kubernetes Job + [ ] Streaming sandbox logs to engine callback + [ ] PR body with test evidence + [ ] SANDBOX_ENABLED env var + orchestrator routing change + [ ] SandboxResult as new callback event type + + P2 (quality of life) + [ ] Deduplicate uvicorn log output (uvicorn access_log=False) + [ ] Move governance risk weights to env/config (no code edit to tune) + [ ] Make PROD_BRANCH_PATTERNS configurable (env var) + [ ] Fix duplicate log lines in docker-compose output + [ ] Add fix_proposal to incidents.json serialization (durability fix) + + P3 (production hardening) + [ ] Rate limit webhook endpoint (prevent spam) + [ ] Webhook signature verification (GITHUB_WEBHOOK_SECRET) + [ ] Org vault sync (cron or webhook-triggered) + [ ] Multi-repo support (per-repo vault namespacing) + [ ] Metrics: MTTR (mean time to recovery), fix success rate over time + + P4 (demo polish) + [ ] RL Metrics page: reward history chart per vault entry + [ ] Vault confidence decay visualization + [ ] Sandbox step in agent timeline UI + [ ] Show PR URL as clickable link in agent timeline after creation + +═══════════════════════════════════════════════════════════════════════════════════ + END OF CONTEXT +═══════════════════════════════════════════════════════════════════════════════════ \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..027aa992a --- /dev/null +++ b/Makefile @@ -0,0 +1,82 @@ +.PHONY: setup dev build test lint clean docker-up docker-down \ + db-migrate seed simulate health-check + +# ───────────────────────────────────────────────────────────────────────────── +# REKALL — Root Makefile +# Usage: make +# ───────────────────────────────────────────────────────────────────────────── + +## setup: first-time bootstrap of the entire dev environment +setup: + @./scripts/setup.sh + +## dev: start all services in development mode (Go + Python + Next.js) +dev: + @./scripts/dev.sh + +## seed: seed 5 human vault entries for demo scenarios +seed: + @VAULT_PATH=vault python3 scripts/seed-vault.py + +## vault-reset: delete all vault entries and re-seed +vault-reset: + @rm -rf vault/local/ vault/org/ vault/episodes.json + @VAULT_PATH=vault python3 scripts/seed-vault.py + +## simulate: inject a simulated failure (default: postgres_refused) +## Usage: make simulate SCENARIO=oom_kill +simulate: + @./scripts/simulate.sh $(or $(SCENARIO),postgres_refused) + +## health: check all service health endpoints +health: + @./scripts/health-check.sh + +## test: run the full test suite (Go + Python + frontend unit) +test: + @./scripts/test-all.sh + +## test-go: run only the Go backend tests +test-go: + @cd backend && go test -race ./... + +## test-py: run only the Python engine tests +test-py: + @cd engine && python3 -m pytest tests/ -v + +## test-fe: run only the frontend unit tests +test-fe: + @cd frontend && npm test -- --passWithNoTests --ci + +## test-e2e: run Playwright end-to-end tests +test-e2e: + @cd frontend && npx playwright test + +## lint-go: run Go linter +lint-go: + @cd backend && go vet ./... && echo "go vet passed" + +## build-go: compile the Go binary +build-go: + @cd backend && make build + +## docker-up: build and start all containers +docker-up: + @docker compose up --build -d + +## docker-down: stop and remove all containers +docker-down: + @docker compose down + +## docker-logs: tail logs from all containers +docker-logs: + @docker compose logs -f + +## clean: remove build artefacts +clean: + @cd backend && rm -rf bin/ coverage.out coverage.html + @cd frontend && rm -rf .next out node_modules/.cache playwright-report + +help: + @echo "REKALL available targets:" + @grep -E '^## ' $(MAKEFILE_LIST) | sed 's/## / make /' | column -t -s ':' diff --git a/README.md b/README.md deleted file mode 100644 index c5c886b3e..000000000 --- a/README.md +++ /dev/null @@ -1,86 +0,0 @@ -# HackToFuture 4.0 — Template - -Welcome to your official HackToFuture 4 repository. - -This repository template will be used for development, tracking progress, and final submission of your project. Ensure that all work is committed here within the allowed hackathon duration. - ---- - -### Instructions for the teams: - -- Fork the Repository and name the forked repo in this convention: hacktofuture4-team_id (for eg: hacktofuture4-A01) - ---- - -## Rules - -- Work must be done ONLY in the forked repository -- Only Four Contributors are allowed. -- After 36 hours, Please make PR to the Main Repository. A Form will be sent to fill the required information. -- Do not copy code from other teams -- All commits must be from individual GitHub accounts -- Please provide meaningful commits for tracking. -- Do not share your repository with other teams -- Final submission must be pushed before the deadline -- Any violation may lead to disqualification - ---- - -# The Final README Template - -## Problem Statement / Idea - -Clearly describe the problem you are solving. - -- What is the problem? -- Why is it important? -- Who are the target users? - ---- - -## Proposed Solution - -Explain your approach: - -- What are you building? -- How does it solve the problem? -- What makes your solution unique? - ---- - -## Features - -List the core features of your project: - -- Feature 1 -- Feature 2 -- Feature 3 - ---- - -## Tech Stack - -Mention all technologies used: - -- Frontend: -- Backend: -- Database: -- APIs / Services: -- Tools / Libraries: - ---- - -## Project Setup Instructions - -Provide clear steps to run your project: - -```bash -# Clone the repository -git clone - -# Install dependencies -... - -# Run the project -... -``` diff --git a/backend/Dockerfile b/backend/Dockerfile new file mode 100644 index 000000000..0c67c48d7 --- /dev/null +++ b/backend/Dockerfile @@ -0,0 +1,23 @@ +# ── Build stage ────────────────────────────────────────────────────────────── +FROM golang:1.22-alpine AS builder + +RUN apk add --no-cache git ca-certificates + +WORKDIR /app + +COPY go.mod go.sum ./ +RUN go mod download + +COPY . . +RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /rekall-backend ./cmd/server + +# ── Runtime stage ───────────────────────────────────────────────────────────── +FROM alpine:3.20 + +RUN apk add --no-cache ca-certificates wget + +COPY --from=builder /rekall-backend /rekall-backend + +EXPOSE 8000 + +ENTRYPOINT ["/rekall-backend"] diff --git a/backend/Makefile b/backend/Makefile new file mode 100644 index 000000000..665ff50f6 --- /dev/null +++ b/backend/Makefile @@ -0,0 +1,47 @@ +.PHONY: build run test lint clean tidy + +BIN := rekall-backend +CMD := ./cmd/server +GOFLAGS := -ldflags="-s -w" + +## build: compile the binary +build: + go build $(GOFLAGS) -o bin/$(BIN) $(CMD) + +## run: run in development mode (auto-reload not included; use air externally) +run: + GIN_MODE=debug go run $(CMD)/main.go + +## test: run all tests with race detector +test: + go test -race -count=1 ./... + +## test-verbose: run tests with verbose output +test-verbose: + go test -race -v -count=1 ./... + +## test-cover: run tests and produce a coverage report +test-cover: + go test -race -coverprofile=coverage.out ./... + go tool cover -html=coverage.out -o coverage.html + @echo "Coverage report: coverage.html" + +## lint: run golangci-lint (must be installed separately) +lint: + golangci-lint run ./... + +## tidy: tidy and verify go modules +tidy: + go mod tidy + go mod verify + +## clean: remove build artifacts +clean: + rm -rf bin/ coverage.out coverage.html + +## docker-build: build the production Docker image +docker-build: + docker build -t rekall-backend:latest . + +help: + @grep -E '^## ' $(MAKEFILE_LIST) | sed 's/## //' diff --git a/backend/cmd/server/main.go b/backend/cmd/server/main.go new file mode 100644 index 000000000..260f78424 --- /dev/null +++ b/backend/cmd/server/main.go @@ -0,0 +1,147 @@ +package main + +import ( + "context" + "log" + "net/http" + "os" + "os/signal" + "syscall" + "time" + + "github.com/gin-gonic/gin" + "github.com/joho/godotenv" + "github.com/rekall/backend/internal/config" + "github.com/rekall/backend/internal/engine" + "github.com/rekall/backend/internal/handlers" + "github.com/rekall/backend/internal/middleware" + "github.com/rekall/backend/internal/sse" + "github.com/rekall/backend/internal/store" + "github.com/rekall/backend/internal/vault" +) + +func main() { + // Load .env from repo root (best-effort; production uses real env vars) + _ = godotenv.Load("../../.env") + _ = godotenv.Load("../.env") + _ = godotenv.Load(".env") + + cfg := config.Load() + gin.SetMode(cfg.GinMode) + + // ── Vault (flat-file, no DB) ──────────────────────────────────────────── + vault.Init(cfg.VaultPath) + log.Printf("[REKALL] vault loaded from %s", cfg.VaultPath) + + // ── Store (in-memory) ─────────────────────────────────────────────────── + if err := store.Load(cfg.VaultPath); err != nil { + log.Printf("[REKALL] store load warning: %v", err) + } else { + log.Println("[REKALL] store loaded incidents successfully") + } + + // ── SSE broker ───────────────────────────────────────────────────────── + broker := sse.NewBroker() + + // ── Engine client ─────────────────────────────────────────────────────── + eng := engine.NewClient(cfg.EngineURL) + + // ── Handlers ──────────────────────────────────────────────────────────── + webhookHandler := handlers.NewWebhookHandler(broker, eng) + approvalHandler := handlers.NewApprovalHandler(eng) + streamHandler := handlers.NewStreamHandler(broker) + callbackHandler := handlers.NewCallbackHandler(broker) + + // ── Router ────────────────────────────────────────────────────────────── + r := gin.New() + r.Use(gin.Recovery()) + r.Use(middleware.Logger()) + r.Use(middleware.CORS(cfg.CORSOrigins)) + + // Health + r.GET("/health", func(c *gin.Context) { + engineUp := eng.Healthy(c.Request.Context()) + c.JSON(http.StatusOK, gin.H{ + "status": "ok", + "service": "rekall-backend", + "engine": engineUp, + }) + }) + + // Webhooks + wh := r.Group("/webhook") + { + wh.POST("/github", webhookHandler.HandleGitHub) + wh.POST("/gitlab", webhookHandler.HandleGitLab) + wh.POST("/simulate", webhookHandler.HandleSimulate) + wh.POST("/fetch-live", webhookHandler.HandleFetchLive) + } + + // Incidents + inc := r.Group("/incidents") + { + inc.GET("", handlers.ListIncidents) + inc.GET("/:id", handlers.GetIncident) + inc.POST("/:id/approve", approvalHandler.Approve) + inc.POST("/:id/reject", approvalHandler.Reject) + } + + // SSE stream + r.GET("/stream/:id", streamHandler.Stream) + + // Vault (reads flat files) + v := r.Group("/vault") + { + v.GET("", handlers.ListVault) + v.GET("/stats", handlers.VaultStats) + } + + // Metrics + m := r.Group("/metrics") + { + m.GET("/summary", handlers.Summary) + m.GET("/episodes", handlers.Episodes) + } + + // Internal — called by the Python engine service only + internal := r.Group("/internal") + { + internal.POST("/engine-callback", callbackHandler.Handle) + } + + // ── HTTP server with graceful shutdown ────────────────────────────────── + srv := &http.Server{ + Addr: ":" + cfg.Port, + Handler: r, + ReadTimeout: 10 * time.Second, + WriteTimeout: 0, // 0 = no timeout for SSE streams + IdleTimeout: 120 * time.Second, + } + + go func() { + log.Printf("[REKALL] listening on :%s (mode=%s)", cfg.Port, cfg.GinMode) + if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.Fatalf("listen: %v", err) + } + }() + + quit := make(chan os.Signal, 1) + signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM) + <-quit + + log.Println("[REKALL] shutting down…") + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := srv.Shutdown(ctx); err != nil { + log.Printf("[REKALL] shutdown error: %v", err) + } + + if err := store.Save(cfg.VaultPath); err != nil { + log.Printf("[REKALL] store save error: %v", err) + } else { + log.Println("[REKALL] store saved to disk") + } + + log.Println("[REKALL] stopped") +} diff --git a/backend/go.mod b/backend/go.mod new file mode 100644 index 000000000..bd2378ab5 --- /dev/null +++ b/backend/go.mod @@ -0,0 +1,43 @@ +module github.com/rekall/backend + +go 1.22 + +require ( + github.com/gin-contrib/cors v1.7.2 + github.com/gin-gonic/gin v1.10.0 + github.com/google/uuid v1.6.0 + github.com/joho/godotenv v1.5.1 + github.com/stretchr/testify v1.9.0 +) + +require ( + github.com/bytedance/sonic v1.11.6 // indirect + github.com/bytedance/sonic/loader v0.1.1 // indirect + github.com/cloudwego/base64x v0.1.4 // indirect + github.com/cloudwego/iasm v0.2.0 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/gabriel-vasile/mimetype v1.4.3 // indirect + github.com/gin-contrib/sse v0.1.0 // indirect + github.com/go-playground/locales v0.14.1 // indirect + github.com/go-playground/universal-translator v0.18.1 // indirect + github.com/go-playground/validator/v10 v10.22.0 // indirect + github.com/goccy/go-json v0.10.3 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/klauspost/cpuid/v2 v2.2.7 // indirect + github.com/kr/text v0.2.0 // indirect + github.com/leodido/go-urn v1.4.0 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/pelletier/go-toml/v2 v2.2.2 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/twitchyliquid64/golang-asm v0.15.1 // indirect + github.com/ugorji/go/codec v1.2.12 // indirect + golang.org/x/arch v0.8.0 // indirect + golang.org/x/crypto v0.27.0 // indirect + golang.org/x/net v0.28.0 // indirect + golang.org/x/sys v0.25.0 // indirect + golang.org/x/text v0.18.0 // indirect + google.golang.org/protobuf v1.34.2 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/backend/go.sum b/backend/go.sum new file mode 100644 index 000000000..abd01d460 --- /dev/null +++ b/backend/go.sum @@ -0,0 +1,103 @@ +github.com/bytedance/sonic v1.11.6 h1:oUp34TzMlL+OY1OUWxHqsdkgC/Zfc85zGqw9siXjrc0= +github.com/bytedance/sonic v1.11.6/go.mod h1:LysEHSvpvDySVdC2f87zGWf6CIKJcAvqab1ZaiQtds4= +github.com/bytedance/sonic/loader v0.1.1 h1:c+e5Pt1k/cy5wMveRDyk2X4B9hF4g7an8N3zCYjJFNM= +github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU= +github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/0Y= +github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w= +github.com/cloudwego/iasm v0.2.0 h1:1KNIy1I1H9hNNFEEH3DVnI4UujN+1zjpuk6gwHLTssg= +github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0= +github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk= +github.com/gin-contrib/cors v1.7.2 h1:oLDHxdg8W/XDoN/8zamqk/Drgt4oVZDvaV0YmvVICQw= +github.com/gin-contrib/cors v1.7.2/go.mod h1:SUJVARKgQ40dmrzgXEVxj2m7Ig1v1qIboQkPDTQ9t2E= +github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE= +github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI= +github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU= +github.com/gin-gonic/gin v1.10.0/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y= +github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s= +github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= +github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA= +github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY= +github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY= +github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY= +github.com/go-playground/validator/v10 v10.22.0 h1:k6HsTZ0sTnROkhS//R0O+55JgM8C4Bx7ia+JlgcnOao= +github.com/go-playground/validator/v10 v10.22.0/go.mod h1:dbuPbCMFw/DrkbEynArYaCwl3amGuJotoKCe95atGMM= +github.com/goccy/go-json v0.10.3 h1:KZ5WoDbxAIgm2HNbYckL0se1fHD6rz5j4ywS6ebzDqA= +github.com/goccy/go-json v0.10.3/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= +github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= +github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= +github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM= +github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= +github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M= +github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= +github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ= +github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM= +github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.8.0 h1:FCbCCtXNOY3UtUuHUYaghJg4y7Fd14rXifAYUAtL9R8= +github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI= +github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08= +github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE= +github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg= +golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= +golang.org/x/arch v0.8.0 h1:3wRIsP3pM4yUptoR96otTUOXI367OS0+c9eeRi9doIc= +golang.org/x/arch v0.8.0/go.mod h1:FEVrYAQjsQXMVJ1nsMoVVXPZg6p2JE2mx8psSWTDQys= +golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A= +golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70= +golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= +golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= +golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= +golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= +google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50= +rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go new file mode 100644 index 000000000..beaeff77d --- /dev/null +++ b/backend/internal/config/config.go @@ -0,0 +1,61 @@ +package config + +import ( + "os" +) + +// Config holds all runtime configuration loaded from environment variables. +type Config struct { + // Server + Port string + GinMode string + + // Python engine service + EngineURL string + + // Flat-file vault path (shared with Python engine via volume mount) + VaultPath string + + // CORS + CORSOrigins []string +} + +// Load reads environment variables and returns a populated Config. +// No longer requires DATABASE_URL or ChromaDB. +func Load() *Config { + return &Config{ + Port: getEnv("PORT", "8000"), + GinMode: getEnv("GIN_MODE", "debug"), + EngineURL: getEnv("ENGINE_URL", "http://localhost:8002"), + VaultPath: getEnv("VAULT_PATH", "vault"), + CORSOrigins: getEnvSlice("CORS_ORIGINS", []string{"http://localhost:3000"}), + } +} + +func getEnv(key, fallback string) string { + if v := os.Getenv(key); v != "" { + return v + } + return fallback +} + + + +func getEnvSlice(key string, fallback []string) []string { + v := os.Getenv(key) + if v == "" { + return fallback + } + result := make([]string, 0) + start := 0 + for i := 0; i <= len(v); i++ { + if i == len(v) || v[i] == ',' { + part := v[start:i] + if part != "" { + result = append(result, part) + } + start = i + 1 + } + } + return result +} diff --git a/backend/internal/engine/client.go b/backend/internal/engine/client.go new file mode 100644 index 000000000..a98c4b8d9 --- /dev/null +++ b/backend/internal/engine/client.go @@ -0,0 +1,139 @@ +// Package engine provides an HTTP client for communicating with the Python +// rekall_engine microservice. +package engine + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "time" +) + +// Client talks to the Python engine service. +type Client struct { + baseURL string + httpClient *http.Client +} + +// NewClient constructs a Client pointing at the given engine service URL. +func NewClient(baseURL string) *Client { + return &Client{ + baseURL: baseURL, + httpClient: &http.Client{ + Timeout: 120 * time.Second, + }, + } +} + +// PipelineRequest is sent to POST /pipeline/run. +type PipelineRequest struct { + IncidentID string `json:"incident_id"` + Payload map[string]any `json:"payload"` +} + +// PipelineResponse is returned by the engine service. +type PipelineResponse struct { + OK bool `json:"ok"` + Message string `json:"message,omitempty"` +} + +// LearnRequest is sent to POST /pipeline/learn. +type LearnRequest struct { + IncidentID string `json:"incident_id"` + FixProposalID string `json:"fix_proposal_id"` + Result string `json:"result"` // success | failure | rejected + ReviewedBy string `json:"reviewed_by,omitempty"` + Notes *string `json:"notes,omitempty"` + FixTier string `json:"fix_tier,omitempty"` // T1_human | T2_synthetic | T3_llm + VaultEntryID string `json:"vault_entry_id,omitempty"` // vault entry that was used +} + +// RunPipeline instructs the engine to begin processing an incident. +// This call returns immediately; the engine executes asynchronously and +// reports progress by posting agent-log events back to the callback URL. +func (c *Client) RunPipeline(ctx context.Context, req PipelineRequest) (*PipelineResponse, error) { + return c.post(ctx, "/pipeline/run", req) +} + +// FetchFromGitHubRequest is sent to POST /pipeline/run-from-github. +type FetchFromGitHubRequest struct { + IncidentID string `json:"incident_id"` + Repo string `json:"repo,omitempty"` +} + +// RunFromGitHub instructs the engine to fetch the latest failed GitHub Actions +// run from the given repo and process it through the full AI pipeline. +func (c *Client) RunFromGitHub(ctx context.Context, req FetchFromGitHubRequest) (*PipelineResponse, error) { + return c.post(ctx, "/pipeline/run-from-github", req) +} + +// Learn submits an outcome so the engine can update vault confidence. +func (c *Client) Learn(ctx context.Context, req LearnRequest) (*PipelineResponse, error) { + return c.post(ctx, "/pipeline/learn", req) +} + +// CreatePRRequest is sent to POST /pipeline/create-pr. +type CreatePRRequest struct { + IncidentID string `json:"incident_id"` + FixCommands []string `json:"fix_commands"` + FixDescription string `json:"fix_description"` + FixTier string `json:"fix_tier"` + FixDiff string `json:"fix_diff,omitempty"` +} + +// CreatePR instructs the engine to open a GitHub PR for a human-approved fix. +// This is called from the Approve handler after governance blocked the pipeline. +func (c *Client) CreatePR(ctx context.Context, req CreatePRRequest) (*PipelineResponse, error) { + return c.post(ctx, "/pipeline/create-pr", req) +} + +// Healthy returns true if the engine service responds to its health endpoint. +func (c *Client) Healthy(ctx context.Context) bool { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+"/health", nil) + if err != nil { + return false + } + resp, err := c.httpClient.Do(req) + if err != nil { + return false + } + defer resp.Body.Close() + return resp.StatusCode == http.StatusOK +} + +func (c *Client) post(ctx context.Context, path string, body any) (*PipelineResponse, error) { + b, err := json.Marshal(body) + if err != nil { + return nil, fmt.Errorf("marshal: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(b)) + if err != nil { + return nil, fmt.Errorf("build request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("do request: %w", err) + } + defer resp.Body.Close() + + raw, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("read body: %w", err) + } + + if resp.StatusCode >= 400 { + return nil, fmt.Errorf("engine returned %d: %s", resp.StatusCode, string(raw)) + } + + var pr PipelineResponse + if err := json.Unmarshal(raw, &pr); err != nil { + return nil, fmt.Errorf("unmarshal response: %w", err) + } + return &pr, nil +} diff --git a/backend/internal/engine/client_test.go b/backend/internal/engine/client_test.go new file mode 100644 index 000000000..dde5740ae --- /dev/null +++ b/backend/internal/engine/client_test.go @@ -0,0 +1,108 @@ +package engine_test + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/rekall/backend/internal/engine" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// mockEngineServer creates a test HTTP server that responds to engine endpoints. +func mockEngineServer(t *testing.T, handlers map[string]http.HandlerFunc) *httptest.Server { + t.Helper() + mux := http.NewServeMux() + for path, h := range handlers { + mux.HandleFunc(path, h) + } + return httptest.NewServer(mux) +} + +func TestClient_Healthy_ReturnsTrue(t *testing.T) { + srv := mockEngineServer(t, map[string]http.HandlerFunc{ + "/health": func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + }, + }) + defer srv.Close() + + c := engine.NewClient(srv.URL) + assert.True(t, c.Healthy(context.Background())) +} + +func TestClient_Healthy_ReturnsFalseOnError(t *testing.T) { + c := engine.NewClient("http://127.0.0.1:0") // unreachable + assert.False(t, c.Healthy(context.Background())) +} + +func TestClient_RunPipeline_Success(t *testing.T) { + srv := mockEngineServer(t, map[string]http.HandlerFunc{ + "/pipeline/run": func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, "POST", r.Method) + assert.Equal(t, "application/json", r.Header.Get("Content-Type")) + + var req engine.PipelineRequest + require.NoError(t, json.NewDecoder(r.Body).Decode(&req)) + assert.Equal(t, "test-incident-1", req.IncidentID) + + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(engine.PipelineResponse{OK: true}) + }, + }) + defer srv.Close() + + c := engine.NewClient(srv.URL) + resp, err := c.RunPipeline(context.Background(), engine.PipelineRequest{ + IncidentID: "test-incident-1", + Payload: map[string]any{"scenario": "test_failure"}, + }) + require.NoError(t, err) + assert.True(t, resp.OK) +} + +func TestClient_RunPipeline_ServerError(t *testing.T) { + srv := mockEngineServer(t, map[string]http.HandlerFunc{ + "/pipeline/run": func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + _, _ = w.Write([]byte(`{"error": "engine exploded"}`)) + }, + }) + defer srv.Close() + + c := engine.NewClient(srv.URL) + resp, err := c.RunPipeline(context.Background(), engine.PipelineRequest{ + IncidentID: "inc-err", + Payload: map[string]any{}, + }) + assert.Nil(t, resp) + assert.Error(t, err) + assert.Contains(t, err.Error(), "500") +} + +func TestClient_Learn_Success(t *testing.T) { + srv := mockEngineServer(t, map[string]http.HandlerFunc{ + "/pipeline/learn": func(w http.ResponseWriter, r *http.Request) { + var req engine.LearnRequest + require.NoError(t, json.NewDecoder(r.Body).Decode(&req)) + assert.Equal(t, "success", req.Result) + + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(engine.PipelineResponse{OK: true}) + }, + }) + defer srv.Close() + + c := engine.NewClient(srv.URL) + resp, err := c.Learn(context.Background(), engine.LearnRequest{ + IncidentID: "inc-learn", + FixProposalID: "fp-1", + Result: "success", + ReviewedBy: "engineer@example.com", + }) + require.NoError(t, err) + assert.True(t, resp.OK) +} diff --git a/backend/internal/handlers/approvals.go b/backend/internal/handlers/approvals.go new file mode 100644 index 000000000..c46ef570c --- /dev/null +++ b/backend/internal/handlers/approvals.go @@ -0,0 +1,135 @@ +package handlers + +import ( + "context" + "net/http" + "time" + + "github.com/gin-gonic/gin" + "github.com/rekall/backend/internal/engine" + "github.com/rekall/backend/internal/models" + "github.com/rekall/backend/internal/store" +) + +// ApprovalHandler holds the engine client needed to trigger learning. +type ApprovalHandler struct { + engine *engine.Client +} + +func NewApprovalHandler(eng *engine.Client) *ApprovalHandler { + return &ApprovalHandler{engine: eng} +} + +// Approve marks an incident as resolved and triggers the LearningAgent. +func (h *ApprovalHandler) Approve(c *gin.Context) { + id := c.Param("id") + + var req models.ApprovalRequest + if err := c.ShouldBindJSON(&req); err != nil { + req = models.ApprovalRequest{ReviewedBy: "human"} + } + if req.ReviewedBy == "" { + req.ReviewedBy = "human" + } + + incident, err := store.GetIncident(c.Request.Context(), id) + if err != nil || incident == nil { + c.JSON(http.StatusNotFound, gin.H{"error": "incident not found"}) + return + } + + if err := store.UpdateIncidentStatus(c.Request.Context(), id, models.StatusResolved); err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + _, _ = store.AppendAgentLog(c.Request.Context(), id, "learning", "done", + "Fix approved by "+req.ReviewedBy+". Vault confidence updated.") + + if fix, err := store.GetLatestFixProposal(c.Request.Context(), id); err == nil && fix != nil { + vaultID := "" + if fix.VaultEntryID != nil { + vaultID = *fix.VaultEntryID + } + fixDiff := "" + if fix.FixDiff != nil { + fixDiff = *fix.FixDiff + } + go func() { + // Use a detached context — c.Request.Context() is cancelled as soon + // as the HTTP response is sent, which would kill these calls. + bgCtx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cancel() + + _, _ = h.engine.Learn(bgCtx, engine.LearnRequest{ + IncidentID: id, + FixProposalID: fix.ID, + Result: "success", + ReviewedBy: req.ReviewedBy, + Notes: req.Notes, + FixTier: string(fix.Tier), + VaultEntryID: vaultID, + }) + // After learning, open the GitHub PR now that a human has approved. + _, _ = h.engine.CreatePR(bgCtx, engine.CreatePRRequest{ + IncidentID: id, + FixCommands: fix.FixCommands, + FixDescription: fix.FixDescription, + FixTier: string(fix.Tier), + FixDiff: fixDiff, + }) + }() + } + + c.JSON(http.StatusOK, gin.H{"ok": true, "incident_id": id, "action": "approved"}) +} + + +// Reject marks an incident as failed and notifies the LearningAgent. +func (h *ApprovalHandler) Reject(c *gin.Context) { + id := c.Param("id") + + var req models.ApprovalRequest + if err := c.ShouldBindJSON(&req); err != nil { + req = models.ApprovalRequest{ReviewedBy: "human"} + } + if req.ReviewedBy == "" { + req.ReviewedBy = "human" + } + + incident, err := store.GetIncident(c.Request.Context(), id) + if err != nil || incident == nil { + c.JSON(http.StatusNotFound, gin.H{"error": "incident not found"}) + return + } + + if err := store.UpdateIncidentStatus(c.Request.Context(), id, models.StatusFailed); err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + _, _ = store.AppendAgentLog(c.Request.Context(), id, "learning", "done", + "Fix rejected by "+req.ReviewedBy+". Vault confidence decayed.") + + if fix, err := store.GetLatestFixProposal(c.Request.Context(), id); err == nil && fix != nil { + vaultID := "" + if fix.VaultEntryID != nil { + vaultID = *fix.VaultEntryID + } + go func() { + bgCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + _, _ = h.engine.Learn(bgCtx, engine.LearnRequest{ + IncidentID: id, + FixProposalID: fix.ID, + Result: "rejected", + ReviewedBy: req.ReviewedBy, + Notes: req.Notes, + FixTier: string(fix.Tier), + VaultEntryID: vaultID, + }) + }() + } + + c.JSON(http.StatusOK, gin.H{"ok": true, "incident_id": id, "action": "rejected"}) +} diff --git a/backend/internal/handlers/callback.go b/backend/internal/handlers/callback.go new file mode 100644 index 000000000..a68b86e78 --- /dev/null +++ b/backend/internal/handlers/callback.go @@ -0,0 +1,165 @@ +package handlers + +import ( + "encoding/json" + "net/http" + + "github.com/gin-gonic/gin" + "github.com/rekall/backend/internal/models" + "github.com/rekall/backend/internal/sse" + "github.com/rekall/backend/internal/store" +) + +// CallbackHandler receives async events POSTed by the Python engine service. +type CallbackHandler struct { + broker *sse.Broker +} + +func NewCallbackHandler(broker *sse.Broker) *CallbackHandler { + return &CallbackHandler{broker: broker} +} + +type callbackEvent struct { + Type string `json:"type"` // agent_log | status + Data json.RawMessage `json:"data"` +} + +type agentLogData struct { + IncidentID string `json:"incident_id"` + StepName string `json:"step_name"` + Status string `json:"status"` // running | done | error + Detail string `json:"detail"` +} + +type statusData struct { + IncidentID string `json:"incident_id"` + Status string `json:"status"` // processing | awaiting_approval | resolved | failed +} + +type fixProposalData struct { + ID string `json:"id"` + IncidentID string `json:"incident_id"` + Tier string `json:"tier"` + VaultEntryID *string `json:"vault_entry_id"` + FixDescription string `json:"fix_description"` + FixCommands []string `json:"fix_commands"` + FixDiff *string `json:"fix_diff"` + Confidence float64 `json:"confidence"` + Reasoning string `json:"reasoning"` +} + +type sandboxResultData struct { + IncidentID string `json:"incident_id"` + Passed bool `json:"passed"` + TestCount int `json:"test_count"` + FailureCount int `json:"failure_count"` + TestLog string `json:"test_log"` + PREvidence string `json:"pr_evidence"` + Namespace string `json:"namespace"` + DurationSeconds float64 `json:"duration_seconds"` + ValKeyDeployed bool `json:"valkey_deployed"` + DemoMode bool `json:"demo_mode"` +} + +// Handle processes a single event from the Python engine service. +func (h *CallbackHandler) Handle(c *gin.Context) { + var ev callbackEvent + if err := c.ShouldBindJSON(&ev); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + ctx := c.Request.Context() + + switch ev.Type { + case "agent_log": + var d agentLogData + if err := json.Unmarshal(ev.Data, &d); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "bad agent_log payload"}) + return + } + + logEntry, err := store.AppendAgentLog(ctx, d.IncidentID, d.StepName, d.Status, d.Detail) + if err != nil { + // Store write failed — still publish to SSE so UI stays live. + h.broker.Publish(d.IncidentID, sse.Event{ + Type: "agent_log", + Data: map[string]string{ + "incident_id": d.IncidentID, + "step_name": d.StepName, + "status": d.Status, + "detail": d.Detail, + }, + }) + } else { + h.broker.Publish(d.IncidentID, sse.Event{Type: "agent_log", Data: logEntry}) + } + + case "status": + var d statusData + if err := json.Unmarshal(ev.Data, &d); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "bad status payload"}) + return + } + + status := models.IncidentStatus(d.Status) + _ = store.UpdateIncidentStatus(ctx, d.IncidentID, status) + + h.broker.Publish(d.IncidentID, sse.Event{ + Type: "status", + Data: map[string]string{"status": d.Status}, + }) + + if status == models.StatusResolved || status == models.StatusFailed { + h.broker.PublishDone(d.IncidentID) + } + + case "fix_proposal": + // The engine posts this when the pipeline pauses for human review + // so the Approve handler can retrieve fix details from the store. + var d fixProposalData + if err := json.Unmarshal(ev.Data, &d); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "bad fix_proposal payload"}) + return + } + fix := &models.FixProposal{ + ID: d.ID, + IncidentID: d.IncidentID, + Tier: models.FixTier(d.Tier), + FixDescription: d.FixDescription, + FixCommands: d.FixCommands, + Confidence: d.Confidence, + Reasoning: d.Reasoning, + VaultEntryID: d.VaultEntryID, + FixDiff: d.FixDiff, + } + _ = store.UpsertFixProposal(ctx, fix) + + case "sandbox_result": + // Minikube sandbox validation result — store and publish to SSE. + var d sandboxResultData + if err := json.Unmarshal(ev.Data, &d); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "bad sandbox_result payload"}) + return + } + r := &models.SandboxResult{ + IncidentID: d.IncidentID, + Passed: d.Passed, + TestCount: d.TestCount, + FailureCount: d.FailureCount, + TestLog: d.TestLog, + PREvidence: d.PREvidence, + Namespace: d.Namespace, + DurationSeconds: d.DurationSeconds, + ValKeyDeployed: d.ValKeyDeployed, + DemoMode: d.DemoMode, + } + _ = store.UpsertSandboxResult(ctx, r) + h.broker.Publish(d.IncidentID, sse.Event{ + Type: "sandbox_result", + Data: r, + }) + } + + c.Status(http.StatusOK) +} diff --git a/backend/internal/handlers/incidents.go b/backend/internal/handlers/incidents.go new file mode 100644 index 000000000..d7663f66a --- /dev/null +++ b/backend/internal/handlers/incidents.go @@ -0,0 +1,65 @@ +package handlers + +import ( + "net/http" + "strconv" + + "github.com/gin-gonic/gin" + "github.com/rekall/backend/internal/models" + "github.com/rekall/backend/internal/store" +) + +// ListIncidents returns paginated incidents sorted by created_at DESC. +func ListIncidents(c *gin.Context) { + limit, _ := strconv.Atoi(c.DefaultQuery("limit", "50")) + offset, _ := strconv.Atoi(c.DefaultQuery("offset", "0")) + + if limit < 1 || limit > 200 { + limit = 50 + } + + incidents, err := store.ListIncidents(c.Request.Context(), limit, offset) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + if incidents == nil { + incidents = []*models.Incident{} + } + c.JSON(http.StatusOK, gin.H{ + "incidents": incidents, + "limit": limit, + "offset": offset, + }) +} + +// GetIncident returns the full detail view of a single incident. +func GetIncident(c *gin.Context) { + id := c.Param("id") + ctx := c.Request.Context() + + incident, err := store.GetIncident(ctx, id) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + if incident == nil { + c.JSON(http.StatusNotFound, gin.H{"error": "incident not found"}) + return + } + + bundle, _ := store.GetDiagnosticBundle(ctx, id) + fix, _ := store.GetLatestFixProposal(ctx, id) + gov, _ := store.GetLatestGovernanceDecision(ctx, id) + sandbox, _ := store.GetSandboxResult(ctx, id) + logs, _ := store.GetAgentLogs(ctx, id) + + c.JSON(http.StatusOK, models.IncidentDetail{ + Incident: incident, + DiagnosticBundle: bundle, + FixProposal: fix, + GovernanceDecision: gov, + SandboxResult: sandbox, + AgentLogs: logs, + }) +} diff --git a/backend/internal/handlers/incidents_test.go b/backend/internal/handlers/incidents_test.go new file mode 100644 index 000000000..1fdc34a2e --- /dev/null +++ b/backend/internal/handlers/incidents_test.go @@ -0,0 +1,69 @@ +package handlers_test + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/gin-gonic/gin" + "github.com/rekall/backend/internal/handlers" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// buildIncidentRouter sets up a router for incident endpoints. +// Uses in-memory store — no database required; all calls succeed. +func buildIncidentRouter() *gin.Engine { + r := gin.New() + r.GET("/incidents", handlers.ListIncidents) + r.GET("/incidents/:id", handlers.GetIncident) + return r +} + +func TestListIncidents_DefaultsReturnJSON(t *testing.T) { + r := buildIncidentRouter() + req := httptest.NewRequest(http.MethodGet, "/incidents", nil) + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + // In-memory store always succeeds + assert.Equal(t, http.StatusOK, w.Code) +} + +func TestListIncidents_InvalidLimitClamped(t *testing.T) { + r := buildIncidentRouter() + req := httptest.NewRequest(http.MethodGet, "/incidents?limit=9999", nil) + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + assert.Equal(t, http.StatusOK, w.Code) +} + +func TestGetIncident_NotFound(t *testing.T) { + r := buildIncidentRouter() + req := httptest.NewRequest(http.MethodGet, "/incidents/00000000-0000-0000-0000-000000000000", nil) + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + // In-memory returns 404 for non-existent IDs + assert.Equal(t, http.StatusNotFound, w.Code) +} + +func TestGetIncident_RouteParamExtracted(t *testing.T) { + r := buildIncidentRouter() + req := httptest.NewRequest(http.MethodGet, "/incidents/abc-123", nil) + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + assert.NotEqual(t, http.StatusMethodNotAllowed, w.Code) + assert.NotEqual(t, http.StatusBadRequest, w.Code) +} + +func TestListIncidents_ResponseShape(t *testing.T) { + r := buildIncidentRouter() + req := httptest.NewRequest(http.MethodGet, "/incidents", nil) + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + var body map[string]any + require.NoError(t, json.Unmarshal(w.Body.Bytes(), &body)) + assert.Contains(t, body, "incidents") +} diff --git a/backend/internal/handlers/metrics.go b/backend/internal/handlers/metrics.go new file mode 100644 index 000000000..8551ed3e7 --- /dev/null +++ b/backend/internal/handlers/metrics.go @@ -0,0 +1,42 @@ +package handlers + +import ( + "net/http" + "strconv" + + "github.com/gin-gonic/gin" + "github.com/rekall/backend/internal/store" + "github.com/rekall/backend/internal/vault" +) + +// Summary returns headline dashboard metrics. +func Summary(c *gin.Context) { + stats, _ := vault.Stats() + vaultSize := 0 + var avgConf *float64 + if stats != nil { + vaultSize = stats.Total + avgConf = stats.AvgConfidence + } + + m, err := store.GetMetricsSummary(c.Request.Context(), vaultSize, avgConf) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + c.JSON(http.StatusOK, m) +} + +// Episodes returns the last N RL episodes from vault/episodes.json. +func Episodes(c *gin.Context) { + limit, _ := strconv.Atoi(c.DefaultQuery("limit", "100")) + if limit < 1 || limit > 500 { + limit = 100 + } + episodes, err := vault.ListEpisodes(limit) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + c.JSON(http.StatusOK, gin.H{"episodes": episodes, "total": len(episodes)}) +} diff --git a/backend/internal/handlers/stream.go b/backend/internal/handlers/stream.go new file mode 100644 index 000000000..5c2cbace3 --- /dev/null +++ b/backend/internal/handlers/stream.go @@ -0,0 +1,72 @@ +package handlers + +import ( + "fmt" + "net/http" + "time" + + "github.com/gin-gonic/gin" + "github.com/rekall/backend/internal/sse" +) + +const keepaliveInterval = 15 * time.Second + +// StreamHandler serves SSE streams for individual incidents. +type StreamHandler struct { + broker *sse.Broker +} + +func NewStreamHandler(broker *sse.Broker) *StreamHandler { + return &StreamHandler{broker: broker} +} + +// Stream opens an SSE connection and fans out agent log events for incidentID. +// The connection is held open until the pipeline completes (done event) or the +// client disconnects. +func (h *StreamHandler) Stream(c *gin.Context) { + incidentID := c.Param("id") + + ch := h.broker.Subscribe(incidentID) + defer h.broker.Unsubscribe(incidentID, ch) + + c.Writer.Header().Set("Content-Type", "text/event-stream") + c.Writer.Header().Set("Cache-Control", "no-cache") + c.Writer.Header().Set("Connection", "keep-alive") + c.Writer.Header().Set("X-Accel-Buffering", "no") // disable nginx buffering + c.Writer.WriteHeader(http.StatusOK) + c.Writer.Flush() + + ticker := time.NewTicker(keepaliveInterval) + defer ticker.Stop() + + for { + select { + case <-c.Request.Context().Done(): + return + + case <-ticker.C: + // Heartbeat comment keeps the TCP connection alive through proxies + fmt.Fprintf(c.Writer, ": heartbeat\n\n") + c.Writer.Flush() + + case ev, ok := <-ch: + if !ok { + return + } + + b, err := ev.Marshal() + if err != nil { + continue + } + + if ev.Type == "done" { + fmt.Fprintf(c.Writer, "event: done\ndata: {}\n\n") + c.Writer.Flush() + return + } + + fmt.Fprintf(c.Writer, "data: %s\n\n", b) + c.Writer.Flush() + } + } +} diff --git a/backend/internal/handlers/vault.go b/backend/internal/handlers/vault.go new file mode 100644 index 000000000..374ec8d87 --- /dev/null +++ b/backend/internal/handlers/vault.go @@ -0,0 +1,42 @@ +package handlers + +import ( + "net/http" + "strconv" + + "github.com/gin-gonic/gin" + "github.com/rekall/backend/internal/vault" +) + +// ListVault returns paginated vault entries, optionally filtered by source. +func ListVault(c *gin.Context) { + source := c.Query("source") // human | synthetic | "" + limit, _ := strconv.Atoi(c.DefaultQuery("limit", "100")) + offset, _ := strconv.Atoi(c.DefaultQuery("offset", "0")) + + if limit < 1 || limit > 500 { + limit = 100 + } + + var src *string + if source != "" { + src = &source + } + + entries, err := vault.ListAll(src, limit, offset) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + c.JSON(http.StatusOK, gin.H{"entries": entries, "limit": limit, "offset": offset}) +} + +// VaultStats returns aggregate statistics for the vault. +func VaultStats(c *gin.Context) { + stats, err := vault.Stats() + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + c.JSON(http.StatusOK, stats) +} diff --git a/backend/internal/handlers/webhooks.go b/backend/internal/handlers/webhooks.go new file mode 100644 index 000000000..c5d08ba96 --- /dev/null +++ b/backend/internal/handlers/webhooks.go @@ -0,0 +1,305 @@ +package handlers + +import ( + "context" + "net/http" + "strings" + "time" + + "github.com/gin-gonic/gin" + "github.com/rekall/backend/internal/engine" + "github.com/rekall/backend/internal/models" + "github.com/rekall/backend/internal/sse" + "github.com/rekall/backend/internal/store" +) + +// ───────────────────────────────────────────────────────────────────────────── +// DEMO vs PRODUCTION — CI Failure Fetching +// ───────────────────────────────────────────────────────────────────────────── +// +// CURRENT (Demo / Hackathon mode): +// Instead of waiting for a slow GitHub Actions run to fail and rate-limiting +// the GitHub API to fetch massive log files, REKALL uses the Webhook Simulator +// below. When you hit POST /webhooks/simulate with a scenario name (e.g. +// "postgres_refused" or "secret_leak"), this handler injects a pre-constructed +// payload that already contains the log_excerpt and git_diff. The LangGraph +// engine ingests the exact shape of a CI failure instantly — zero API calls, +// zero rate-limit risk, perfectly reproducible demo loops. +// +// PRODUCTION (how a fully deployed version would work): +// The MonitorAgent would catch a real GitHub webhook (event: workflow_run, +// action: completed, conclusion: failure). HandleGitHub() already receives +// and validates this payload. The missing step is log extraction: +// +// runID := payload.WorkflowRun.ID +// url := fmt.Sprintf( +// "https://api.github.com/repos/%s/actions/runs/%d/logs", +// payload.Repository.FullName, runID, +// ) +// req, _ := http.NewRequest("GET", url, nil) +// req.Header.Set("Authorization", "Bearer "+os.Getenv("GITHUB_TOKEN")) +// req.Header.Set("Accept", "application/vnd.github+json") +// // Response is a zip archive — unzip and concatenate step logs. +// // Inject the extracted log_excerpt into the raw map before runPipeline(). +// +// This log bytes download is then merged into the raw payload map so the +// Python engine's DiagnosticAgent receives populated log_excerpt/git_diff +// fields, matching the shape the simulator already provides. +// ───────────────────────────────────────────────────────────────────────────── + +// webhookSimulatorScenarios defines pre-built failure payloads for demo use. +var webhookSimulatorScenarios = map[string]map[string]any{ + "postgres_refused": { + "failure_type": "infra", + "description": "PostgreSQL ECONNREFUSED on port 5432", + "log_excerpt": "Error: connect ECONNREFUSED postgres:5432\n at TCPConnectWrap.afterConnect", + "git_diff": "--- a/config/database.yml\n+++ b/config/database.yml\n@@ -2 +2 @@\n- host: db.internal\n+ host: postgres", + "simulated": true, + }, + "oom_kill": { + "failure_type": "oom", + "description": "Container killed by OOM — JVM heap exhausted", + "log_excerpt": "FATAL: Terminating due to java.lang.OutOfMemoryError: Java heap space\nContainer killed by OOM killer", + "simulated": true, + }, + "test_failure": { + "failure_type": "test", + "description": "Auth test suite: 1 failure after middleware change", + "log_excerpt": "FAIL: test_user_auth\nAssertionError: 401 != 200\nRan 47 tests in 3.2s — FAILED (failures=1)", + "git_diff": "--- a/src/auth/middleware.py\n+++ b/src/auth/middleware.py\n@@ -12 +12 @@\n- if token and verify(token):\n+ if token:", + "simulated": true, + }, + "secret_leak": { + "failure_type": "security", + "description": "Secret detected in committed .env file", + "log_excerpt": "gitleaks: secret detected\n Rule: generic-api-key\n File: .env\n Line: 7", + "simulated": true, + }, + "image_pull_backoff": { + "failure_type": "deploy", + "description": "Image tag v2.1.0 not found in registry", + "log_excerpt": "Warning: Failed to pull image 'registry.io/app:v2.1.0': manifest not found\nImagePullBackOff", + "git_diff": "--- a/.github/workflows/deploy.yml\n+++ b/.github/workflows/deploy.yml\n@@ -8 +8 @@\n- IMAGE_TAG: v2.0.9\n+ IMAGE_TAG: v2.1.0", + "simulated": true, + }, +} + +// WebhookHandler holds dependencies used by all webhook endpoints. +type WebhookHandler struct { + broker *sse.Broker + engine *engine.Client +} + +func NewWebhookHandler(broker *sse.Broker, eng *engine.Client) *WebhookHandler { + return &WebhookHandler{broker: broker, engine: eng} +} + +// HandleGitHub receives GitHub Actions workflow_run failure webhooks. +func (h *WebhookHandler) HandleGitHub(c *gin.Context) { + var payload models.GitHubWebhookPayload + if err := c.ShouldBindJSON(&payload); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + if payload.WorkflowRun == nil { + c.JSON(http.StatusOK, gin.H{"accepted": false, "reason": "no workflow_run"}) + return + } + if payload.WorkflowRun.Conclusion != "failure" && payload.WorkflowRun.Conclusion != "cancelled" { + c.JSON(http.StatusOK, gin.H{"accepted": false, "reason": "conclusion=" + payload.WorkflowRun.Conclusion}) + return + } + + failureType := classifyGitHubRun(payload.WorkflowRun.Name) + raw := map[string]any{ + "action": payload.Action, + "workflow_run": payload.WorkflowRun, + "repository": payload.Repository, + } + + incident, err := store.CreateIncident(c.Request.Context(), "github_actions", failureType, raw) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "create incident: " + err.Error()}) + return + } + + go h.runPipeline(incident.ID, raw) + c.JSON(http.StatusOK, gin.H{"accepted": true, "incident_id": incident.ID}) +} + +// HandleGitLab receives GitLab CI pipeline failure webhooks. +func (h *WebhookHandler) HandleGitLab(c *gin.Context) { + var payload models.GitLabWebhookPayload + if err := c.ShouldBindJSON(&payload); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + if payload.Status != "failed" && payload.Status != "canceled" { + c.JSON(http.StatusOK, gin.H{"accepted": false, "reason": "status=" + payload.Status}) + return + } + + raw := map[string]any{"object_kind": payload.ObjectKind, "status": payload.Status} + incident, err := store.CreateIncident(c.Request.Context(), "gitlab", "deploy", raw) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + go h.runPipeline(incident.ID, raw) + c.JSON(http.StatusOK, gin.H{"accepted": true, "incident_id": incident.ID}) +} + +// HandleSimulate injects a pre-built failure scenario. +// Kept for local testing — prefer HandleFetchLive for real CI monitoring. +func (h *WebhookHandler) HandleSimulate(c *gin.Context) { + var req models.SimulateRequest + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + scenario, ok := webhookSimulatorScenarios[req.Scenario] + if !ok { + c.JSON(http.StatusBadRequest, gin.H{"error": "unknown scenario: " + req.Scenario}) + return + } + + ft, _ := scenario["failure_type"].(string) + incident, err := store.CreateIncident(c.Request.Context(), "simulator", ft, scenario) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + go h.runPipeline(incident.ID, scenario) + + c.JSON(http.StatusOK, gin.H{ + "accepted": true, + "incident_id": incident.ID, + "scenario": req.Scenario, + }) +} + +// HandleFetchLive fetches the latest failed GitHub Actions run in the configured +// repo and runs the real AI pipeline (Monitor → Diagnostic → Fix → PR). +func (h *WebhookHandler) HandleFetchLive(c *gin.Context) { + var body struct { + Repo string `json:"repo"` // optional override, defaults to GITHUB_REPO env + } + _ = c.ShouldBindJSON(&body) + + raw := map[string]any{ + "source": "github_actions", + "failure_type": "unknown", + "description": "Fetching latest CI failure from GitHub", + "live": true, + "repo": body.Repo, + } + + incident, err := store.CreateIncident(c.Request.Context(), "github_actions", "unknown", raw) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "create incident: " + err.Error()}) + return + } + + go h.runFetchLivePipeline(incident.ID, body.Repo) + c.JSON(http.StatusOK, gin.H{"accepted": true, "incident_id": incident.ID}) +} + +// runPipeline is called in a goroutine to drive the agent pipeline. +// Tries the Python engine first; falls back to emulation if unavailable. +func (h *WebhookHandler) runPipeline(incidentID string, payload map[string]any) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + engineOK := h.engine.Healthy(ctx) + + if engineOK { + _, err := h.engine.RunPipeline(ctx, engine.PipelineRequest{ + IncidentID: incidentID, + Payload: payload, + }) + if err != nil { + engineOK = false + } + } + + if !engineOK { + h.emulatedPipeline(ctx, incidentID) + } +} + +// runFetchLivePipeline delegates to the engine's /pipeline/run-from-github +// endpoint so it can fetch real CI failure logs from GitHub and run the full +// AI agent pipeline. Falls back to emulation only if engine is unreachable. +func (h *WebhookHandler) runFetchLivePipeline(incidentID string, repo string) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + + if !h.engine.Healthy(ctx) { + h.emulatedPipeline(ctx, incidentID) + return + } + + _, err := h.engine.RunFromGitHub(ctx, engine.FetchFromGitHubRequest{ + IncidentID: incidentID, + Repo: repo, + }) + if err != nil { + h.emulatedPipeline(ctx, incidentID) + } +} + +// emulatedPipeline replays a step-by-step simulation when the engine is offline. +func (h *WebhookHandler) emulatedPipeline(ctx context.Context, incidentID string) { + steps := []struct { + name string + detail string + }{ + {"monitor", "Normalising failure event payload"}, + {"diagnostic", "Fetching logs, git diff, and test reports"}, + {"fix", "Searching memory vault: T1 → T2 → T3 fallback"}, + {"governance", "Computing risk score across 6 dimensions"}, + {"publish_guard", "Supply-chain safety gate"}, + {"learning", "Updating vault confidence and logging RL episode"}, + } + + for _, step := range steps { + select { + case <-ctx.Done(): + return + default: + } + + if logEntry, err := store.AppendAgentLog(ctx, incidentID, step.name, "running", step.detail); err == nil { + h.broker.Publish(incidentID, sse.Event{Type: "agent_log", Data: logEntry}) + } + + time.Sleep(1200 * time.Millisecond) + + if logEntry, err := store.AppendAgentLog(ctx, incidentID, step.name, "done", step.detail); err == nil { + h.broker.Publish(incidentID, sse.Event{Type: "agent_log", Data: logEntry}) + } + } + + _ = store.UpdateIncidentStatus(ctx, incidentID, models.StatusResolved) + h.broker.Publish(incidentID, sse.Event{Type: "status", Data: map[string]string{"status": "resolved"}}) + h.broker.PublishDone(incidentID) +} + +func classifyGitHubRun(name string) string { + lower := strings.ToLower(name) + switch { + case strings.Contains(lower, "test"): + return "test" + case strings.Contains(lower, "deploy"): + return "deploy" + case strings.Contains(lower, "build"): + return "deploy" + default: + return "unknown" + } +} diff --git a/backend/internal/handlers/webhooks_test.go b/backend/internal/handlers/webhooks_test.go new file mode 100644 index 000000000..9010248d2 --- /dev/null +++ b/backend/internal/handlers/webhooks_test.go @@ -0,0 +1,117 @@ +package handlers_test + +import ( + "bytes" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/gin-gonic/gin" + "github.com/rekall/backend/internal/engine" + "github.com/rekall/backend/internal/handlers" + "github.com/rekall/backend/internal/sse" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func init() { + gin.SetMode(gin.TestMode) +} + +// buildTestRouter wires a minimal Gin router with the webhook handler. +// It uses a real SSE broker and a no-op engine client (engine service offline). +func buildTestRouter() *gin.Engine { + broker := sse.NewBroker() + eng := engine.NewClient("http://127.0.0.1:0") // intentionally unreachable + wh := handlers.NewWebhookHandler(broker, eng) + + r := gin.New() + r.POST("/webhook/github", wh.HandleGitHub) + r.POST("/webhook/gitlab", wh.HandleGitLab) + r.POST("/webhook/simulate", wh.HandleSimulate) + return r +} + +// TestSimulate_UnknownScenario ensures an invalid scenario returns 400. +func TestSimulate_UnknownScenario(t *testing.T) { + // Skip if no DB — this test only validates routing logic (no DB call reaches + // the simulate handler before scenario validation). + r := buildTestRouter() + body, _ := json.Marshal(map[string]string{"scenario": "nonexistent_chaos"}) + req := httptest.NewRequest(http.MethodPost, "/webhook/simulate", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + // Unknown scenario returns 400 without touching the DB. + assert.Equal(t, http.StatusBadRequest, w.Code) + + var resp map[string]string + require.NoError(t, json.Unmarshal(w.Body.Bytes(), &resp)) + assert.Contains(t, resp["error"], "unknown scenario") +} + +// TestGitHub_IgnoresNonFailure verifies that successful workflow_run events +// are ignored without creating an incident. +func TestGitHub_IgnoresNonFailure(t *testing.T) { + r := buildTestRouter() + body, _ := json.Marshal(map[string]any{ + "action": "completed", + "workflow_run": map[string]string{ + "name": "CI", + "conclusion": "success", // not a failure + "html_url": "https://github.com/org/repo/actions/runs/1", + }, + }) + req := httptest.NewRequest(http.MethodPost, "/webhook/github", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + + var resp map[string]any + require.NoError(t, json.Unmarshal(w.Body.Bytes(), &resp)) + assert.Equal(t, false, resp["accepted"]) +} + +// TestGitHub_MissingWorkflowRun verifies handling of malformed payloads. +func TestGitHub_MissingWorkflowRun(t *testing.T) { + r := buildTestRouter() + body, _ := json.Marshal(map[string]string{"action": "completed"}) + req := httptest.NewRequest(http.MethodPost, "/webhook/github", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + var resp map[string]any + require.NoError(t, json.Unmarshal(w.Body.Bytes(), &resp)) + assert.Equal(t, false, resp["accepted"]) +} + +// TestGitLab_IgnoresSuccessStatus verifies non-failed pipelines are skipped. +func TestGitLab_IgnoresSuccessStatus(t *testing.T) { + r := buildTestRouter() + body, _ := json.Marshal(map[string]string{"object_kind": "pipeline", "status": "success"}) + req := httptest.NewRequest(http.MethodPost, "/webhook/gitlab", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + var resp map[string]any + require.NoError(t, json.Unmarshal(w.Body.Bytes(), &resp)) + assert.Equal(t, false, resp["accepted"]) +} + +// TestWebhook_BadJSON verifies 400 on completely invalid JSON. +func TestWebhook_BadJSON(t *testing.T) { + r := buildTestRouter() + req := httptest.NewRequest(http.MethodPost, "/webhook/simulate", + bytes.NewReader([]byte(`{not valid json}`))) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + assert.Equal(t, http.StatusBadRequest, w.Code) +} diff --git a/backend/internal/middleware/cors.go b/backend/internal/middleware/cors.go new file mode 100644 index 000000000..10e4cbefc --- /dev/null +++ b/backend/internal/middleware/cors.go @@ -0,0 +1,21 @@ +package middleware + +import ( + "github.com/gin-contrib/cors" + "github.com/gin-gonic/gin" +) + +// CORS returns a configured CORS middleware that allows the frontend origin(s). +func CORS(origins []string) gin.HandlerFunc { + cfg := cors.Config{ + AllowOrigins: origins, + AllowMethods: []string{"GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"}, + AllowHeaders: []string{ + "Origin", "Content-Type", "Accept", "Authorization", + "Cache-Control", "X-Requested-With", + }, + ExposeHeaders: []string{"Content-Length", "Content-Type"}, + AllowCredentials: true, + } + return cors.New(cfg) +} diff --git a/backend/internal/middleware/logger.go b/backend/internal/middleware/logger.go new file mode 100644 index 000000000..cf43b5573 --- /dev/null +++ b/backend/internal/middleware/logger.go @@ -0,0 +1,45 @@ +package middleware + +import ( + "fmt" + "time" + + "github.com/gin-gonic/gin" +) + +// Logger returns a Gin middleware that prints structured request logs. +func Logger() gin.HandlerFunc { + return func(c *gin.Context) { + start := time.Now() + path := c.Request.URL.Path + + c.Next() + + latency := time.Since(start) + status := c.Writer.Status() + method := c.Request.Method + + color := statusColor(status) + reset := "\033[0m" + + fmt.Printf("[REKALL] %s %s%d%s %-7s %s %s\n", + start.Format("15:04:05"), + color, status, reset, + method, path, + latency, + ) + } +} + +func statusColor(code int) string { + switch { + case code >= 500: + return "\033[31m" // red + case code >= 400: + return "\033[33m" // yellow + case code >= 200: + return "\033[32m" // green + default: + return "\033[0m" + } +} diff --git a/backend/internal/models/models.go b/backend/internal/models/models.go new file mode 100644 index 000000000..fda974ce5 --- /dev/null +++ b/backend/internal/models/models.go @@ -0,0 +1,172 @@ +package models + +import "time" + +// IncidentStatus represents the lifecycle state of an incident. +type IncidentStatus string + +const ( + StatusProcessing IncidentStatus = "processing" + StatusAwaitingApproval IncidentStatus = "awaiting_approval" + StatusResolved IncidentStatus = "resolved" + StatusFailed IncidentStatus = "failed" +) + +// FixTier identifies the retrieval tier used to source a fix. +type FixTier string + +const ( + TierHuman FixTier = "T1_human" + TierSynthetic FixTier = "T2_synthetic" + TierLLM FixTier = "T3_llm" +) + +// GovernanceDecisionType controls what action is taken with a fix. +type GovernanceDecisionType string + +const ( + DecisionAutoApply GovernanceDecisionType = "auto_apply" + DecisionCreatePR GovernanceDecisionType = "create_pr" + DecisionBlockAwaitHuman GovernanceDecisionType = "block_await_human" +) + +// Incident is the central record for a CI/CD failure event. +type Incident struct { + ID string `json:"id"` + Source string `json:"source"` + FailureType string `json:"failure_type"` + RawPayload map[string]any `json:"raw_payload"` + Status IncidentStatus `json:"status"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +// DiagnosticBundle holds the context built by DiagnosticAgent. +type DiagnosticBundle struct { + ID string `json:"id"` + IncidentID string `json:"incident_id"` + FailureSignature string `json:"failure_signature"` + LogExcerpt *string `json:"log_excerpt"` + GitDiff *string `json:"git_diff"` + TestReport *string `json:"test_report"` + ContextSummary *string `json:"context_summary"` + CreatedAt time.Time `json:"created_at"` +} + +// FixProposal is produced by FixAgent, indicating how to repair the failure. +type FixProposal struct { + ID string `json:"id"` + IncidentID string `json:"incident_id"` + Tier FixTier `json:"tier"` + VaultEntryID *string `json:"vault_entry_id"` + SimilarityScore *float64 `json:"similarity_score"` + FixDescription string `json:"fix_description"` + FixCommands []string `json:"fix_commands"` + FixDiff *string `json:"fix_diff"` + Confidence float64 `json:"confidence"` + Reasoning string `json:"reasoning"` + RLMTrace []byte `json:"rlm_trace,omitempty"` // JSONB depth 0/1 scan trace + CreatedAt time.Time `json:"created_at"` +} + +// GovernanceDecision is produced by GovernanceAgent. +type GovernanceDecision struct { + ID string `json:"id"` + IncidentID string `json:"incident_id"` + RiskScore float64 `json:"risk_score"` + Decision GovernanceDecisionType `json:"decision"` + RiskFactors []string `json:"risk_factors"` + CreatedAt time.Time `json:"created_at"` +} + +// AgentLog is a single step event emitted during pipeline execution. +type AgentLog struct { + ID string `json:"id"` + IncidentID string `json:"incident_id"` + StepName string `json:"step_name"` + Status string `json:"status"` // running | done | error + Detail string `json:"detail"` + CreatedAt time.Time `json:"created_at"` +} + +// VaultEntry mirrors the flat-file vault JSON schema for the UI. +// No chromadb_id — keyed by failure_signature only. +type VaultEntry struct { + ID string `json:"id"` + FailureSignature string `json:"failure_signature"` + FailureType *string `json:"failure_type"` + FixDescription *string `json:"fix_description"` + Source string `json:"source"` // human | synthetic + Confidence float64 `json:"confidence"` + RetrievalCount int `json:"retrieval_count"` + SuccessCount int `json:"success_count"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +// VaultStats aggregates vault summary metrics. +type VaultStats struct { + Total int `json:"total"` + HumanCount int `json:"human_count"` + SyntheticCount int `json:"synthetic_count"` + AvgConfidence *float64 `json:"avg_confidence"` +} + +// MetricsSummary provides the dashboard headline figures. +type MetricsSummary struct { + TotalIncidents int `json:"total_incidents"` + ResolvedCount int `json:"resolved_count"` + VaultSize int `json:"vault_size"` + AvgConfidence *float64 `json:"avg_confidence"` +} + +// --- Request / response DTOs --- + +type SimulateRequest struct { + Scenario string `json:"scenario" binding:"required"` +} + +type ApprovalRequest struct { + ReviewedBy string `json:"reviewed_by"` + Notes *string `json:"notes"` +} + +type GitHubWebhookPayload struct { + Action *string `json:"action"` + WorkflowRun *WorkflowRun `json:"workflow_run"` + Repository map[string]any `json:"repository"` +} + +type WorkflowRun struct { + Name string `json:"name"` + Conclusion string `json:"conclusion"` + HTMLURL string `json:"html_url"` +} + +type GitLabWebhookPayload struct { + ObjectKind string `json:"object_kind"` + Status string `json:"status"` +} + +// SandboxResult holds the outcome of the Minikube sandbox validation. +type SandboxResult struct { + IncidentID string `json:"incident_id"` + Passed bool `json:"passed"` + TestCount int `json:"test_count"` + FailureCount int `json:"failure_count"` + TestLog string `json:"test_log"` + PREvidence string `json:"pr_evidence"` + Namespace string `json:"namespace"` + DurationSeconds float64 `json:"duration_seconds"` + ValKeyDeployed bool `json:"valkey_deployed"` + DemoMode bool `json:"demo_mode"` +} + +type IncidentDetail struct { + Incident *Incident `json:"incident"` + DiagnosticBundle *DiagnosticBundle `json:"diagnostic_bundle"` + FixProposal *FixProposal `json:"fix_proposal"` + GovernanceDecision *GovernanceDecision `json:"governance_decision"` + SandboxResult *SandboxResult `json:"sandbox_result"` + AgentLogs []AgentLog `json:"agent_logs"` +} diff --git a/backend/internal/sse/broker.go b/backend/internal/sse/broker.go new file mode 100644 index 000000000..eea90bdd6 --- /dev/null +++ b/backend/internal/sse/broker.go @@ -0,0 +1,90 @@ +// Package sse provides a fan-out SSE event broker for streaming agent logs. +// Each incident gets its own channel. Multiple frontend clients can subscribe +// to the same incident by holding separate channels added to the same key. +package sse + +import ( + "encoding/json" + "sync" +) + +// Event is the payload sent over the SSE wire. +type Event struct { + Type string `json:"type"` // agent_log | status | done + Data any `json:"data"` +} + +// Broker manages per-incident subscriber channels. +type Broker struct { + mu sync.RWMutex + clients map[string][]chan Event +} + +// NewBroker constructs a ready-to-use Broker. +func NewBroker() *Broker { + return &Broker{ + clients: make(map[string][]chan Event), + } +} + +// Subscribe returns a channel that will receive events for the given incidentID. +// The channel is buffered; slow consumers silently drop events. +func (b *Broker) Subscribe(incidentID string) chan Event { + ch := make(chan Event, 128) + b.mu.Lock() + b.clients[incidentID] = append(b.clients[incidentID], ch) + b.mu.Unlock() + return ch +} + +// Unsubscribe removes a channel from the subscriber list and closes it. +func (b *Broker) Unsubscribe(incidentID string, ch chan Event) { + b.mu.Lock() + defer b.mu.Unlock() + + list := b.clients[incidentID] + newList := list[:0] + for _, c := range list { + if c != ch { + newList = append(newList, c) + } + } + if len(newList) == 0 { + delete(b.clients, incidentID) + } else { + b.clients[incidentID] = newList + } + close(ch) +} + +// Publish sends an event to all subscribers of incidentID. +// Non-blocking: if a subscriber's buffer is full the event is dropped for that client. +func (b *Broker) Publish(incidentID string, ev Event) { + b.mu.RLock() + defer b.mu.RUnlock() + + for _, ch := range b.clients[incidentID] { + select { + case ch <- ev: + default: + } + } +} + +// PublishDone signals that the pipeline for incidentID is complete. +// After this call, callers should drain and unsubscribe all channels. +func (b *Broker) PublishDone(incidentID string) { + b.Publish(incidentID, Event{Type: "done", Data: map[string]string{}}) +} + +// SubscriberCount returns the number of active subscribers for debugging. +func (b *Broker) SubscriberCount(incidentID string) int { + b.mu.RLock() + defer b.mu.RUnlock() + return len(b.clients[incidentID]) +} + +// Marshal returns the JSON-encoded SSE data line for an event. +func (e Event) Marshal() ([]byte, error) { + return json.Marshal(e) +} diff --git a/backend/internal/sse/broker_test.go b/backend/internal/sse/broker_test.go new file mode 100644 index 000000000..010af110f --- /dev/null +++ b/backend/internal/sse/broker_test.go @@ -0,0 +1,107 @@ +package sse_test + +import ( + "testing" + "time" + + "github.com/rekall/backend/internal/sse" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestBroker_SubscribeReceivesPublishedEvents(t *testing.T) { + b := sse.NewBroker() + const incidentID = "incident-001" + + ch := b.Subscribe(incidentID) + defer b.Unsubscribe(incidentID, ch) + + ev := sse.Event{Type: "agent_log", Data: map[string]string{"step": "monitor"}} + b.Publish(incidentID, ev) + + select { + case got := <-ch: + assert.Equal(t, "agent_log", got.Type) + case <-time.After(100 * time.Millisecond): + t.Fatal("timed out waiting for event") + } +} + +func TestBroker_MultipleSubscribersReceiveSameEvent(t *testing.T) { + b := sse.NewBroker() + const incidentID = "incident-002" + + ch1 := b.Subscribe(incidentID) + ch2 := b.Subscribe(incidentID) + defer b.Unsubscribe(incidentID, ch1) + defer b.Unsubscribe(incidentID, ch2) + + b.Publish(incidentID, sse.Event{Type: "status", Data: "ok"}) + + for _, ch := range []chan sse.Event{ch1, ch2} { + select { + case got := <-ch: + assert.Equal(t, "status", got.Type) + case <-time.After(100 * time.Millisecond): + t.Fatal("subscriber did not receive event") + } + } +} + +func TestBroker_UnsubscribeRemovesChannel(t *testing.T) { + b := sse.NewBroker() + const incidentID = "incident-003" + + ch := b.Subscribe(incidentID) + assert.Equal(t, 1, b.SubscriberCount(incidentID)) + + b.Unsubscribe(incidentID, ch) + assert.Equal(t, 0, b.SubscriberCount(incidentID)) +} + +func TestBroker_DifferentIncidentsAreIsolated(t *testing.T) { + b := sse.NewBroker() + + ch1 := b.Subscribe("inc-A") + ch2 := b.Subscribe("inc-B") + defer b.Unsubscribe("inc-A", ch1) + defer b.Unsubscribe("inc-B", ch2) + + b.Publish("inc-A", sse.Event{Type: "agent_log", Data: "A"}) + + select { + case <-ch1: + // correct: ch1 received inc-A's event + case <-time.After(100 * time.Millisecond): + t.Fatal("ch1 did not receive event for inc-A") + } + + select { + case ev := <-ch2: + t.Fatalf("ch2 unexpectedly received event: %v", ev) + case <-time.After(50 * time.Millisecond): + // correct: ch2 received nothing + } +} + +func TestBroker_PublishDoneSentinel(t *testing.T) { + b := sse.NewBroker() + ch := b.Subscribe("inc-done") + defer b.Unsubscribe("inc-done", ch) + + b.PublishDone("inc-done") + + select { + case got := <-ch: + assert.Equal(t, "done", got.Type) + case <-time.After(100 * time.Millisecond): + t.Fatal("did not receive done event") + } +} + +func TestEvent_Marshal(t *testing.T) { + ev := sse.Event{Type: "agent_log", Data: map[string]string{"step": "fix"}} + b, err := ev.Marshal() + require.NoError(t, err) + assert.Contains(t, string(b), "agent_log") +} diff --git a/backend/internal/store/store.go b/backend/internal/store/store.go new file mode 100644 index 000000000..68dc5f698 --- /dev/null +++ b/backend/internal/store/store.go @@ -0,0 +1,340 @@ +// Package store provides an in-memory data store for incidents, agent logs, +// fix proposals, and governance decisions. It replaces the PostgreSQL backend +// entirely — no database required. Data is keyed by incident UUID and held in +// memory for the lifetime of the process. +// +// Vault entries are NOT stored here; they are read directly from the flat-file +// vault on disk by the vault package. +package store + +import ( + "context" + "encoding/json" + "errors" + "os" + "path/filepath" + "sort" + "sync" + "time" + + "github.com/google/uuid" + "github.com/rekall/backend/internal/models" +) + +// ErrNotFound is returned when a requested record does not exist. +var ErrNotFound = errors.New("not found") + +// incidentRecord is the full in-memory record for one incident. +type incidentRecord struct { + incident *models.Incident + bundle *models.DiagnosticBundle + fix *models.FixProposal + governance *models.GovernanceDecision + sandbox *models.SandboxResult + logs []models.AgentLog +} + +var ( + mu sync.RWMutex + records = map[string]*incidentRecord{} +) + +// SerializableRecord is used for JSON file persistence. +type SerializableRecord struct { + Incident *models.Incident `json:"incident"` + Bundle *models.DiagnosticBundle `json:"bundle"` + Fix *models.FixProposal `json:"fix"` + Governance *models.GovernanceDecision `json:"governance"` + Sandbox *models.SandboxResult `json:"sandbox"` + Logs []models.AgentLog `json:"logs"` +} + +// Load reads incidents from incidents.json in the vault. +func Load(vaultPath string) error { + bytes, err := os.ReadFile(filepath.Join(vaultPath, "incidents.json")) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + var data map[string]SerializableRecord + if err := json.Unmarshal(bytes, &data); err != nil { + return err + } + mu.Lock() + defer mu.Unlock() + for k, v := range data { + records[k] = &incidentRecord{ + incident: v.Incident, + bundle: v.Bundle, + fix: v.Fix, + governance: v.Governance, + sandbox: v.Sandbox, + logs: v.Logs, + } + } + return nil +} + +// Save writes incidents to incidents.json in the vault. +func Save(vaultPath string) error { + mu.RLock() + defer mu.RUnlock() + data := make(map[string]SerializableRecord, len(records)) + for k, v := range records { + data[k] = SerializableRecord{ + Incident: v.incident, + Bundle: v.bundle, + Fix: v.fix, + Governance: v.governance, + Sandbox: v.sandbox, + Logs: v.logs, + } + } + bytes, err := json.MarshalIndent(data, "", " ") + if err != nil { + return err + } + return os.WriteFile(filepath.Join(vaultPath, "incidents.json"), bytes, 0644) +} + +// ── Incidents ────────────────────────────────────────────────────────────────── + +// CreateIncident creates a new incident and stores it in memory. +func CreateIncident(_ context.Context, source, failureType string, payload map[string]any) (*models.Incident, error) { + inc := &models.Incident{ + ID: uuid.NewString(), + Source: source, + FailureType: failureType, + RawPayload: payload, + Status: models.StatusProcessing, + CreatedAt: time.Now().UTC(), + UpdatedAt: time.Now().UTC(), + } + mu.Lock() + records[inc.ID] = &incidentRecord{incident: inc, logs: []models.AgentLog{}} + mu.Unlock() + return inc, nil +} + +// GetIncident fetches one incident by ID. +func GetIncident(_ context.Context, id string) (*models.Incident, error) { + mu.RLock() + rec, ok := records[id] + mu.RUnlock() + if !ok { + return nil, nil // return nil, nil (not found) — matches old DB behaviour + } + return rec.incident, nil +} + +// ListIncidents returns incidents sorted newest-first with limit/offset. +func ListIncidents(_ context.Context, limit, offset int) ([]*models.Incident, error) { + mu.RLock() + all := make([]*models.Incident, 0, len(records)) + for _, r := range records { + all = append(all, r.incident) + } + mu.RUnlock() + + sort.Slice(all, func(i, j int) bool { + return all[i].CreatedAt.After(all[j].CreatedAt) + }) + + if offset >= len(all) { + return []*models.Incident{}, nil + } + end := offset + limit + if end > len(all) { + end = len(all) + } + return all[offset:end], nil +} + +// UpdateIncidentStatus sets the status of an existing incident. +func UpdateIncidentStatus(_ context.Context, id string, status models.IncidentStatus) error { + mu.Lock() + defer mu.Unlock() + rec, ok := records[id] + if !ok { + return ErrNotFound + } + rec.incident.Status = status + rec.incident.UpdatedAt = time.Now().UTC() + return nil +} + +// ── Agent Logs ──────────────────────────────────────────────────────────────── + +// AppendAgentLog appends a pipeline step log entry for an incident. +func AppendAgentLog(_ context.Context, incidentID, stepName, status, detail string) (*models.AgentLog, error) { + entry := models.AgentLog{ + ID: uuid.NewString(), + IncidentID: incidentID, + StepName: stepName, + Status: status, + Detail: detail, + CreatedAt: time.Now().UTC(), + } + mu.Lock() + rec, ok := records[incidentID] + if !ok { + // Auto-create a minimal record so logs work even during races + rec = &incidentRecord{ + incident: &models.Incident{ID: incidentID, Status: models.StatusProcessing, CreatedAt: time.Now().UTC(), UpdatedAt: time.Now().UTC()}, + logs: []models.AgentLog{}, + } + records[incidentID] = rec + } + rec.logs = append(rec.logs, entry) + mu.Unlock() + return &entry, nil +} + +// GetAgentLogs returns all log entries for an incident. +func GetAgentLogs(_ context.Context, incidentID string) ([]models.AgentLog, error) { + mu.RLock() + rec, ok := records[incidentID] + mu.RUnlock() + if !ok { + return []models.AgentLog{}, nil + } + out := make([]models.AgentLog, len(rec.logs)) + copy(out, rec.logs) + return out, nil +} + +// ── Fix Proposals ───────────────────────────────────────────────────────────── + +// UpsertFixProposal stores (or replaces) the fix proposal for an incident. +func UpsertFixProposal(_ context.Context, fix *models.FixProposal) error { + if fix.ID == "" { + fix.ID = uuid.NewString() + } + fix.CreatedAt = time.Now().UTC() + mu.Lock() + defer mu.Unlock() + rec, ok := records[fix.IncidentID] + if !ok { + return ErrNotFound + } + rec.fix = fix + return nil +} + +// GetLatestFixProposal returns the stored fix proposal for an incident. +func GetLatestFixProposal(_ context.Context, incidentID string) (*models.FixProposal, error) { + mu.RLock() + rec, ok := records[incidentID] + mu.RUnlock() + if !ok { + return nil, nil + } + return rec.fix, nil +} + +// ── Governance Decisions ────────────────────────────────────────────────────── + +// UpsertGovernanceDecision stores the governance decision for an incident. +func UpsertGovernanceDecision(_ context.Context, gov *models.GovernanceDecision) error { + if gov.ID == "" { + gov.ID = uuid.NewString() + } + gov.CreatedAt = time.Now().UTC() + mu.Lock() + defer mu.Unlock() + rec, ok := records[gov.IncidentID] + if !ok { + return ErrNotFound + } + rec.governance = gov + return nil +} + +// GetLatestGovernanceDecision returns the governance decision for an incident. +func GetLatestGovernanceDecision(_ context.Context, incidentID string) (*models.GovernanceDecision, error) { + mu.RLock() + rec, ok := records[incidentID] + mu.RUnlock() + if !ok { + return nil, nil + } + return rec.governance, nil +} + +// ── Diagnostic Bundles ──────────────────────────────────────────────────────── + +// UpsertDiagnosticBundle stores the diagnostic bundle for an incident. +func UpsertDiagnosticBundle(_ context.Context, b *models.DiagnosticBundle) error { + if b.ID == "" { + b.ID = uuid.NewString() + } + b.CreatedAt = time.Now().UTC() + mu.Lock() + defer mu.Unlock() + rec, ok := records[b.IncidentID] + if !ok { + return ErrNotFound + } + rec.bundle = b + return nil +} + +// GetDiagnosticBundle returns the diagnostic bundle for an incident. +func GetDiagnosticBundle(_ context.Context, incidentID string) (*models.DiagnosticBundle, error) { + mu.RLock() + rec, ok := records[incidentID] + mu.RUnlock() + if !ok { + return nil, nil + } + return rec.bundle, nil +} + +// ── Sandbox Results ─────────────────────────────────────────────────────────── + +// UpsertSandboxResult stores the Minikube sandbox result for an incident. +func UpsertSandboxResult(_ context.Context, r *models.SandboxResult) error { + mu.Lock() + defer mu.Unlock() + rec, ok := records[r.IncidentID] + if !ok { + return ErrNotFound + } + rec.sandbox = r + return nil +} + +// GetSandboxResult returns the sandbox result for an incident (nil if not run). +func GetSandboxResult(_ context.Context, incidentID string) (*models.SandboxResult, error) { + mu.RLock() + rec, ok := records[incidentID] + mu.RUnlock() + if !ok { + return nil, nil + } + return rec.sandbox, nil +} + +// ── Metrics ─────────────────────────────────────────────────────────────────── + +// GetMetricsSummary computes live metrics from in-memory records. +func GetMetricsSummary(_ context.Context, vaultSize int, avgConfidence *float64) (*models.MetricsSummary, error) { + mu.RLock() + total := len(records) + resolved := 0 + for _, r := range records { + if r.incident.Status == models.StatusResolved { + resolved++ + } + } + mu.RUnlock() + + return &models.MetricsSummary{ + TotalIncidents: total, + ResolvedCount: resolved, + VaultSize: vaultSize, + AvgConfidence: avgConfidence, + }, nil +} diff --git a/backend/internal/vault/reader.go b/backend/internal/vault/reader.go new file mode 100644 index 000000000..a1bf573fa --- /dev/null +++ b/backend/internal/vault/reader.go @@ -0,0 +1,222 @@ +// Package vault provides file-based reading of the REKALL flat-file vault. +// It is the Go side of the flat-file vault — read-only for the backend. +// All writes are done by the Python engine (rekall_engine/vault/store.py). +package vault + +// ListEpisodes reads the last `limit` RL episodes from vault/episodes.json. + +import ( + "encoding/json" + "log" + "math" + "os" + "path/filepath" + "sort" + "strconv" + "sync" + "time" + + "github.com/rekall/backend/internal/models" +) + +var ( + vaultPath string + once sync.Once +) + +// Init sets the vault root directory. Call once at startup. +func Init(path string) { + once.Do(func() { vaultPath = path }) +} + +// listEntries reads all *.json files from a scope directory. +func listEntries(scope string) ([]*models.VaultEntry, error) { + dir := filepath.Join(vaultPath, scope) + entries, err := os.ReadDir(dir) + if err != nil { + if os.IsNotExist(err) { + return []*models.VaultEntry{}, nil + } + return nil, err + } + + var result []*models.VaultEntry + for _, e := range entries { + if e.IsDir() || filepath.Ext(e.Name()) != ".json" { + continue + } + data, err := os.ReadFile(filepath.Join(dir, e.Name())) + if err != nil { + log.Printf("[vault] read error %s: %v", e.Name(), err) + continue + } + var raw map[string]any + if err := json.Unmarshal(data, &raw); err != nil { + log.Printf("[vault] parse error %s: %v", e.Name(), err) + continue + } + entry := mapToVaultEntry(raw) + if entry != nil { + result = append(result, entry) + } + } + return result, nil +} + +// ListAll returns all vault entries from local (and optionally org) sorted by confidence DESC. +func ListAll(source *string, limit, offset int) ([]*models.VaultEntry, error) { + local, err := listEntries("local") + if err != nil { + return nil, err + } + org, _ := listEntries("org") // org vault is optional + all := append(local, org...) + + // Filter by source if requested + if source != nil && *source != "" { + var filtered []*models.VaultEntry + for _, e := range all { + if e.Source == *source { + filtered = append(filtered, e) + } + } + all = filtered + } + + sort.Slice(all, func(i, j int) bool { + return all[i].Confidence > all[j].Confidence + }) + + if offset >= len(all) { + return []*models.VaultEntry{}, nil + } + end := offset + limit + if end > len(all) { + end = len(all) + } + return all[offset:end], nil +} + +// Stats computes aggregate vault statistics. +func Stats() (*models.VaultStats, error) { + entries, err := ListAll(nil, math.MaxInt32, 0) + if err != nil { + return nil, err + } + + humanCount := 0 + synthCount := 0 + totalConf := 0.0 + + for _, e := range entries { + if e.Source == "human" { + humanCount++ + } else { + synthCount++ + } + totalConf += e.Confidence + } + + var avgConf *float64 + if len(entries) > 0 { + v := totalConf / float64(len(entries)) + avgConf = &v + } + + return &models.VaultStats{ + Total: len(entries), + HumanCount: humanCount, + SyntheticCount: synthCount, + AvgConfidence: avgConf, + }, nil +} + +// mapToVaultEntry converts a raw JSON map to a VaultEntry. +func mapToVaultEntry(raw map[string]any) *models.VaultEntry { + id, _ := raw["id"].(string) + if id == "" { + return nil + } + sig, _ := raw["failure_signature"].(string) + ft, _ := raw["failure_type"].(string) + desc, _ := raw["fix_description"].(string) + src, _ := raw["source"].(string) + if src == "" { + src = "human" + } + + conf := toFloat(raw["confidence"]) + retrieval := toInt(raw["retrieval_count"]) + success := toInt(raw["success_count"]) + + createdAt := time.Now() + updatedAt := time.Now() + if s, ok := raw["created_at"].(string); ok { + if t, err := time.Parse(time.RFC3339, s); err == nil { + createdAt = t + } + } + if s, ok := raw["updated_at"].(string); ok { + if t, err := time.Parse(time.RFC3339, s); err == nil { + updatedAt = t + } + } + + return &models.VaultEntry{ + ID: id, + FailureSignature: sig, + FailureType: &ft, + FixDescription: &desc, + Source: src, + Confidence: conf, + RetrievalCount: retrieval, + SuccessCount: success, + CreatedAt: createdAt, + UpdatedAt: updatedAt, + } +} + +func toFloat(v any) float64 { + switch x := v.(type) { + case float64: + return x + case string: + f, _ := strconv.ParseFloat(x, 64) + return f + } + return 0 +} + +func toInt(v any) int { + switch x := v.(type) { + case float64: + return int(x) + case int: + return x + } + return 0 +} + +// ListEpisodes reads the last `limit` RL episodes from vault/episodes.json. +// Returns an empty slice if the file doesn't exist yet. +func ListEpisodes(limit int) ([]map[string]any, error) { + path := filepath.Join(vaultPath, "episodes.json") + data, err := os.ReadFile(path) + if err != nil { + if os.IsNotExist(err) { + return []map[string]any{}, nil + } + return nil, err + } + + var all []map[string]any + if err := json.Unmarshal(data, &all); err != nil { + return nil, err + } + + // Return the last `limit` episodes (most recent) + if limit > 0 && len(all) > limit { + all = all[len(all)-limit:] + } + return all, nil +} diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 000000000..d5388b73b --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,97 @@ +# ───────────────────────────────────────────────────────────────────────────── +# REKALL — Full-stack docker-compose (DB-free) +# +# Services (3 only — no postgres, no chromadb): +# backend — Go (Gin) API server +# engine — Python FastAPI (AI agent + RLM orchestration) +# frontend — Next.js 15 +# +# Shared vault volume: +# engine writes JSON to /app/vault/ +# backend reads JSON from /app/vault/ (flat-file reader) +# ───────────────────────────────────────────────────────────────────────────── + +services: + # ── Python engine service ────────────────────────────────────────────────── + engine: + build: + context: . + dockerfile: engine/Dockerfile + restart: unless-stopped + ports: + - "8002:8002" + environment: + GROQ_API_KEY: ${GROQ_API_KEY} + GROQ_API_KEYS: ${GROQ_API_KEYS:-} + GROQ_API_KEY_2: ${GROQ_API_KEY_2:-} + GROQ_API_KEY_3: ${GROQ_API_KEY_3:-} + GROQ_API_KEY_4: ${GROQ_API_KEY_4:-} + GROQ_API_KEY_5: ${GROQ_API_KEY_5:-} + GO_BACKEND_URL: http://backend:8000 + VAULT_PATH: /app/vault + LOG_LEVEL: INFO + # GitHub integration — required for real CI failure fetch + PR creation + GITHUB_TOKEN: ${GITHUB_TOKEN} + GITHUB_REPO: ${GITHUB_REPO} + GITHUB_LIVE_PR: ${GITHUB_LIVE_PR:-true} + # Optional integrations + INTEGRATIONS_ENABLED: ${INTEGRATIONS_ENABLED} + SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL} + NOTION_TOKEN: ${NOTION_TOKEN} + NOTION_DATABASE_ID: ${NOTION_DATABASE_ID} + # Minikube sandbox (optional — requires minikube + kubectl on PATH) + SANDBOX_ENABLED: ${SANDBOX_ENABLED:-false} + MINIKUBE_PROFILE: ${MINIKUBE_PROFILE:-rekall} + MINIKUBE_CPUS: ${MINIKUBE_CPUS:-4} + MINIKUBE_MEMORY: ${MINIKUBE_MEMORY:-8192} + SANDBOX_TIMEOUT: ${SANDBOX_TIMEOUT:-300} + SANDBOX_VALKEY_IMAGE: ${SANDBOX_VALKEY_IMAGE:-valkey/valkey:7.2-alpine} + volumes: + - ./vault:/app/vault + healthcheck: + test: [ "CMD-SHELL", "python3 -c \"import urllib.request; urllib.request.urlopen('http://localhost:8002/health')\"" ] + interval: 15s + timeout: 5s + retries: 3 + start_period: 20s + + # ── Go backend ──────────────────────────────────────────────────────────── + backend: + build: + context: backend + dockerfile: Dockerfile + restart: unless-stopped + ports: + - "8000:8000" + environment: + ENGINE_URL: http://engine:8002 + VAULT_PATH: /app/vault + CORS_ORIGINS: http://localhost:3000 + GIN_MODE: release + PORT: "8000" + volumes: + - ./vault:/app/vault:ro # read-only: engine owns writes + depends_on: + engine: + condition: service_healthy + healthcheck: + test: [ "CMD", "wget", "-qO-", "http://localhost:8000/health" ] + interval: 10s + timeout: 5s + retries: 3 + start_period: 5s + + # ── Next.js frontend ────────────────────────────────────────────────────── + frontend: + build: + context: frontend + dockerfile: Dockerfile + restart: unless-stopped + ports: + - "3000:3000" + environment: + NEXT_PUBLIC_API_URL: http://localhost:8000 + NODE_ENV: production + depends_on: + backend: + condition: service_healthy diff --git a/engine/Dockerfile b/engine/Dockerfile new file mode 100644 index 000000000..38f4b329b --- /dev/null +++ b/engine/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.12-slim + +WORKDIR /app +ENV PYTHONPATH=/app + +RUN pip install --no-cache-dir uv + +# Build context is the repo root (set in docker-compose.yml) +COPY engine/requirements.txt . +# Install CPU-only torch first (avoids 2GB+ of NVIDIA CUDA libs) +RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --system --no-cache -r requirements.txt + +# Copy engine service and the rekall_engine module +COPY engine/ /app/engine/ +COPY rekall_engine/ /app/rekall_engine/ + +WORKDIR /app/engine + +EXPOSE 8002 + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8002"] diff --git a/engine/main.py b/engine/main.py new file mode 100644 index 000000000..789b53507 --- /dev/null +++ b/engine/main.py @@ -0,0 +1,779 @@ +""" +REKALL Engine Service — Python FastAPI microservice. + +This service wraps `rekall_engine` and exposes two endpoints for the Go backend: + POST /pipeline/run — start async pipeline for an incident + POST /pipeline/learn — submit outcome for LearningAgent + GET /health — liveness probe + +The Go backend calls these endpoints; the engine service runs the AI agent +graph (LangGraph) asynchronously and can notify the Go backend via a callback +URL when agent log events are emitted. +""" + +from __future__ import annotations + +import asyncio +import logging +import os +from contextlib import asynccontextmanager +from typing import Any, Dict, Optional + +import httpx +from fastapi import FastAPI, BackgroundTasks, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel +from pydantic_settings import BaseSettings + +# ───────────────────────────────────────────── +# Module-level reusable HTTP client (Fix #1) +# ───────────────────────────────────────────── +_http_client: Optional[httpx.AsyncClient] = None + + + +# ───────────────────────────────────────────── +# Config +# ───────────────────────────────────────────── + +class Settings(BaseSettings): + groq_api_key: str = "" + go_backend_url: str = "http://localhost:8000" # callback target + vault_path: str = "vault" # flat-file vault directory + log_level: str = "INFO" + + class Config: + env_file = "../../.env" + env_file_encoding = "utf-8" + extra = "ignore" + + +settings = Settings() +logging.basicConfig(level=settings.log_level) +log = logging.getLogger("rekall.engine") + + +# ───────────────────────────────────────────── +# App +# ───────────────────────────────────────────── + +@asynccontextmanager +async def lifespan(app: FastAPI): + global _http_client + _http_client = httpx.AsyncClient(timeout=10.0) + log.info("Engine service starting up") + yield + log.info("Engine service shutting down") + await _http_client.aclose() + _http_client = None + + +app = FastAPI( + title="REKALL Engine Service", + description="AI agent pipeline — LangGraph + vault + RL", + version="1.0.0", + lifespan=lifespan, +) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], +) + + +# ───────────────────────────────────────────── +# Request / response models +# ───────────────────────────────────────────── + +class PipelineRunRequest(BaseModel): + incident_id: str + payload: Dict[str, Any] + + +class PipelineLearnRequest(BaseModel): + incident_id: str + fix_proposal_id: str + result: str # success | failure | rejected + reviewed_by: str = "human" + notes: Optional[str] = None + fix_tier: Optional[str] = None # T1_human | T2_synthetic | T3_llm + vault_entry_id: Optional[str] = None # vault entry that was selected + + +class CreatePRRequest(BaseModel): + incident_id: str + fix_commands: list = [] + fix_description: str = "" + fix_tier: str = "T3_llm" + fix_diff: Optional[str] = None + + +class PipelineResponse(BaseModel): + ok: bool + message: str = "" + + +# ───────────────────────────────────────────── +# Endpoints +# ───────────────────────────────────────────── + +@app.get("/health") +async def health(): + return {"ok": True, "service": "rekall-engine"} + + +@app.post("/pipeline/run", response_model=PipelineResponse) +async def run_pipeline(req: PipelineRunRequest, background_tasks: BackgroundTasks): + """ + Start the agent pipeline for an incident. + Returns immediately; work runs in the background. + """ + background_tasks.add_task(_run_pipeline_async, req.incident_id, req.payload) + return PipelineResponse(ok=True, message="pipeline started") + + +class FetchFromGitHubRequest(BaseModel): + incident_id: str + repo: Optional[str] = None # e.g. "abjt01/sample-ci-sad" — defaults to GITHUB_REPO env + + +@app.post("/pipeline/run-from-github", response_model=PipelineResponse) +async def run_from_github(req: FetchFromGitHubRequest, background_tasks: BackgroundTasks): + """ + Fetch the latest failed GitHub Actions run from the given repo (or GITHUB_REPO env), + extract its logs, and run the full real agent pipeline to diagnose and fix it. + """ + background_tasks.add_task(_fetch_and_run_pipeline, req.incident_id, req.repo) + return PipelineResponse(ok=True, message="fetching github ci failure and running pipeline") + + +@app.post("/pipeline/learn", response_model=PipelineResponse) +async def learn(req: PipelineLearnRequest): + """ + Submit an outcome so LearningAgent can update vault confidence. + """ + try: + await _run_learning( + req.incident_id, req.fix_proposal_id, req.result, + req.reviewed_by, req.notes, req.fix_tier, req.vault_entry_id, + ) + return PipelineResponse(ok=True, message="learning complete") + except NotImplementedError: + # rekall_engine agents are placeholders — acknowledged gracefully + return PipelineResponse(ok=True, message="learning placeholder (engine not yet implemented)") + except Exception as exc: + log.exception("learning failed: %s", exc) + raise HTTPException(status_code=500, detail=str(exc)) + + +@app.post("/pipeline/create-pr", response_model=PipelineResponse) +async def create_pr(req: CreatePRRequest, background_tasks: BackgroundTasks): + """ + Open a real GitHub PR using the approved fix proposal. + Called by the Go backend when a human approves a block_await_human incident. + Runs asynchronously and posts back the PR URL via the engine-callback. + """ + background_tasks.add_task( + _create_pr_async, + req.incident_id, + req.fix_commands, + req.fix_description, + req.fix_tier, + req.fix_diff, + ) + return PipelineResponse(ok=True, message="pr creation started") + + +# ───────────────────────────────────────────── +# Pipeline execution +# ───────────────────────────────────────────── + +async def _fetch_and_run_pipeline(incident_id: str, repo_name: Optional[str]) -> None: + """ + Fetch the latest failed GitHub Actions workflow run from the configured repo, + extract the real failure logs, and run the full agent pipeline against them. + This is the REAL path — no simulated data, no emulation. + """ + github_token = os.getenv("GITHUB_TOKEN", "") + repo_slug = repo_name or os.getenv("GITHUB_REPO", "") + + if not github_token or not repo_slug: + await _post_callback(incident_id, {"type": "agent_log", "data": { + "incident_id": incident_id, "step_name": "monitor", "status": "error", + "detail": "GITHUB_TOKEN or GITHUB_REPO not configured — cannot fetch real CI failures", + }}) + await _post_callback(incident_id, {"type": "status", "data": {"incident_id": incident_id, "status": "failed"}}) + return + + await _post_callback(incident_id, {"type": "agent_log", "data": { + "incident_id": incident_id, "step_name": "monitor", "status": "running", + "detail": f"Connecting to GitHub → {repo_slug}", + }}) + + try: + import zipfile, io + try: + from github import Github # type: ignore + except ImportError: + await _post_callback(incident_id, {"type": "agent_log", "data": { + "incident_id": incident_id, "step_name": "monitor", "status": "error", + "detail": "PyGithub not installed — cannot fetch real CI failures", + }}) + return + + loop = asyncio.get_running_loop() + + def gh_fetch(): + g = Github(github_token) + repo = g.get_repo(repo_slug) + + # Find the most recent failed workflow run (any branch) + runs = repo.get_workflow_runs(status="failure") + run = None + for r in runs: + run = r + break + + if run is None: + return None, None, None, None + + # Download the log zip archive + import urllib.request, urllib.error + logs_url = ( + f"https://api.github.com/repos/{repo_slug}/actions/runs/{run.id}/logs" + ) + req = urllib.request.Request(logs_url, headers={ + "Authorization": f"Bearer {github_token}", + "Accept": "application/vnd.github+json", + }) + log_text = "" + try: + with urllib.request.urlopen(req, timeout=30) as resp: + zdata = resp.read() + zf = zipfile.ZipFile(io.BytesIO(zdata)) + parts = [] + for name in sorted(zf.namelist())[:10]: # first 10 step logs + parts.append(f"=== {name} ===\n{zf.read(name).decode('utf-8', errors='replace')[:3000]}") + log_text = "\n\n".join(parts)[:12000] + except Exception as e: + log_text = f"[Could not download logs: {e}]" + + return run, log_text, repo.default_branch, run.head_commit.sha if run.head_commit else "" + + run, log_text, default_branch, commit_sha = await loop.run_in_executor(None, gh_fetch) + + if run is None: + await _post_callback(incident_id, {"type": "agent_log", "data": { + "incident_id": incident_id, "step_name": "monitor", "status": "error", + "detail": f"No failed workflow runs found in {repo_slug}", + }}) + await _post_callback(incident_id, {"type": "status", "data": {"incident_id": incident_id, "status": "failed"}}) + return + + await _post_callback(incident_id, {"type": "agent_log", "data": { + "incident_id": incident_id, "step_name": "monitor", "status": "done", + "detail": f"Found failed run #{run.run_number}: '{run.name}' on {run.head_branch}", + }}) + + # Build a real incident payload with actual GitHub data + payload = { + "source": "github_actions", + "failure_type": "unknown", # DiagnosticAgent will classify from logs + "description": f"GitHub Actions failure: {run.name} (run #{run.run_number})", + "log_excerpt": log_text, + "git_diff": None, + "branch": run.head_branch, + "commit_sha": commit_sha, + "workflow_url": run.html_url, + "repo": repo_slug, + } + + except Exception as exc: + log.exception("GitHub fetch failed: %s", exc) + await _post_callback(incident_id, {"type": "agent_log", "data": { + "incident_id": incident_id, "step_name": "monitor", "status": "error", + "detail": f"GitHub fetch failed: {exc}", + }}) + await _post_callback(incident_id, {"type": "status", "data": {"incident_id": incident_id, "status": "failed"}}) + return + + # Delegate to the real agent pipeline — NOT the emulated fallback + await _run_pipeline_async(incident_id, payload) + + +async def _run_pipeline_async(incident_id: str, payload: Dict[str, Any]) -> None: + """ + Drive the rekall_engine pipeline and relay agent log events back to the + Go backend via its /internal/agent-log endpoint. + When rekall_engine agents are not yet implemented this falls back to a + stepped emulation that keeps the dashboard alive. + """ + log.info("Pipeline started for incident %s", incident_id) + + try: + # Use the real engine graph — run_pipeline returns final state dict + # and emits AgentLogEntry objects to a queue as it runs. + from rekall_engine.graph.orchestrator import run_pipeline # type: ignore + import asyncio as _asyncio + from rekall_engine.types import AgentLogEntry # type: ignore + + queue: _asyncio.Queue = _asyncio.Queue() + + # Run pipeline in background, draining the queue concurrently + pipeline_task = _asyncio.create_task( + run_pipeline(payload, incident_id, log_queue=queue) + ) + + # Drain log entries until sentinel (None) received + while True: + entry = await queue.get() + if entry is None: + break + if isinstance(entry, AgentLogEntry): + await _post_callback(incident_id, { + "type": "agent_log", + "data": { + "incident_id": entry.incident_id, + "step_name": entry.step_name, + "status": entry.status, + "detail": entry.detail, + }, + }) + + # Wait for pipeline to finish + final_state = await pipeline_task + + # ── Post sandbox_result callback if available ───────────────────── + # This covers both the sandbox-validated-PR path and the paused path + # (where sandbox failed and human review is still needed). + sandbox = final_state.get("sandbox_result") + if sandbox is not None: + await _post_callback(incident_id, { + "type": "sandbox_result", + "data": { + "incident_id": incident_id, + "passed": bool(getattr(sandbox, "passed", False)), + "test_count": int(getattr(sandbox, "test_count", 0)), + "failure_count": int(getattr(sandbox, "failure_count", 0)), + "test_log": str(getattr(sandbox, "test_log", ""))[:5000], + "pr_evidence": str(getattr(sandbox, "pr_evidence", "")), + "namespace": str(getattr(sandbox, "namespace", "")), + "duration_seconds": float(getattr(sandbox, "duration_seconds", 0.0)), + "valkey_deployed": bool(getattr(sandbox, "valkey_deployed", False)), + "demo_mode": bool(getattr(sandbox, "demo_mode", False)), + }, + }) + + # ── Sandbox-validated PR path ───────────────────────────────────── + # When the sandbox passed, orchestrator set sandbox_validated_pr=True + # and did NOT set paused. We now create the PR with sandbox evidence. + if final_state.get("sandbox_validated_pr"): + fix = final_state.get("fix_proposal") + pr_evidence = str(getattr(sandbox, "pr_evidence", "")) if sandbox else "" + if fix is not None: + import asyncio as _asyncio + _asyncio.create_task(_create_pr_async( + incident_id, + list(getattr(fix, "fix_commands", []) or []), + str(getattr(fix, "fix_description", "") or ""), + str(getattr(fix, "tier", "T3_llm") or "T3_llm"), + getattr(fix, "fix_diff", None), + pr_evidence=pr_evidence, + )) + + # ── If the pipeline paused for human review, push the fix_proposal to + # ── the Go store NOW so Approve → GetLatestFixProposal finds it. + elif final_state.get("paused"): + fix = final_state.get("fix_proposal") + if fix is not None: + import uuid as _uuid + await _post_callback(incident_id, { + "type": "fix_proposal", + "data": { + "id": str(_uuid.uuid4()), + "incident_id": incident_id, + "tier": str(getattr(fix, "tier", "T3_llm")), + "fix_description": str(getattr(fix, "fix_description", "") or ""), + "fix_commands": list(getattr(fix, "fix_commands", []) or []), + "fix_diff": getattr(fix, "fix_diff", None), + "vault_entry_id": getattr(fix, "vault_entry_id", None), + "confidence": float(getattr(fix, "confidence", 0.5) or 0.5), + "reasoning": str(getattr(fix, "reasoning", "") or ""), + }, + }) + + # Determine final status + gov = final_state.get("governance_decision") + if final_state.get("paused"): + final_status = "awaiting_approval" + else: + final_status = "resolved" + + await _post_callback(incident_id, { + "type": "status", + "data": { + "incident_id": incident_id, + "status": final_status, + "governance_decision": { + "risk_score": gov.risk_score if gov else 0.5, + "decision": gov.decision if gov else "block_await_human", + "risk_factors": gov.risk_factors if gov else [], + } if gov else None, + }, + }) + + except Exception as exc: + log.exception("pipeline error: %s", exc) + # Only fall back to emulation if it's clearly a missing implementation + if isinstance(exc, (NotImplementedError, ImportError)): + log.warning("rekall_engine not implemented — running emulated pipeline") + await _emulated_pipeline(incident_id, payload) + else: + # Real error — report it to the dashboard + await _post_callback(incident_id, { + "type": "agent_log", + "data": { + "incident_id": incident_id, + "step_name": "error", + "status": "error", + "detail": f"Pipeline error: {type(exc).__name__}: {exc}", + }, + }) + await _post_callback(incident_id, { + "type": "status", + "data": {"incident_id": incident_id, "status": "failed"}, + }) + + +async def _emulated_pipeline(incident_id: str, payload: Dict[str, Any]) -> None: + """ + Replays a realistic step-by-step timeline to the Go backend callback + when the real engine graph is not yet implemented. + Also performs a real GitHub PR creation if GITHUB_LIVE_PR=true. + """ + steps = [ + ("monitor", "Normalising failure event payload"), + ("diagnostic", "Fetching logs, git diff, and test reports"), + ("fix", "Searching memory vault: T1 → T2 → T3 fallback"), + ("governance", "Computing risk score across 6 dimensions"), + ("publish_guard", "Supply-chain safety gate: checking commands"), + ("learning", "Slack & Notion notifications dispatched"), + ] + + for step_name, detail in steps: + for status in ("running", "done"): + await _post_callback(incident_id, { + "type": "agent_log", + "data": { + "incident_id": incident_id, + "step_name": step_name, + "status": status, + "detail": detail, + }, + }) + if status == "running": + await asyncio.sleep(1.2) + + # ── Live GitHub PR (production demo) ────────────────────────────────────── + # When GITHUB_LIVE_PR=true, open a real PR on GITHUB_REPO using the + # AI-generated fix commands (emulated here). This runs even in emulated + # pipeline mode because the real agent stubs raise NotImplementedError. + github_live_pr = os.getenv("GITHUB_LIVE_PR", "false").lower() == "true" + github_token = os.getenv("GITHUB_TOKEN", "") + github_repo = os.getenv("GITHUB_REPO", "") + + pr_url: Optional[str] = None + if github_live_pr and github_token and github_repo: + try: + try: + from github import Github # type: ignore # PyGithub + except ImportError: + log.warning("[emulated_pipeline] PyGithub not installed — skipping PR") + raise RuntimeError("PyGithub not installed") + g = Github(github_token) + repo = g.get_repo(github_repo) + + branch_name = f"rekall-auto-fix-{incident_id[:8]}" + base_branch = repo.default_branch + base_sha = repo.get_branch(base_branch).commit.sha + + # Create the fix branch + repo.create_git_ref(f"refs/heads/{branch_name}", base_sha) + + # Determine scenario label for the commit message + scenario = payload.get("scenario", payload.get("failure_type", "unknown")) + + # Commit a fix script to the branch + fix_script = ( + f"#!/bin/bash\n" + f"# REKALL Auto-Fix — Incident {incident_id}\n" + f"# Scenario: {scenario}\n" + f"# Generated: by REKALL AI Agent pipeline\n\n" + f"echo 'Applying REKALL recommended fix for: {scenario}'\n" + f"# TODO: replace with actual fix commands from FixAgent\n" + ) + repo.create_file( + path=f".rekall/fix-{incident_id[:8]}.sh", + message=f"fix({incident_id[:8]}): REKALL auto-fix for {scenario}", + content=fix_script.encode(), + branch=branch_name, + ) + + # Open the Pull Request + pr = repo.create_pull( + title=f"[REKALL] Auto-fix: {scenario} — incident {incident_id[:8]}", + body=( + f"## 🤖 REKALL Auto-Fix\n\n" + f"**Incident ID:** `{incident_id}`\n" + f"**Scenario:** `{scenario}`\n" + f"**Pipeline:** Emulated (AI agents returning fix commands)\n\n" + f"### What happened\n" + f"REKALL's AI pipeline detected a `{scenario}` failure, " + f"diagnosed the root cause, retrieved a fix from the memory vault, " + f"and scored governance risk as low enough to proceed.\n\n" + f"### Fix\n" + f"See `.rekall/fix-{incident_id[:8]}.sh` in this branch.\n\n" + f"*Auto-generated by REKALL. Please review before merging.*" + ), + head=branch_name, + base=base_branch, + ) + pr_url = pr.html_url + log.info("[emulated_pipeline] PR opened: %s", pr_url) + + await _post_callback(incident_id, { + "type": "agent_log", + "data": { + "incident_id": incident_id, + "step_name": "execute", + "status": "done", + "detail": f"Pull request opened: {pr_url}", + }, + }) + + except Exception as exc: + log.warning("[emulated_pipeline] GitHub PR creation failed: %s", exc) + await _post_callback(incident_id, { + "type": "agent_log", + "data": { + "incident_id": incident_id, + "step_name": "execute", + "status": "error", + "detail": f"PR creation failed: {exc}", + }, + }) + # ────────────────────────────────────────────────────────────────────────── + + await _post_callback(incident_id, { + "type": "status", + "data": {"incident_id": incident_id, "status": "resolved"}, + }) + + +async def _create_pr_async( + incident_id: str, + fix_commands: list, + fix_description: str, + fix_tier: str, + fix_diff: Optional[str], + pr_evidence: str = "", +) -> None: + """ + Create a real GitHub PR for a human-approved fix. + Called by POST /pipeline/create-pr (triggered from Go Approve handler). + Posts execution progress back via the engine-callback so the SSE + stream updates the dashboard in real time. + """ + github_token = os.getenv("GITHUB_TOKEN", "") + github_repo = os.getenv("GITHUB_REPO", "") + github_live = os.getenv("GITHUB_LIVE_PR", "false").lower() == "true" + + await _post_callback(incident_id, { + "type": "agent_log", + "data": { + "incident_id": incident_id, + "step_name": "execute", + "status": "running", + "detail": "Human approved — opening pull request on GitHub", + }, + }) + + if not github_live or not github_token or not github_repo: + # Not configured for live PRs — emit a trace-only event + await _post_callback(incident_id, { + "type": "agent_log", + "data": { + "incident_id": incident_id, + "step_name": "execute", + "status": "done", + "detail": "PR creation skipped (GITHUB_LIVE_PR not enabled)", + }, + }) + return + + try: + try: + from github import Github # type: ignore + except ImportError: + log.error("[create_pr] PyGithub not installed") + await _post_callback(incident_id, {"type": "agent_log", "data": { + "incident_id": incident_id, "step_name": "execute", "status": "error", + "detail": "PyGithub not installed — cannot create PR", + }}) + return + + loop = asyncio.get_running_loop() + + def gh_create(): + g = Github(github_token) + repo = g.get_repo(github_repo) + + branch_name = f"rekall-fix-{incident_id[:8]}" + base_branch = repo.default_branch + base_sha = repo.get_branch(base_branch).commit.sha + + try: + repo.create_git_ref(f"refs/heads/{branch_name}", base_sha) + except Exception as branch_exc: + if "already exists" in str(branch_exc).lower() or "reference already" in str(branch_exc).lower(): + pass + else: + log.warning("[create_pr] branch creation error: %s", branch_exc) + + # Build fix script content + scenario = fix_description or f"incident-{incident_id[:8]}" + cmd_block = "\n".join(fix_commands) if fix_commands else "# No specific fix commands generated" + script = ( + f"#!/bin/bash\n" + f"# REKALL Auto-Fix — Incident {incident_id}\n" + f"# Tier: {fix_tier}\n" + f"# Generated by REKALL AI agent pipeline (human-approved)\n\n" + f"echo 'Applying fix: {scenario}'\n\n" + f"{cmd_block}\n" + ) + + # Commit fix script + try: + repo.create_file( + path=f".rekall/fix-{incident_id[:8]}.sh", + message=f"fix({incident_id[:8]}): REKALL auto-fix [{fix_tier}]", + content=script.encode(), + branch=branch_name, + ) + except Exception: + # File may exist already — update it + existing = repo.get_contents(f".rekall/fix-{incident_id[:8]}.sh", ref=branch_name) + repo.update_file( + path=f".rekall/fix-{incident_id[:8]}.sh", + message=f"fix({incident_id[:8]}): update REKALL auto-fix [{fix_tier}]", + content=script.encode(), + sha=existing.sha, + branch=branch_name, + ) + + # Build PR body — include sandbox evidence if available + sandbox_section = ( + f"\n\n{pr_evidence}" + if pr_evidence + else "\n\n*Auto-generated by REKALL. Approved by human reviewer. Please review before merging.*" + ) + approval_note = ( + "*Fix was automatically validated in a Minikube sandbox and auto-approved.*" + if pr_evidence + else "*Auto-generated by REKALL. Approved by human reviewer. Please review before merging.*" + ) + + pr_title_prefix = "[REKALL] Sandbox-Validated Fix" if pr_evidence else "[REKALL] Auto-fix" + + pr = repo.create_pull( + title=f"{pr_title_prefix}: {scenario[:70]}", + body=( + f"## 🤖 REKALL Auto-Fix {'(Sandbox Validated)' if pr_evidence else '(Human Approved)'}\n\n" + f"**Incident ID:** `{incident_id}`\n" + f"**Fix tier:** `{fix_tier}`\n" + f"**Description:** {scenario}\n\n" + f"### Fix commands\n```bash\n{cmd_block}\n```\n" + f"{sandbox_section}" + ), + head=branch_name, + base=base_branch, + ) + return pr.html_url + + pr_url = await loop.run_in_executor(None, gh_create) + log.info("[create_pr] PR opened: %s", pr_url) + + await _post_callback(incident_id, { + "type": "agent_log", + "data": { + "incident_id": incident_id, + "step_name": "execute", + "status": "done", + "detail": f"Pull request opened: {pr_url}", + }, + }) + + except Exception as exc: + log.exception("[create_pr] GitHub PR creation failed: %s", exc) + await _post_callback(incident_id, { + "type": "agent_log", + "data": { + "incident_id": incident_id, + "step_name": "execute", + "status": "error", + "detail": f"PR creation failed: {exc}", + }, + }) + + +async def _run_learning( + incident_id: str, + fix_proposal_id: str, + result: str, + reviewed_by: str, + notes: Optional[str], + fix_tier: Optional[str] = None, + vault_entry_id: Optional[str] = None, +) -> None: + """ + Delegate to LearningAgent with properly typed Outcome and FixProposal. + """ + from rekall_engine.agents.learning import LearningAgent # type: ignore + from rekall_engine.types import Outcome, FixProposal # type: ignore + + outcome = Outcome( + incident_id=incident_id, + fix_proposal_id=fix_proposal_id, + result=result, # type: ignore[arg-type] + reviewed_by=reviewed_by, + notes=notes, + ) + fix = FixProposal( + incident_id=incident_id, + tier=fix_tier or "T3_llm", # type: ignore[arg-type] + vault_entry_id=vault_entry_id, + similarity_score=None, + fix_description="", + fix_commands=[], + fix_diff=None, + confidence=0.5, + ) + agent = LearningAgent() + await agent.run({"outcome": outcome, "fix_proposal": fix}) + + +async def _post_callback(incident_id: str, event: Dict[str, Any]) -> None: + """ + POST an event back to the Go backend's internal callback endpoint. + Failures are logged and swallowed — the pipeline continues regardless. + """ + url = f"{settings.go_backend_url}/internal/engine-callback" + client = _http_client + if client is None: + client = httpx.AsyncClient(timeout=5.0) + try: + await client.post(url, json=event) + except Exception as exc: + log.debug("callback failed (ok during dev): %s", exc) diff --git a/engine/pytest.ini b/engine/pytest.ini new file mode 100644 index 000000000..cdce43ffd --- /dev/null +++ b/engine/pytest.ini @@ -0,0 +1,6 @@ +[pytest] +asyncio_mode = auto +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* diff --git a/engine/requirements.txt b/engine/requirements.txt new file mode 100644 index 000000000..f6e1b3a04 --- /dev/null +++ b/engine/requirements.txt @@ -0,0 +1,14 @@ +fastapi>=0.115.0 +uvicorn[standard]>=0.30.0 +httpx>=0.27.0 +pydantic>=2.7.0 +pydantic-settings>=2.3.0 +python-dotenv>=1.0.0 +groq>=0.9.0 +langgraph>=0.2.0 +pytest>=8.0.0 +pytest-asyncio>=0.23.0 +httpx[test]>=0.27.0 +PyGithub>=2.3.0 +kubernetes>=29.0.0 +valkey>=6.0.0 diff --git a/frontend/Dockerfile b/frontend/Dockerfile new file mode 100644 index 000000000..f831b5adf --- /dev/null +++ b/frontend/Dockerfile @@ -0,0 +1,19 @@ +FROM node:20-alpine AS deps +WORKDIR /app +COPY package.json ./ +RUN npm install + +FROM node:20-alpine AS builder +WORKDIR /app +COPY --from=deps /app/node_modules ./node_modules +COPY . . +RUN npm run build + +FROM node:20-alpine AS runner +WORKDIR /app +ENV NODE_ENV production +ENV NEXT_TELEMETRY_DISABLED 1 +COPY --from=builder /app/.next/standalone ./ +COPY --from=builder /app/.next/static ./.next/static +EXPOSE 3000 +CMD ["node", "server.js"] diff --git a/frontend/__tests__/components/agent-timeline.test.tsx b/frontend/__tests__/components/agent-timeline.test.tsx new file mode 100644 index 000000000..d66420a71 --- /dev/null +++ b/frontend/__tests__/components/agent-timeline.test.tsx @@ -0,0 +1,58 @@ +import React from "react"; +import { render, screen } from "@testing-library/react"; +import { AgentTimeline } from "@/components/agent-timeline"; +import type { AgentLog } from "@/lib/types"; + +const makeLog = (step: string, status: "running" | "done" | "error", detail = "detail"): AgentLog => ({ + id: `${step}-${status}`, + incident_id: "inc-001", + step_name: step, + status, + detail, + created_at: new Date().toISOString(), +}); + +describe("AgentTimeline", () => { + it("renders all seven pipeline steps", () => { + render(); + expect(screen.getByText(/Monitor/)).toBeInTheDocument(); + expect(screen.getByText(/Diagnostic/)).toBeInTheDocument(); + expect(screen.getByText(/Fix/)).toBeInTheDocument(); + expect(screen.getByText(/Simulation/)).toBeInTheDocument(); + expect(screen.getByText(/Governance/)).toBeInTheDocument(); + expect(screen.getByText(/Publish/)).toBeInTheDocument(); + expect(screen.getByText(/Learning/)).toBeInTheDocument(); + }); + + it("shows done state for completed steps", () => { + const logs = [makeLog("monitor", "done", "Normalised payload")]; + render(); + expect(screen.getByText("Normalised payload")).toBeInTheDocument(); + }); + + it("shows pipeline complete when done is true", () => { + render(); + expect(screen.getByText(/Pipeline completed successfully/)).toBeInTheDocument(); + }); + + it("does not show pipeline complete when done is false", () => { + render(); + expect(screen.queryByText(/Pipeline completed successfully/)).not.toBeInTheDocument(); + }); + + it("deduplicates logs — uses latest status per step", () => { + const logs = [ + makeLog("monitor", "running", "Starting"), + makeLog("monitor", "done", "Finished"), + ]; + render(); + expect(screen.getByText("Finished")).toBeInTheDocument(); + expect(screen.queryByText("Starting")).not.toBeInTheDocument(); + }); + + it("renders error detail for errored step", () => { + const logs = [makeLog("diagnostic", "error", "GitHub API unreachable")]; + render(); + expect(screen.getByText("GitHub API unreachable")).toBeInTheDocument(); + }); +}); diff --git a/frontend/__tests__/components/fix-proposal-card.test.tsx b/frontend/__tests__/components/fix-proposal-card.test.tsx new file mode 100644 index 000000000..cee4472f6 --- /dev/null +++ b/frontend/__tests__/components/fix-proposal-card.test.tsx @@ -0,0 +1,74 @@ +import React from "react"; +import { render, screen, fireEvent } from "@testing-library/react"; +import { FixProposalCard } from "@/components/fix-proposal-card"; +import type { FixProposal } from "@/lib/types"; + +const BASE_FIX: FixProposal = { + id: "fp-001", + incident_id: "inc-001", + tier: "T1_human", + vault_entry_id: "vault-abc", + similarity_score: 0.91, + fix_description: "Restore correct Postgres host in database.yml", + fix_commands: ["git checkout -- config/database.yml", "systemctl restart app"], + fix_diff: null, + confidence: 0.92, + reasoning: "Vault match (T1_human): signature 'infra:postgres:econnrefused'", + rlm_trace: [], + created_at: new Date().toISOString(), +}; + +describe("FixProposalCard", () => { + it("renders T1 tier badge", () => { + render(); + expect(screen.getByText(/T1 Human Pattern/)).toBeInTheDocument(); + }); + + it("renders T2 tier badge", () => { + render(); + expect(screen.getByText(/T2 Validated Logic/)).toBeInTheDocument(); + }); + + it("renders T3 tier badge", () => { + render(); + expect(screen.getByText(/T3 Synthesized/)).toBeInTheDocument(); + }); + + it("renders fix description", () => { + render(); + expect(screen.getByText("Restore correct Postgres host in database.yml")).toBeInTheDocument(); + }); + + it("renders all fix commands", () => { + render(); + expect(screen.getByText("git checkout -- config/database.yml")).toBeInTheDocument(); + expect(screen.getByText("systemctl restart app")).toBeInTheDocument(); + }); + + it("shows confidence percentage", () => { + render(); + expect(screen.getByText("92%")).toBeInTheDocument(); + }); + + it("shows similarity score when present", () => { + render(); + expect(screen.getByText(/91\.0%/)).toBeInTheDocument(); + }); + + it("hides similarity score when null", () => { + render(); + expect(screen.queryByText(/Similarity/)).not.toBeInTheDocument(); + }); + + it("renders diff section when diff present", () => { + const { container } = render( + + ); + // The diff toggle is an un-labelled button (chevron only) — find it and click + const buttons = container.querySelectorAll("button"); + expect(buttons.length).toBeGreaterThan(0); + // First button in the diff section — click to expand + fireEvent.click(buttons[0]); + expect(screen.getByText(/--- a\/f/)).toBeInTheDocument(); + }); +}); diff --git a/frontend/__tests__/components/incident-card.test.tsx b/frontend/__tests__/components/incident-card.test.tsx new file mode 100644 index 000000000..9379adc44 --- /dev/null +++ b/frontend/__tests__/components/incident-card.test.tsx @@ -0,0 +1,74 @@ +import React from "react"; +import { render, screen } from "@testing-library/react"; +import { IncidentCard } from "@/components/incident-card"; +import type { Incident } from "@/lib/types"; + +// Next/link needs a router — mock it +jest.mock("next/link", () => { + const MockLink = ({ children, href }: { children: React.ReactNode; href: string }) => ( + {children} + ); + MockLink.displayName = "MockLink"; + return MockLink; +}); + +const BASE_INCIDENT: Incident = { + id: "550e8400-e29b-41d4-a716-446655440000", + source: "simulator", + failure_type: "infra", + raw_payload: { description: "Postgres connection refused" }, + status: "processing", + created_at: new Date().toISOString(), + updated_at: new Date().toISOString(), +}; + +describe("IncidentCard", () => { + it("renders the failure type badge", () => { + render(); + // failure_type="infra" renders label "Infra" in the component + expect(screen.getByText("Infra")).toBeInTheDocument(); + }); + + it("renders the source label", () => { + render(); + expect(screen.getByText("simulator")).toBeInTheDocument(); + }); + + it("renders the description from raw_payload", () => { + render(); + expect(screen.getByText("Postgres connection refused")).toBeInTheDocument(); + }); + + it("renders the status label", () => { + render(); + expect(screen.getByText("Processing")).toBeInTheDocument(); + }); + + it("links to the correct incident detail URL", () => { + render(); + const link = screen.getByRole("link"); + expect(link).toHaveAttribute("href", `/incidents/${BASE_INCIDENT.id}`); + }); + + it("renders resolved status correctly", () => { + render(); + expect(screen.getByText("Resolved")).toBeInTheDocument(); + }); + + it("renders failed status correctly", () => { + render(); + expect(screen.getByText("Failed")).toBeInTheDocument(); + }); + + it("renders awaiting_approval status", () => { + render(); + // awaiting_approval renders "Needs Review" in the IncidentCard component + expect(screen.getByText("Needs Review")).toBeInTheDocument(); + }); + + it("falls back to truncated ID when description missing", () => { + render(); + // ID-{first 8 chars} format + expect(screen.getByText(/ID-550e8400/i)).toBeInTheDocument(); + }); +}); diff --git a/frontend/__tests__/components/risk-gauge.test.tsx b/frontend/__tests__/components/risk-gauge.test.tsx new file mode 100644 index 000000000..f8e48da07 --- /dev/null +++ b/frontend/__tests__/components/risk-gauge.test.tsx @@ -0,0 +1,65 @@ +import React from "react"; +import { render, screen } from "@testing-library/react"; +import { RiskGauge } from "@/components/risk-gauge"; +import type { GovernanceDecision } from "@/lib/types"; + +const makeDecision = ( + risk_score: number, + decision: GovernanceDecision["decision"], + factors: string[] = [], +): GovernanceDecision => ({ + id: "gov-001", + incident_id: "inc-001", + risk_score, + decision, + risk_factors: factors, + created_at: new Date().toISOString(), +}); + +describe("RiskGauge", () => { + it("shows Low Risk for score < 0.3", () => { + render(); + expect(screen.getByText("Low Risk")).toBeInTheDocument(); + }); + + it("shows Medium Risk for score 0.3–0.7", () => { + render(); + expect(screen.getByText("Medium Risk")).toBeInTheDocument(); + }); + + it("shows High Risk for score >= 0.7", () => { + render(); + expect(screen.getByText("High Risk")).toBeInTheDocument(); + }); + + it("shows percentage on gauge", () => { + render(); + expect(screen.getByText("72%")).toBeInTheDocument(); + }); + + it("renders decision label for auto_apply", () => { + render(); + expect(screen.getByText("Auto-applied")).toBeInTheDocument(); + }); + + it("renders decision label for create_pr", () => { + render(); + expect(screen.getByText("Pull request opened")).toBeInTheDocument(); + }); + + it("renders decision label for block_await_human", () => { + render(); + expect(screen.getByText("Awaiting human review")).toBeInTheDocument(); + }); + + it("renders risk factors", () => { + render(); + expect(screen.getByText("touches_secrets")).toBeInTheDocument(); + expect(screen.getByText("llm_generated")).toBeInTheDocument(); + }); + + it("does not render factors section when empty", () => { + render(); + expect(screen.queryByText("Factors")).not.toBeInTheDocument(); + }); +}); diff --git a/frontend/__tests__/lib/api-client.test.ts b/frontend/__tests__/lib/api-client.test.ts new file mode 100644 index 000000000..ef276bc0a --- /dev/null +++ b/frontend/__tests__/lib/api-client.test.ts @@ -0,0 +1,76 @@ +/** + * Tests for the API client — uses MSW (mock service worker) patterns + * but kept lightweight here with jest fetch mocks. + */ + +// Polyfill fetch for Node test environment +global.fetch = jest.fn(); + +const mockFetch = global.fetch as jest.Mock; + +// Reset mock between tests +beforeEach(() => { + mockFetch.mockClear(); +}); + +describe("api client URL construction", () => { + it("simulate posts to correct path", async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ incident_id: "new-inc-123" }), + }); + + const { api } = await import("@/lib/api-client"); + const result = await api.simulate("postgres_refused"); + + expect(mockFetch).toHaveBeenCalledTimes(1); + const [url] = mockFetch.mock.calls[0]; + expect(url).toContain("/webhook/simulate"); + expect(result).toEqual({ incident_id: "new-inc-123" }); + }); + + it("getIncident calls correct path", async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ incident: { id: "abc-123" } }), + }); + + const { api } = await import("@/lib/api-client"); + await api.getIncident("abc-123"); + + const [url] = mockFetch.mock.calls[0]; + expect(url).toContain("/incidents/abc-123"); + }); + + it("streamUrl returns correct SSE URL", async () => { + const { api } = await import("@/lib/api-client"); + const url = api.streamUrl("inc-xyz"); + expect(url).toContain("/stream/inc-xyz"); + }); + + it("throws on non-ok response", async () => { + mockFetch.mockResolvedValueOnce({ + ok: false, + status: 404, + json: async () => ({}), + }); + + const { api } = await import("@/lib/api-client"); + await expect(api.getIncident("bad-id")).rejects.toThrow("404"); + }); + + it("approveIncident posts correct body", async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ ok: true }), + }); + + const { api } = await import("@/lib/api-client"); + await api.approveIncident("inc-001", "alice", "looks good"); + + const [, init] = mockFetch.mock.calls[0]; + const body = JSON.parse(init.body as string); + expect(body.reviewed_by).toBe("alice"); + expect(body.notes).toBe("looks good"); + }); +}); diff --git a/frontend/app/(app)/dashboard/page.tsx b/frontend/app/(app)/dashboard/page.tsx new file mode 100644 index 000000000..d0e5fe6a6 --- /dev/null +++ b/frontend/app/(app)/dashboard/page.tsx @@ -0,0 +1,295 @@ +"use client"; + +import { useState } from "react"; +import { useRouter } from "next/navigation"; +import { + Loader2, Play, AlertCircle, RefreshCw, ChevronDown, ChevronRight, + Activity, CheckCircle2, TrendingUp, + Server, Cpu, TestTube, Shield, Container, Radio, + Zap, GitBranch, Github, +} from "lucide-react"; +import { useIncidents } from "@/lib/hooks/use-incidents"; +import { IncidentCard } from "@/components/incident-card"; +import { SkeletonCard } from "@/components/ui/skeleton"; +import { StatCard } from "@/components/ui/stat-card"; +import { api } from "@/lib/api-client"; +import { cn } from "@/lib/utils"; + +// Local scenarios — kept for quick offline testing +const SCENARIOS = [ + { id: "postgres_refused", label: "Postgres Refused", type: "infra", icon: Server, color: "text-amber-500" }, + { id: "oom_kill", label: "OOM Kill", type: "oom", icon: Cpu, color: "text-red-500" }, + { id: "test_failure", label: "Test Failure", type: "test", icon: TestTube, color: "text-blue-500" }, + { id: "secret_leak", label: "Secret Leak", type: "security", icon: Shield, color: "text-rose-500" }, + { id: "image_pull_backoff", label: "Image Pull Backoff", type: "deploy", icon: Container, color: "text-sky-500" }, +]; + +export default function DashboardPage() { + const { incidents, loading, error, refetch } = useIncidents(4000); + const [running, setRunning] = useState(null); // "live" | scenario id + const [githubRepo, setGithubRepo] = useState("abjt01/sample-ci-sad"); + const [runError, setRunError] = useState(null); + const [showLocal, setShowLocal] = useState(false); + const router = useRouter(); + + async function fetchLive() { + setRunning("live"); + setRunError(null); + try { + const result = await api.fetchLive(githubRepo || undefined); + refetch(); + router.push(`/incidents/${result.incident_id}`); + } catch (e: unknown) { + setRunError(e instanceof Error ? e.message : "Request failed"); + } finally { + setRunning(null); + } + } + + async function simulate(scenario: string) { + setRunning(scenario); + setRunError(null); + try { + const result = await api.simulate(scenario); + refetch(); + router.push(`/incidents/${result.incident_id}`); + } catch (e: unknown) { + setRunError(e instanceof Error ? e.message : "Request failed"); + } finally { + setRunning(null); + } + } + + const activeCount = incidents.filter(i => i.status === "processing" || i.status === "awaiting_approval").length; + const resolvedCount = incidents.filter(i => i.status === "resolved").length; + const failedCount = incidents.filter(i => i.status === "failed").length; + + return ( +
+ {/* ── Header ─────────────────────────────────────────── */} +
+
+
+
+ +
+
+

Dashboard

+

Autonomous CI/CD Repair

+
+
+ +
+
+ +
+ + {/* ── Stats ──────────────────────────────────────────── */} +
+ } + accent="hsl(215 25% 15%)" + /> + 0 ? "Processing..." : "All clear"} + trend={activeCount > 0 ? "up" : "neutral"} + icon={} + accent="hsl(28 100% 50%)" + /> + 0 ? `${Math.round(resolvedCount / incidents.length * 100)}% success` : undefined} + trend="up" + icon={} + accent="hsl(142 76% 36%)" + /> + } + accent="hsl(0 84% 60%)" + /> +
+ + {/* ── CI Monitor ─────────────────────────────────────── */} +
+
+ +
+

CI Monitor

+

Analyze the latest GitHub Actions failure and open an AI-generated fix PR

+
+
+ +
+ + {/* ── Repo Input + Fetch Button ── */} +
+
+ + setGithubRepo(e.target.value)} + placeholder="owner/repo — e.g. abjt01/sample-ci-sad" + className="flex-1 text-sm font-mono bg-transparent text-slate-800 placeholder:text-slate-400 focus:outline-none" + /> +
+ +
+ + {/* Running/error feedback */} + {runError && ( +
+ + {runError} +
+ )} + {running === "live" && ( +
+ + Fetching latest failed run from {githubRepo} — Groq AI is analysing the logs… +
+ )} + + {/* ── Local Scenarios (collapsible) ── */} +
+ + + {showLocal && ( +
+

Injects a pre-built payload for offline testing when GitHub is not configured.

+
+ {SCENARIOS.map((s) => { + const busy = running === s.id; + const Icon = s.icon; + return ( + + ); + })} +
+ {running && running !== "live" && ( +
+ + Running pipeline for local scenario… +
+ )} +
+ )} +
+
+
+ + {/* ── Incidents ──────────────────────────────────────── */} +
+
+

+ Incidents + {incidents.length > 0 && ( + + {incidents.length} total + + )} +

+ {loading && incidents.length > 0 && ( + + )} +
+ + {error && ( +
+ + {error} — is the backend running? +
+ )} + + {loading && !incidents.length ? ( +
+ {Array.from({ length: 4 }).map((_, i) => )} +
+ ) : incidents.length === 0 ? ( +
+
+ +
+

No incidents yet

+

+ Enter a GitHub repo above and click Fetch & Fix Latest Failure +

+
+ ) : ( +
+ {incidents.map((inc, i) => ( +
+ +
+ ))} +
+ )} +
+
+
+ ); +} diff --git a/frontend/app/(app)/incidents/[id]/page.tsx b/frontend/app/(app)/incidents/[id]/page.tsx new file mode 100644 index 000000000..8fea0a726 --- /dev/null +++ b/frontend/app/(app)/incidents/[id]/page.tsx @@ -0,0 +1,236 @@ +"use client"; + +import { use, useEffect, useState, useCallback } from "react"; +import { ArrowLeft, Loader2, AlertCircle, Clock, CheckCircle2, XCircle } from "lucide-react"; +import Link from "next/link"; +import { api } from "@/lib/api-client"; +import { useAgentStream } from "@/lib/hooks/use-agent-stream"; +import { AgentTimeline } from "@/components/agent-timeline"; +import { FixProposalCard } from "@/components/fix-proposal-card"; +import { RiskGauge } from "@/components/risk-gauge"; +import { ApprovalPanel } from "@/components/approval-panel"; +import { SandboxResultCard } from "@/components/sandbox-result-card"; +import { Badge } from "@/components/ui/badge"; +import { SkeletonTimeline } from "@/components/ui/skeleton"; +import { cn, timeAgo } from "@/lib/utils"; +import type { + Incident, DiagnosticBundle, FixProposal, + GovernanceDecision, AgentLog, SandboxResult, +} from "@/lib/types"; + +interface Detail { + incident: Incident; + diagnostic_bundle: DiagnosticBundle | null; + fix_proposal: FixProposal | null; + governance_decision: GovernanceDecision | null; + sandbox_result: SandboxResult | null; + agent_logs: AgentLog[]; +} + +const STATUS_CONFIG = { + processing: { icon: Loader2, variant: "warning" as const, label: "Processing", spin: true }, + awaiting_approval: { icon: Clock, variant: "primary" as const, label: "Needs Review", spin: false }, + resolved: { icon: CheckCircle2, variant: "success" as const, label: "Resolved", spin: false }, + failed: { icon: XCircle, variant: "danger" as const, label: "Failed / Rejected", spin: false }, +}; + +export default function IncidentDetailPage({ + params, +}: { + params: Promise<{ id: string }>; +}) { + const { id } = use(params); + const [data, setData] = useState(null); + const [loading, setLoading] = useState(true); + const { logs, done, sandboxResult: liveSandbox } = useAgentStream(id); + + const fetchData = useCallback(async () => { + try { + const result = await api.getIncident(id); + setData(result as unknown as Detail); + } finally { + setLoading(false); + } + }, [id]); + + useEffect(() => { fetchData(); }, [fetchData]); + useEffect(() => { if (done) fetchData(); }, [done, fetchData]); + + if (loading) { + return ( +
+
+
+ {Array.from({ length: 3 }).map((_, i) => ( +
+ +
+ ))} +
+
+ ); + } + + if (!data) { + return ( +
+
+ + Incident not found. +
+
+ ); + } + + const { incident, diagnostic_bundle: bundle, fix_proposal: fix, governance_decision: gov, sandbox_result: storedSandbox } = data; + // Prefer live SSE sandbox_result over stored value (shows in real time) + const sandbox = liveSandbox ?? storedSandbox; + const allLogs = logs.length > 0 ? logs : data.agent_logs; + const needsApproval = incident.status === "awaiting_approval" + || gov?.decision === "block_await_human"; + const s = STATUS_CONFIG[incident.status] ?? STATUS_CONFIG.processing; + const StatusIcon = s.icon; + + return ( +
+ {/* Breadcrumb + header */} +
+
+ + + +
+
+

+ Incident ID-{id.slice(0, 8).toUpperCase()} +

+ + + {s.label} + +
+

+ {incident.source} · {incident.failure_type} · {timeAgo(incident.created_at)} +

+
+
+
+ + {/* 3-column grid */} +
+ {/* Col 1: Agent timeline */} +
+
+

+ Agent Pipeline +

+
+
+ +
+
+ + {/* Col 2: Context + fix */} +
+ {/* Log excerpt */} + {bundle?.log_excerpt && ( +
+
+

+ Log Excerpt +

+
+ + + +
+
+
+                {bundle.log_excerpt}
+              
+
+ )} + + {/* Git diff */} + {bundle?.git_diff && ( +
+
+

+ Git Diff +

+
+
+                {bundle.git_diff}
+              
+
+ )} + + {/* Context summary */} + {bundle?.context_summary && ( +
+

+ Context Summary +

+

+ {bundle.context_summary} +

+
+ )} + + {/* Fix proposal */} + {fix ? ( + + ) : ( +
+ +

+ {incident.status === "processing" + ? "Searching memory vault…" + : "No fix proposal generated"} +

+
+ )} + + {/* Sandbox validation result */} + {sandbox && } +
+ + {/* Col 3: Governance + approval */} +
+ {gov && } + + {needsApproval && ( + + )} + + {incident.status === "resolved" && !needsApproval && ( +
+
+ +
+

Resolved

+

+ Outcome recorded & stakeholders notified. +

+
+ )} + + {incident.status === "failed" && ( +
+
+ +
+

Failed / Rejected

+

+ Incident logged & reported. +

+
+ )} +
+
+
+ ); +} diff --git a/frontend/app/(app)/layout.tsx b/frontend/app/(app)/layout.tsx new file mode 100644 index 000000000..1576d1d86 --- /dev/null +++ b/frontend/app/(app)/layout.tsx @@ -0,0 +1,12 @@ +import { Sidebar } from "@/components/sidebar"; + +export default function AppLayout({ children }: { children: React.ReactNode }) { + return ( +
+ +
+ {children} +
+
+ ); +} diff --git a/frontend/app/(app)/vault/page.tsx b/frontend/app/(app)/vault/page.tsx new file mode 100644 index 000000000..70c5e2cff --- /dev/null +++ b/frontend/app/(app)/vault/page.tsx @@ -0,0 +1,97 @@ +"use client"; + +import { useEffect, useState } from "react"; +import { Loader2, Database, Shield, Cpu, TrendingUp } from "lucide-react"; +import { api } from "@/lib/api-client"; +import { VaultExplorer } from "@/components/vault-explorer"; +import { StatCard } from "@/components/ui/stat-card"; +import { SkeletonStats } from "@/components/ui/skeleton"; +import type { VaultEntry } from "@/lib/types"; + +interface VaultStats { + total: number; + human_count: number; + synthetic_count: number; + avg_confidence: number | null; +} + +export default function VaultPage() { + const [entries, setEntries] = useState([]); + const [stats, setStats] = useState(null); + const [loading, setLoading] = useState(true); + + useEffect(() => { + Promise.all([api.listVault(), api.vaultStats()]) + .then(([vault, st]) => { + setEntries((vault as { entries: VaultEntry[] }).entries); + setStats(st as unknown as VaultStats); + }) + .finally(() => setLoading(false)); + }, []); + + return ( +
+ {/* ── Header ─────────────────────────────────────────── */} +
+
+
+
+ +
+
+

Memory Vault

+

+ Human-approved Fixes · Institutional Memory +

+
+
+
+
+ +
+ {/* Stats */} + {loading ? ( + + ) : stats ? ( +
+ } + /> + } + /> + } + /> + } + /> +
+ ) : null} + + {/* Explorer */} + {loading ? ( +
+ +

Loading Memory Vault...

+
+ ) : ( + + )} +
+
+ ); +} diff --git a/frontend/app/globals.css b/frontend/app/globals.css new file mode 100644 index 000000000..a8f60f470 --- /dev/null +++ b/frontend/app/globals.css @@ -0,0 +1,237 @@ +@tailwind base; +@tailwind components; +@tailwind utilities; + +/* ───────────────────────────────────────────────────────────── + REKALL Design System + Dark = default command-room aesthetic + Light = clean slate mode +───────────────────────────────────────────────────────────── */ + +@layer base { + :root { + /* ── REKALL Bright Theme ─────────────────────────── */ + --background: 0 0% 100%; + --background-subtle: 210 20% 98%; + --foreground: 222 47% 11%; + + --card: 0 0% 100%; + --card-hover: 0 0% 99%; + --card-foreground: 222 47% 11%; + + --border: 214 32% 91%; + --border-strong: 214 32% 80%; + --input: 214 32% 95%; + --ring: 25 100% 50%; /* REKALL Orange */ + + --primary: 25 100% 50%; + --primary-hover: 25 100% 42%; + --primary-foreground: 0 0% 100%; + + --secondary: 210 20% 96%; + --secondary-foreground: 222 47% 20%; + + --muted: 210 40% 96%; + --muted-foreground: 215 16% 47%; + + --accent: 25 100% 50%; + --accent-subtle: 25 100% 97%; + + --popover: 0 0% 100%; + --popover-foreground: 222 47% 11%; + + --destructive: 0 84% 60%; + --success: 142 76% 36%; + --warning: 38 92% 50%; + --info: 217 91% 60%; + + /* Status */ + --status-processing: 38 92% 50%; + --status-awaiting: 262 83% 58%; + --status-resolved: 142 76% 36%; + --status-failed: 0 84% 60%; + + /* Sidebar */ + --sidebar-width: 240px; + --sidebar-bg: 210 20% 98%; + --sidebar-border: 214 32% 91%; + + --radius: 0.75rem; + } +} + +@layer base { + * { + @apply border-border; + box-sizing: border-box; + } + + html { + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; + font-feature-settings: "cv11", "ss01"; + } + + body { + @apply bg-background text-foreground; + font-family: "Inter", "SF Pro Display", system-ui, -apple-system, sans-serif; + font-size: 0.875rem; + line-height: 1.6; + } + + code, pre, kbd { + font-family: "JetBrains Mono", "Fira Code", "Cascadia Code", "SF Mono", monospace; + } +} + +/* ── Scrollbar ─────────────────────────────────────────────── */ +::-webkit-scrollbar { width: 4px; height: 4px; } +::-webkit-scrollbar-track { background: transparent; } +::-webkit-scrollbar-thumb { + background: hsl(var(--border-strong)); + border-radius: 99px; +} +::-webkit-scrollbar-thumb:hover { + background: hsl(var(--muted-foreground) / 0.5); +} + +/* ── Animations ────────────────────────────────────────────── */ +@keyframes step-pulse { + 0%, 100% { opacity: 1; } + 50% { opacity: 0.4; } +} +@keyframes shimmer { + 0% { background-position: -200% 0; } + 100% { background-position: 200% 0; } +} +@keyframes fade-up { + from { opacity: 0; transform: translateY(8px); } + to { opacity: 1; transform: translateY(0); } +} +@keyframes fade-in { + from { opacity: 0; } + to { opacity: 1; } +} +@keyframes slide-in-left { + from { opacity: 0; transform: translateX(-12px); } + to { opacity: 1; transform: translateX(0); } +} +@keyframes scale-in { + from { opacity: 0; transform: scale(0.96); } + to { opacity: 1; transform: scale(1); } +} +@keyframes progress-fill { + from { width: 0%; } + to { width: var(--target-width, 100%); } +} + +.step-running { animation: step-pulse 1.4s ease-in-out infinite; } +.fade-up { animation: fade-up 0.35s ease-out both; } +.fade-in { animation: fade-in 0.25s ease-out both; } +.slide-in-left { animation: slide-in-left 0.3s ease-out both; } +.scale-in { animation: scale-in 0.25s ease-out both; } + +/* ── Skeleton shimmer ──────────────────────────────────────── */ +.skeleton { + background: linear-gradient( + 90deg, + hsl(var(--muted)) 25%, + hsl(var(--muted-foreground) / 0.08) 50%, + hsl(var(--muted)) 75% + ); + background-size: 200% 100%; + animation: shimmer 1.8s infinite; + border-radius: var(--radius); +} + +/* ── Sidebar nav active indicator ─────────────────────────── */ +.nav-active { + position: relative; +} +.nav-active::before { + content: ""; + position: absolute; + left: 0; + top: 50%; + transform: translateY(-50%); + height: 60%; + width: 3px; + background: hsl(var(--primary)); + border-radius: 0 99px 99px 0; +} + +/* ── Gradient text ─────────────────────────────────────────── */ +.gradient-text { + background: linear-gradient(135deg, hsl(25 100% 50%), hsl(45 100% 50%)); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; +} +.gradient-text-warm { + background: linear-gradient(135deg, hsl(20 100% 50%), hsl(0 100% 60%)); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; +} + +/* ── Card hover lift ───────────────────────────────────────── */ +.card-lift { + transition: transform 200ms cubic-bezier(0.4, 0, 0.2, 1), border-color 200ms ease, box-shadow 200ms ease; +} +.card-lift:hover { + transform: translateY(-4px); + border-color: hsl(var(--primary) / 0.5); + box-shadow: 0 12px 24px -10px hsl(var(--primary) / 0.15), 0 4px 12px -4px hsl(222 47% 11% / 0.05); +} + +/* ── Glass panel ───────────────────────────────────────────── */ +.glass { + backdrop-filter: blur(12px) saturate(180%); + background: hsl(var(--card) / 0.7); + border: 1px solid hsl(var(--border)); +} + +/* ── Status dot ────────────────────────────────────────────── */ +.status-dot { + width: 6px; + height: 6px; + border-radius: 50%; + display: inline-block; + flex-shrink: 0; +} +.status-dot.live { + background: hsl(var(--success)); + box-shadow: 0 0 0 3px hsl(var(--success) / 0.2); + animation: step-pulse 2s ease-in-out infinite; +} + +/* ── Monospace terminal block ──────────────────────────────── */ +.terminal { + background: hsl(224 30% 5%); + border: 1px solid hsl(var(--border)); + border-radius: var(--radius); + font-family: "JetBrains Mono", "Fira Code", monospace; + font-size: 12px; + line-height: 1.7; + padding: 1rem; + overflow-x: auto; +} + +/* ── Diff colors ───────────────────────────────────────────── */ +.diff-add { color: hsl(142 69% 50%); } +.diff-del { color: hsl(0 72% 55%); } +.diff-meta { color: hsl(var(--muted-foreground)); } + +/* ── Focus ring override ───────────────────────────────────── */ +.focus-ring { + @apply outline-none ring-2 ring-primary ring-offset-2 ring-offset-background; +} +*:focus-visible { + outline: 2px solid hsl(var(--ring)); + outline-offset: 2px; +} + +/* ── Chip / inline tag ─────────────────────────────────────── */ +.chip { + @apply inline-flex items-center gap-1 px-2 py-0.5 rounded-md text-xs font-medium; +} diff --git a/frontend/app/layout.tsx b/frontend/app/layout.tsx new file mode 100644 index 000000000..faf68bb53 --- /dev/null +++ b/frontend/app/layout.tsx @@ -0,0 +1,18 @@ +import type { Metadata } from "next"; +import "./globals.css"; +import { ThemeProvider } from "@/components/theme-provider"; + +export const metadata: Metadata = { + title: { default: "REKALL", template: "%s — REKALL" }, + description: "Memory-driven agentic CI/CD repair — detects failures, retrieves fixes from a learning vault, and applies repairs with human-in-the-loop governance.", +}; + +export default function RootLayout({ children }: { children: React.ReactNode }) { + return ( + + + {children} + + + ); +} diff --git a/frontend/app/page.tsx b/frontend/app/page.tsx new file mode 100644 index 000000000..506f7bfbc --- /dev/null +++ b/frontend/app/page.tsx @@ -0,0 +1,373 @@ +"use client"; + +import Link from "next/link"; +import { useEffect, useState } from "react"; +import { + ArrowRight, Zap, Brain, GitBranch, Shield, + ChevronRight, Terminal, Check, Activity, + Database, BarChart3, Cpu, Bell, MessageSquare, +} from "lucide-react"; +import { cn } from "@/lib/utils"; + +// ── Animated terminal lines ──────────────────────────────────────────────── + +const TERMINAL_LINES = [ + { delay: 0, color: "text-muted-foreground", text: "$ rekall watch --env production" }, + { delay: 600, color: "text-yellow-500", text: "[monitor] failure detected — github_actions / api_test" }, + { delay: 1200, color: "text-orange-500", text: "[diagnostic] fetching 4,312 lines of build logs..." }, + { delay: 1800, color: "text-orange-500", text: "[diagnostic] recursive scan: jest_assertion_error at L492" }, + { delay: 2400, color: "text-orange-600", text: "[fix] vault retrieval: T1 miss. T2 hit!" }, + { delay: 3000, color: "text-orange-600", text: "[fix] RLM reasoning verified. confidence: 0.94" }, + { delay: 3600, color: "text-orange-700", text: "[governance] risk_score: 0.12 → decision: auto_apply" }, + { delay: 4200, color: "text-emerald-500", text: "[execute] fix applied. monitoring health..." }, + { delay: 4800, color: "text-blue-500", text: "[reporting] Slack & Notion notified. outcome: success" }, + { delay: 5200, color: "text-muted-foreground", text: "─────────────────────────────────────────────────────" }, + { delay: 5600, color: "text-emerald-600", text: "✓ incident resolved in 14.8s" }, +]; + +function AnimatedTerminal() { + const [visibleCount, setVisibleCount] = useState(0); + const [cursor, setCursor] = useState(true); + + useEffect(() => { + const timers = TERMINAL_LINES.map((line, i) => + setTimeout(() => setVisibleCount(i + 1), line.delay) + ); + const cursorTimer = setInterval(() => setCursor((c) => !c), 500); + return () => { + timers.forEach(clearTimeout); + clearInterval(cursorTimer); + }; + }, []); + + return ( +
+ {/* window chrome */} +
+ + + + rekall — real_time +
+
+ {TERMINAL_LINES.slice(0, visibleCount).map((line, i) => ( +

+ {line.text} +

+ ))} + {visibleCount < TERMINAL_LINES.length && ( + + )} +
+
+ ); +} + +// ── How it works steps ───────────────────────────────────────────────────── + +const STEPS = [ + { + number: "01", + icon: Activity, + title: "Failure detection", + description: + "REKALL listens to GitHub, GitLab, and custom CI webhooks. The moment a workflow fails, it intercepts the logs and triggers a diagnostic event.", + color: "text-orange-500", + bg: "bg-orange-500/10", + border: "border-orange-500/20", + }, + { + number: "02", + icon: Cpu, + title: "RLM Recursive Diagnosis", + description: + "Unlike basic LLM tools, Recursive Language Models (RLM) scan massive logs in multiple passes to find the real root cause, regardless of log length.", + color: "text-orange-600", + bg: "bg-orange-600/10", + border: "border-orange-600/20", + }, + { + number: "03", + icon: Database, + title: "Institutional Memory Vault", + description: + "Battle-tested fixes are retrieved from a tiered JSON vault. REKALL prioritizes human-approved patterns (T1) before attempting synthesis.", + color: "text-orange-700", + bg: "bg-orange-700/10", + border: "border-orange-700/20", + }, + { + number: "04", + icon: Shield, + title: "Governance Safety-Gate", + description: + "Every fix is scored across nine risk dimensions. High-risk actions are blocked for human review, while safe fixes are auto-applied.", + color: "text-orange-800", + bg: "bg-orange-800/10", + border: "border-orange-800/20", + }, + { + number: "05", + icon: MessageSquare, + title: "Integrations & Reporting", + description: + "Upon resolution, REKALL logs all reasoning to Notion and notifies stakeholders via Slack, ensuring full auditability of every automated repair.", + color: "text-blue-600", + bg: "bg-blue-600/10", + border: "border-blue-600/20", + }, +]; + +// ── Feature grid ─────────────────────────────────────────────────────────── + +const FEATURES = [ + { + icon: Brain, + title: "Memory-Driven", + description: "REKALL grows an institutional memory of your pipeline failures, ensuring the same bug never requires manual effort twice.", + accent: "text-orange-500", + }, + { + icon: Cpu, + title: "RLM Architecture", + description: "Recursive analysis handles 10M+ token contexts by shifting from traditional attention windows to a programmable environment.", + accent: "text-orange-600", + }, + { + icon: Shield, + title: "Gated Governance", + description: "Multi-layered risk scoring ensures automation only happens when confidence is absolute. Built for safety-critical infra.", + accent: "text-orange-700", + }, + { + icon: Bell, + title: "Slack Notifications", + description: "Rich Block Kit notifications keep your team informed of every diagnosis, risk score, and automated intervention.", + accent: "text-orange-500", + }, + { + icon: Database, + title: "Notion Auditing", + description: "Automatic post-mortem generation and logging. Every incident is recorded in your project workspace for team visibility.", + accent: "text-blue-500", + }, + { + icon: GitBranch, + title: "Automated PRs", + description: "If a fix needs review, REKALL opens a complete Pull Request with code changes, diagnostic traces, and test results.", + accent: "text-orange-600", + }, +]; + +export default function LandingPage() { + return ( +
+ + {/* ── Nav ─────────────────────────────────────────────────────── */} +
+
+
+
+ +
+ REKALL +
+ +
+ + Open Dashboard + + +
+
+
+ + {/* ── Hero ────────────────────────────────────────────────────── */} +
+ {/* Decorative elements */} +
+ +
+
+ + {/* Left — Copy */} +
+
+ + State-of-the-Art CI/CD Diagnosis +
+ +

+ Your pipeline broke. +
+ REKALL remembers the fix. +

+ +

+ The world's first memory-driven agentic repair system. + Using Recursive Language Models (RLM) to diagnose complex + CI/CD failures and apply battle-tested fixes in seconds. +

+ +
+ + Start Now + + + + The RLM Architecture + +
+ + {/* Badges */} +
+
+ + LLM-Agnostic +
+
+ + Memory-Driven +
+
+ + Governance-Gated +
+
+
+ + {/* Right — Animated Terminal */} +
+ +
+
+
+
+ + {/* ── Stats ───────────────────────────────────────────────────── */} +
+
+
+ {[ + { val: "14s", lab: "avg. recovery time" }, + { val: "10M+", lab: "RLM token context" }, + { val: "24/7", lab: "continuous watch" }, + { val: "0.0s", lab: "human effort required" }, + ].map((s) => ( +
+

{s.val}

+

{s.lab}

+
+ ))} +
+
+
+ + {/* ── How it works ────────────────────────────────────────────── */} +
+
+

+ Five Intelligent Agents. +

+

+ Building institutional memory requires more than a simple prompt. + REKALL orchestrates a specialised agent team through a state-machine architecture. +

+
+ +
+ {STEPS.map((step, i) => ( +
+
+ +
+ +
+
+ + {step.number} + +

{step.title}

+
+

+ {step.description} +

+
+
+ ))} +
+
+ + {/* ── Capabilities ────────────────────────────────────────────── */} +
+ {/* Glow */} +
+ +
+
+

+ Architecture Built For Scale. +

+

+ Every layer of the REKALL stack is designed to ensure fixes are applied safely and recorded permanently. +

+
+ +
+ {FEATURES.map((f) => ( +
+ +

{f.title}

+

+ {f.description} +

+
+ ))} +
+
+
+ + {/* ── Footer ──────────────────────────────────────────────────── */} +
+
+
+ + REKALL +
+

+ Memory-Driven Agentic CI/CD · 2026 +

+
+ Dashboard + Vault +
+
+
+
+ ); +} diff --git a/frontend/components/agent-timeline.tsx b/frontend/components/agent-timeline.tsx new file mode 100644 index 000000000..9576f12b8 --- /dev/null +++ b/frontend/components/agent-timeline.tsx @@ -0,0 +1,109 @@ +"use client"; + +import { + CheckCircle2, Loader2, XCircle, + Radio, Microscope, Wrench, Scale, Rocket, Brain, Shield, Container, Play, +} from "lucide-react"; +import { cn } from "@/lib/utils"; +import type { AgentLog } from "@/lib/types"; + +const STEPS = [ + { key: "monitor", label: "Monitor", sub: "Detect & normalise event", icon: Radio }, + { key: "diagnostic", label: "Diagnostic", sub: "Fetch logs, diff, tests", icon: Microscope }, + { key: "fix", label: "Fix", sub: "Search vault T1 → T2 → T3", icon: Wrench }, + { key: "simulation", label: "Simulation", sub: "Counterfactual dry-run", icon: Rocket }, + { key: "governance", label: "Governance", sub: "Score risk, decide action", icon: Scale }, + { key: "publish_guard",label: "Publish", sub: "Supply chain gate", icon: Shield }, + { key: "sandbox", label: "Sandbox", sub: "Minikube validation", icon: Container }, + { key: "execute", label: "Execute", sub: "Apply fix / open PR", icon: Play }, + { key: "learning", label: "Learning", sub: "Update vault confidence", icon: Brain }, +]; + +interface Props { + logs: AgentLog[]; + done: boolean; +} + +export function AgentTimeline({ logs, done }: Props) { + const statusMap = new Map(); + for (const log of logs) { + statusMap.set(log.step_name, { status: log.status, detail: log.detail }); + } + + return ( +
+ {STEPS.map(({ key, label, sub, icon: StepIcon }, idx) => { + const entry = statusMap.get(key); + const status = entry?.status ?? "pending"; + const detail = entry?.detail; + const isLast = idx === STEPS.length - 1; + + const isDone = status === "done"; + const isRunning = status === "running"; + const isError = status === "error"; + + return ( +
+ {/* Track column */} +
+ {/* Step node */} +
+ {isDone ? : + isRunning ? : + isError ? : + } +
+ {/* Connector */} + {!isLast && ( +
+ )} +
+ + {/* Content */} +
+
+ + {label} + + {isRunning && ( + RUNNING + )} + {isDone && ( + DONE + )} +
+ {(status === "pending" || !detail) && ( +

{sub}

+ )} + {detail && status !== "pending" && ( +

{detail}

+ )} +
+
+ ); + })} + + {done && ( +
+ + Pipeline completed successfully +
+ )} +
+ ); +} diff --git a/frontend/components/approval-panel.tsx b/frontend/components/approval-panel.tsx new file mode 100644 index 000000000..cb4089314 --- /dev/null +++ b/frontend/components/approval-panel.tsx @@ -0,0 +1,106 @@ +"use client"; + +import { useState } from "react"; +import { CheckCircle2, XCircle, Loader2, AlertOctagon } from "lucide-react"; +import { api } from "@/lib/api-client"; + +interface Props { + incidentId: string; + onResolved: () => void; +} + +export function ApprovalPanel({ incidentId, onResolved }: Props) { + const [notes, setNotes] = useState(""); + const [loading, setLoading] = useState<"approve" | "reject" | null>(null); + const [done, setDone] = useState<"approved" | "rejected" | null>(null); + + async function handle(action: "approve" | "reject") { + setLoading(action); + try { + if (action === "approve") { + await api.approveIncident(incidentId, "human", notes || undefined); + } else { + await api.rejectIncident(incidentId, "human", notes || undefined); + } + setDone(action === "approve" ? "approved" : "rejected"); + onResolved(); + } finally { + setLoading(null); + } + } + + if (done) { + const isApproved = done === "approved"; + return ( +
+ {isApproved + ? + : + } +

+ Fix {done} +

+

+ LearningAgent is updating vault confidence… +

+
+ ); + } + + return ( +
+
+ +

Human Approval Required

+
+
+

+ This fix requires your review. Approving will apply the fix and update vault confidence. + Rejecting will decay the confidence of this fix pathway. +

+ +