diff --git a/.agents/skills/deep-research/README.md b/.agents/skills/deep-research/README.md new file mode 100644 index 000000000..353fdaa6f --- /dev/null +++ b/.agents/skills/deep-research/README.md @@ -0,0 +1,119 @@ +# Deep Research Skill for Claude Code + +Enterprise-grade research engine for Claude Code. Produces citation-backed reports with source credibility scoring, multi-provider search, and automated validation. + +## Installation + +```bash +# Clone into Claude Code skills directory +git clone https://github.com/199-biotechnologies/claude-deep-research-skill.git ~/.claude/skills/deep-research +``` + +No additional dependencies required for basic usage. + +### Optional: search-cli (multi-provider search) + +For aggregated search across Brave, Serper, Exa, Jina, and Firecrawl: + +```bash +brew tap 199-biotechnologies/tap && brew install search-cli +search config set keys.brave YOUR_KEY # configure at least one provider +``` + +## Usage + +``` +deep research on the current state of quantum computing +``` + +``` +deep research in ultradeep mode: compare PostgreSQL vs Supabase for our stack +``` + +## Research Modes + +| Mode | Phases | Duration | Best For | +|------|--------|----------|----------| +| Quick | 3 | 2-5 min | Initial exploration | +| Standard | 6 | 5-10 min | Most research questions | +| Deep | 8 | 10-20 min | Complex topics, critical decisions | +| UltraDeep | 8+ | 20-45 min | Comprehensive reports, maximum rigor | + +## Pipeline + +Scope → Plan → **Retrieve** (parallel search + agents) → Triangulate → Outline Refinement → Synthesize → Critique (with loop-back) → Refine → Package + +Key features: +- **Step 0**: Retrieves current date before searches (prevents stale training-data year assumptions) +- **Parallel retrieval**: 5-10 concurrent searches + 2-3 focused sub-agents returning structured evidence objects +- **First Finish Search**: Adaptive quality thresholds by mode +- **Critique loop-back**: Phase 6 can return to Phase 3 with delta-queries if critical gaps found +- **Multi-persona red teaming**: Skeptical Practitioner, Adversarial Reviewer, Implementation Engineer (Deep/UltraDeep) +- **Disk-persisted citations**: `sources.json` survives context compaction and continuation agents + +## Output + +Reports saved to `~/Documents/[Topic]_Research_[Date]/`: +- Markdown (primary source of truth) +- HTML (McKinsey-style, auto-opened in browser) +- PDF (professional print via WeasyPrint) + +Reports >18K words auto-continue via recursive agent spawning with context preservation. + +## Quality Standards + +- 10+ sources, 3+ per major claim +- Executive summary 200-400 words +- Findings 600-2,000 words each, prose-first (>=80%) +- Full bibliography with URLs, no placeholders +- Automated validation: `validate_report.py` (9 checks) + `verify_citations.py` (DOI/URL/hallucination detection) +- Validation loop: validate → fix → retry (max 3 cycles) + +## Search Tools + +| Tool | Priority | Setup | +|------|----------|-------| +| search-cli | **Primary** — all searches go here first | `brew install search-cli` + API keys | +| WebSearch | Fallback — if search-cli fails or rate-limited | None (built-in) | +| Exa MCP | Optional — semantic/neural search alongside search-cli | MCP config | + +## Architecture + +``` +deep-research/ +├── SKILL.md # Skill entry point (lean, ~100 lines) +├── reference/ +│ ├── methodology.md # 8-phase pipeline details +│ ├── report-assembly.md # Progressive generation strategy +│ ├── quality-gates.md # Validation standards +│ ├── html-generation.md # McKinsey HTML conversion +│ ├── continuation.md # Auto-continuation protocol +│ └── weasyprint_guidelines.md # PDF generation +├── templates/ +│ ├── report_template.md # Report structure template +│ └── mckinsey_report_template.html # HTML report template +├── scripts/ +│ ├── validate_report.py # 9-check structure validator +│ ├── verify_citations.py # DOI/URL/hallucination checker +│ ├── source_evaluator.py # Source credibility scoring +│ ├── citation_manager.py # Citation tracking +│ ├── md_to_html.py # Markdown to HTML converter +│ ├── verify_html.py # HTML verification +│ └── research_engine.py # Core orchestration engine +└── tests/ + └── fixtures/ # Test report fixtures +``` + +## Version History + +| Version | Date | Changes | +|---------|------|---------| +| 2.3.1 | 2026-03-19 | Template/validator harmonization, structured evidence, critique loop-back, multi-persona red teaming | +| 2.3 | 2026-03-19 | Contract harmonization, search-cli integration, dynamic year detection, disk-persisted citations, validation loops | +| 2.2 | 2025-11-05 | Auto-continuation system for unlimited length | +| 2.1 | 2025-11-05 | Progressive file assembly | +| 1.0 | 2025-11-04 | Initial release | + +## License + +MIT - modify as needed for your workflow. diff --git a/.agents/skills/deep-research/SKILL.md b/.agents/skills/deep-research/SKILL.md new file mode 100644 index 000000000..6f76d4256 --- /dev/null +++ b/.agents/skills/deep-research/SKILL.md @@ -0,0 +1,108 @@ +--- +name: deep-research +description: Use when the user needs multi-source research with citation tracking, evidence persistence, and structured report generation. Triggers on "deep research", "comprehensive analysis", "research report", "compare X vs Y", "analyze trends", or "state of the art". Not for simple lookups, debugging, or questions answerable with 1-2 searches. +--- + +# Deep Research + +## Core Purpose + +Deliver citation-tracked research reports through a structured pipeline with evidence persistence, source identity management, claim-level verification, and progressive context management. + +**Autonomy Principle:** Operate independently. Infer assumptions from context. Only stop for critical errors or incomprehensible queries. Surface high-materiality assumptions explicitly in the Introduction and Methodology rather than silently defaulting. + +--- + +## Decision Tree + +``` +Request Analysis ++-- Simple lookup? --> STOP: Use WebSearch ++-- Debugging? --> STOP: Use standard tools ++-- Complex analysis needed? --> CONTINUE + +Mode Selection ++-- Initial exploration --> quick (3 phases, 2-5 min) ++-- Standard research --> standard (6 phases, 5-10 min) [DEFAULT] ++-- Critical decision --> deep (8 phases, 10-20 min) ++-- Comprehensive review --> ultradeep (8+ phases, 20-45 min) +``` + +**Default assumptions:** Technical query = technical audience. Comparison = balanced perspective. Trend = recent 1-2 years. + +--- + +## Workflow Overview + +| Phase | Name | Quick | Std | Deep | Ultra | +|-------|------|-------|-----|------|-------| +| 1 | SCOPE | Y | Y | Y | Y | +| 2 | PLAN | - | Y | Y | Y | +| 3 | RETRIEVE | Y | Y | Y | Y | +| 4 | TRIANGULATE | - | Y | Y | Y | +| 4.5 | OUTLINE REFINEMENT | - | Y | Y | Y | +| 5 | SYNTHESIZE | - | Y | Y | Y | +| 6 | CRITIQUE | - | - | Y | Y | +| 7 | REFINE | - | - | Y | Y | +| 8 | PACKAGE | Y | Y | Y | Y | + +**Note:** Phases 3-5 operate as an evidence loop per section (retrieve → evidence store → refine outline → draft → verify claims → delta-retrieve if needed), not as strict sequential gates. + +--- + +## Execution + +**On invocation, load relevant reference files:** + +1. **Phase 1-7:** Load [methodology.md](./reference/methodology.md) for detailed phase instructions +2. **Phase 8 (Report):** Load [report-assembly.md](./reference/report-assembly.md) for progressive generation +3. **HTML/PDF output:** Load [html-generation.md](./reference/html-generation.md) +4. **Quality checks:** Load [quality-gates.md](./reference/quality-gates.md) +5. **Long reports (>18K words):** Load [continuation.md](./reference/continuation.md) + +**Templates:** +- Report structure: [report_template.md](./templates/report_template.md) +- HTML styling: [mckinsey_report_template.html](./templates/mckinsey_report_template.html) + +**Scripts:** +- `python scripts/validate_report.py --report [path]` +- `python scripts/verify_citations.py --report [path]` +- `python scripts/md_to_html.py [markdown_path]` + +--- + +## Output Contract + +**Required sections:** +- Executive Summary (200-400 words) +- Introduction (scope, methodology, assumptions) +- Main Analysis (4-8 findings, 600-2,000 words each, cited) +- Synthesis & Insights (patterns, implications) +- Limitations & Caveats +- Recommendations +- Bibliography (COMPLETE - every citation, no placeholders) +- Methodology Appendix + +**Output files (all to `~/Documents/[Topic]_Research_[YYYYMMDD]/`):** +- Markdown (primary source of truth) +- `sources.jsonl` — stable source registry with canonical IDs +- `evidence.jsonl` — append-only evidence store with quotes and locators +- `claims.jsonl` — atomic claim ledger with support status +- `run_manifest.json` — query, mode, assumptions, provider config +- HTML (McKinsey style, auto-opened) +- PDF (professional print, auto-opened) + +**Quality standards:** +- 10+ sources, 3+ per major claim (cluster-independent, not just count) +- All factual claims cited immediately [N] with evidence backing in `evidence.jsonl` +- Claim-support verification mandatory: no unsupported factual claims pass delivery +- No placeholders, no fabricated citations +- Prose-first (>=80%), bullets sparingly + +--- + +## When to Use / NOT Use + +**Use:** Comprehensive analysis, technology comparisons, state-of-the-art reviews, multi-perspective investigation, market analysis. + +**Do NOT use:** Simple lookups, debugging, 1-2 search answers, quick time-sensitive queries. diff --git a/.agents/skills/deep-research/reference/continuation.md b/.agents/skills/deep-research/reference/continuation.md new file mode 100644 index 000000000..1a640ee0b --- /dev/null +++ b/.agents/skills/deep-research/reference/continuation.md @@ -0,0 +1,167 @@ +# Auto-Continuation Protocol + +## When to Use + +Trigger auto-continuation when report exceeds 18,000 words in single run. + +--- + +## Strategy Overview + +1. Generate sections 1-10 (stay under 18K words) +2. Save continuation state file with context preservation +3. Spawn continuation agent via Task tool +4. Continuation agent: Reads state -> Generates next batch -> Spawns next if needed +5. Chain continues recursively until complete + +--- + +## Continuation State File + +**Location:** `~/.claude/research_output/continuation_state_[report_id].json` + +```json +{ + "version": "3.0.0", + "report_id": "[unique_id]", + "file_path": "[absolute_path_to_report.md]", + "mode": "[quick|standard|deep|ultradeep]", + + "progress": { + "sections_completed": ["list of section IDs"], + "total_planned_sections": 15, + "word_count_so_far": 12000, + "continuation_count": 1 + }, + + "artifacts": { + "sources_path": "[folder]/sources.jsonl", + "evidence_path": "[folder]/evidence.jsonl", + "claims_path": "[folder]/claims.jsonl", + "run_manifest_path": "[folder]/run_manifest.json" + }, + + "research_context": { + "research_question": "[original question]", + "key_themes": ["theme1", "theme2"], + "main_findings_summary": [ + "Finding 1: [100-word summary]", + "Finding 2: [100-word summary]" + ], + "narrative_arc": "middle" + }, + + "quality_metrics": { + "avg_words_per_finding": 1500, + "citation_density": 5.2, + "prose_vs_bullets_ratio": "85% prose", + "writing_style": "technical-precise-data-driven" + }, + + "next_sections": [ + {"id": 11, "type": "finding", "title": "Finding X", "target_words": 1500}, + {"id": 12, "type": "synthesis", "title": "Synthesis", "target_words": 1000} + ] +} +``` + +--- + +## Spawning Continuation Agent + +Use Task tool: + +``` +Task( + subagent_type="general-purpose", + description="Continue deep-research report generation", + prompt=""" +CONTINUATION TASK: Continue existing deep-research report. + +CRITICAL INSTRUCTIONS: +1. Read continuation state: ~/.claude/research_output/continuation_state_[report_id].json +2. Read existing report: [file_path from state] +3. Read LAST 3 completed sections for flow/style +4. Load research context: themes, narrative arc, writing style +5. Load source registry from state.artifacts.sources_path — use stable source_ids, assign display numbers via citation_manager.py +6. Maintain quality metrics (avg words, citation density, prose ratio) + +YOUR TASK: +Generate next batch (stay under 18,000 words): +[List next_sections from state] + +Use Write/Edit to append to: [file_path] + +QUALITY GATES: +- Words per section: Within +/-20% of avg_words_per_finding +- Citation density: Match +/-0.5 per 1K words +- Prose ratio: Maintain >=80% +- Theme alignment: Section ties to key_themes + +After generating: +- If more sections remain: Update state, spawn next agent +- If final sections: Generate bibliography, verify report, cleanup state +""" +) +``` + +--- + +## Continuation Agent Quality Protocol + +### Context Loading (CRITICAL) + +1. Read continuation_state.json -> Load ALL context +2. Read existing report file -> Review last 3 sections +3. Extract patterns: + - Sentence structure complexity + - Technical terminology used + - Citation placement patterns + - Paragraph transition style + +### Pre-Generation Checklist + +- [ ] Loaded research context (themes, question, narrative arc) +- [ ] Reviewed previous sections for flow +- [ ] Loaded source registry from artifacts (stable source_ids, not citation numbers) +- [ ] Loaded quality targets (words, density, style) +- [ ] Understand narrative position (beginning/middle/end) + +### Per-Section Generation + +1. Generate section content +2. Quality checks: + - Word count within +/-20% + - Citation density matches + - Prose ratio >=80% + - Theme connection verified + - Style consistent +3. If ANY fails: Regenerate +4. If passes: Write to file, update state + +### Handoff Decision + +Calculate: Current words + remaining sections x avg_words_per_section +- If total < 18K: Generate all + finish +- If total > 18K: Generate partial, update state, spawn next agent + +### Final Agent Responsibilities + +- Generate final content sections +- Generate COMPLETE bibliography from state.citations.bibliography_entries +- Read entire assembled report +- Run validation: `python scripts/validate_report.py --report [path]` +- Delete continuation_state.json (cleanup) +- Report complete to user + +--- + +## User Communication + +After spawning continuation: +``` +Report Generation: Part 1 Complete (N sections, X words) +Auto-continuing via spawned agent... + Next batch: [section list] + Progress: [X%] complete +``` diff --git a/.agents/skills/deep-research/reference/html-generation.md b/.agents/skills/deep-research/reference/html-generation.md new file mode 100644 index 000000000..4c81f4285 --- /dev/null +++ b/.agents/skills/deep-research/reference/html-generation.md @@ -0,0 +1,103 @@ +# HTML Generation: McKinsey Style Report + +## Design Principles + +- Sharp corners (NO border-radius) +- Muted corporate colors (navy #003d5c, gray #f8f9fa) +- Ultra-compact layout +- Info-first structure +- 14px base font, compact spacing +- No decorative gradients or colors +- NO EMOJIS in final HTML + +--- + +## Generation Steps + +### Step 1: Read McKinsey Template +Load template from: `./templates/mckinsey_report_template.html` + +### Step 2: Extract Key Metrics +Extract 3-4 key quantitative findings for dashboard display at top. + +### Step 3: Convert MD to HTML + +Use Python script: +```bash +cd ~/.claude/skills/deep-research +python scripts/md_to_html.py [markdown_report_path] +``` + +**Script outputs two parts:** +- **Part A ({{CONTENT}}):** All sections except Bibliography +- **Part B ({{BIBLIOGRAPHY}}):** Bibliography section only + +**Script handles all conversion:** +- Headers: `##` -> `

` +- Headers: `###` -> `

` +- Lists: Markdown bullets -> `' in stripped or '' in stripped: + if in_paragraph: + result.append('

') + in_paragraph = False + result.append(line) + continue + + # Regular text line - wrap in paragraph + if not in_paragraph: + result.append('

' + line) + in_paragraph = True + else: + result.append(line) + + if in_paragraph: + result.append('

') + + return '\n'.join(result) + + +def _close_sections(html: str) -> str: + """Close all open section divs""" + # Count open and closed divs + open_divs = html.count('
') + closed_divs = html.count('
') + + # Add closing divs for sections + # Each section should be closed before the next section starts + lines = html.split('\n') + result = [] + section_open = False + + for i, line in enumerate(lines): + if '
' in line: + if section_open: + result.append('
') # Close previous section + section_open = True + result.append(line) + + # Close final section if still open + if section_open: + result.append('

') + + return '\n'.join(result) + + +def main(): + """Test the converter with a sample markdown file""" + import sys + + if len(sys.argv) < 2: + print("Usage: python md_to_html.py ") + sys.exit(1) + + md_file = Path(sys.argv[1]) + if not md_file.exists(): + print(f"Error: File {md_file} not found") + sys.exit(1) + + markdown_text = md_file.read_text() + content_html, bib_html = convert_markdown_to_html(markdown_text) + + print("=== CONTENT HTML ===") + print(content_html[:1000]) + print("\n=== BIBLIOGRAPHY HTML ===") + print(bib_html[:500]) + + +if __name__ == "__main__": + main() diff --git a/.agents/skills/deep-research/scripts/research_engine.py b/.agents/skills/deep-research/scripts/research_engine.py new file mode 100755 index 000000000..f0d0cde39 --- /dev/null +++ b/.agents/skills/deep-research/scripts/research_engine.py @@ -0,0 +1,584 @@ +#!/usr/bin/env python3 +""" +Deep Research Engine — STATE SCAFFOLD (not a runtime orchestrator) + +This file provides phase instruction templates and research state persistence. +It does NOT drive Claude Code — Claude is the orchestrator; this file provides +data structures and CLI utilities for state management. + +For the actual research workflow, see reference/methodology.md. +For the evidence substrate, see scripts/citation_manager.py and scripts/evidence_store.py. +""" + +import argparse +import json +import sys +import time +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional, Any +from dataclasses import dataclass, asdict +from enum import Enum + + +class ResearchPhase(Enum): + """Research pipeline phases""" + SCOPE = "scope" + PLAN = "plan" + RETRIEVE = "retrieve" + TRIANGULATE = "triangulate" + SYNTHESIZE = "synthesize" + CRITIQUE = "critique" + REFINE = "refine" + PACKAGE = "package" + + +class ResearchMode(Enum): + """Research depth modes""" + QUICK = "quick" # 3 phases: scope, retrieve, package + STANDARD = "standard" # 6 phases: skip refine and critique + DEEP = "deep" # Full 8 phases + ULTRADEEP = "ultradeep" # 8 phases + extended iterations + + +@dataclass +class Source: + """Represents a research source""" + url: str + title: str + snippet: str + retrieved_at: str + credibility_score: float = 0.0 + source_type: str = "web" # web, academic, documentation, code + verification_status: str = "unverified" # unverified, verified, conflicted + + def to_citation(self, index: int) -> str: + """Generate citation string""" + return f"[{index}] {self.title} - {self.url} (Retrieved: {self.retrieved_at})" + + +@dataclass +class ResearchState: + """Maintains research state across phases""" + query: str + mode: ResearchMode + phase: ResearchPhase + scope: Dict[str, Any] + plan: Dict[str, Any] + sources: List[Source] + findings: List[Dict[str, Any]] + synthesis: Dict[str, Any] + critique: Dict[str, Any] + report: str + metadata: Dict[str, Any] + + def save(self, filepath: Path): + """Save research state to file with retry logic""" + max_retries = 3 + for attempt in range(max_retries): + try: + with open(filepath, 'w') as f: + json.dump(self._serialize(), f, indent=2) + return # Success + except (IOError, OSError) as e: + if attempt == max_retries - 1: + # Final attempt failed + raise IOError(f"Failed to save state after {max_retries} attempts: {e}") + # Wait with exponential backoff before retry + wait_time = (attempt + 1) * 0.5 # 0.5s, 1s, 1.5s + time.sleep(wait_time) + + def _serialize(self) -> dict: + """Convert to serializable dict""" + return { + 'query': self.query, + 'mode': self.mode.value, + 'phase': self.phase.value, + 'scope': self.scope, + 'plan': self.plan, + 'sources': [asdict(s) for s in self.sources], + 'findings': self.findings, + 'synthesis': self.synthesis, + 'critique': self.critique, + 'report': self.report, + 'metadata': self.metadata + } + + @classmethod + def load(cls, filepath: Path) -> 'ResearchState': + """Load research state from file""" + with open(filepath, 'r') as f: + data = json.load(f) + + return cls( + query=data['query'], + mode=ResearchMode(data['mode']), + phase=ResearchPhase(data['phase']), + scope=data['scope'], + plan=data['plan'], + sources=[Source(**s) for s in data['sources']], + findings=data['findings'], + synthesis=data['synthesis'], + critique=data['critique'], + report=data['report'], + metadata=data['metadata'] + ) + + +class ResearchEngine: + """Main research orchestration engine""" + + def __init__(self, mode: ResearchMode = ResearchMode.STANDARD): + self.mode = mode + self.state: Optional[ResearchState] = None + self.output_dir = Path.home() / ".claude" / "research_output" + self.output_dir.mkdir(parents=True, exist_ok=True) + + def initialize_research(self, query: str) -> ResearchState: + """Initialize new research session""" + self.state = ResearchState( + query=query, + mode=self.mode, + phase=ResearchPhase.SCOPE, + scope={}, + plan={}, + sources=[], + findings=[], + synthesis={}, + critique={}, + report="", + metadata={ + 'started_at': datetime.now().isoformat(), + 'version': '1.0' + } + ) + return self.state + + def get_phase_instructions(self, phase: ResearchPhase) -> str: + """Get instructions for current phase""" + instructions = { + ResearchPhase.SCOPE: """ +# Phase 1: SCOPE + +Your task: Define research boundaries and success criteria + +## Execute: +1. Decompose the question into 3-5 core components +2. Identify 2-4 key stakeholder perspectives +3. Define what's IN scope and what's OUT of scope +4. List 3-5 success criteria for this research +5. Document 3-5 assumptions that need validation + +## Output Format: +```json +{ + "core_components": ["component1", "component2", ...], + "stakeholder_perspectives": ["perspective1", "perspective2", ...], + "in_scope": ["item1", "item2", ...], + "out_of_scope": ["item1", "item2", ...], + "success_criteria": ["criteria1", "criteria2", ...], + "assumptions": ["assumption1", "assumption2", ...] +} +``` + +Use extended reasoning to explore multiple framings before finalizing scope. +""", + ResearchPhase.PLAN: """ +# Phase 2: PLAN + +Your task: Create intelligent research roadmap + +## Execute: +1. Identify 5-10 primary sources to investigate +2. List 5-10 secondary/backup sources +3. Map knowledge dependencies (what must be understood first) +4. Create 10-15 search query variations +5. Plan triangulation approach (how to verify claims) +6. Define 3-5 quality gates + +## Output Format: +```json +{ + "primary_sources": ["source_type1", "source_type2", ...], + "secondary_sources": ["source_type1", "source_type2", ...], + "knowledge_dependencies": {"concept1": ["prerequisite1", "prerequisite2"], ...}, + "search_queries": ["query1", "query2", ...], + "triangulation_strategy": "description of verification approach", + "quality_gates": ["gate1", "gate2", ...] +} +``` + +Use Graph-of-Thoughts: branch into 3-4 potential research paths, evaluate, then converge on optimal strategy. +""", + ResearchPhase.RETRIEVE: """ +# Phase 3: RETRIEVE + +Your task: Systematically collect information from multiple sources + +## Execute: +1. Use WebSearch with iterative query refinement (minimum 10 searches) +2. Use WebFetch to deep-dive into 5-10 most promising sources +3. Extract key passages with metadata +4. Track information gaps +5. Follow 2-3 promising tangents +6. Ensure source diversity (different domains, perspectives) + +## Tools to Use: +- WebSearch: For current information and broad coverage +- WebFetch: For detailed extraction from specific URLs +- Grep/Read: For local documentation if relevant +- Task: Spawn 2-3 parallel retrieval agents for efficiency + +## Output: +Store all sources with metadata. Each source should include: +- URL/location +- Title +- Key excerpts +- Relevance score +- Source type +- Retrieved timestamp + +Aim for 15-30 distinct sources minimum. +""", + ResearchPhase.TRIANGULATE: """ +# Phase 4: TRIANGULATE + +Your task: Validate information across multiple independent sources + +## Execute: +1. List all major claims from retrieved information +2. For each claim, find 3+ independent confirmatory sources +3. Flag any contradictions or uncertainties +4. Assess source credibility (domain expertise, recency, bias) +5. Document consensus areas vs. debate areas +6. Mark verification status for each claim + +## Quality Standards: +- Core claims MUST have 3+ independent sources +- Flag any single-source claims as "unverified" +- Note information recency +- Identify potential biases + +## Output Format: +```json +{ + "verified_claims": [ + { + "claim": "statement", + "sources": ["source1", "source2", "source3"], + "confidence": "high|medium|low" + } + ], + "unverified_claims": [...], + "contradictions": [ + { + "topic": "what's contradicted", + "viewpoint1": {"claim": "...", "sources": [...]}, + "viewpoint2": {"claim": "...", "sources": [...]} + } + ] +} +``` +""", + ResearchPhase.SYNTHESIZE: """ +# Phase 5: SYNTHESIZE + +Your task: Connect insights and generate novel understanding + +## Execute: +1. Identify 5-10 key patterns across sources +2. Map relationships between concepts +3. Generate 3-5 insights that go beyond source material +4. Create conceptual frameworks or mental models +5. Build argument structures +6. Develop evidence hierarchies + +## Use Extended Reasoning: +- Explore non-obvious connections +- Consider second-order implications +- Think about what sources might be missing +- Generate novel hypotheses + +## Output Format: +```json +{ + "patterns": ["pattern1", "pattern2", ...], + "concept_relationships": {"concept1": ["related_to1", "related_to2"], ...}, + "novel_insights": ["insight1", "insight2", ...], + "frameworks": ["framework_description1", ...], + "key_arguments": [ + { + "argument": "main claim", + "supporting_evidence": ["evidence1", "evidence2"], + "strength": "strong|moderate|weak" + } + ] +} +``` +""", + ResearchPhase.CRITIQUE: """ +# Phase 6: CRITIQUE + +Your task: Rigorously evaluate research quality + +## Execute Red Team Analysis: +1. Check logical consistency +2. Verify citation completeness +3. Identify gaps or weaknesses +4. Assess balance and objectivity +5. Test alternative interpretations +6. Challenge assumptions + +## Red Team Questions: +- What's missing from this research? +- What could be wrong? +- What alternative explanations exist? +- What biases might be present? +- What counterfactuals should be considered? +- What would a skeptic say? + +## Output Format: +```json +{ + "strengths": ["strength1", "strength2", ...], + "weaknesses": ["weakness1", "weakness2", ...], + "gaps": ["gap1", "gap2", ...], + "biases": ["bias1", "bias2", ...], + "improvements_needed": [ + { + "issue": "description", + "recommendation": "how to fix", + "priority": "high|medium|low" + } + ] +} +``` +""", + ResearchPhase.REFINE: """ +# Phase 7: REFINE + +Your task: Address gaps and strengthen weak areas + +## Execute: +1. Conduct additional research for identified gaps +2. Strengthen weak arguments with more evidence +3. Add missing perspectives +4. Resolve contradictions where possible +5. Enhance clarity and structure +6. Verify all revised content + +## Focus On: +- High priority improvements from critique +- Missing stakeholder perspectives +- Weak evidence chains +- Unclear explanations + +## Output: +Updated findings, sources, and synthesis with improvements documented. +""", + ResearchPhase.PACKAGE: """ +# Phase 8: PACKAGE + +Your task: Deliver professional, actionable research report + +## Generate Complete Report: + +```markdown +# Research Report: [Topic] + +## Executive Summary +[3-5 key findings bullets] +[Primary recommendation] +[Confidence level: High/Medium/Low] + +## Introduction +### Research Question +[Original question] + +### Scope & Methodology +[What was investigated and how] + +### Key Assumptions +[Important assumptions made] + +## Main Analysis + +### Finding 1: [Title] +[Detailed explanation with evidence] +[Citations: [1], [2], [3]] + +### Finding 2: [Title] +[Detailed explanation with evidence] +[Citations: [4], [5], [6]] + +[Continue for all findings...] + +## Synthesis & Insights +[Patterns and connections] +[Novel insights] +[Implications] + +## Limitations & Caveats +[Known gaps] +[Assumptions] +[Areas of uncertainty] + +## Recommendations +[Action items] +[Next steps] +[Further research needs] + +## Bibliography +[1] Source 1 full citation +[2] Source 2 full citation +... + +## Appendix: Methodology +[Research process] +[Sources consulted] +[Verification approach] +``` + +Save report to file with timestamp. +""" + } + + return instructions.get(phase, "No instructions available for this phase") + + def execute_phase(self, phase: ResearchPhase) -> Dict[str, Any]: + """Execute a research phase""" + print(f"\n{'='*80}") + print(f"PHASE {phase.value.upper()}: Starting...") + print(f"{'='*80}\n") + + instructions = self.get_phase_instructions(phase) + print(instructions) + + # In real usage, Claude will execute these instructions + # This returns a structured result that Claude should populate + result = { + 'phase': phase.value, + 'status': 'instructions_displayed', + 'timestamp': datetime.now().isoformat() + } + + return result + + def run_pipeline(self, query: str) -> str: + """Run complete research pipeline""" + print(f"\n{'#'*80}") + print(f"# DEEP RESEARCH ENGINE") + print(f"# Query: {query}") + print(f"# Mode: {self.mode.value}") + print(f"{'#'*80}\n") + + # Initialize research + self.initialize_research(query) + + # Determine phases based on mode + phases = self._get_phases_for_mode() + + # Execute each phase + for phase in phases: + self.state.phase = phase + result = self.execute_phase(phase) + + # Save state after each phase + state_file = self.output_dir / f"research_state_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + self.state.save(state_file) + print(f"\n✓ Phase {phase.value} complete. State saved to: {state_file}\n") + + # Generate report path + report_file = self.output_dir / f"research_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md" + + print(f"\n{'='*80}") + print(f"RESEARCH PIPELINE COMPLETE") + print(f"Report will be saved to: {report_file}") + print(f"{'='*80}\n") + + return str(report_file) + + def _get_phases_for_mode(self) -> List[ResearchPhase]: + """Get phases based on research mode""" + if self.mode == ResearchMode.QUICK: + return [ + ResearchPhase.SCOPE, + ResearchPhase.RETRIEVE, + ResearchPhase.PACKAGE + ] + elif self.mode == ResearchMode.STANDARD: + return [ + ResearchPhase.SCOPE, + ResearchPhase.PLAN, + ResearchPhase.RETRIEVE, + ResearchPhase.TRIANGULATE, + ResearchPhase.SYNTHESIZE, + ResearchPhase.PACKAGE + ] + elif self.mode == ResearchMode.DEEP: + return list(ResearchPhase) + elif self.mode == ResearchMode.ULTRADEEP: + # In ultradeep, we might iterate some phases + return list(ResearchPhase) + + return list(ResearchPhase) + + +def main(): + """CLI entry point""" + parser = argparse.ArgumentParser( + description="Deep Research Engine for Claude Code", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python research_engine.py --query "state of quantum computing 2025" --mode deep + python research_engine.py --query "PostgreSQL vs Supabase comparison" --mode standard + python research_engine.py -q "longevity biotech funding trends" -m ultradeep + """ + ) + + parser.add_argument( + '--query', '-q', + type=str, + required=True, + help='Research question or topic' + ) + + parser.add_argument( + '--mode', '-m', + type=str, + choices=['quick', 'standard', 'deep', 'ultradeep'], + default='standard', + help='Research depth mode (default: standard)' + ) + + parser.add_argument( + '--resume', + type=str, + help='Resume from saved state file' + ) + + args = parser.parse_args() + + # Initialize engine + mode = ResearchMode(args.mode) + engine = ResearchEngine(mode=mode) + + if args.resume: + # Load previous state + state_file = Path(args.resume) + if not state_file.exists(): + print(f"Error: State file not found: {state_file}", file=sys.stderr) + sys.exit(1) + engine.state = ResearchState.load(state_file) + print(f"Resumed research from: {state_file}") + + # Run pipeline + report_path = engine.run_pipeline(args.query) + + print(f"\nResearch complete! Report path: {report_path}") + print(f"\nNow Claude should execute each phase using the displayed instructions.") + + +if __name__ == '__main__': + main() diff --git a/.agents/skills/deep-research/scripts/source_evaluator.py b/.agents/skills/deep-research/scripts/source_evaluator.py new file mode 100755 index 000000000..d1f7533f8 --- /dev/null +++ b/.agents/skills/deep-research/scripts/source_evaluator.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python3 +""" +Source Credibility Evaluator +Assesses source quality, credibility, and potential biases +""" + +from dataclasses import dataclass +from typing import List, Dict, Optional +from urllib.parse import urlparse +from datetime import datetime, timedelta +import re + + +@dataclass +class CredibilityScore: + """Represents source credibility assessment""" + overall_score: float # 0-100 + domain_authority: float # 0-100 + recency: float # 0-100 + expertise: float # 0-100 + bias_score: float # 0-100 (higher = more neutral) + factors: Dict[str, str] + recommendation: str # "high_trust", "moderate_trust", "low_trust", "verify" + + +class SourceEvaluator: + """Evaluates source credibility and quality""" + + # Domain reputation tiers + HIGH_AUTHORITY_DOMAINS = { + # Academic & Research + 'arxiv.org', 'nature.com', 'science.org', 'cell.com', 'nejm.org', + 'thelancet.com', 'springer.com', 'sciencedirect.com', 'plos.org', + 'ieee.org', 'acm.org', 'pubmed.ncbi.nlm.nih.gov', + + # Government & International Organizations + 'nih.gov', 'cdc.gov', 'who.int', 'fda.gov', 'nasa.gov', + 'gov.uk', 'europa.eu', 'un.org', + + # Established Tech Documentation + 'docs.python.org', 'developer.mozilla.org', 'docs.microsoft.com', + 'cloud.google.com', 'aws.amazon.com', 'kubernetes.io', + + # Reputable News (Fact-check verified) + 'reuters.com', 'apnews.com', 'bbc.com', 'economist.com', + 'nature.com/news', 'scientificamerican.com' + } + + MODERATE_AUTHORITY_DOMAINS = { + # Tech News & Analysis + 'techcrunch.com', 'theverge.com', 'arstechnica.com', 'wired.com', + 'zdnet.com', 'cnet.com', + + # Industry Publications + 'forbes.com', 'bloomberg.com', 'wsj.com', 'ft.com', + + # Educational + 'wikipedia.org', 'britannica.com', 'khanacademy.org', + + # Tech Blogs (established) + 'medium.com', 'dev.to', 'stackoverflow.com', 'github.com' + } + + LOW_AUTHORITY_INDICATORS = [ + 'blogspot.com', 'wordpress.com', 'wix.com', 'substack.com' + ] + + def __init__(self): + pass + + def evaluate_source( + self, + url: str, + title: str, + content: Optional[str] = None, + publication_date: Optional[str] = None, + author: Optional[str] = None + ) -> CredibilityScore: + """Evaluate source credibility""" + + domain = self._extract_domain(url) + + # Calculate component scores + domain_score = self._evaluate_domain_authority(domain) + recency_score = self._evaluate_recency(publication_date) + expertise_score = self._evaluate_expertise(domain, title, author) + bias_score = self._evaluate_bias(domain, title, content) + + # Calculate overall score (weighted average) + overall = ( + domain_score * 0.35 + + recency_score * 0.20 + + expertise_score * 0.25 + + bias_score * 0.20 + ) + + # Determine factors + factors = self._identify_factors( + domain, domain_score, recency_score, expertise_score, bias_score + ) + + # Generate recommendation + recommendation = self._generate_recommendation(overall) + + return CredibilityScore( + overall_score=round(overall, 2), + domain_authority=round(domain_score, 2), + recency=round(recency_score, 2), + expertise=round(expertise_score, 2), + bias_score=round(bias_score, 2), + factors=factors, + recommendation=recommendation + ) + + def _extract_domain(self, url: str) -> str: + """Extract domain from URL""" + parsed = urlparse(url) + domain = parsed.netloc.lower() + # Remove www prefix + domain = domain.replace('www.', '') + return domain + + def _evaluate_domain_authority(self, domain: str) -> float: + """Evaluate domain authority (0-100)""" + if domain in self.HIGH_AUTHORITY_DOMAINS: + return 90.0 + elif domain in self.MODERATE_AUTHORITY_DOMAINS: + return 70.0 + elif any(indicator in domain for indicator in self.LOW_AUTHORITY_INDICATORS): + return 40.0 + else: + # Unknown domain - moderate skepticism + return 55.0 + + def _evaluate_recency(self, publication_date: Optional[str]) -> float: + """Evaluate information recency (0-100)""" + if not publication_date: + return 50.0 # Unknown date + + try: + pub_date = datetime.fromisoformat(publication_date.replace('Z', '+00:00')) + age = datetime.now() - pub_date + + # Recency scoring + if age < timedelta(days=90): # < 3 months + return 100.0 + elif age < timedelta(days=365): # < 1 year + return 85.0 + elif age < timedelta(days=730): # < 2 years + return 70.0 + elif age < timedelta(days=1825): # < 5 years + return 50.0 + else: + return 30.0 + + except Exception: + return 50.0 + + def _evaluate_expertise( + self, + domain: str, + title: str, + author: Optional[str] + ) -> float: + """Evaluate source expertise (0-100)""" + score = 50.0 + + # Academic/research domains get high expertise + if any(d in domain for d in ['arxiv', 'nature', 'science', 'ieee', 'acm']): + score += 30 + + # Government/official sources + if '.gov' in domain or 'who.int' in domain: + score += 25 + + # Technical documentation + if 'docs.' in domain or 'documentation' in title.lower(): + score += 20 + + # Author credentials (if available) + if author: + if any(title in author.lower() for title in ['dr.', 'phd', 'professor']): + score += 15 + + return min(score, 100.0) + + def _evaluate_bias( + self, + domain: str, + title: str, + content: Optional[str] + ) -> float: + """Evaluate potential bias (0-100, higher = more neutral)""" + score = 70.0 # Start neutral + + # Check for sensationalism in title + sensational_indicators = [ + '!', 'shocking', 'unbelievable', 'you won\'t believe', + 'secret', 'they don\'t want you to know' + ] + title_lower = title.lower() + if any(indicator in title_lower for indicator in sensational_indicators): + score -= 20 + + # Academic sources are typically less biased + if any(d in domain for d in ['arxiv', 'nature', 'science', 'ieee']): + score += 20 + + # Check for balance in content (if available) + if content: + # Look for balanced language + balanced_indicators = ['however', 'although', 'on the other hand', 'critics argue'] + if any(indicator in content.lower() for indicator in balanced_indicators): + score += 10 + + return min(max(score, 0), 100.0) + + def _identify_factors( + self, + domain: str, + domain_score: float, + recency_score: float, + expertise_score: float, + bias_score: float + ) -> Dict[str, str]: + """Identify key credibility factors""" + factors = {} + + if domain_score >= 85: + factors['domain'] = "High authority domain" + elif domain_score <= 45: + factors['domain'] = "Low authority domain - verify claims" + + if recency_score >= 85: + factors['recency'] = "Recent information" + elif recency_score <= 40: + factors['recency'] = "Outdated information - verify currency" + + if expertise_score >= 80: + factors['expertise'] = "Expert source" + elif expertise_score <= 45: + factors['expertise'] = "Limited expertise indicators" + + if bias_score >= 80: + factors['bias'] = "Balanced perspective" + elif bias_score <= 50: + factors['bias'] = "Potential bias detected" + + return factors + + def _generate_recommendation(self, overall_score: float) -> str: + """Generate trust recommendation""" + if overall_score >= 80: + return "high_trust" + elif overall_score >= 60: + return "moderate_trust" + elif overall_score >= 40: + return "low_trust" + else: + return "verify" + + +# Example usage +if __name__ == '__main__': + evaluator = SourceEvaluator() + + # Test sources + test_sources = [ + { + 'url': 'https://www.nature.com/articles/s41586-2025-12345', + 'title': 'Breakthrough in Quantum Computing', + 'publication_date': '2025-10-15' + }, + { + 'url': 'https://someblog.wordpress.com/shocking-discovery', + 'title': 'SHOCKING! You Won\'t Believe This Discovery!', + 'publication_date': '2020-01-01' + }, + { + 'url': 'https://docs.python.org/3/library/asyncio.html', + 'title': 'asyncio — Asynchronous I/O', + 'publication_date': '2025-11-01' + } + ] + + for source in test_sources: + score = evaluator.evaluate_source(**source) + print(f"\nSource: {source['title']}") + print(f"URL: {source['url']}") + print(f"Overall Score: {score.overall_score}/100") + print(f"Recommendation: {score.recommendation}") + print(f"Factors: {score.factors}") diff --git a/.agents/skills/deep-research/scripts/validate_report.py b/.agents/skills/deep-research/scripts/validate_report.py new file mode 100755 index 000000000..b654226fe --- /dev/null +++ b/.agents/skills/deep-research/scripts/validate_report.py @@ -0,0 +1,354 @@ +#!/usr/bin/env python3 +""" +Report Validation Script +Ensures research reports meet quality standards before delivery +""" + +import argparse +import re +import sys +from pathlib import Path +from typing import List, Tuple, Dict + + +class ReportValidator: + """Validates research report quality""" + + def __init__(self, report_path: Path): + self.report_path = report_path + self.content = self._read_report() + self.errors: List[str] = [] + self.warnings: List[str] = [] + + def _read_report(self) -> str: + """Read report file""" + try: + with open(self.report_path, 'r', encoding='utf-8') as f: + return f.read() + except Exception as e: + print(f"❌ ERROR: Cannot read report: {e}") + sys.exit(1) + + def validate(self) -> bool: + """Run all validation checks""" + print(f"\n{'='*60}") + print(f"VALIDATING REPORT: {self.report_path.name}") + print(f"{'='*60}\n") + + checks = [ + ("Executive Summary", self._check_executive_summary), + ("Required Sections", self._check_required_sections), + ("Citations", self._check_citations), + ("Bibliography", self._check_bibliography), + ("Placeholder Text", self._check_placeholders), + ("Content Truncation", self._check_content_truncation), + ("Word Count", self._check_word_count), + ("Source Count", self._check_source_count), + ("Broken Links", self._check_broken_references), + ] + + for check_name, check_func in checks: + print(f"⏳ Checking: {check_name}...", end=" ") + passed = check_func() + if passed: + print("✅ PASS") + else: + print("❌ FAIL") + + self._print_summary() + + return len(self.errors) == 0 + + def _check_executive_summary(self) -> bool: + """Check executive summary exists and is 200-400 words""" + pattern = r'## Executive Summary(.*?)(?=##|\Z)' + match = re.search(pattern, self.content, re.DOTALL | re.IGNORECASE) + + if not match: + self.errors.append("Missing 'Executive Summary' section") + return False + + summary = match.group(1).strip() + word_count = len(summary.split()) + + if word_count > 400: + self.warnings.append(f"Executive summary too long: {word_count} words (should be ≤400)") + + if word_count < 50: + self.warnings.append(f"Executive summary too short: {word_count} words (should be ≥50)") + + return True + + def _check_required_sections(self) -> bool: + """Check all required sections are present""" + required = [ + "Executive Summary", + "Introduction", + "Main Analysis", + "Synthesis", + "Limitations", + "Recommendations", + "Bibliography", + "Methodology" + ] + + # Recommended sections (warnings if missing, not errors) + recommended = [ + "Counterevidence Register", + "Claims-Evidence Table" + ] + + missing = [] + for section in required: + if not re.search(rf'##.*{section}', self.content, re.IGNORECASE): + missing.append(section) + + if missing: + self.errors.append(f"Missing sections: {', '.join(missing)}") + return False + + # Check recommended sections (warnings only) + missing_recommended = [] + for section in recommended: + if not re.search(rf'##.*{section}', self.content, re.IGNORECASE): + missing_recommended.append(section) + + if missing_recommended: + self.warnings.append(f"Missing recommended sections (for academic rigor): {', '.join(missing_recommended)}") + + return True + + def _check_citations(self) -> bool: + """Check citation format and presence""" + # Find all citation references [1], [2], etc. + citations = re.findall(r'\[(\d+)\]', self.content) + + if not citations: + self.errors.append("No citations found in report") + return False + + unique_citations = set(citations) + + if len(unique_citations) < 10: + self.warnings.append(f"Only {len(unique_citations)} unique sources cited (recommended: ≥10)") + + # Check for consecutive citation numbers + citation_nums = sorted([int(c) for c in unique_citations]) + if citation_nums: + max_citation = max(citation_nums) + expected = set(range(1, max_citation + 1)) + missing = expected - set(citation_nums) + + if missing: + self.warnings.append(f"Non-consecutive citation numbers, missing: {sorted(missing)}") + + return True + + def _check_bibliography(self) -> bool: + """Check bibliography exists, matches citations, and has no truncation placeholders""" + pattern = r'## Bibliography(.*?)(?=##|\Z)' + match = re.search(pattern, self.content, re.DOTALL | re.IGNORECASE) + + if not match: + self.errors.append("Missing 'Bibliography' section") + return False + + bib_section = match.group(1) + + # CRITICAL: Check for truncation placeholders (2025 CiteGuard enhancement) + truncation_patterns = [ + (r'\[\d+-\d+\]', 'Citation range (e.g., [8-75])'), + (r'Additional.*citations', 'Phrase "Additional citations"'), + (r'would be included', 'Phrase "would be included"'), + (r'\[\.\.\.continue', 'Pattern "[...continue"'), + (r'\[Continue with', 'Pattern "[Continue with"'), + (r'etc\.(?!\w)', 'Standalone "etc."'), + (r'and so on', 'Phrase "and so on"'), + ] + + for pattern_re, description in truncation_patterns: + if re.search(pattern_re, bib_section, re.IGNORECASE): + self.errors.append(f"⚠️ CRITICAL: Bibliography contains truncation placeholder: {description}") + self.errors.append(f" This makes the report UNUSABLE - complete bibliography required") + return False + + # Count bibliography entries [1], [2], etc. + bib_entries = re.findall(r'^\[(\d+)\]', bib_section, re.MULTILINE) + + if not bib_entries: + self.errors.append("Bibliography has no entries") + return False + + # Check citation number continuity (no gaps) + bib_nums = sorted([int(n) for n in bib_entries]) + if bib_nums: + expected = list(range(1, bib_nums[-1] + 1)) + actual = bib_nums + missing = [n for n in expected if n not in actual] + if missing: + self.errors.append(f"Bibliography has gaps in numbering: missing {missing}") + return False + + # Find citations in text + text_citations = set(re.findall(r'\[(\d+)\]', self.content)) + bib_citations = set(bib_entries) + + # Check all citations have bibliography entries + missing_in_bib = text_citations - bib_citations + if missing_in_bib: + self.errors.append(f"Citations missing from bibliography: {sorted(missing_in_bib)}") + return False + + # Check for unused bibliography entries + unused = bib_citations - text_citations + if unused: + self.warnings.append(f"Unused bibliography entries: {sorted(unused)}") + + return True + + def _check_placeholders(self) -> bool: + """Check for placeholder text that shouldn't be in final report""" + placeholders = [ + 'TBD', 'TODO', 'FIXME', 'XXX', + '[citation needed]', '[needs citation]', + '[placeholder]', '[TODO]', '[TBD]' + ] + + found_placeholders = [] + for placeholder in placeholders: + if placeholder in self.content: + found_placeholders.append(placeholder) + + if found_placeholders: + self.errors.append(f"Found placeholder text: {', '.join(found_placeholders)}") + return False + + return True + + def _check_content_truncation(self) -> bool: + """Check for content truncation patterns (2025 Progressive Assembly enhancement)""" + truncation_patterns = [ + (r'Content continues', 'Phrase "Content continues"'), + (r'Due to length', 'Phrase "Due to length"'), + (r'would continue', 'Phrase "would continue"'), + (r'\[Sections \d+-\d+', 'Pattern "[Sections X-Y"'), + (r'Additional sections', 'Phrase "Additional sections"'), + (r'comprehensive.*word document that continues', 'Pattern "comprehensive...document that continues"'), + ] + + for pattern_re, description in truncation_patterns: + if re.search(pattern_re, self.content, re.IGNORECASE): + self.errors.append(f"⚠️ CRITICAL: Content truncation detected: {description}") + self.errors.append(f" Report is INCOMPLETE and UNUSABLE - regenerate with progressive assembly") + return False + + return True + + def _check_word_count(self) -> bool: + """Check overall report length""" + word_count = len(self.content.split()) + + if word_count < 500: + self.warnings.append(f"Report is very short: {word_count} words (consider expanding)") + # No upper limit warning - progressive assembly supports unlimited lengths + + return True + + def _check_source_count(self) -> bool: + """Check minimum source count""" + pattern = r'## Bibliography(.*?)(?=##|\Z)' + match = re.search(pattern, self.content, re.DOTALL | re.IGNORECASE) + + if not match: + return True # Already caught in bibliography check + + bib_section = match.group(1) + bib_entries = re.findall(r'^\[(\d+)\]', bib_section, re.MULTILINE) + + source_count = len(set(bib_entries)) + + if source_count < 10: + self.warnings.append(f"Only {source_count} sources (recommended: ≥10)") + + return True + + def _check_broken_references(self) -> bool: + """Check for broken internal references""" + # Find all markdown links [text](./path) + internal_links = re.findall(r'\[.*?\]\((\.\/.*?)\)', self.content) + + broken = [] + for link in internal_links: + # Remove anchor if present + link_path = link.split('#')[0] + full_path = self.report_path.parent / link_path + + if not full_path.exists(): + broken.append(link) + + if broken: + self.errors.append(f"Broken internal links: {', '.join(broken)}") + return False + + return True + + def _print_summary(self): + """Print validation summary""" + print(f"\n{'='*60}") + print(f"VALIDATION SUMMARY") + print(f"{'='*60}\n") + + if self.errors: + print(f"❌ ERRORS ({len(self.errors)}):") + for error in self.errors: + print(f" • {error}") + print() + + if self.warnings: + print(f"⚠️ WARNINGS ({len(self.warnings)}):") + for warning in self.warnings: + print(f" • {warning}") + print() + + if not self.errors and not self.warnings: + print("✅ ALL CHECKS PASSED - Report meets quality standards!\n") + elif not self.errors: + print("✅ VALIDATION PASSED (with warnings)\n") + else: + print("❌ VALIDATION FAILED - Please fix errors before delivery\n") + + +def main(): + parser = argparse.ArgumentParser( + description="Validate research report quality", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python validate_report.py --report report.md + python validate_report.py -r ~/.claude/research_output/research_report_20251104_153045.md + """ + ) + + parser.add_argument( + '--report', '-r', + type=str, + required=True, + help='Path to research report markdown file' + ) + + args = parser.parse_args() + + report_path = Path(args.report) + + if not report_path.exists(): + print(f"❌ ERROR: Report file not found: {report_path}") + sys.exit(1) + + validator = ReportValidator(report_path) + passed = validator.validate() + + sys.exit(0 if passed else 1) + + +if __name__ == '__main__': + main() diff --git a/.agents/skills/deep-research/scripts/verify_citations.py b/.agents/skills/deep-research/scripts/verify_citations.py new file mode 100755 index 000000000..fb633bbc4 --- /dev/null +++ b/.agents/skills/deep-research/scripts/verify_citations.py @@ -0,0 +1,426 @@ +#!/usr/bin/env python3 +""" +Citation Verification Script + +Catches fabricated citations by checking: +1. DOI resolution (via doi.org) +2. Basic metadata matching (title similarity, year match) +3. URL accessibility verification +4. Hallucination pattern detection (generic titles, suspicious patterns) +5. Flags suspicious entries for manual review + +Usage: + python verify_citations.py --report [path] + python verify_citations.py --report [path] --strict # Fail on any unverified + +Does NOT require API keys - uses free DOI resolver and heuristics. +""" + +import sys +import argparse +import re +from pathlib import Path +from typing import List, Dict, Tuple +from urllib import request, error +from urllib.parse import quote +import json +import time +from datetime import datetime + +class CitationVerifier: + """Verify citations in research report""" + + def __init__(self, report_path: Path, strict_mode: bool = False): + self.report_path = report_path + self.strict_mode = strict_mode + self.content = self._read_report() + self.suspicious = [] + self.verified = [] + self.errors = [] + + # Hallucination detection patterns (2025 CiteGuard enhancement) + self.suspicious_patterns = [ + # Generic academic-sounding but fake patterns + (r'^(A |An |The )?(Study|Analysis|Review|Survey|Investigation) (of|on|into)', + "Generic academic title pattern"), + (r'^(Recent|Current|Modern|Contemporary) (Advances|Developments|Trends) in', + "Generic 'advances' title pattern"), + # Too perfect, templated titles + (r'^[A-Z][a-z]+ [A-Z][a-z]+: A (Comprehensive|Complete|Systematic) (Review|Analysis|Guide)$', + "Too perfect, templated structure"), + ] + + def _read_report(self) -> str: + """Read report file""" + try: + with open(self.report_path, 'r', encoding='utf-8') as f: + return f.read() + except Exception as e: + print(f"L ERROR: Cannot read report: {e}") + sys.exit(1) + + def extract_bibliography(self) -> List[Dict]: + """Extract bibliography entries from report""" + pattern = r'## Bibliography(.*?)(?=##|\Z)' + match = re.search(pattern, self.content, re.DOTALL | re.IGNORECASE) + + if not match: + self.errors.append("No Bibliography section found") + return [] + + bib_section = match.group(1) + + # Parse entries: [N] Author (Year). "Title". Venue. URL + entries = [] + lines = bib_section.strip().split('\n') + + current_entry = None + for line in lines: + line = line.strip() + if not line: + continue + + # Check if starts with citation number [N] + match_num = re.match(r'^\[(\d+)\]\s+(.+)$', line) + if match_num: + if current_entry: + entries.append(current_entry) + + num = match_num.group(1) + rest = match_num.group(2) + + # Try to parse: Author (Year). "Title". Venue. URL + year_match = re.search(r'\((\d{4})\)', rest) + title_match = re.search(r'"([^"]+)"', rest) + doi_match = re.search(r'doi\.org/(10\.\S+)', rest) + url_match = re.search(r'https?://[^\s\)]+', rest) + + current_entry = { + 'num': num, + 'raw': rest, + 'year': year_match.group(1) if year_match else None, + 'title': title_match.group(1) if title_match else None, + 'doi': doi_match.group(1) if doi_match else None, + 'url': url_match.group(0) if url_match else None + } + elif current_entry: + # Multi-line entry, append to raw + current_entry['raw'] += ' ' + line + + if current_entry: + entries.append(current_entry) + + return entries + + def verify_doi(self, doi: str) -> Tuple[bool, Dict]: + """ + Verify DOI exists and get metadata. + Returns (success, metadata_dict) + """ + if not doi: + return False, {} + + try: + # Use content negotiation to get JSON metadata + url = f"https://doi.org/{quote(doi)}" + req = request.Request(url) + req.add_header('Accept', 'application/vnd.citationstyles.csl+json') + + with request.urlopen(req, timeout=10) as response: + data = json.loads(response.read().decode('utf-8')) + + return True, { + 'title': data.get('title', ''), + 'year': data.get('issued', {}).get('date-parts', [[None]])[0][0], + 'authors': [ + f"{a.get('family', '')} {a.get('given', '')}" + for a in data.get('author', []) + ], + 'venue': data.get('container-title', '') + } + except error.HTTPError as e: + if e.code == 404: + return False, {'error': 'DOI not found (404)'} + return False, {'error': f'HTTP {e.code}'} + except Exception as e: + return False, {'error': str(e)} + + def verify_url(self, url: str) -> Tuple[bool, str]: + """ + Verify URL is accessible (2025 CiteGuard enhancement). + Returns (accessible, status_message) + """ + if not url: + return False, "No URL" + + try: + # HEAD request to check accessibility without downloading + req = request.Request(url, method='HEAD') + req.add_header('User-Agent', 'Mozilla/5.0 (Research Citation Verifier)') + + with request.urlopen(req, timeout=10) as response: + if response.status == 200: + return True, "URL accessible" + else: + return False, f"HTTP {response.status}" + except error.HTTPError as e: + return False, f"HTTP {e.code}" + except error.URLError as e: + return False, f"URL error: {e.reason}" + except Exception as e: + return False, f"Connection error: {str(e)[:50]}" + + def detect_hallucination_patterns(self, entry: Dict) -> List[str]: + """ + Detect common LLM hallucination patterns in citations (2025 CiteGuard). + Returns list of detected issues. + """ + issues = [] + title = entry.get('title', '') + + if not title: + return issues + + # Check against suspicious patterns + for pattern, description in self.suspicious_patterns: + if re.match(pattern, title, re.IGNORECASE): + issues.append(f"Suspicious title pattern: {description}") + + # Check for overly generic titles + generic_words = ['overview', 'introduction', 'guide', 'handbook', 'manual'] + if any(word in title.lower() for word in generic_words) and len(title.split()) < 5: + issues.append("Very generic short title") + + # Check for placeholder-like titles + if any(x in title.lower() for x in ['tbd', 'todo', 'placeholder', 'example']): + issues.append("Placeholder text in title") + + # Check for inconsistent metadata + if entry.get('year'): + year = int(entry['year']) + current_year = datetime.now().year + # Very recent without DOI or URL is suspicious + if year >= current_year - 1 and not entry.get('doi') and not entry.get('url'): + issues.append(f"Recent year ({year}) with no verification method") + # Future year is definitely wrong + if year > current_year: + issues.append(f"Future year: {year} (current: {current_year})") + # Very old with modern phrasing is suspicious + if year < 2000 and any(word in title.lower() for word in ['ai', 'llm', 'gpt', 'transformer']): + issues.append(f"Anachronistic: pre-2000 ({year}) citation mentioning modern AI terms") + + return issues + + def check_title_similarity(self, title1: str, title2: str) -> float: + """ + Simple title similarity check (word overlap). + Returns score 0.0-1.0 + """ + if not title1 or not title2: + return 0.0 + + # Normalize: lowercase, remove punctuation, split + def normalize(s): + s = s.lower() + s = re.sub(r'[^\w\s]', ' ', s) + return set(s.split()) + + words1 = normalize(title1) + words2 = normalize(title2) + + if not words1 or not words2: + return 0.0 + + overlap = len(words1 & words2) + total = len(words1 | words2) + + return overlap / total if total > 0 else 0.0 + + def verify_entry(self, entry: Dict) -> Dict: + """Verify a single bibliography entry (Enhanced 2025 with CiteGuard)""" + result = { + 'num': entry['num'], + 'status': 'unknown', + 'issues': [], + 'metadata': {}, + 'verification_methods': [] + } + + # STEP 1: Run hallucination detection (CiteGuard 2025) + hallucination_issues = self.detect_hallucination_patterns(entry) + if hallucination_issues: + result['issues'].extend(hallucination_issues) + result['status'] = 'suspicious' + + # STEP 2: Has DOI? + if entry['doi']: + print(f" [{entry['num']}] Checking DOI {entry['doi']}...", end=' ') + success, metadata = self.verify_doi(entry['doi']) + + if success: + result['metadata'] = metadata + result['status'] = 'verified' + print("") + + # Check title similarity if we have both + if entry['title'] and metadata.get('title'): + similarity = self.check_title_similarity( + entry['title'], + metadata['title'] + ) + + if similarity < 0.5: + result['issues'].append( + f"Title mismatch (similarity: {similarity:.1%})" + ) + result['status'] = 'suspicious' + + # Check year match + if entry['year'] and metadata.get('year'): + if int(entry['year']) != int(metadata['year']): + result['issues'].append( + f"Year mismatch: report says {entry['year']}, DOI says {metadata['year']}" + ) + result['status'] = 'suspicious' + + else: + print(f"✗ {metadata.get('error', 'Failed')}") + result['status'] = 'unverified' + result['issues'].append(f"DOI resolution failed: {metadata.get('error', 'unknown')}") + + # STEP 3: Check URL accessibility (if no DOI or DOI failed) + if entry['url'] and result['status'] != 'verified': + url_ok, url_status = self.verify_url(entry['url']) + if url_ok: + result['verification_methods'].append('URL') + # Upgrade status if URL verifies + if result['status'] in ['unknown', 'no_doi', 'unverified']: + result['status'] = 'url_verified' + print(f" [{entry['num']}] URL accessible ✓") + else: + result['issues'].append(f"URL check failed: {url_status}") + + # STEP 4: Final fallback - no verification method + if not entry['doi'] and not entry['url']: + if 'No DOI provided' not in ' '.join(result['issues']): + result['issues'].append("No DOI or URL - cannot verify") + result['status'] = 'suspicious' + + return result + + def verify_all(self): + """Verify all bibliography entries""" + print(f"\n{'='*60}") + print(f"CITATION VERIFICATION: {self.report_path.name}") + print(f"{'='*60}\n") + + entries = self.extract_bibliography() + + if not entries: + print("L No bibliography entries found\n") + return False + + print(f"Found {len(entries)} citations\n") + + results = [] + for entry in entries: + result = self.verify_entry(entry) + results.append(result) + + # Rate limiting + time.sleep(0.5) + + # Summarize + print(f"\n{'='*60}") + print(f"VERIFICATION SUMMARY") + print(f"{'='*60}\n") + + verified = [r for r in results if r['status'] == 'verified'] + url_verified = [r for r in results if r['status'] == 'url_verified'] + suspicious = [r for r in results if r['status'] == 'suspicious'] + unverified = [r for r in results if r['status'] in ['unverified', 'no_doi', 'unknown']] + + print(f'DOI Verified: {len(verified)}/{len(results)}') + print(f'URL Verified: {len(url_verified)}/{len(results)}') + print(f'Suspicious: {len(suspicious)}/{len(results)}') + print(f'Unverified: {len(unverified)}/{len(results)}') + print() + + if suspicious: + print('SUSPICIOUS CITATIONS (Manual Review Needed):') + for r in suspicious: + print(f"\n [{r['num']}]") + for issue in r['issues']: + print(f" - {issue}") + print() + + if unverified and len(unverified) > 0: + print('UNVERIFIED CITATIONS (Could not check):') + for r in unverified: + print(f" [{r['num']}] {r['issues'][0] if r['issues'] else 'Unknown'}") + print() + + # Decision (Enhanced 2025 - includes URL-verified as acceptable) + total_verified = len(verified) + len(url_verified) + + if suspicious: + print('WARNING: Suspicious citations detected') + if self.strict_mode: + print(' STRICT MODE: Failing due to suspicious citations') + return False + else: + print(' (Continuing in non-strict mode)') + + if self.strict_mode and unverified: + print('STRICT MODE: Unverified citations found') + return False + + if total_verified / len(results) < 0.5: + print('WARNING: Less than 50% citations verified') + return True # Pass with warning + else: + print('CITATION VERIFICATION PASSED') + return True + + +def main(): + parser = argparse.ArgumentParser( + description="Verify citations in research report", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python verify_citations.py --report report.md + +Note: Requires internet connection to check DOIs. +Uses free DOI resolver - no API key needed. + """ + ) + + parser.add_argument( + '--report', '-r', + type=str, + required=True, + help='Path to research report markdown file' + ) + + parser.add_argument( + '--strict', + action='store_true', + help='Strict mode: fail on any unverified or suspicious citations' + ) + + args = parser.parse_args() + report_path = Path(args.report) + + if not report_path.exists(): + print(f"ERROR: Report file not found: {report_path}") + sys.exit(1) + + verifier = CitationVerifier(report_path, strict_mode=args.strict) + passed = verifier.verify_all() + + sys.exit(0 if passed else 1) + + +if __name__ == '__main__': + main() diff --git a/.agents/skills/deep-research/scripts/verify_claim_support.py b/.agents/skills/deep-research/scripts/verify_claim_support.py new file mode 100644 index 000000000..403d3fa7c --- /dev/null +++ b/.agents/skills/deep-research/scripts/verify_claim_support.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python3 +""" +Claim-Support Verification — checks whether evidence supports claims. + +CLI subcommands: + verify Check all claims against evidence, update support_status + report Generate a support verification summary + +Version 1 is deterministic and cheap: entity, number, date, and +lexical-overlap checks over stored evidence. No LLM calls. + +Only factual claims hard-fail on unsupported status. +Synthesis/recommendation need traceability but softer thresholds. +""" + +import argparse +import json +import os +import re +import sys +from collections import Counter +from datetime import datetime, timezone + + +# --------------------------------------------------------------------------- +# JSONL helpers +# --------------------------------------------------------------------------- + +def read_jsonl(path: str) -> list[dict]: + rows = [] + if not os.path.exists(path): + return rows + with open(path) as f: + for line in f: + line = line.strip() + if line: + rows.append(json.loads(line)) + return rows + + +def write_jsonl(path: str, rows: list[dict]) -> None: + with open(path, 'w') as f: + for row in rows: + f.write(json.dumps(row, ensure_ascii=False) + '\n') + + +# --------------------------------------------------------------------------- +# Support verification logic +# --------------------------------------------------------------------------- + +# Extract numbers (integers and decimals) +NUMBER_RE = re.compile(r'\b\d+(?:\.\d+)?(?:%|x|X)?\b') + +# Extract year-like numbers +YEAR_RE = re.compile(r'\b(19|20)\d{2}\b') + +# Extract capitalized entities (naive NER) +ENTITY_RE = re.compile(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b') + +# Common stop entities to ignore +STOP_ENTITIES = frozenset([ + 'The', 'This', 'That', 'These', 'However', 'Furthermore', + 'Moreover', 'Additionally', 'Therefore', 'Nevertheless', +]) + + +def extract_tokens(text: str) -> set[str]: + """Extract significant lowercase tokens (>3 chars).""" + words = re.findall(r'\b[a-z]{4,}\b', text.lower()) + return set(words) + + +def extract_numbers(text: str) -> set[str]: + """Extract numeric values.""" + return set(NUMBER_RE.findall(text)) + + +def extract_years(text: str) -> set[str]: + """Extract year mentions.""" + return set(YEAR_RE.findall(text)) + + +def extract_entities(text: str) -> set[str]: + """Extract capitalized entity mentions.""" + ents = set(ENTITY_RE.findall(text)) + return ents - STOP_ENTITIES + + +def compute_support_score(claim_text: str, evidence_quotes: list[str]) -> tuple[str, float, str]: + """ + Compute support status for a claim given its linked evidence quotes. + + Returns (status, score, notes). + Score range: 0.0 (no overlap) to 1.0 (strong support). + """ + if not evidence_quotes: + return ('unsupported', 0.0, 'no evidence linked') + + claim_tokens = extract_tokens(claim_text) + claim_numbers = extract_numbers(claim_text) + claim_years = extract_years(claim_text) + claim_entities = extract_entities(claim_text) + + best_score = 0.0 + best_notes = [] + + for quote in evidence_quotes: + ev_tokens = extract_tokens(quote) + ev_numbers = extract_numbers(quote) + ev_years = extract_years(quote) + ev_entities = extract_entities(quote) + + # Token overlap (Jaccard-like) + if claim_tokens: + token_overlap = len(claim_tokens & ev_tokens) / len(claim_tokens) + else: + token_overlap = 0.0 + + # Number match + if claim_numbers: + number_match = len(claim_numbers & ev_numbers) / len(claim_numbers) + else: + number_match = 1.0 # No numbers to check + + # Year match + if claim_years: + year_match = len(claim_years & ev_years) / len(claim_years) + else: + year_match = 1.0 + + # Entity match + if claim_entities: + entity_match = len(claim_entities & ev_entities) / len(claim_entities) + else: + entity_match = 1.0 + + # Weighted composite + score = ( + 0.4 * token_overlap + + 0.25 * number_match + + 0.15 * year_match + + 0.2 * entity_match + ) + + if score > best_score: + best_score = score + best_notes = [] + if token_overlap < 0.3: + best_notes.append('low lexical overlap') + if claim_numbers and number_match < 0.5: + best_notes.append('number mismatch') + if claim_years and year_match < 1.0: + best_notes.append('year mismatch') + if claim_entities and entity_match < 0.3: + best_notes.append('entity mismatch') + + # Threshold decision + if best_score >= 0.6: + status = 'supported' + elif best_score >= 0.35: + status = 'partial' + else: + status = 'needs_review' + + notes = '; '.join(best_notes) if best_notes else 'adequate overlap' + return (status, round(best_score, 3), notes) + + +# --------------------------------------------------------------------------- +# Subcommands +# --------------------------------------------------------------------------- + +def cmd_verify(args: argparse.Namespace) -> None: + """Verify all claims against evidence, update claims.jsonl.""" + claims_path = os.path.join(args.dir, 'claims.jsonl') + evidence_path = os.path.join(args.dir, 'evidence.jsonl') + sources_path = os.path.join(args.dir, 'sources.jsonl') + + claims = read_jsonl(claims_path) + evidence = read_jsonl(evidence_path) + sources = read_jsonl(sources_path) + + # Build evidence index by source_id + ev_by_source: dict[str, list[str]] = {} + ev_by_id: dict[str, dict] = {} + for ev in evidence: + sid = ev.get('source_id', '') + eid = ev.get('evidence_id', '') + ev_by_source.setdefault(sid, []).append(ev.get('quote', '')) + ev_by_id[eid] = ev + + # Deduplicate claims + seen = set() + unique_claims = [] + for c in claims: + cid = c.get('claim_id') + if cid not in seen: + seen.add(cid) + unique_claims.append(c) + + verified = 0 + updated_claims = [] + + for claim in unique_claims: + claim_type = claim.get('claim_type', 'factual') + + # Gather evidence for this claim + cited_ids = claim.get('cited_source_ids', []) + evidence_ids = claim.get('evidence_ids', []) + + # Collect evidence quotes from linked evidence_ids + quotes = [] + for eid in evidence_ids: + if eid in ev_by_id: + quotes.append(ev_by_id[eid].get('quote', '')) + + # Also gather from cited sources + for sid in cited_ids: + if sid in ev_by_source: + quotes.extend(ev_by_source[sid]) + + if not quotes and not cited_ids and not evidence_ids: + # No links at all + if claim_type == 'speculation': + claim['support_status'] = 'supported' # Speculation doesn't need evidence + else: + claim['support_status'] = 'unsupported' + elif not quotes: + # Has cited sources but no evidence captured yet + claim['support_status'] = 'needs_review' + else: + status, score, notes = compute_support_score(claim['text'], quotes) + claim['support_status'] = status + claim['_support_score'] = score + claim['_support_notes'] = notes + + verified += 1 + updated_claims.append(claim) + + # Rewrite claims.jsonl with updated statuses + write_jsonl(claims_path, updated_claims) + + # Compute summary + status_counts = Counter(c.get('support_status') for c in updated_claims) + factual_unsupported = sum( + 1 for c in updated_claims + if c.get('claim_type') == 'factual' and c.get('support_status') == 'unsupported' + ) + total_factual = sum(1 for c in updated_claims if c.get('claim_type') == 'factual') + + # Strict mode: fail if any factual claim is unsupported + passed = True + if args.strict and factual_unsupported > 0: + passed = False + + print(json.dumps({ + 'status': 'pass' if passed else 'fail', + 'verified': verified, + 'support_status_counts': dict(status_counts), + 'factual_unsupported': factual_unsupported, + 'total_factual': total_factual, + 'unsupported_rate': round(factual_unsupported / max(total_factual, 1), 3), + }, indent=2)) + + if not passed: + sys.exit(1) + + +def cmd_report(args: argparse.Namespace) -> None: + """Generate human-readable support verification report.""" + claims_path = os.path.join(args.dir, 'claims.jsonl') + claims = read_jsonl(claims_path) + + # Deduplicate + seen = set() + unique = [] + for c in claims: + cid = c.get('claim_id') + if cid not in seen: + seen.add(cid) + unique.append(c) + + lines = ['# Claim Support Verification Report', ''] + + # Summary + status_counts = Counter(c.get('support_status') for c in unique) + type_counts = Counter(c.get('claim_type') for c in unique) + lines.append(f'**Total claims:** {len(unique)}') + lines.append(f'**By type:** {dict(type_counts)}') + lines.append(f'**By status:** {dict(status_counts)}') + lines.append('') + + # Unsupported factual claims (the failures) + unsupported_factual = [ + c for c in unique + if c.get('claim_type') == 'factual' and c.get('support_status') in ('unsupported', 'needs_review') + ] + if unsupported_factual: + lines.append('## Unsupported/Review-needed Factual Claims') + lines.append('') + for c in unsupported_factual: + lines.append(f'- [{c["support_status"]}] `{c["section_id"]}`: {c["text"][:100]}...') + if c.get('_support_notes'): + lines.append(f' Notes: {c["_support_notes"]}') + lines.append('') + + # All clear + if not unsupported_factual: + lines.append('## All factual claims have adequate support.') + lines.append('') + + print('\n'.join(lines)) + + +# --------------------------------------------------------------------------- +# CLI entry point +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser( + prog='verify_claim_support', + description='Claim-support verification for deep-research v3.0', + ) + sub = parser.add_subparsers(dest='command', required=True) + + # verify + p_ver = sub.add_parser('verify', help='Verify claims against evidence') + p_ver.add_argument('--dir', required=True, help='Run directory') + p_ver.add_argument('--strict', action='store_true', help='Exit 1 if any factual claim unsupported') + + # report + p_rep = sub.add_parser('report', help='Generate verification report') + p_rep.add_argument('--dir', required=True, help='Run directory') + + args = parser.parse_args() + dispatch = { + 'verify': cmd_verify, + 'report': cmd_report, + } + dispatch[args.command](args) + + +if __name__ == '__main__': + main() diff --git a/.agents/skills/deep-research/scripts/verify_html.py b/.agents/skills/deep-research/scripts/verify_html.py new file mode 100755 index 000000000..5a6c46ad3 --- /dev/null +++ b/.agents/skills/deep-research/scripts/verify_html.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +""" +HTML Report Verification Script +Validates that HTML reports are properly generated with all sections from MD +""" + +import argparse +import re +from pathlib import Path +from typing import List, Tuple + + +class HTMLVerifier: + """Verify HTML research reports""" + + def __init__(self, html_path: Path, md_path: Path): + self.html_path = html_path + self.md_path = md_path + self.errors = [] + self.warnings = [] + + def verify(self) -> bool: + """ + Run all verification checks + + Returns: + True if all checks pass, False otherwise + """ + print(f"\n{'='*60}") + print(f"HTML REPORT VERIFICATION") + print(f"{'='*60}\n") + + print(f"HTML File: {self.html_path}") + print(f"MD File: {self.md_path}\n") + + # Read files + try: + html_content = self.html_path.read_text() + md_content = self.md_path.read_text() + except Exception as e: + self.errors.append(f"Failed to read files: {e}") + return False + + # Run checks + self._check_sections(html_content, md_content) + self._check_no_placeholders(html_content) + self._check_no_emojis(html_content) + self._check_structure(html_content) + self._check_citations(html_content, md_content) + self._check_bibliography(html_content, md_content) + + # Report results + self._print_results() + + return len(self.errors) == 0 + + def _check_sections(self, html: str, md: str): + """Verify all markdown sections are present in HTML""" + # Extract section headings from markdown + md_sections = re.findall(r'^## (.+)$', md, re.MULTILINE) + + # Extract sections from HTML + html_sections = re.findall(r'

(.+?)

', html) + + # Check if we have placeholder sections like
#
+ placeholder_sections = re.findall(r'
#
', html) + + if placeholder_sections: + self.errors.append( + f"Found {len(placeholder_sections)} placeholder sections (empty '#' divs) - content not converted properly" + ) + + # Compare section counts + if len(md_sections) > len(html_sections) + 1: # +1 for bibliography which is separate + self.errors.append( + f"Section count mismatch: MD has {len(md_sections)} sections, HTML has only {len(html_sections)} + bibliography" + ) + missing = set(md_sections) - set(html_sections) + if missing: + self.errors.append(f"Missing sections in HTML: {missing}") + + # Verify Executive Summary is present + if "Executive Summary" in md and "Executive Summary" not in html: + self.errors.append("Executive Summary missing from HTML") + + def _check_no_placeholders(self, html: str): + """Check for common placeholders that shouldn't be in final report""" + placeholders = [ + '{{TITLE}}', '{{DATE}}', '{{CONTENT}}', '{{BIBLIOGRAPHY}}', + '{{METRICS_DASHBOARD}}', '{{SOURCE_COUNT}}', 'TODO', 'TBD', + 'PLACEHOLDER', 'FIXME' + ] + + found = [] + for placeholder in placeholders: + if placeholder in html: + found.append(placeholder) + + if found: + self.errors.append(f"Found unreplaced placeholders: {', '.join(found)}") + + def _check_no_emojis(self, html: str): + """Verify no emojis are present in HTML""" + # Common emoji patterns + emoji_pattern = re.compile( + "[" + "\U0001F600-\U0001F64F" # emoticons + "\U0001F300-\U0001F5FF" # symbols & pictographs + "\U0001F680-\U0001F6FF" # transport & map symbols + "\U0001F1E0-\U0001F1FF" # flags + "\U00002702-\U000027B0" + "\U000024C2-\U0001F251" + "]+", + flags=re.UNICODE + ) + + emojis = emoji_pattern.findall(html) + if emojis: + unique_emojis = set(emojis) + self.errors.append(f"Found {len(emojis)} emojis in HTML (should be none): {unique_emojis}") + + def _check_structure(self, html: str): + """Verify HTML has proper structure""" + required_elements = [ + ('', 'title tag'), + ('class="header"', 'header section'), + ('class="content"', 'content section'), + ('class="bibliography"', 'bibliography section'), + ] + + for element, name in required_elements: + if element not in html: + self.errors.append(f"Missing {name} in HTML") + + # Check for unclosed tags (basic check) + open_divs = html.count('') + + if abs(open_divs - close_divs) > 2: # Allow small discrepancy + self.warnings.append( + f"Possible unclosed divs: {open_divs} opening tags, {close_divs} closing tags" + ) + + def _check_citations(self, html: str, md: str): + """Verify citations are present""" + # Extract citations from markdown + md_citations = set(re.findall(r'\[(\d+)\]', md)) + + # Extract citations from HTML (excluding bibliography) + html_content = html.split('class="bibliography"')[0] if 'class="bibliography"' in html else html + html_citations = set(re.findall(r'\[(\d+)\]', html_content)) + + if len(md_citations) > 0 and len(html_citations) == 0: + self.errors.append("No citations found in HTML content (but present in MD)") + + if len(md_citations) > len(html_citations) * 1.5: # Allow some variation + self.warnings.append( + f"Fewer citations in HTML ({len(html_citations)}) than MD ({len(md_citations)})" + ) + + def _check_bibliography(self, html: str, md: str): + """Verify bibliography is present and formatted""" + if '## Bibliography' in md: + if 'class="bibliography"' not in html: + self.errors.append("Bibliography section missing from HTML") + elif 'class="bib-entry"' not in html: + self.warnings.append("Bibliography present but entries not properly formatted") + + def _print_results(self): + """Print verification results""" + print(f"\n{'-'*60}") + print("VERIFICATION RESULTS") + print(f"{'-'*60}\n") + + if self.errors: + print(f"❌ ERRORS ({len(self.errors)}):") + for i, error in enumerate(self.errors, 1): + print(f" {i}. {error}") + print() + + if self.warnings: + print(f"⚠️ WARNINGS ({len(self.warnings)}):") + for i, warning in enumerate(self.warnings, 1): + print(f" {i}. {warning}") + print() + + if not self.errors and not self.warnings: + print("✅ All checks passed! HTML report is valid.") + print() + + print(f"{'-'*60}\n") + + +def main(): + """Main entry point""" + parser = argparse.ArgumentParser(description='Verify HTML research report') + parser.add_argument('--html', type=Path, required=True, help='Path to HTML report') + parser.add_argument('--md', type=Path, required=True, help='Path to markdown report') + + args = parser.parse_args() + + if not args.html.exists(): + print(f"Error: HTML file not found: {args.html}") + return 1 + + if not args.md.exists(): + print(f"Error: Markdown file not found: {args.md}") + return 1 + + verifier = HTMLVerifier(args.html, args.md) + success = verifier.verify() + + return 0 if success else 1 + + +if __name__ == "__main__": + exit(main()) diff --git a/.agents/skills/deep-research/templates/mckinsey_report_template.html b/.agents/skills/deep-research/templates/mckinsey_report_template.html new file mode 100644 index 000000000..7f578e079 --- /dev/null +++ b/.agents/skills/deep-research/templates/mckinsey_report_template.html @@ -0,0 +1,443 @@ + + + + + + {{TITLE}} - Deep Research Report + + + +
+
+

{{TITLE}}

+
+ {{DATE}} + + {{SOURCE_COUNT}} Sources +
+
+ + {{METRICS_DASHBOARD}} + +
+ {{CONTENT}} + +
+
Bibliography
+ {{BIBLIOGRAPHY}} +
+ +
+
+ + diff --git a/.agents/skills/deep-research/templates/report_template.md b/.agents/skills/deep-research/templates/report_template.md new file mode 100644 index 000000000..9ef97710b --- /dev/null +++ b/.agents/skills/deep-research/templates/report_template.md @@ -0,0 +1,414 @@ +# Research Report: [Topic] + + + + + + + + + + + + + + + + + + + + + + + + + + + + +## Executive Summary + +[Write 3-5 bullet points, 200-400 words total] +- **Key Finding 1:** [Major discovery with specific data/metrics] +- **Key Finding 2:** [Important insight with evidence] +- **Key Finding 3:** [Critical conclusion with implications] +- [Additional findings as needed] + +**Primary Recommendation:** [One clear sentence stating the main recommendation] + +**Confidence Level:** [High/Medium/Low with brief justification] + +--- + +## Introduction + +### Research Question +[State the original question clearly and completely] + +[Add 1-2 sentences providing context for why this question matters] + +### Scope & Methodology +[2-3 paragraphs explaining:] +- What specific aspects were investigated +- What was included vs excluded from scope +- What research methods were used (web search, academic sources, industry reports, etc.) +- How many sources were consulted +- Time period covered + +### Key Assumptions +[List 3-5 important assumptions made during research] +- Assumption 1: [Description and why it matters] +- Assumption 2: [Description and why it matters] +- [Continue...] + +--- + +## Main Analysis + + + + + + + + +### Finding 1: [Descriptive Title That Captures the Key Point] + +[Opening paragraph: State the finding clearly and why it matters] + +[Body paragraphs: +- Present detailed evidence +- Include specific data, statistics, dates, numbers +- Explain mechanisms, causes, or relationships +- Discuss implications +- Address nuances or exceptions +] + +**Key Evidence:** +- Data point 1 from Source A [1] +- Data point 2 from Source B [2] +- Conflicting view from Source C [3] and how it was resolved + +**Implications:** +[1-2 paragraphs on what this finding means for the user's decision/understanding] + +**Sources:** [1], [2], [3], [4] + +--- + +### Finding 2: [Descriptive Title] + +[Follow same detailed structure as Finding 1] +[Minimum 300 words per finding] +[Include multiple paragraphs with evidence] + +**Sources:** [5], [6], [7], [8] + +--- + +### Finding 3: [Descriptive Title] + +[Continue with same detail level] + +**Sources:** [9], [10], [11] + +--- + +### Finding 4: [Descriptive Title] + +[And so on... Include 4-8 major findings minimum] + +**Sources:** [12], [13], [14] + +--- + +[Continue with additional findings as needed] + +--- + +## Synthesis & Insights + + + + +### Patterns Identified + +[2-3 paragraphs identifying key patterns across findings] + +**Pattern 1: [Name]** +[Explain the pattern in detail, cite which findings support it] + +**Pattern 2: [Name]** +[Continue...] + +### Novel Insights + +[2-3 paragraphs of insights that go BEYOND what sources explicitly stated] + +**Insight 1: [Name]** +[What you discovered by connecting information across sources] +[Why this matters even though no single source said it explicitly] + +**Insight 2: [Name]** +[Continue...] + +### Implications + +[2-3 paragraphs on what all this means] + +**For [User Context]:** +[Specific implications for the user's situation/decision] + +**Broader Implications:** +[Wider significance of these findings] + +**Second-Order Effects:** +[What might happen as consequences of these findings] + +--- + +## Limitations & Caveats + + + +### Counterevidence Register + + + +[2-3 paragraphs explaining contradictory evidence found during research] + +**Contradictory Finding 1:** [Description] +- Source: [Citation] +- Why it contradicts: [Explanation] +- How resolved/interpreted: [Your analysis] +- Impact on conclusions: [Minimal/Moderate/Significant] + +**Contradictory Finding 2:** [Continue...] + +### Known Gaps + +[2-3 paragraphs explaining:] +- What information was not available +- What questions remain unanswered +- What would strengthen this research + +**Gap 1:** [Description] +- Why it's missing +- How it affects conclusions +- How to address it in future research + +**Gap 2:** [Continue...] + +### Assumptions + +[Revisit key assumptions from intro, now with more detail on their validity] + +**Assumption 1:** [Restate] +- Evidence supporting it: [...] +- Evidence challenging it: [...] +- Overall validity: [...] + +### Areas of Uncertainty + +[2-3 paragraphs on:] +- Where sources disagree +- Where evidence is thin +- Where extrapolation was necessary +- What could change conclusions + +**Uncertainty 1:** [Topic] +[Detailed explanation of what's uncertain and why] + +**Uncertainty 2:** [Continue...] + +--- + +## Recommendations + + + +### Immediate Actions + +[3-5 specific actions the user should take NOW] + +1. **[Action Title]** + - What: [Specific action] + - Why: [Rationale based on findings] + - How: [Implementation steps] + - Timeline: [When to do this] + +2. **[Continue with similar detail...]** + +### Next Steps + +[3-5 actions for the near-term future (1-3 months)] + +1. **[Step Title]** + - [Similar detailed structure] + +### Further Research Needs + +[3-5 areas where additional research would be valuable] + +1. **[Research Topic]** + - What to investigate: [Specific question] + - Why it matters: [Connection to current findings] + - Suggested approach: [How to research it] + +--- + +## Bibliography + + + + + + + + + + +[1] Author Name or Organization ([YEAR]). "Full Title of Article or Paper". Publication Name or Website. https://full-url.com (Retrieved: [CURRENT_DATE]) + +[2] Second Author ([YEAR]). "Second Article Title". Journal Name, Volume(Issue), pages. https://doi-or-url.com (Retrieved: [CURRENT_DATE]) + + + + + +--- + +## Appendix: Methodology + +### Research Process + +[2-3 paragraphs describing the research process in detail] + +**Phase Execution:** +- Phase 1 (SCOPE): [What was done] +- Phase 2 (PLAN): [What was done] +- Phase 3 (RETRIEVE): [What was done] +- [Continue for all phases executed] + +### Sources Consulted + +**Total Sources:** [Number] + +**Source Types:** +- Academic journals: [Number] +- Industry reports: [Number] +- News articles: [Number] +- Government/regulatory: [Number] +- Documentation: [Number] +- [Other categories] + +**Geographic Coverage:** +[If relevant, note geographic distribution of sources] + +**Temporal Coverage:** +[Date range of sources, recency distribution] + +### Verification Approach + +[2-3 paragraphs explaining:] + +**Triangulation:** +- How claims were verified across multiple sources +- Minimum sources required per major claim: 3 +- How contradictions were handled + +**Credibility Assessment:** +- How source quality was evaluated +- Scoring system used (0-100) +- Average credibility score: [Number]/100 +- Distribution: [High/medium/low source counts] + +**Quality Control:** +- Validation checks performed +- Issues found and corrected +- Final quality metrics + +### Claims-Evidence Table + + + +| Claim ID | Major Claim | Evidence Type | Supporting Sources | Confidence | +|----------|-------------|---------------|-------------------|------------| +| C1 | [First major claim from findings] | [Primary data / Meta-analysis / Expert opinion] | [1], [2], [3] | High / Medium / Low | +| C2 | [Second major claim] | [Evidence type] | [4], [5], [6] | High / Medium / Low | +| C3 | [Third major claim] | [Evidence type] | [7], [8] | High / Medium / Low | +| ... | [Continue for all major claims] | ... | ... | ... | + +**Confidence Levels:** +- **High**: 3+ independent sources, consistent findings, strong methodology +- **Medium**: 2 sources OR single high-quality source with minor contradictions +- **Low**: Single source OR significant contradictions in evidence + +--- + +## Report Metadata + +**Research Mode:** [Quick/Standard/Deep/UltraDeep] +**Total Sources:** [Number] +**Word Count:** [Approximate count] +**Research Duration:** [Time taken] +**Generated:** [Date and time] +**Validation Status:** [Passed with X warnings / Passed without warnings] + +--- + + + + + diff --git a/.agents/skills/deep-research/tests/fixtures/invalid_report.md b/.agents/skills/deep-research/tests/fixtures/invalid_report.md new file mode 100644 index 000000000..3a80d809a --- /dev/null +++ b/.agents/skills/deep-research/tests/fixtures/invalid_report.md @@ -0,0 +1,27 @@ +# Research Report: Bad Report + +## Executive Summary + +This is too short. + +**Primary Recommendation:** TBD + +**Confidence Level:** High + +--- + +## Introduction + +Missing methodology section. + +--- + +## Main Analysis + +No citations here [99]. + +--- + +## Limitations & Caveats + +Some limitations TODO. diff --git a/.agents/skills/deep-research/tests/fixtures/valid_report.md b/.agents/skills/deep-research/tests/fixtures/valid_report.md new file mode 100644 index 000000000..07cfb1174 --- /dev/null +++ b/.agents/skills/deep-research/tests/fixtures/valid_report.md @@ -0,0 +1,114 @@ +# Research Report: Test Topic + +## Executive Summary + +This is a test report with exactly the right length for validation. It contains multiple findings backed by citations. The report covers comprehensive research on the test topic. Overall confidence level is high. + +**Primary Recommendation:** Proceed with implementation + +**Confidence Level:** High + +--- + +## Introduction + +### Research Question +What is the current state of test research? + +### Scope & Methodology +This research covered academic sources, industry publications, and recent developments in the field using a systematic 8-phase approach. + +### Key Assumptions +We assume test data is representative of real-world conditions. + +--- + +## Main Analysis + +### Finding 1: Current State + +The field has seen significant advancement in recent years [1], [2]. Multiple studies confirm this trend [3]. + +**Sources:** [1], [2], [3] + +### Finding 2: Key Challenges + +Several challenges remain, including scalability [4] and adoption barriers [5], [6]. + +**Sources:** [4], [5], [6] + +### Finding 3: Future Outlook + +The outlook is positive with emerging solutions [7], [8], [9], [10]. + +**Sources:** [7], [8], [9], [10] + +--- + +## Synthesis & Insights + +### Patterns Identified +Clear trend toward increased adoption and sophistication in implementations. + +### Novel Insights +The combination of recent developments suggests accelerated progress in the next 2-3 years. + +### Implications +Organizations should prepare for rapid change and invest in capability building. + +--- + +## Limitations & Caveats + +### Known Gaps +Limited data available for certain niche applications. + +### Assumptions +Assumes current trajectory continues without major disruptions. + +### Areas of Uncertainty +Long-term impact remains to be fully understood. + +--- + +## Recommendations + +### Immediate Actions +Begin pilot implementation to gain early experience. + +### Next Steps +Monitor developments and adjust strategy quarterly. + +### Further Research +Deep dive into specific implementation case studies. + +--- + +## Bibliography + +[1] Smith, J. (2025). "Test Research Advances". Journal of Testing. https://example.com/paper1 +[2] Johnson, K. (2025). "Current State Analysis". Research Quarterly. https://example.com/paper2 +[3] Williams, M. (2024). "Comprehensive Review". Academic Press. https://example.com/paper3 +[4] Brown, A. (2025). "Scalability Challenges". Tech Review. https://example.com/paper4 +[5] Davis, R. (2024). "Adoption Barriers". Industry Report. https://example.com/paper5 +[6] Miller, S. (2025). "Implementation Issues". Trade Journal. https://example.com/paper6 +[7] Wilson, T. (2025). "Future Trends". Forecasting Quarterly. https://example.com/paper7 +[8] Moore, L. (2025). "Emerging Solutions". Innovation Today. https://example.com/paper8 +[9] Taylor, P. (2024). "Next Generation Approaches". Tech Horizons. https://example.com/paper9 +[10] Anderson, C. (2025). "Market Outlook". Strategy Brief. https://example.com/paper10 + +--- + +## Appendix: Methodology + +### Research Process +Conducted 8-phase research pipeline with systematic source evaluation and triangulation. + +### Sources Consulted +10 peer-reviewed sources spanning 2024-2025. + +### Verification Approach +All major claims verified across minimum 3 independent sources. + +### Quality Control +Automated validation plus manual review for accuracy and completeness. diff --git a/.agents/skills/deep-research/tests/test_citation_manager.py b/.agents/skills/deep-research/tests/test_citation_manager.py new file mode 100644 index 000000000..a98ebdcae --- /dev/null +++ b/.agents/skills/deep-research/tests/test_citation_manager.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +"""Smoke tests for citation_manager.py CLI.""" + +import json +import os +import subprocess +import sys +import tempfile +import unittest + +SCRIPT = os.path.join(os.path.dirname(__file__), '..', 'scripts', 'citation_manager.py') + + +def run_cm(*args: str) -> dict: + """Run citation_manager.py with args, return parsed JSON from stdout.""" + result = subprocess.run( + [sys.executable, SCRIPT, *args], + capture_output=True, text=True, + ) + if result.returncode != 0: + raise RuntimeError(f'Exit {result.returncode}: {result.stderr}') + return json.loads(result.stdout) if result.stdout.strip().startswith(('{', '[')) else result.stdout + + +class TestInitRun(unittest.TestCase): + def test_creates_manifest_and_artifacts(self): + with tempfile.TemporaryDirectory() as d: + out = run_cm('init-run', '--out-dir', d, '--query', 'test question', '--mode', 'deep') + self.assertEqual(out['status'], 'ok') + + # Manifest exists and has correct fields + manifest = json.load(open(os.path.join(d, 'run_manifest.json'))) + self.assertEqual(manifest['version'], '3.0.0') + self.assertEqual(manifest['query'], 'test question') + self.assertEqual(manifest['mode'], 'deep') + self.assertIsNotNone(manifest['started_at']) + self.assertIsNone(manifest['finished_at']) + self.assertEqual(manifest['artifact_paths']['sources'], 'sources.jsonl') + + # Empty JSONL files exist + for name in ('sources.jsonl', 'evidence.jsonl', 'claims.jsonl'): + path = os.path.join(d, name) + self.assertTrue(os.path.exists(path), f'{name} missing') + self.assertEqual(os.path.getsize(path), 0) + + +class TestRegisterSource(unittest.TestCase): + def setUp(self): + self.tmpdir = tempfile.mkdtemp() + run_cm('init-run', '--out-dir', self.tmpdir, '--query', 'test') + + def tearDown(self): + import shutil + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_register_and_dedup(self): + src = json.dumps({ + 'raw_url': 'https://arxiv.org/abs/2305.14251', + 'title': 'FActScore', + 'source_type': 'academic', + 'year': '2023', + }) + out1 = run_cm('register-source', '--json', src, '--dir', self.tmpdir) + self.assertEqual(out1['status'], 'registered') + self.assertEqual(len(out1['source_id']), 16) + self.assertTrue(out1['canonical_locator'].startswith('arxiv:')) + + # Same URL -> duplicate + out2 = run_cm('register-source', '--json', src, '--dir', self.tmpdir) + self.assertEqual(out2['status'], 'duplicate') + self.assertEqual(out2['source_id'], out1['source_id']) + + def test_doi_canonicalization(self): + src = json.dumps({ + 'raw_url': 'https://doi.org/10.1038/s41586-023-06745-9', + 'title': 'Some Nature paper', + }) + out = run_cm('register-source', '--json', src, '--dir', self.tmpdir) + self.assertTrue(out['canonical_locator'].startswith('doi:10.1038/')) + + def test_url_normalization(self): + src1 = json.dumps({ + 'raw_url': 'https://Example.Com/article?utm_source=google&id=42', + 'title': 'Test', + }) + src2 = json.dumps({ + 'raw_url': 'https://example.com/article?id=42&utm_medium=email', + 'title': 'Test duplicate', + }) + out1 = run_cm('register-source', '--json', src1, '--dir', self.tmpdir) + out2 = run_cm('register-source', '--json', src2, '--dir', self.tmpdir) + # Both should resolve to same canonical locator -> same source_id + self.assertEqual(out1['source_id'], out2['source_id']) + self.assertEqual(out2['status'], 'duplicate') + + +class TestAssignDisplayNumbers(unittest.TestCase): + def test_assigns_in_order(self): + with tempfile.TemporaryDirectory() as d: + run_cm('init-run', '--out-dir', d, '--query', 'test') + + for i, url in enumerate(['https://a.com/1', 'https://b.com/2', 'https://c.com/3']): + run_cm('register-source', '--json', json.dumps({ + 'raw_url': url, 'title': f'Source {i+1}', + }), '--dir', d) + + mapping = run_cm('assign-display-numbers', '--dir', d) + self.assertEqual(len(mapping), 3) + # Values should be 1, 2, 3 + self.assertEqual(sorted(mapping.values()), [1, 2, 3]) + + +class TestExportBibliography(unittest.TestCase): + def test_markdown_export(self): + with tempfile.TemporaryDirectory() as d: + run_cm('init-run', '--out-dir', d, '--query', 'test') + run_cm('register-source', '--json', json.dumps({ + 'raw_url': 'https://arxiv.org/abs/2305.14251', + 'title': 'FActScore', + 'authors': ['Min, S.', 'Krishna, K.'], + 'year': '2023', + 'source_type': 'academic', + }), '--dir', d) + + out = run_cm('export-bibliography', '--dir', d, '--style', 'markdown') + self.assertIn('[1]', out) + self.assertIn('FActScore', out) + self.assertIn('Min, S. & Krishna, K.', out) + + def test_json_export(self): + with tempfile.TemporaryDirectory() as d: + run_cm('init-run', '--out-dir', d, '--query', 'test') + run_cm('register-source', '--json', json.dumps({ + 'raw_url': 'https://example.com/paper', + 'title': 'Test Paper', + }), '--dir', d) + + out = run_cm('export-bibliography', '--dir', d, '--style', 'json') + self.assertEqual(len(out), 1) + self.assertEqual(out[0]['display_number'], 1) + self.assertEqual(out[0]['title'], 'Test Paper') + + +class TestCanonicalization(unittest.TestCase): + """Unit tests for canonicalize_locator without running the CLI.""" + + @classmethod + def setUpClass(cls): + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'scripts')) + from citation_manager import canonicalize_locator, compute_source_id + cls.canonicalize = staticmethod(canonicalize_locator) + cls.compute_id = staticmethod(compute_source_id) + + def test_doi_from_url(self): + canonicalize_locator = self.canonicalize + self.assertEqual( + canonicalize_locator('https://doi.org/10.1038/s41586-023-06745-9'), + 'doi:10.1038/s41586-023-06745-9', + ) + self.assertEqual( + canonicalize_locator('https://dx.doi.org/10.1234/test.'), + 'doi:10.1234/test', + ) + + def test_arxiv_from_url(self): + canonicalize_locator = self.canonicalize + self.assertEqual( + canonicalize_locator('https://arxiv.org/abs/2305.14251v2'), + 'arxiv:2305.14251v2', + ) + self.assertEqual( + canonicalize_locator('arxiv:2401.15884'), + 'arxiv:2401.15884', + ) + + def test_url_strips_tracking(self): + canonicalize_locator = self.canonicalize + result = canonicalize_locator('https://Example.Com/page?utm_source=x&key=val') + self.assertNotIn('utm_source', result) + self.assertIn('key=val', result) + self.assertTrue(result.startswith('https://example.com')) + + def test_url_strips_fragment(self): + canonicalize_locator = self.canonicalize + result = canonicalize_locator('https://example.com/page#section') + self.assertNotIn('#section', result) + + def test_url_strips_trailing_slash(self): + canonicalize_locator = self.canonicalize + result = canonicalize_locator('https://example.com/page/') + self.assertFalse(result.endswith('/')) + + +if __name__ == '__main__': + unittest.main() diff --git a/.agents/skills/deep-research/tests/test_evidence_store.py b/.agents/skills/deep-research/tests/test_evidence_store.py new file mode 100644 index 000000000..acee64344 --- /dev/null +++ b/.agents/skills/deep-research/tests/test_evidence_store.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +"""Smoke tests for evidence_store.py CLI.""" + +import json +import os +import shutil +import subprocess +import sys +import tempfile +import unittest + +SCRIPT = os.path.join(os.path.dirname(__file__), '..', 'scripts', 'evidence_store.py') + + +def run_es(*args: str) -> dict | list: + """Run evidence_store.py with args, return parsed JSON from stdout.""" + result = subprocess.run( + [sys.executable, SCRIPT, *args], + capture_output=True, text=True, + ) + if result.returncode != 0: + raise RuntimeError(f'Exit {result.returncode}: {result.stderr}') + return json.loads(result.stdout) + + +class TestInit(unittest.TestCase): + def test_creates_empty_file(self): + with tempfile.TemporaryDirectory() as d: + out = run_es('init', '--dir', d) + self.assertEqual(out['status'], 'ok') + path = os.path.join(d, 'evidence.jsonl') + self.assertTrue(os.path.exists(path)) + self.assertEqual(os.path.getsize(path), 0) + + +class TestAddEvidence(unittest.TestCase): + def setUp(self): + self.tmpdir = tempfile.mkdtemp() + run_es('init', '--dir', self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_add_and_dedup(self): + ev = json.dumps({ + 'source_id': 'abcdef0123456789', + 'quote': 'FActScore decomposes generation into atomic facts.', + 'evidence_type': 'direct_quote', + 'locator': 'page 3', + 'retrieval_query': 'factuality evaluation methods', + }) + out1 = run_es('add', '--json', ev, '--dir', self.tmpdir) + self.assertEqual(out1['status'], 'added') + self.assertEqual(len(out1['evidence_id']), 16) + + # Same quote -> duplicate + out2 = run_es('add', '--json', ev, '--dir', self.tmpdir) + self.assertEqual(out2['status'], 'duplicate') + self.assertEqual(out2['evidence_id'], out1['evidence_id']) + + def test_whitespace_normalization(self): + ev1 = json.dumps({ + 'source_id': 'abcdef0123456789', + 'quote': ' FActScore decomposes generation into atomic facts. ', + 'evidence_type': 'direct_quote', + }) + ev2 = json.dumps({ + 'source_id': 'abcdef0123456789', + 'quote': 'FActScore decomposes generation into atomic facts.', + 'evidence_type': 'direct_quote', + }) + out1 = run_es('add', '--json', ev1, '--dir', self.tmpdir) + out2 = run_es('add', '--json', ev2, '--dir', self.tmpdir) + # Should be same ID due to normalization + self.assertEqual(out1['evidence_id'], out2['evidence_id']) + self.assertEqual(out2['status'], 'duplicate') + + def test_different_sources_different_ids(self): + ev1 = json.dumps({ + 'source_id': 'aaaaaaaaaaaaaaaa', + 'quote': 'Same quote text.', + 'evidence_type': 'paraphrase', + }) + ev2 = json.dumps({ + 'source_id': 'bbbbbbbbbbbbbbbb', + 'quote': 'Same quote text.', + 'evidence_type': 'paraphrase', + }) + out1 = run_es('add', '--json', ev1, '--dir', self.tmpdir) + out2 = run_es('add', '--json', ev2, '--dir', self.tmpdir) + self.assertNotEqual(out1['evidence_id'], out2['evidence_id']) + self.assertEqual(out2['status'], 'added') + + +class TestListAndExport(unittest.TestCase): + def setUp(self): + self.tmpdir = tempfile.mkdtemp() + run_es('init', '--dir', self.tmpdir) + # Add 3 evidence items from 2 sources + for src, quote in [ + ('src_aaa', 'First quote from source A.'), + ('src_aaa', 'Second quote from source A.'), + ('src_bbb', 'Quote from source B.'), + ]: + run_es('add', '--json', json.dumps({ + 'source_id': src, + 'quote': quote, + 'evidence_type': 'direct_quote', + }), '--dir', self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_list_all(self): + out = run_es('list', '--dir', self.tmpdir) + self.assertEqual(out['count'], 3) + + def test_list_filtered(self): + out = run_es('list', '--dir', self.tmpdir, '--source-id', 'src_aaa') + self.assertEqual(out['count'], 2) + + out = run_es('list', '--dir', self.tmpdir, '--source-id', 'src_bbb') + self.assertEqual(out['count'], 1) + + def test_export(self): + out = run_es('export', '--dir', self.tmpdir) + self.assertIsInstance(out, list) + self.assertEqual(len(out), 3) + # Each has required fields + for row in out: + self.assertIn('evidence_id', row) + self.assertIn('source_id', row) + self.assertIn('quote', row) + self.assertIn('evidence_type', row) + self.assertIn('captured_at', row) + + +class TestEvidenceID(unittest.TestCase): + """Unit tests for compute_evidence_id.""" + + @classmethod + def setUpClass(cls): + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'scripts')) + from evidence_store import compute_evidence_id, normalize_quote + cls.compute_id = staticmethod(compute_evidence_id) + cls.normalize = staticmethod(normalize_quote) + + def test_deterministic(self): + id1 = self.compute_id('src_a', 'test quote', 'page 1') + id2 = self.compute_id('src_a', 'test quote', 'page 1') + self.assertEqual(id1, id2) + + def test_locator_matters(self): + id1 = self.compute_id('src_a', 'test quote', 'page 1') + id2 = self.compute_id('src_a', 'test quote', 'page 2') + self.assertNotEqual(id1, id2) + + def test_normalize_whitespace(self): + self.assertEqual( + self.normalize(' hello world '), + 'hello world', + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/.agents/skills/deep-research/tests/test_extract_claims.py b/.agents/skills/deep-research/tests/test_extract_claims.py new file mode 100644 index 000000000..f1376ee54 --- /dev/null +++ b/.agents/skills/deep-research/tests/test_extract_claims.py @@ -0,0 +1,213 @@ +#!/usr/bin/env python3 +"""Tests for extract_claims.py CLI.""" + +import json +import os +import shutil +import subprocess +import sys +import tempfile +import unittest + +SCRIPT = os.path.join(os.path.dirname(__file__), '..', 'scripts', 'extract_claims.py') +FIXTURES = os.path.join(os.path.dirname(__file__), 'fixtures') + + +def run_ec(*args: str) -> dict | list: + """Run extract_claims.py with args.""" + result = subprocess.run( + [sys.executable, SCRIPT, *args], + capture_output=True, text=True, + ) + if result.returncode != 0: + raise RuntimeError(f'Exit {result.returncode}: {result.stderr}') + return json.loads(result.stdout) + + +SAMPLE_REPORT = """\ +--- +title: Test Research Report +--- + +## Executive Summary + +This report examines the impact of quantum computing on cryptography [1, 2]. The field has advanced significantly since 2020, with major breakthroughs in error correction. + +## Introduction + +Quantum computing represents a paradigm shift in computational capability. Researchers at Google demonstrated quantum supremacy in 2019 using a 53-qubit processor [3]. This milestone confirmed theoretical predictions made decades earlier. + +## Finding 1 + +The Shor algorithm can factor large numbers exponentially faster than classical methods [4]. Current RSA-2048 encryption could be broken by a sufficiently large quantum computer. However, such machines are estimated to require millions of physical qubits [5, 6]. + +## Finding 2 + +Post-quantum cryptography standards should be adopted within the next 5 years. Organizations should consider hybrid classical-quantum approaches during the transition period. NIST has already standardized several lattice-based algorithms [7]. + +## Synthesis + +Taken together, the evidence suggests that quantum computing poses a real but manageable threat to current cryptographic systems. The timeline for practical quantum attacks remains uncertain, but proactive migration reduces risk substantially. + +## Recommendations + +Organizations should begin evaluating post-quantum cryptography solutions immediately. Security teams should conduct a cryptographic inventory to identify vulnerable systems. Companies should consider implementing crypto-agility frameworks to enable rapid algorithm switching. + +## Bibliography + +[1] Smith et al. (2023). Quantum Computing Advances. +[2] Johnson (2024). Cryptographic Implications. +""" + + +class TestExtract(unittest.TestCase): + def setUp(self): + self.tmpdir = tempfile.mkdtemp() + # Create empty claims.jsonl + open(os.path.join(self.tmpdir, 'claims.jsonl'), 'w').close() + # Write sample report + self.report_path = os.path.join(self.tmpdir, 'report.md') + with open(self.report_path, 'w') as f: + f.write(SAMPLE_REPORT) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_extract_finds_claims(self): + out = run_ec('extract', '--report', self.report_path, '--dir', self.tmpdir) + self.assertEqual(out['status'], 'ok') + self.assertGreater(out['claims_added'], 5) + + def test_extract_idempotent(self): + out1 = run_ec('extract', '--report', self.report_path, '--dir', self.tmpdir) + out2 = run_ec('extract', '--report', self.report_path, '--dir', self.tmpdir) + self.assertEqual(out2['claims_added'], 0) + self.assertEqual(out2['claims_skipped'], out1['claims_added']) + + def test_claim_types_assigned(self): + run_ec('extract', '--report', self.report_path, '--dir', self.tmpdir) + out = run_ec('stats', '--dir', self.tmpdir) + # Should have at least factual and recommendation types + self.assertIn('factual', out['by_type']) + self.assertIn('recommendation', out['by_type']) + + def test_sections_detected(self): + run_ec('extract', '--report', self.report_path, '--dir', self.tmpdir) + out = run_ec('stats', '--dir', self.tmpdir) + self.assertIn('finding_1', out['by_section']) + self.assertIn('finding_2', out['by_section']) + self.assertIn('recommendations', out['by_section']) + + +class TestAdd(unittest.TestCase): + def setUp(self): + self.tmpdir = tempfile.mkdtemp() + open(os.path.join(self.tmpdir, 'claims.jsonl'), 'w').close() + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_add_and_dedup(self): + claim = json.dumps({ + 'section_id': 'finding_1', + 'text': 'Quantum computers can break RSA encryption.', + 'claim_type': 'factual', + }) + out1 = run_ec('add', '--json', claim, '--dir', self.tmpdir) + self.assertEqual(out1['status'], 'added') + self.assertEqual(len(out1['claim_id']), 16) + + out2 = run_ec('add', '--json', claim, '--dir', self.tmpdir) + self.assertEqual(out2['status'], 'duplicate') + + def test_add_with_sources(self): + claim = json.dumps({ + 'section_id': 'finding_1', + 'text': 'NIST standardized CRYSTALS-Kyber in 2024.', + 'claim_type': 'factual', + 'cited_source_ids': ['abcdef0123456789'], + 'evidence_ids': ['1234567890abcdef'], + }) + out = run_ec('add', '--json', claim, '--dir', self.tmpdir) + self.assertEqual(out['status'], 'added') + + +class TestListAndStats(unittest.TestCase): + def setUp(self): + self.tmpdir = tempfile.mkdtemp() + open(os.path.join(self.tmpdir, 'claims.jsonl'), 'w').close() + # Add mixed claims + for sec, text, ctype in [ + ('finding_1', 'The sky appears blue due to Rayleigh scattering.', 'factual'), + ('finding_1', 'Light wavelengths scatter differently in the atmosphere.', 'factual'), + ('synthesis', 'Overall, atmospheric optics explains most visual phenomena.', 'synthesis'), + ('recommendations', 'Researchers should investigate polarization effects further.', 'recommendation'), + ]: + run_ec('add', '--json', json.dumps({ + 'section_id': sec, 'text': text, 'claim_type': ctype, + }), '--dir', self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_list_all(self): + out = run_ec('list', '--dir', self.tmpdir) + self.assertEqual(out['count'], 4) + + def test_list_by_section(self): + out = run_ec('list', '--dir', self.tmpdir, '--section', 'finding_1') + self.assertEqual(out['count'], 2) + + def test_list_by_type(self): + out = run_ec('list', '--dir', self.tmpdir, '--type', 'recommendation') + self.assertEqual(out['count'], 1) + + def test_stats(self): + out = run_ec('stats', '--dir', self.tmpdir) + self.assertEqual(out['total'], 4) + self.assertEqual(out['by_type']['factual'], 2) + self.assertEqual(out['by_type']['synthesis'], 1) + self.assertEqual(out['by_type']['recommendation'], 1) + + +class TestClaimID(unittest.TestCase): + """Unit tests for compute_claim_id.""" + + @classmethod + def setUpClass(cls): + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'scripts')) + from extract_claims import compute_claim_id, classify_claim + cls.compute_id = staticmethod(compute_claim_id) + cls.classify = staticmethod(classify_claim) + + def test_deterministic(self): + id1 = self.compute_id('finding_1', 'Test claim.') + id2 = self.compute_id('finding_1', 'Test claim.') + self.assertEqual(id1, id2) + + def test_section_matters(self): + id1 = self.compute_id('finding_1', 'Same text.') + id2 = self.compute_id('finding_2', 'Same text.') + self.assertNotEqual(id1, id2) + + def test_classify_recommendation(self): + self.assertEqual( + self.classify('Organizations should adopt PQC immediately.', 'recommendations'), + 'recommendation', + ) + + def test_classify_factual(self): + self.assertEqual( + self.classify('RSA-2048 uses 2048-bit keys.', 'finding_1'), + 'factual', + ) + + def test_classify_synthesis(self): + self.assertEqual( + self.classify('Taken together, the results indicate a clear trend.', 'synthesis'), + 'synthesis', + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/.agents/skills/deep-research/tests/test_verify_claim_support.py b/.agents/skills/deep-research/tests/test_verify_claim_support.py new file mode 100644 index 000000000..4c05ce9cb --- /dev/null +++ b/.agents/skills/deep-research/tests/test_verify_claim_support.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 +"""Tests for verify_claim_support.py CLI.""" + +import json +import os +import shutil +import subprocess +import sys +import tempfile +import unittest + +SCRIPT = os.path.join(os.path.dirname(__file__), '..', 'scripts', 'verify_claim_support.py') + + +def run_vcs(*args: str, expect_fail: bool = False) -> dict | str: + """Run verify_claim_support.py.""" + result = subprocess.run( + [sys.executable, SCRIPT, *args], + capture_output=True, text=True, + ) + if result.returncode != 0 and not expect_fail: + raise RuntimeError(f'Exit {result.returncode}: {result.stderr}\n{result.stdout}') + stdout = result.stdout.strip() + if stdout.startswith('{'): + return json.loads(stdout) + return stdout + + +def write_jsonl(path: str, rows: list[dict]): + with open(path, 'w') as f: + for row in rows: + f.write(json.dumps(row) + '\n') + + +class TestVerifySupported(unittest.TestCase): + """Claims with matching evidence should be supported.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp() + # Sources + write_jsonl(os.path.join(self.tmpdir, 'sources.jsonl'), [ + {'source_id': 'src_quantum_001', 'title': 'Quantum Computing 2024'}, + ]) + # Evidence with clear overlap to the claim + write_jsonl(os.path.join(self.tmpdir, 'evidence.jsonl'), [ + { + 'evidence_id': 'ev_shor_001', + 'source_id': 'src_quantum_001', + 'quote': "Shor's algorithm can factor large integers exponentially faster than any known classical algorithm, threatening RSA-2048 encryption.", + 'evidence_type': 'direct_quote', + }, + ]) + # Claim that matches the evidence + write_jsonl(os.path.join(self.tmpdir, 'claims.jsonl'), [ + { + 'claim_id': 'clm_factor_001', + 'section_id': 'finding_1', + 'text': "Shor's algorithm can factor large numbers exponentially faster than classical methods, threatening RSA-2048.", + 'claim_type': 'factual', + 'cited_source_ids': ['src_quantum_001'], + 'evidence_ids': ['ev_shor_001'], + 'support_status': 'unverified', + }, + ]) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_supported_claim(self): + out = run_vcs('verify', '--dir', self.tmpdir) + self.assertEqual(out['status'], 'pass') + self.assertEqual(out['factual_unsupported'], 0) + + # Check updated claims file + claims = [] + with open(os.path.join(self.tmpdir, 'claims.jsonl')) as f: + for line in f: + claims.append(json.loads(line)) + self.assertEqual(claims[0]['support_status'], 'supported') + + +class TestVerifyUnsupported(unittest.TestCase): + """Claims without evidence should be unsupported.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp() + write_jsonl(os.path.join(self.tmpdir, 'sources.jsonl'), []) + write_jsonl(os.path.join(self.tmpdir, 'evidence.jsonl'), []) + write_jsonl(os.path.join(self.tmpdir, 'claims.jsonl'), [ + { + 'claim_id': 'clm_no_ev_001', + 'section_id': 'finding_1', + 'text': 'The population of Mars is 500 million as of 2025.', + 'claim_type': 'factual', + 'cited_source_ids': [], + 'evidence_ids': [], + 'support_status': 'unverified', + }, + ]) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_unsupported_no_evidence(self): + out = run_vcs('verify', '--dir', self.tmpdir) + self.assertEqual(out['factual_unsupported'], 1) + self.assertEqual(out['status'], 'pass') # Non-strict by default + + def test_strict_fails(self): + out = run_vcs('verify', '--dir', self.tmpdir, '--strict', expect_fail=True) + self.assertEqual(out['status'], 'fail') + + +class TestVerifyMixed(unittest.TestCase): + """Mixed claim types with different thresholds.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp() + write_jsonl(os.path.join(self.tmpdir, 'sources.jsonl'), []) + write_jsonl(os.path.join(self.tmpdir, 'evidence.jsonl'), []) + write_jsonl(os.path.join(self.tmpdir, 'claims.jsonl'), [ + { + 'claim_id': 'clm_spec_001', + 'section_id': 'finding_1', + 'text': 'Quantum computers might eventually solve protein folding in real time.', + 'claim_type': 'speculation', + 'cited_source_ids': [], + 'evidence_ids': [], + 'support_status': 'unverified', + }, + { + 'claim_id': 'clm_rec_001', + 'section_id': 'recommendations', + 'text': 'Organizations should begin PQC migration planning immediately.', + 'claim_type': 'recommendation', + 'cited_source_ids': [], + 'evidence_ids': [], + 'support_status': 'unverified', + }, + ]) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_speculation_passes(self): + out = run_vcs('verify', '--dir', self.tmpdir) + # Speculation doesn't need evidence + claims = [] + with open(os.path.join(self.tmpdir, 'claims.jsonl')) as f: + for line in f: + claims.append(json.loads(line)) + spec = [c for c in claims if c['claim_type'] == 'speculation'][0] + self.assertEqual(spec['support_status'], 'supported') + + +class TestVerifyPartial(unittest.TestCase): + """Evidence with partial overlap should result in partial status.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp() + write_jsonl(os.path.join(self.tmpdir, 'sources.jsonl'), [ + {'source_id': 'src_nist_001', 'title': 'NIST PQC Standards'}, + ]) + write_jsonl(os.path.join(self.tmpdir, 'evidence.jsonl'), [ + { + 'evidence_id': 'ev_nist_001', + 'source_id': 'src_nist_001', + 'quote': 'NIST announced the standardization of CRYSTALS-Kyber for key encapsulation.', + 'evidence_type': 'direct_quote', + }, + ]) + # Claim mentions NIST but adds unverified detail about timeline + write_jsonl(os.path.join(self.tmpdir, 'claims.jsonl'), [ + { + 'claim_id': 'clm_nist_time', + 'section_id': 'finding_2', + 'text': 'NIST standardized four lattice-based algorithms in 2024, covering both encryption and signatures.', + 'claim_type': 'factual', + 'cited_source_ids': ['src_nist_001'], + 'evidence_ids': ['ev_nist_001'], + 'support_status': 'unverified', + }, + ]) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_partial_support(self): + out = run_vcs('verify', '--dir', self.tmpdir) + claims = [] + with open(os.path.join(self.tmpdir, 'claims.jsonl')) as f: + for line in f: + claims.append(json.loads(line)) + # Should be partial or needs_review (not fully supported due to number/detail mismatch) + self.assertIn(claims[0]['support_status'], ('partial', 'needs_review', 'supported')) + + +class TestSupportScore(unittest.TestCase): + """Unit tests for compute_support_score.""" + + @classmethod + def setUpClass(cls): + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'scripts')) + from verify_claim_support import compute_support_score + cls.score = staticmethod(compute_support_score) + + def test_identical_text(self): + status, score, _ = self.score( + 'RSA-2048 uses 2048-bit keys for encryption.', + ['RSA-2048 uses 2048-bit keys for encryption.'], + ) + self.assertEqual(status, 'supported') + self.assertGreater(score, 0.8) + + def test_no_evidence(self): + status, score, _ = self.score('Any claim text.', []) + self.assertEqual(status, 'unsupported') + self.assertEqual(score, 0.0) + + def test_unrelated_evidence(self): + status, score, _ = self.score( + 'The moon landing occurred in 1969.', + ['Bananas are a good source of potassium and fiber.'], + ) + self.assertIn(status, ('needs_review', 'unsupported')) + self.assertLess(score, 0.35) + + +if __name__ == '__main__': + unittest.main() diff --git a/.agents/skills/permission-auditor/SKILL.md b/.agents/skills/permission-auditor/SKILL.md new file mode 100644 index 000000000..7684f3edb --- /dev/null +++ b/.agents/skills/permission-auditor/SKILL.md @@ -0,0 +1,113 @@ +--- +name: permission-auditor +description: Analyze OpenClaw skill permissions and explain exactly what each permission allows. Identifies over-privileged + skills and suggests minimal permission sets. +metadata: + short-description: Explain requested skill permissions and flag over-privileged combinations. + why: Keep skill permissions minimal and understandable before granting access. + what: Provides a permission-analysis module for mapping declared access to actual task need. + how: Uses permission-by-permission review plus dangerous-combination checks and least-privilege guidance. + results: Produces a permission fit assessment with recommended minimal access scope. + version: 1.0.0 + updated: '2026-03-10T03:42:30Z' + jtbd-1: When I need to decide whether a skill is requesting more access than its job actually needs. + audit: + kind: module + author: useclawpro + category: Security + trust-score: 96 + last-audited: '2026-02-01' + permissions: + file-read: true + file-write: false + network: false + shell: false +--- + +# Permission Auditor + +You are a permissions analyst for OpenClaw skills. Your job is to audit the permissions a skill requests and explain the security implications to the user. + +## OpenClaw Permission Model + +OpenClaw skills can request four permission types: + +### fileRead +**What it allows:** Reading files from the user's filesystem. +**Legitimate use:** Code analysis, documentation generation, test generation. +**Risk:** A malicious skill could read `~/.ssh/id_rsa`, `~/.aws/credentials`, `.env` files, or any sensitive data on disk. +**Mitigation:** Check which file paths the skill actually accesses. A code reviewer needs `src/**` — not `~/`. + +### fileWrite +**What it allows:** Creating or modifying files on the user's filesystem. +**Legitimate use:** Generating code, writing test files, updating configs. +**Risk:** A malicious skill could overwrite `.bashrc` to inject persistence, modify `node_modules` to inject backdoors, or write files to startup directories. +**Mitigation:** Verify the skill writes only to expected project directories. Flag any writes outside the current workspace. + +### network +**What it allows:** Making HTTP requests to external servers. +**Legitimate use:** Fetching API schemas, downloading documentation, checking package versions. +**Risk:** This is the primary exfiltration vector. A malicious skill can send your source code, credentials, or environment variables to an external server. +**Mitigation:** Network access should be rare. If granted, the skill must declare exactly which domains it contacts and why. + +### shell +**What it allows:** Executing arbitrary shell commands on the user's system. +**Legitimate use:** Running `git log`, `npm test`, build commands. +**Risk:** Full system compromise. A skill with shell access can do anything: install malware, open reverse shells, modify system files, exfiltrate data. +**Mitigation:** Shell access should be granted only to well-known, verified skills. Always review which commands the skill executes. + +## Audit Protocol + +When the user provides a skill's permissions, follow this process: + +### 1. List Requested Permissions + +``` +PERMISSION AUDIT +================ +Skill: + + fileRead: [YES/NO] + fileWrite: [YES/NO] + network: [YES/NO] + shell: [YES/NO] +``` + +### 2. Evaluate Necessity + +For each granted permission, answer: +- **Why does this skill need it?** (based on its description) +- **Is this the minimum required?** (could it work with fewer permissions?) +- **What is the worst case?** (if the skill is malicious, what could it do?) + +### 3. Identify Dangerous Combinations + +| Combination | Risk | Reason | +|---|---|---| +| network + fileRead | CRITICAL | Can read and exfiltrate any file | +| network + shell | CRITICAL | Can execute commands and send output externally | +| shell + fileWrite | HIGH | Can modify system files and persist | +| fileRead + fileWrite | MEDIUM | Can read secrets and write backdoors | +| fileRead only | LOW | Read-only, minimal risk | + +### 4. Suggest Minimum Permissions + +Based on the skill's description, recommend the minimal permission set: + +``` +RECOMMENDATION +============== +Current: fileRead + fileWrite + network + shell +Minimal: fileRead + fileWrite +Reason: This skill generates tests from source code. + It needs to read source and write test files. + Network and shell access are not justified. +``` + +## Rules + +1. Always explain permissions in plain language — assume the user is not a security expert +2. Use concrete examples of what could go wrong, not abstract warnings +3. If a skill requests `network` or `shell`, always recommend extra scrutiny +4. Never approve a skill with all four permissions unless it has a strong justification +5. Suggest alternatives if a skill seems over-privileged diff --git a/.agents/skills/swarm-advanced/SKILL.md b/.agents/skills/swarm-advanced/SKILL.md new file mode 100644 index 000000000..fad1ea188 --- /dev/null +++ b/.agents/skills/swarm-advanced/SKILL.md @@ -0,0 +1,973 @@ +--- +name: swarm-advanced +description: Advanced swarm orchestration patterns for research, development, testing, and complex distributed workflows +version: 2.0.0 +category: orchestration +tags: [swarm, distributed, parallel, research, testing, development, coordination] +author: Claude Flow Team +--- + +# Advanced Swarm Orchestration + +Master advanced swarm patterns for distributed research, development, and testing workflows. This skill covers comprehensive orchestration strategies using both MCP tools and CLI commands. + +## Quick Start + +### Prerequisites +```bash +# Ensure Claude Flow is installed +npm install -g claude-flow@alpha + +# Add MCP server (if using MCP tools) +claude mcp add claude-flow npx claude-flow@alpha mcp start +``` + +### Basic Pattern +```javascript +// 1. Initialize swarm topology +mcp__claude-flow__swarm_init({ topology: "mesh", maxAgents: 6 }) + +// 2. Spawn specialized agents +mcp__claude-flow__agent_spawn({ type: "researcher", name: "Agent 1" }) + +// 3. Orchestrate tasks +mcp__claude-flow__task_orchestrate({ task: "...", strategy: "parallel" }) +``` + +## Core Concepts + +### Swarm Topologies + +**Mesh Topology** - Peer-to-peer communication, best for research and analysis +- All agents communicate directly +- High flexibility and resilience +- Use for: Research, analysis, brainstorming + +**Hierarchical Topology** - Coordinator with subordinates, best for development +- Clear command structure +- Sequential workflow support +- Use for: Development, structured workflows + +**Star Topology** - Central coordinator, best for testing +- Centralized control and monitoring +- Parallel execution with coordination +- Use for: Testing, validation, quality assurance + +**Ring Topology** - Sequential processing chain +- Step-by-step processing +- Pipeline workflows +- Use for: Multi-stage processing, data pipelines + +### Agent Strategies + +**Adaptive** - Dynamic adjustment based on task complexity +**Balanced** - Equal distribution of work across agents +**Specialized** - Task-specific agent assignment +**Parallel** - Maximum concurrent execution + +## Pattern 1: Research Swarm + +### Purpose +Deep research through parallel information gathering, analysis, and synthesis. + +### Architecture +```javascript +// Initialize research swarm +mcp__claude-flow__swarm_init({ + "topology": "mesh", + "maxAgents": 6, + "strategy": "adaptive" +}) + +// Spawn research team +const researchAgents = [ + { + type: "researcher", + name: "Web Researcher", + capabilities: ["web-search", "content-extraction", "source-validation"] + }, + { + type: "researcher", + name: "Academic Researcher", + capabilities: ["paper-analysis", "citation-tracking", "literature-review"] + }, + { + type: "analyst", + name: "Data Analyst", + capabilities: ["data-processing", "statistical-analysis", "visualization"] + }, + { + type: "analyst", + name: "Pattern Analyzer", + capabilities: ["trend-detection", "correlation-analysis", "outlier-detection"] + }, + { + type: "documenter", + name: "Report Writer", + capabilities: ["synthesis", "technical-writing", "formatting"] + } +] + +// Spawn all agents +researchAgents.forEach(agent => { + mcp__claude-flow__agent_spawn({ + type: agent.type, + name: agent.name, + capabilities: agent.capabilities + }) +}) +``` + +### Research Workflow + +#### Phase 1: Information Gathering +```javascript +// Parallel information collection +mcp__claude-flow__parallel_execute({ + "tasks": [ + { + "id": "web-search", + "command": "search recent publications and articles" + }, + { + "id": "academic-search", + "command": "search academic databases and papers" + }, + { + "id": "data-collection", + "command": "gather relevant datasets and statistics" + }, + { + "id": "expert-search", + "command": "identify domain experts and thought leaders" + } + ] +}) + +// Store research findings in memory +mcp__claude-flow__memory_usage({ + "action": "store", + "key": "research-findings-" + Date.now(), + "value": JSON.stringify(findings), + "namespace": "research", + "ttl": 604800 // 7 days +}) +``` + +#### Phase 2: Analysis and Validation +```javascript +// Pattern recognition in findings +mcp__claude-flow__pattern_recognize({ + "data": researchData, + "patterns": ["trend", "correlation", "outlier", "emerging-pattern"] +}) + +// Cognitive analysis +mcp__claude-flow__cognitive_analyze({ + "behavior": "research-synthesis" +}) + +// Quality assessment +mcp__claude-flow__quality_assess({ + "target": "research-sources", + "criteria": ["credibility", "relevance", "recency", "authority"] +}) + +// Cross-reference validation +mcp__claude-flow__neural_patterns({ + "action": "analyze", + "operation": "fact-checking", + "metadata": { "sources": sourcesArray } +}) +``` + +#### Phase 3: Knowledge Management +```javascript +// Search existing knowledge base +mcp__claude-flow__memory_search({ + "pattern": "topic X", + "namespace": "research", + "limit": 20 +}) + +// Create knowledge graph connections +mcp__claude-flow__neural_patterns({ + "action": "learn", + "operation": "knowledge-graph", + "metadata": { + "topic": "X", + "connections": relatedTopics, + "depth": 3 + } +}) + +// Store connections for future use +mcp__claude-flow__memory_usage({ + "action": "store", + "key": "knowledge-graph-X", + "value": JSON.stringify(knowledgeGraph), + "namespace": "research$graphs", + "ttl": 2592000 // 30 days +}) +``` + +#### Phase 4: Report Generation +```javascript +// Orchestrate report generation +mcp__claude-flow__task_orchestrate({ + "task": "generate comprehensive research report", + "strategy": "sequential", + "priority": "high", + "dependencies": ["gather", "analyze", "validate", "synthesize"] +}) + +// Monitor research progress +mcp__claude-flow__swarm_status({ + "swarmId": "research-swarm" +}) + +// Generate final report +mcp__claude-flow__workflow_execute({ + "workflowId": "research-report-generation", + "params": { + "findings": findings, + "format": "comprehensive", + "sections": ["executive-summary", "methodology", "findings", "analysis", "conclusions", "references"] + } +}) +``` + +### CLI Fallback +```bash +# Quick research swarm +npx claude-flow swarm "research AI trends in 2025" \ + --strategy research \ + --mode distributed \ + --max-agents 6 \ + --parallel \ + --output research-report.md +``` + +## Pattern 2: Development Swarm + +### Purpose +Full-stack development through coordinated specialist agents. + +### Architecture +```javascript +// Initialize development swarm with hierarchy +mcp__claude-flow__swarm_init({ + "topology": "hierarchical", + "maxAgents": 8, + "strategy": "balanced" +}) + +// Spawn development team +const devTeam = [ + { type: "architect", name: "System Architect", role: "coordinator" }, + { type: "coder", name: "Backend Developer", capabilities: ["node", "api", "database"] }, + { type: "coder", name: "Frontend Developer", capabilities: ["react", "ui", "ux"] }, + { type: "coder", name: "Database Engineer", capabilities: ["sql", "nosql", "optimization"] }, + { type: "tester", name: "QA Engineer", capabilities: ["unit", "integration", "e2e"] }, + { type: "reviewer", name: "Code Reviewer", capabilities: ["security", "performance", "best-practices"] }, + { type: "documenter", name: "Technical Writer", capabilities: ["api-docs", "guides", "tutorials"] }, + { type: "monitor", name: "DevOps Engineer", capabilities: ["ci-cd", "deployment", "monitoring"] } +] + +// Spawn all team members +devTeam.forEach(member => { + mcp__claude-flow__agent_spawn({ + type: member.type, + name: member.name, + capabilities: member.capabilities, + swarmId: "dev-swarm" + }) +}) +``` + +### Development Workflow + +#### Phase 1: Architecture and Design +```javascript +// System architecture design +mcp__claude-flow__task_orchestrate({ + "task": "design system architecture for REST API", + "strategy": "sequential", + "priority": "critical", + "assignTo": "System Architect" +}) + +// Store architecture decisions +mcp__claude-flow__memory_usage({ + "action": "store", + "key": "architecture-decisions", + "value": JSON.stringify(architectureDoc), + "namespace": "development$design" +}) +``` + +#### Phase 2: Parallel Implementation +```javascript +// Parallel development tasks +mcp__claude-flow__parallel_execute({ + "tasks": [ + { + "id": "backend-api", + "command": "implement REST API endpoints", + "assignTo": "Backend Developer" + }, + { + "id": "frontend-ui", + "command": "build user interface components", + "assignTo": "Frontend Developer" + }, + { + "id": "database-schema", + "command": "design and implement database schema", + "assignTo": "Database Engineer" + }, + { + "id": "api-documentation", + "command": "create API documentation", + "assignTo": "Technical Writer" + } + ] +}) + +// Monitor development progress +mcp__claude-flow__swarm_monitor({ + "swarmId": "dev-swarm", + "interval": 5000 +}) +``` + +#### Phase 3: Testing and Validation +```javascript +// Comprehensive testing +mcp__claude-flow__batch_process({ + "items": [ + { type: "unit", target: "all-modules" }, + { type: "integration", target: "api-endpoints" }, + { type: "e2e", target: "user-flows" }, + { type: "performance", target: "critical-paths" } + ], + "operation": "execute-tests" +}) + +// Quality assessment +mcp__claude-flow__quality_assess({ + "target": "codebase", + "criteria": ["coverage", "complexity", "maintainability", "security"] +}) +``` + +#### Phase 4: Review and Deployment +```javascript +// Code review workflow +mcp__claude-flow__workflow_execute({ + "workflowId": "code-review-process", + "params": { + "reviewers": ["Code Reviewer"], + "criteria": ["security", "performance", "best-practices"] + } +}) + +// CI/CD pipeline +mcp__claude-flow__pipeline_create({ + "config": { + "stages": ["build", "test", "security-scan", "deploy"], + "environment": "production" + } +}) +``` + +### CLI Fallback +```bash +# Quick development swarm +npx claude-flow swarm "build REST API with authentication" \ + --strategy development \ + --mode hierarchical \ + --monitor \ + --output sqlite +``` + +## Pattern 3: Testing Swarm + +### Purpose +Comprehensive quality assurance through distributed testing. + +### Architecture +```javascript +// Initialize testing swarm with star topology +mcp__claude-flow__swarm_init({ + "topology": "star", + "maxAgents": 7, + "strategy": "parallel" +}) + +// Spawn testing team +const testingTeam = [ + { + type: "tester", + name: "Unit Test Coordinator", + capabilities: ["unit-testing", "mocking", "coverage", "tdd"] + }, + { + type: "tester", + name: "Integration Tester", + capabilities: ["integration", "api-testing", "contract-testing"] + }, + { + type: "tester", + name: "E2E Tester", + capabilities: ["e2e", "ui-testing", "user-flows", "selenium"] + }, + { + type: "tester", + name: "Performance Tester", + capabilities: ["load-testing", "stress-testing", "benchmarking"] + }, + { + type: "monitor", + name: "Security Tester", + capabilities: ["security-testing", "penetration-testing", "vulnerability-scanning"] + }, + { + type: "analyst", + name: "Test Analyst", + capabilities: ["coverage-analysis", "test-optimization", "reporting"] + }, + { + type: "documenter", + name: "Test Documenter", + capabilities: ["test-documentation", "test-plans", "reports"] + } +] + +// Spawn all testers +testingTeam.forEach(tester => { + mcp__claude-flow__agent_spawn({ + type: tester.type, + name: tester.name, + capabilities: tester.capabilities, + swarmId: "testing-swarm" + }) +}) +``` + +### Testing Workflow + +#### Phase 1: Test Planning +```javascript +// Analyze test coverage requirements +mcp__claude-flow__quality_assess({ + "target": "test-coverage", + "criteria": [ + "line-coverage", + "branch-coverage", + "function-coverage", + "edge-cases" + ] +}) + +// Identify test scenarios +mcp__claude-flow__pattern_recognize({ + "data": testScenarios, + "patterns": [ + "edge-case", + "boundary-condition", + "error-path", + "happy-path" + ] +}) + +// Store test plan +mcp__claude-flow__memory_usage({ + "action": "store", + "key": "test-plan-" + Date.now(), + "value": JSON.stringify(testPlan), + "namespace": "testing$plans" +}) +``` + +#### Phase 2: Parallel Test Execution +```javascript +// Execute all test suites in parallel +mcp__claude-flow__parallel_execute({ + "tasks": [ + { + "id": "unit-tests", + "command": "npm run test:unit", + "assignTo": "Unit Test Coordinator" + }, + { + "id": "integration-tests", + "command": "npm run test:integration", + "assignTo": "Integration Tester" + }, + { + "id": "e2e-tests", + "command": "npm run test:e2e", + "assignTo": "E2E Tester" + }, + { + "id": "performance-tests", + "command": "npm run test:performance", + "assignTo": "Performance Tester" + }, + { + "id": "security-tests", + "command": "npm run test:security", + "assignTo": "Security Tester" + } + ] +}) + +// Batch process test suites +mcp__claude-flow__batch_process({ + "items": testSuites, + "operation": "execute-test-suite" +}) +``` + +#### Phase 3: Performance and Security +```javascript +// Run performance benchmarks +mcp__claude-flow__benchmark_run({ + "suite": "comprehensive-performance" +}) + +// Bottleneck analysis +mcp__claude-flow__bottleneck_analyze({ + "component": "application", + "metrics": ["response-time", "throughput", "memory", "cpu"] +}) + +// Security scanning +mcp__claude-flow__security_scan({ + "target": "application", + "depth": "comprehensive" +}) + +// Vulnerability analysis +mcp__claude-flow__error_analysis({ + "logs": securityScanLogs +}) +``` + +#### Phase 4: Monitoring and Reporting +```javascript +// Real-time test monitoring +mcp__claude-flow__swarm_monitor({ + "swarmId": "testing-swarm", + "interval": 2000 +}) + +// Generate comprehensive test report +mcp__claude-flow__performance_report({ + "format": "detailed", + "timeframe": "current-run" +}) + +// Get test results +mcp__claude-flow__task_results({ + "taskId": "test-execution-001" +}) + +// Trend analysis +mcp__claude-flow__trend_analysis({ + "metric": "test-coverage", + "period": "30d" +}) +``` + +### CLI Fallback +```bash +# Quick testing swarm +npx claude-flow swarm "test application comprehensively" \ + --strategy testing \ + --mode star \ + --parallel \ + --timeout 600 +``` + +## Pattern 4: Analysis Swarm + +### Purpose +Deep code and system analysis through specialized analyzers. + +### Architecture +```javascript +// Initialize analysis swarm +mcp__claude-flow__swarm_init({ + "topology": "mesh", + "maxAgents": 5, + "strategy": "adaptive" +}) + +// Spawn analysis specialists +const analysisTeam = [ + { + type: "analyst", + name: "Code Analyzer", + capabilities: ["static-analysis", "complexity-analysis", "dead-code-detection"] + }, + { + type: "analyst", + name: "Security Analyzer", + capabilities: ["security-scan", "vulnerability-detection", "dependency-audit"] + }, + { + type: "analyst", + name: "Performance Analyzer", + capabilities: ["profiling", "bottleneck-detection", "optimization"] + }, + { + type: "analyst", + name: "Architecture Analyzer", + capabilities: ["dependency-analysis", "coupling-detection", "modularity-assessment"] + }, + { + type: "documenter", + name: "Analysis Reporter", + capabilities: ["reporting", "visualization", "recommendations"] + } +] + +// Spawn all analysts +analysisTeam.forEach(analyst => { + mcp__claude-flow__agent_spawn({ + type: analyst.type, + name: analyst.name, + capabilities: analyst.capabilities + }) +}) +``` + +### Analysis Workflow +```javascript +// Parallel analysis execution +mcp__claude-flow__parallel_execute({ + "tasks": [ + { "id": "analyze-code", "command": "analyze codebase structure and quality" }, + { "id": "analyze-security", "command": "scan for security vulnerabilities" }, + { "id": "analyze-performance", "command": "identify performance bottlenecks" }, + { "id": "analyze-architecture", "command": "assess architectural patterns" } + ] +}) + +// Generate comprehensive analysis report +mcp__claude-flow__performance_report({ + "format": "detailed", + "timeframe": "current" +}) + +// Cost analysis +mcp__claude-flow__cost_analysis({ + "timeframe": "30d" +}) +``` + +## Advanced Techniques + +### Error Handling and Fault Tolerance + +```javascript +// Setup fault tolerance for all agents +mcp__claude-flow__daa_fault_tolerance({ + "agentId": "all", + "strategy": "auto-recovery" +}) + +// Error handling pattern +try { + await mcp__claude-flow__task_orchestrate({ + "task": "complex operation", + "strategy": "parallel", + "priority": "high" + }) +} catch (error) { + // Check swarm health + const status = await mcp__claude-flow__swarm_status({}) + + // Analyze error patterns + await mcp__claude-flow__error_analysis({ + "logs": [error.message] + }) + + // Auto-recovery attempt + if (status.healthy) { + await mcp__claude-flow__task_orchestrate({ + "task": "retry failed operation", + "strategy": "sequential" + }) + } +} +``` + +### Memory and State Management + +```javascript +// Cross-session persistence +mcp__claude-flow__memory_persist({ + "sessionId": "swarm-session-001" +}) + +// Namespace management for different swarms +mcp__claude-flow__memory_namespace({ + "namespace": "research-swarm", + "action": "create" +}) + +// Create state snapshot +mcp__claude-flow__state_snapshot({ + "name": "development-checkpoint-1" +}) + +// Restore from snapshot if needed +mcp__claude-flow__context_restore({ + "snapshotId": "development-checkpoint-1" +}) + +// Backup memory stores +mcp__claude-flow__memory_backup({ + "path": "$workspaces$claude-code-flow$backups$swarm-memory.json" +}) +``` + +### Neural Pattern Learning + +```javascript +// Train neural patterns from successful workflows +mcp__claude-flow__neural_train({ + "pattern_type": "coordination", + "training_data": JSON.stringify(successfulWorkflows), + "epochs": 50 +}) + +// Adaptive learning from experience +mcp__claude-flow__learning_adapt({ + "experience": { + "workflow": "research-to-report", + "success": true, + "duration": 3600, + "quality": 0.95 + } +}) + +// Pattern recognition for optimization +mcp__claude-flow__pattern_recognize({ + "data": workflowMetrics, + "patterns": ["bottleneck", "optimization-opportunity", "efficiency-gain"] +}) +``` + +### Workflow Automation + +```javascript +// Create reusable workflow +mcp__claude-flow__workflow_create({ + "name": "full-stack-development", + "steps": [ + { "phase": "design", "agents": ["architect"] }, + { "phase": "implement", "agents": ["backend-dev", "frontend-dev"], "parallel": true }, + { "phase": "test", "agents": ["tester", "security-tester"], "parallel": true }, + { "phase": "review", "agents": ["reviewer"] }, + { "phase": "deploy", "agents": ["devops"] } + ], + "triggers": ["on-commit", "scheduled-daily"] +}) + +// Setup automation rules +mcp__claude-flow__automation_setup({ + "rules": [ + { + "trigger": "file-changed", + "pattern": "*.js", + "action": "run-tests" + }, + { + "trigger": "PR-created", + "action": "code-review-swarm" + } + ] +}) + +// Event-driven triggers +mcp__claude-flow__trigger_setup({ + "events": ["code-commit", "PR-merge", "deployment"], + "actions": ["test", "analyze", "document"] +}) +``` + +### Performance Optimization + +```javascript +// Topology optimization +mcp__claude-flow__topology_optimize({ + "swarmId": "current-swarm" +}) + +// Load balancing +mcp__claude-flow__load_balance({ + "swarmId": "development-swarm", + "tasks": taskQueue +}) + +// Agent coordination sync +mcp__claude-flow__coordination_sync({ + "swarmId": "development-swarm" +}) + +// Auto-scaling +mcp__claude-flow__swarm_scale({ + "swarmId": "development-swarm", + "targetSize": 12 +}) +``` + +### Monitoring and Metrics + +```javascript +// Real-time swarm monitoring +mcp__claude-flow__swarm_monitor({ + "swarmId": "active-swarm", + "interval": 3000 +}) + +// Collect comprehensive metrics +mcp__claude-flow__metrics_collect({ + "components": ["agents", "tasks", "memory", "performance"] +}) + +// Health monitoring +mcp__claude-flow__health_check({ + "components": ["swarm", "agents", "neural", "memory"] +}) + +// Usage statistics +mcp__claude-flow__usage_stats({ + "component": "swarm-orchestration" +}) + +// Trend analysis +mcp__claude-flow__trend_analysis({ + "metric": "agent-performance", + "period": "7d" +}) +``` + +## Best Practices + +### 1. Choosing the Right Topology + +- **Mesh**: Research, brainstorming, collaborative analysis +- **Hierarchical**: Structured development, sequential workflows +- **Star**: Testing, validation, centralized coordination +- **Ring**: Pipeline processing, staged workflows + +### 2. Agent Specialization + +- Assign specific capabilities to each agent +- Avoid overlapping responsibilities +- Use coordination agents for complex workflows +- Leverage memory for agent communication + +### 3. Parallel Execution + +- Identify independent tasks for parallelization +- Use sequential execution for dependent tasks +- Monitor resource usage during parallel execution +- Implement proper error handling + +### 4. Memory Management + +- Use namespaces to organize memory +- Set appropriate TTL values +- Create regular backups +- Implement state snapshots for checkpoints + +### 5. Monitoring and Optimization + +- Monitor swarm health regularly +- Collect and analyze metrics +- Optimize topology based on performance +- Use neural patterns to learn from success + +### 6. Error Recovery + +- Implement fault tolerance strategies +- Use auto-recovery mechanisms +- Analyze error patterns +- Create fallback workflows + +## Real-World Examples + +### Example 1: AI Research Project +```javascript +// Research AI trends, analyze findings, generate report +mcp__claude-flow__swarm_init({ topology: "mesh", maxAgents: 6 }) +// Spawn: 2 researchers, 2 analysts, 1 synthesizer, 1 documenter +// Parallel gather → Analyze patterns → Synthesize → Report +``` + +### Example 2: Full-Stack Application +```javascript +// Build complete web application with testing +mcp__claude-flow__swarm_init({ topology: "hierarchical", maxAgents: 8 }) +// Spawn: 1 architect, 2 devs, 1 db engineer, 2 testers, 1 reviewer, 1 devops +// Design → Parallel implement → Test → Review → Deploy +``` + +### Example 3: Security Audit +```javascript +// Comprehensive security analysis +mcp__claude-flow__swarm_init({ topology: "star", maxAgents: 5 }) +// Spawn: 1 coordinator, 1 code analyzer, 1 security scanner, 1 penetration tester, 1 reporter +// Parallel scan → Vulnerability analysis → Penetration test → Report +``` + +### Example 4: Performance Optimization +```javascript +// Identify and fix performance bottlenecks +mcp__claude-flow__swarm_init({ topology: "mesh", maxAgents: 4 }) +// Spawn: 1 profiler, 1 bottleneck analyzer, 1 optimizer, 1 tester +// Profile → Identify bottlenecks → Optimize → Validate +``` + +## Troubleshooting + +### Common Issues + +**Issue**: Swarm agents not coordinating properly +**Solution**: Check topology selection, verify memory usage, enable monitoring + +**Issue**: Parallel execution failing +**Solution**: Verify task dependencies, check resource limits, implement error handling + +**Issue**: Memory persistence not working +**Solution**: Verify namespaces, check TTL settings, ensure backup configuration + +**Issue**: Performance degradation +**Solution**: Optimize topology, reduce agent count, analyze bottlenecks + +## Related Skills + +- `sparc-methodology` - Systematic development workflow +- `github-integration` - Repository management and automation +- `neural-patterns` - AI-powered coordination optimization +- `memory-management` - Cross-session state persistence + +## References + +- [Claude Flow Documentation](https:/$github.com$ruvnet$claude-flow) +- [Swarm Orchestration Guide](https:/$github.com$ruvnet$claude-flow$wiki$swarm) +- [MCP Tools Reference](https:/$github.com$ruvnet$claude-flow$wiki$mcp) +- [Performance Optimization](https:/$github.com$ruvnet$claude-flow$wiki$performance) + +--- + +**Version**: 2.0.0 +**Last Updated**: 2025-10-19 +**Skill Level**: Advanced +**Estimated Learning Time**: 2-3 hours diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..f1ec07ec3 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +indent_style = space +indent_size = 2 +insert_final_newline = true +trim_trailing_whitespace = true + +[*.py] +indent_size = 4 diff --git a/.env.example b/.env.example new file mode 100644 index 000000000..9da825107 --- /dev/null +++ b/.env.example @@ -0,0 +1,30 @@ +# Frontend +NEXT_PUBLIC_API_BASE_URL=http://localhost:8000 + +# Backend +LLM_PROVIDER=groq +GROQ_API_KEY=replace_me +GROQ_MODEL=llama-3.1-8b-instant +# Keep Apfel configured for future switch testing. +APFEL_BASE_URL=https://apfel.example.internal/v1 +APFEL_API_KEY=replace_me +APFEL_MODEL=apfel-chat +APFEL_ENABLED=true +MILVUS_HOST=localhost +MILVUS_PORT=19530 +MILVUS_COLLECTION_NAME=uniops_documents +RETRIEVAL_MODE=hybrid +EMBEDDING_PROVIDER=deterministic +EMBEDDING_MODEL=BAAI/bge-small-en-v1.5 + +# IRIS Integration +IRIS_BASE_URL=https://localhost +IRIS_PROJECT_KEY=SERVICE-X +IRIS_API_KEY=replace_me +IRIS_VERIFY_SSL=false + +# Confluence Integration +CONFLUENCE_BASE_URL=https://confluence.example.internal +CONFLUENCE_SPACE_KEY=OPS +CONFLUENCE_API_TOKEN=replace_me +CONFLUENCE_EMAIL=replace_me@example.com diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..6313b56c5 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +* text=auto eol=lf diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 000000000..40ff4ecbc --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,8 @@ +# Replace these handles with your real GitHub usernames. +# Example: @chiragds @teammate + +/frontend/ @engineer-frontend +/backend/ @engineer-backend +/shared/contracts/ @engineer-frontend @engineer-backend +/docs/ways-of-working/ @engineer-frontend @engineer-backend +/infra/ @engineer-backend diff --git a/.github/workflows/ownership-boundary-check.yml b/.github/workflows/ownership-boundary-check.yml new file mode 100644 index 000000000..7abf34556 --- /dev/null +++ b/.github/workflows/ownership-boundary-check.yml @@ -0,0 +1,23 @@ +name: Ownership Boundary Check + +on: + pull_request: + branches: + - main + types: [opened, synchronize, reopened, ready_for_review] + +jobs: + boundary-check: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Validate ownership boundaries + env: + BRANCH_NAME: ${{ github.head_ref }} + BASE_REF: origin/${{ github.base_ref }} + run: bash scripts/check-boundaries.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..fee077b73 --- /dev/null +++ b/.gitignore @@ -0,0 +1,30 @@ +# Node / frontend +frontend/node_modules/ +frontend/.next/ +frontend/out/ +frontend/coverage/ + +# Python / backend +backend/.venv/ +backend/__pycache__/ +**/__pycache__/ +backend/.pytest_cache/ +backend/.mypy_cache/ +backend/*.pyc +*.pyc +backend/.uniops/ + +# Env and local state +.env +.env.* +!.env.example +*.log +.DS_Store + +# Local third-party checkout +.vendor/ + +# IDE +.vscode/* +!.vscode/extensions.json +!.vscode/settings.json diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..7bdaeb4ab --- /dev/null +++ b/Makefile @@ -0,0 +1,32 @@ +.PHONY: up down backend-test iris-install iris-up iris-down iris-logs iris-admin-password require-docker + +ROOT_DIR := $(shell pwd) +IRIS_WEB_DIR ?= $(ROOT_DIR)/.vendor/iris-web +IRIS_WEB_REF ?= v2.4.27 + +up: + bash scripts/dev-up.sh + +down: + bash scripts/dev-down.sh + +backend-test: + cd backend && python -m pytest -q + +iris-install: + bash scripts/iris/install_iris_web.sh "$(IRIS_WEB_DIR)" "$(IRIS_WEB_REF)" + +require-docker: + @command -v docker >/dev/null 2>&1 || (echo "docker CLI not found. Install Docker Desktop and retry." && exit 1) + +iris-up: require-docker iris-install + cd "$(IRIS_WEB_DIR)" && docker compose up -d + +iris-down: require-docker + cd "$(IRIS_WEB_DIR)" && docker compose down + +iris-logs: require-docker + cd "$(IRIS_WEB_DIR)" && docker compose logs -f app + +iris-admin-password: require-docker + cd "$(IRIS_WEB_DIR)" && docker compose logs app | grep "create_safe_admin" | tail -1 diff --git a/README.md b/README.md index c5c886b3e..6c16b2920 100644 --- a/README.md +++ b/README.md @@ -1,86 +1,115 @@ -# HackToFuture 4.0 — Template - -Welcome to your official HackToFuture 4 repository. - -This repository template will be used for development, tracking progress, and final submission of your project. Ensure that all work is committed here within the allowed hackathon duration. - ---- - -### Instructions for the teams: - -- Fork the Repository and name the forked repo in this convention: hacktofuture4-team_id (for eg: hacktofuture4-A01) - ---- - -## Rules - -- Work must be done ONLY in the forked repository -- Only Four Contributors are allowed. -- After 36 hours, Please make PR to the Main Repository. A Form will be sent to fill the required information. -- Do not copy code from other teams -- All commits must be from individual GitHub accounts -- Please provide meaningful commits for tracking. -- Do not share your repository with other teams -- Final submission must be pushed before the deadline -- Any violation may lead to disqualification - ---- +# hacktofuture4 — D07 + +Monorepo scaffold for the HackToFuture 4 (D07) build, designed for fast parallel development across frontend and backend. + +## Repository Structure + +```text +. +├── frontend/ # Next.js app +│ ├── app/ +│ ├── components/ +│ ├── lib/ +│ └── tests/ +├── backend/ # FastAPI API + orchestration +│ ├── app/ +│ │ └── api/routes/ +│ ├── src/ +│ │ ├── controller/ +│ │ ├── swarms/ +│ │ ├── gates/ +│ │ ├── memory/ +│ │ └── tools/ +│ └── tests/ +├── shared/ +│ └── contracts/ # Shared integration boundary +├── data/ # Sample/source data +│ ├── confluence/ +│ ├── runbooks/ +│ ├── incidents/ +│ ├── github/ +│ └── slack/ +├── infra/ +│ └── docker-compose.yml +├── scripts/ +└── docs/ + ├── UniOps PRD.md + └── ways-of-working/ +``` -# The Final README Template +## Quick Start -## Problem Statement / Idea +### Option A: Docker Compose (recommended) -Clearly describe the problem you are solving. +From the repository root: -- What is the problem? -- Why is it important? -- Who are the target users? +```bash +make up +``` ---- +- Frontend: http://localhost:3000 +- Backend health: http://localhost:8000/health +- Milvus: localhost:19530 -## Proposed Solution +### Option B: Run services separately -Explain your approach: +Frontend: -- What are you building? -- How does it solve the problem? -- What makes your solution unique? +```bash +cd frontend +npm install +npm run dev +``` ---- +Backend: -## Features +```bash +cd backend +python -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +uvicorn app.main:app --reload --port 8000 +``` -List the core features of your project: +## Current MVP Scaffold -- Feature 1 -- Feature 2 -- Feature 3 +- FastAPI app with `/health` and `POST /api/chat` +- Next.js app shell +- Shared chat contract: `shared/contracts/chat.contract.json` +- Sample data folders for Confluence, runbooks, incidents, GitHub, and Slack ---- +## Vector DB (Milvus) -## Tech Stack +Retrieval behavior is controlled by `RETRIEVAL_MODE`: -Mention all technologies used: +- `keyword`: keyword-only retrieval (no vector indexing) +- `semantic`: Milvus semantic retrieval (falls back to keyword when unavailable) +- `hybrid`: semantic-first with keyword backfill -- Frontend: -- Backend: -- Database: -- APIs / Services: -- Tools / Libraries: +Example `.env` values: ---- +```bash +RETRIEVAL_MODE=hybrid +MILVUS_HOST=localhost +MILVUS_PORT=19530 +MILVUS_COLLECTION_NAME=uniops_documents +EMBEDDING_PROVIDER=deterministic +EMBEDDING_MODEL=BAAI/bge-small-en-v1.5 +``` -## Project Setup Instructions +Vector endpoints: -Provide clear steps to run your project: +- `GET /api/vector/status` +- `POST /api/vector/rebuild` -```bash -# Clone the repository -git clone +## Working Agreements (parallel build) -# Install dependencies -... +- Frontend work stays in `frontend/**` +- Backend work stays in `backend/**` +- Shared areas (`shared/**`, `infra/**`, docs) should be changed with extra care to avoid conflicts -# Run the project -... -``` +See: +- `docs/ways-of-working/OWNERSHIP.md` +- `docs/ways-of-working/BRANCHING.md` +- `docs/ways-of-working/INTEGRATION_RULES.md` +- `docs/ways-of-working/TASK_SPLIT_24H.md` diff --git a/TO-DO.md b/TO-DO.md new file mode 100644 index 000000000..88add3500 --- /dev/null +++ b/TO-DO.md @@ -0,0 +1,157 @@ +# UniOps Implementation Tracker + +## Branch Lane Map +- Engineer 1 core branch: `feature/backend-orchestration-and-skills` (base branch) +- Engineer 2 systems branch: `feat/backend-systems-queue-flow` +- Shared contract branch: `chore/shared-actions-contract` +- Merged shared branches: `chore/shared-chat-contract-iris-incident-input`, `chore/shared-chat-contract-dedup-metadata` +- Merged backend branches: `feat/backend-systems-iris-report-query-pass-through`, `feat/backend-dedup-api-metadata`, `feat/backend-reasoning-source-priority-hints`, `feat/backend-dedup-determinism-tests` +- Merged integration branch: `feature/backend-final-demo-integration` -> `main` +- Open PR count: 0 + +## Engineer 1 Full Plan (Target 10 points) +- [x] P0 (3): Controller pipeline with swarm chaining. +- [x] P0 (3): Retrieval + reasoning output schema and citation handoff. +- [x] P1 (2): Permission policy rules for HITL decisions. +- [x] P1 (2): Memory summary and Kairos-lite dedup pass API. +- Current completion: 10/10 points. +- Remaining completion: 0/10 points. + +## Backlog +- [x] Engineer 1: Add `run_dedup_pass()` in `backend/src/memory/three_tier_memory.py` for transcript/doc dedup. +- [x] Engineer 1: Expose dedup summary metadata in memory summary API shape. +- [x] Engineer 1: Add focused tests for dedup behavior and deterministic idempotency (`backend/tests/test_memory_dedup.py`). +- [x] Engineer 1: Tune reasoning quality hints for source prioritization (non-breaking). + +## In Progress +- [x] Core MVP golden flow implementation: IRIS + Confluence end-to-end integration on `main` baseline. +- [x] HITL completion path: pending approval -> approve/reject -> plan_approved/plan_rejected audit trace. +- [x] SSE reliability hardening: reconnect-safe stream behavior, heartbeats, and idle timeout termination. +- [x] Ingestion error envelope hardening: consistent endpoint and per-item adapter error metadata. +- [x] Transcript readiness hardening: atomic transcript writes + wait-based transcript reads. + +## Done +- [x] PR #5 merged to `main` (`feat: Update backend and frontend for LLM integration and SSE endpoints`). +- [x] PR #6 merged to `main` (`feat: expand ingestion across GitHub/Jira/Slack with live-demo wiring`). +- [x] Post-merge backend regression fixed: restored `ToolExecutor` runtime and added approval executor compatibility fallback for older test doubles. +- [x] Live validation expanded to active GitHub issue + Groq-backed chat stream + approval + transcript verification. +- [x] Live benchmark assets added for repeatable endpoint checks (`scripts/live_benchmark.py`, `scripts/benchmark_ingestion.py`). +- [x] Established branch split strategy and pushed baseline/core changes to `main`. +- [x] Pushed backend orchestration and skill assets to feature branch. +- [x] Slice 1: Contract updates for stream/transcript endpoints. +- [x] Slice 1: Backend live SSE trace endpoint (`POST /api/chat`). +- [x] Slice 1: Backend transcript read endpoint (`GET /api/chat/transcript/{trace_id}`). +- [x] Slice 1: Frontend hooks for chat and trace streaming. +- [x] Slice 1: Frontend page integration for answer + live trace. +- [x] Slice 1: Backend tests for stream and transcript behavior. +- [x] Systems-slice changes moved off this branch to `feat/backend-systems-queue-flow`. +- [x] Shared contract actions changes moved to `chore/shared-actions-contract`. +- [x] IRIS dual-input contract added (message + incident_report with precedence rule). +- [x] IRIS runtime path added in backend chat route with validation and canonical incident context mapping. +- [x] Kairos dedup pass implemented for documents/transcripts. +- [x] Dedup summary metadata exposed in chat response and transcript payload. +- [x] Reasoning quality tuning shipped (source reranking, dedup-aware confidence, tuned action selection). +- [x] Dedicated deterministic/idempotency dedup tests added. +- [x] Active feature branches merged into `feature/backend-final-demo-integration` and verified end-to-end. +- [x] PR #1 merged to `main` (`feat: add backend orchestration core and local skill assets`). +- [x] PR #4 retargeted to `main` and merged (`merge: final integrated backend feature set + PRD demo verification`). +- [x] PR #2 and PR #3 closed as superseded by merged integration work. +- [x] Frontend on `main` restored to main-baseline implementation after merge sequencing. +- [x] API-only Confluence batch ingestion shipped (`POST /api/ingest/confluence` with body `page_ids`) with partial-failure reporting. +- [x] Automated API-only E2E flow test added (`backend/tests/test_e2e_ingest_chat_approve.py`) for ingest -> chat -> stream -> approve -> transcript lifecycle. +- [x] Manual E2E verification script added (`scripts/e2e_confluence_flow.sh`) for live backend runs. +- [x] Phase 2 frontend demo wiring shipped (`frontend/app/page.tsx`, `frontend/lib/chat-api.ts`): ingestion controls, chat submit, SSE trace rendering, approval actions, transcript refresh. +- [x] Credential setup completed for live connectors: `.env` now has working Confluence and IRIS integration keys (IRIS key sourced from local `iris-web` DB admin user record). + +## Risks +- SSE consumers can see parse errors if event payload shape changes unexpectedly. +- Browser CORS can block frontend-to-backend calls if origin config is too strict. +- Transcript read-after-write race is reduced by atomic writes and wait-based reads; clients using zero wait timeout may still observe eventual consistency windows. +- Local Python 3.14 environments may fail to build backend dependencies (`pydantic-core`/PyO3) without compatibility handling. + +## Decisions +- Tight Slice 1 first: TO-DO + SSE trace + transcript read + frontend live trace. +- Queue work is isolated to systems branch (`feat/backend-systems-queue-flow`). +- Shared actions contract is isolated to `chore/shared-actions-contract`. +- Keep `POST /api/chat` backward compatible while extending payload shape additively. +- Incident input model supports both free-text and IRIS incident reports; `incident_report` takes precedence when present. +- Dedup metadata is additive and non-breaking in both chat response and transcript payload. + +## Current Implementation Approach (Active) +- Core MVP feature selected from PRD: single golden flow for "Explain Redis latency incident" using Confluence runbook context + IRIS incident context + live trace + human approval before external action. +- Integration targets: Atlassian Confluence Cloud and ServiceNow-style IRIS APIs (real connectors, not local-only stubs). +- Auth model: environment secret based credentials for external API access. +- Delivery bar: one complete golden flow end-to-end (not all four PRD flows in this sprint slice). +- Base branch strategy: implement from `main` state to avoid drift from older feature branch snapshots. +- Branch lane execution plan: + - `chore/shared-*`: contract/schema updates first. + - `feat/backend-systems-*`: ingestion routes, approval route, external API adapters. + - `feat/backend-core-*`: controller/retrieval/execution approval state transitions and trace/audit persistence. + - `feat/frontend-*`: chat + trace + incident input + approval modal wiring. +- Backend build scope for this slice: + - Extend chat request to support dual input (`message` + `incident_report`) with precedence validation. + - Add `/api/ingest/confluence` and `/api/ingest/iris` for source sync. + - Add approval decision endpoint for trace-bound actions (`approve` / `reject`). + - Replace pending-only execution stop with full decision transition and recorded execution outcome. +- Frontend build scope for this slice: + - Replace static shell with functional chat workflow. + - Show live SSE trace steps with cited sources. + - Add incident input path and approval modal submission UX. + - Display post-approval action result and updated trace. +- Verification gates for completion: + - Boundary checks per branch lane (`scripts/check-boundaries.sh`). + - Backend tests for dual-input chat, ingestion, approval transitions, and stream/transcript regression. + - Frontend build + runtime smoke. + - Golden flow API sequence: ingest -> chat -> stream -> approve/reject -> transcript confirms final state. + +## Verification Log +- 2026-04-17: Slice 4.1 SSE reliability hardening completed (`backend/app/api/routes/chat.py`) with new stream tests (`backend/tests/test_chat_stream.py`): focused stream suite `6 passed`, broader chat suite `14 passed`. +- 2026-04-17: Slice 4.2 unified ingestion error envelope completed (`backend/app/api/routes/ingestion.py`) and validated (`backend/tests/test_ingestion.py`: `15 passed`). +- 2026-04-17: Slice 4.3 transcript readiness/race hardening completed (`backend/src/memory/three_tier_memory.py`, transcript wait handling in chat/approvals routes) and validated (`backend/tests/test_chat_stream.py tests/test_approvals.py tests/test_memory_dedup.py`: `13 passed`). +- 2026-04-17: Combined reliability verification completed (`backend/tests/test_chat_orchestration.py tests/test_chat_iris_input.py tests/test_chat_stream.py tests/test_approvals.py tests/test_ingestion.py tests/test_memory_dedup.py`: `36 passed`). +- 2026-04-17: PR #5 and PR #6 confirmed merged; open PR count is now 0. +- 2026-04-17: Post-merge validation run completed (`backend/.venv/bin/python -m pytest -q`: `52 passed`; frontend `npm run lint && npm run build`: passed). +- 2026-04-17: Live ingestion benchmark run with active GitHub issue succeeded for Confluence, GitHub, Jira, and Slack channel endpoints (HTTP 200 with ingested records). +- 2026-04-17: Live chat->approval flow succeeded with Groq enabled (`trace_started -> trace_step x3 -> trace_complete`, approval final status `plan_approved`, transcript final status `plan_approved`). +- 2026-04-17: Slack thread endpoint initially failed for configured `.env` thread timestamp (`thread_not_found`); validated a working thread timestamp (`1776343995.110039`) for channel `C0AT618FUPM`, and thread ingestion succeeded with this value. +- 2026-04-17: `JIRA_ISSUE_KEY` remains unset in `.env`; live runs used fallback key `KAN-49` for Jira validation. +- 2026-04-16: Started Slice 1 implementation. +- 2026-04-16: Backend tests passed (`pytest -q`): 6 passed. +- 2026-04-16: Frontend production build passed (`npm run build`). +- 2026-04-16: Slice 1 marked complete; queue work remains in Backlog for Slice 2. +- 2026-04-16: Stream test suite cleaned and rerun (`pytest -q`): 7 passed. +- 2026-04-16: Branch split completed; systems and shared changes removed from Engineer 1 branch. +- 2026-04-16: IRIS runtime validation passed on local backend (`GET /health` 200, `POST /api/chat` message-only 200, incident_report-only 200, dual-input 200, invalid payload 422, transcript fetch 200 with 3 steps, stream fetch returned 3 data events). +- 2026-04-16: IRIS automated validation passed (`python -m pytest -q tests/test_chat_iris_input.py`: 4 passed; `python -m pytest -q`: 11 passed; `BASE_REF=origin/feature/backend-orchestration-and-skills bash scripts/check-boundaries.sh`: passed). +- 2026-04-16: PR evidence recorded: #2 `chore(shared): add IRIS incident_report dual-input chat contract` (https://github.com/chiraghontec/hacktofuture4-D07/pull/2) and #3 `feat(backend): support IRIS incident report as chat input context` (https://github.com/chiraghontec/hacktofuture4-D07/pull/3). +- 2026-04-16: Dedup pass + metadata implementation completed and pushed (`feat/backend-dedup-api-metadata`, commit `399c8b3`; `python -m pytest -q`: 11 passed; runtime payload shows `dedup_summary` in both `POST /api/chat` and `GET /api/chat/transcript/{trace_id}`). +- 2026-04-16: Dedup metadata contract update completed and pushed (`chore/shared-chat-contract-dedup-metadata`, commit `17d2f33`; JSON validation via `jq empty shared/contracts/chat.contract.json`). +- 2026-04-16: Reasoning quality tuning completed and pushed (`feat/backend-reasoning-source-priority-hints`, commit `ea8d5d4`; `python -m pytest -q`: 14 passed; `BASE_REF=origin/feature/backend-orchestration-and-skills bash scripts/check-boundaries.sh`: passed). +- 2026-04-16: Dedicated deterministic/idempotency dedup tests completed and pushed (`feat/backend-dedup-determinism-tests`, commit `b944e52`; `python -m pytest -q tests/test_memory_dedup.py`: 2 passed; full suite `python -m pytest -q`: 13 passed). +- 2026-04-16: Active feature branches merged into integration branch `feature/backend-final-demo-integration`; full backend suite passed (`python -m pytest -q`: 16 passed) and frontend build passed (`npm run build`). +- 2026-04-16: Final PRD-aligned browser demo flows validated on merged integration branch (4/4 flows passed with rendered answers, expected approval behavior, and trace steps `retrieval -> reasoning -> execution`; transcript endpoint returned `dedup_summary`; SSE stream emitted 3 events per flow). +- 2026-04-16: Integration PR opened: #4 `merge: final integrated backend feature set + PRD demo verification` (https://github.com/chiraghontec/hacktofuture4-D07/pull/4). +- 2026-04-16: PR #1 merged to `main` (merge commit `8884e63`; https://github.com/chiraghontec/hacktofuture4-D07/pull/1). +- 2026-04-16: PR #4 retargeted to `main` and merged (merge commit `326a4e5`; https://github.com/chiraghontec/hacktofuture4-D07/pull/4). +- 2026-04-16: PR #2 and PR #3 closed as superseded after integration merge (https://github.com/chiraghontec/hacktofuture4-D07/pull/2, https://github.com/chiraghontec/hacktofuture4-D07/pull/3). +- 2026-04-16: Frontend baseline correction applied on `main` to keep main-branch frontend implementation (`fix(frontend): keep main frontend baseline`, commit `89d25bc`); frontend production build passed (`npm run build`). +- 2026-04-16: Active implementation approach logged for next build slice: contract-first IRIS + Confluence golden flow with HITL approval completion path and branch-lane execution plan. +- 2026-04-16: Phase 1 implementation started for DFIR-IRIS local setup. Added `make iris-install|iris-up|iris-down|iris-logs|iris-admin-password`, installer script (`scripts/iris/install_iris_web.sh`), and local runbook (`docs/ways-of-working/LOCAL_DFIR_IRIS_SETUP_MACOS.md`). +- 2026-04-16: Official `dfir-iris/iris-web` installed locally to `.vendor/iris-web` at tag `v2.4.27`; generated local `.env` with randomized DB/admin/secret values and `SERVER_NAME=localhost`. +- 2026-04-16: Runtime start attempt blocked in this environment because Docker CLI is unavailable (`docker: command not found`). Added `require-docker` precheck in `Makefile` for clear operator feedback. +- 2026-04-16: Phase 2 ingestion implementation started: added IRIS and Confluence adapter clients (`backend/src/adapters/*.py`), ingestion API routes (`POST /api/ingest/iris`, `POST /api/ingest/confluence`), runtime document ingestion in memory, and `backend/tests/test_ingestion.py`. +- 2026-04-16: Backend dependency installation for tests is blocked on Python 3.14 compatibility (`pydantic-core/jiter` build failure). Validation requires Python 3.12 environment. +- 2026-04-16: Phase 3 approval workflow implementation started: added approval endpoint (`POST /api/approvals/{trace_id}`), planner-only tool executor, transcript approval audit persistence, and router registration. +- 2026-04-16: Shared contract updated for ingestion and approval endpoints plus transcript approval metadata (`shared/contracts/chat.contract.json`); JSON validation passed (`python3 -m json.tool`). +- 2026-04-16: Syntax validation passed for approval workflow files (`python3 -m py_compile ...`). Automated tests remain blocked in current `.venv` because `pytest` and compatible dependencies are unavailable with Python 3.14 pin set. +- 2026-04-16: Confluence ingestion contract upgraded to batch request shape (`POST /api/ingest/confluence` body `{ "page_ids": [...] }`) and per-page result reporting in `shared/contracts/chat.contract.json`; JSON validation passed (`python3 -m json.tool shared/contracts/chat.contract.json`). +- 2026-04-16: Backend batch ingestion implementation + regression fixes completed (`backend/app/api/routes/ingestion.py`, `backend/app/api/routes/chat.py` SSE streaming via `StreamingResponse`), focused tests passed (`10 passed`) and full backend suite passed (`22 passed`) using `backend/.venv`. +- 2026-04-16: API-only golden flow validation assets completed: `backend/tests/test_e2e_ingest_chat_approve.py` and executable script `scripts/e2e_confluence_flow.sh`. +- 2026-04-16: Demo run executed now: backend started successfully from `backend/.venv` (`uvicorn app.main:app --host 0.0.0.0 --port 8000`), automated E2E demo passed (`.venv/bin/python -m pytest -q tests/test_e2e_ingest_chat_approve.py`: `1 passed`). +- 2026-04-16: Live scripted demo run executed (`CONFLUENCE_PAGE_IDS=12345,67890 ./scripts/e2e_confluence_flow.sh`): full trace/approval path completed (`final_status=plan_approved`, 3 SSE events), while Confluence fetch step returned `CONFLUENCE_BASE_URL is not configured` (runtime env configuration pending). +- 2026-04-16: Backend startup updated to auto-load root `.env` (`backend/app/main.py` using `python-dotenv` when available); regression tests passed (`.venv/bin/python -m pytest -q tests/test_ingestion.py tests/test_chat_stream.py tests/test_approvals.py`: `9 passed`). +- 2026-04-16: Phase 2 frontend verification passed (`npm run lint`, `npm run build`) and browser demo validated at `http://localhost:3000`: chat produced trace, SSE rendered retrieval/reasoning/execution, approval action produced `plan_approved` final status and transcript card updated. +- 2026-04-16: Live Confluence connector now reads env values, but sample IDs `12345,67890` returned upstream 404 from Confluence API; requires valid page IDs for successful live ingest evidence. +- 2026-04-16: Confluence credential validity confirmed by page discovery probe (`GET /rest/api/content?limit=5` 200); validated page IDs captured (`65868`, `65898`) and scripted demo run passed with `ingested_count=2`. +- 2026-04-16: IRIS auth issue resolved from 401 to valid auth by replacing placeholder password with actual admin API key token (retrieved from local `iris_db` user record); IRIS list endpoint probe returned case `1` (`#1 - Initial Demo`) and ingestion succeeded (`POST /api/ingest/iris?case_id=1` => 200). +- 2026-04-16: Frontend live-demo verification completed at `http://localhost:3000` with validated defaults (`Confluence page IDs: 65868,65898`, `IRIS case ID: 1`): Confluence ingest status `2 ok / 0 failed`, IRIS ingest status `Case 1`, approval flow completed, transcript final status `plan_approved`. diff --git a/backend/README.md b/backend/README.md new file mode 100644 index 000000000..8362feb4b --- /dev/null +++ b/backend/README.md @@ -0,0 +1,50 @@ +# Backend (Engineer B) + +## Scope +- FastAPI API and SSE endpoints +- Controller Kernel and swarm orchestration +- Native Permission Gate (HITL) +- Memory layer and audit trail + +## Start +```bash +python -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +uvicorn app.main:app --reload --port 8000 +``` + +## Milvus Vector DB Setup + +1. Start Milvus from the repository root: + +```bash +cd infra +docker compose up -d milvus +``` + +2. Configure retrieval mode and vector settings in `.env`: + +```bash +RETRIEVAL_MODE=hybrid +MILVUS_HOST=localhost +MILVUS_PORT=19530 +MILVUS_COLLECTION_NAME=uniops_documents +EMBEDDING_PROVIDER=deterministic +``` + +3. Verify and rebuild vector index: + +```bash +curl http://127.0.0.1:8000/api/vector/status +curl -X POST http://127.0.0.1:8000/api/vector/rebuild +``` + +## LLM Module + +Reasoning and execution assessment providers are selected via `LLM_PROVIDER`: + +- `groq` (requires `GROQ_API_KEY`) +- `apfel` (requires `APFEL_BASE_URL` and `APFEL_API_KEY`) + +Retrieval query expansion, reasoning synthesis, and execution assessment are all wired through the shared LLM client module in `backend/src/adapters/llm_client.py`. diff --git a/backend/app/api/routes/approvals.py b/backend/app/api/routes/approvals.py new file mode 100644 index 000000000..2d1f126e0 --- /dev/null +++ b/backend/app/api/routes/approvals.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +from datetime import UTC, datetime +from typing import Literal + +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel + +from app.api.routes.chat import memory +from src.tools.executor import PlanningToolExecutor + +router = APIRouter() +executor = PlanningToolExecutor() + + +class ApprovalDecisionRequest(BaseModel): + decision: Literal["approve", "reject"] + approver_id: str + comment: str | None = None + + +class ApprovalDecisionResponse(BaseModel): + trace_id: str + final_status: str + execution_mode: str + approval: dict + execution_result: dict + + +@router.post("/approvals/{trace_id}", response_model=ApprovalDecisionResponse) +def submit_approval(trace_id: str, payload: ApprovalDecisionRequest) -> ApprovalDecisionResponse: + transcript = memory.wait_for_transcript(trace_id, timeout_seconds=0.75) + if transcript is None: + raise HTTPException(status_code=404, detail=f"trace {trace_id} not found") + + suggested_action = str(transcript.get("suggested_action", "")).strip() + action_details = transcript.get("action_details") if isinstance(transcript.get("action_details"), dict) else None + if not suggested_action and not action_details: + raise HTTPException(status_code=409, detail="trace does not contain a suggested action") + + if not suggested_action and action_details is not None: + suggested_action = str(action_details.get("intent") or "execute approved action") + + approval = { + "decision": payload.decision, + "approver_id": payload.approver_id, + "comment": payload.comment or "", + "timestamp": datetime.now(UTC).isoformat(), + } + + if payload.decision == "reject": + execution_result = { + "tool": "planner.external_action_plan", + "status": "plan_rejected", + "output": "Execution plan was rejected by approver. No external write operations were performed.", + "timestamp": datetime.now(UTC).isoformat(), + "execution_mode": "planner_only", + "no_write_policy": True, + } + final_status = "plan_rejected" + else: + try: + execution_result = executor.execute(suggested_action, action_details=action_details) + except TypeError: + execution_result = executor.execute(suggested_action) + final_status = "plan_approved" + + execution_mode = str(execution_result.get("execution_mode") or "planner_only") + + memory.persist_approval_decision( + trace_id=trace_id, + approval=approval, + execution_result=execution_result, + final_status=final_status, + execution_mode=execution_mode, + ) + + return ApprovalDecisionResponse( + trace_id=trace_id, + final_status=final_status, + execution_mode=execution_mode, + approval=approval, + execution_result=execution_result, + ) diff --git a/backend/app/api/routes/chat.py b/backend/app/api/routes/chat.py new file mode 100644 index 000000000..cf8206d63 --- /dev/null +++ b/backend/app/api/routes/chat.py @@ -0,0 +1,316 @@ +from __future__ import annotations + +import asyncio +import json +import queue +import threading +import time +from typing import Any + +from fastapi import APIRouter, HTTPException, Query, Request +from fastapi.responses import StreamingResponse +from pydantic import BaseModel, ConfigDict, model_validator + +from src.controller.controller import ControllerKernel +from src.memory.three_tier_memory import ThreeTierMemory + +router = APIRouter() +kernel = ControllerKernel() +memory = ThreeTierMemory() +STREAM_RETRY_MS = 3000 +STREAM_HEARTBEAT_SECONDS = 2.5 +STREAM_IDLE_TIMEOUT_SECONDS = 20.0 +STREAM_QUEUE_MAXSIZE = 256 + + +class IncidentReport(BaseModel): + model_config = ConfigDict(extra="allow") + + source_system: str = "iris" + case_id: str | None = None + report_id: str | None = None + report_url: str | None = None + ingested_at: str | None = None + case_name: str + short_description: str + severity: str + tags: list[str] + iocs: list[Any] + timeline: list[Any] + + +class ChatRequest(BaseModel): + message: str | None = None + session_id: str + incident_report: IncidentReport | None = None + + @model_validator(mode="after") + def validate_payload(self) -> "ChatRequest": + has_message = bool(self.message and self.message.strip()) + has_incident_report = self.incident_report is not None + if not has_message and not has_incident_report: + raise ValueError("Either message or incident_report must be provided.") + return self + + +def _stable_json(items: list[Any]) -> str: + return json.dumps(items, sort_keys=True, separators=(",", ":"), ensure_ascii=False) + + +def incident_report_to_query(report: IncidentReport) -> str: + lines = [ + "IRIS Incident Report", + f"Source System: {report.source_system}", + f"Case Name: {report.case_name}", + f"Short Description: {report.short_description}", + f"Severity: {report.severity}", + f"Tags: {', '.join(report.tags) if report.tags else 'none'}", + f"IOCs: {_stable_json(report.iocs)}", + f"Timeline: {_stable_json(report.timeline)}", + ] + + if report.case_id: + lines.append(f"Case ID: {report.case_id}") + if report.report_id: + lines.append(f"Report ID: {report.report_id}") + if report.report_url: + lines.append(f"Report URL: {report.report_url}") + if report.ingested_at: + lines.append(f"Ingested At: {report.ingested_at}") + + return "\n".join(lines) + + +def _format_sse(event: str, payload: dict[str, Any], event_id: str, retry_ms: int | None = None) -> str: + lines: list[str] = [] + if retry_ms is not None: + lines.append(f"retry: {retry_ms}") + lines.append(f"id: {event_id}") + lines.append(f"event: {event}") + serialized = json.dumps(payload, ensure_ascii=False) + for line in serialized.splitlines() or [serialized]: + lines.append(f"data: {line}") + return "\n".join(lines) + "\n\n" + + +def _build_stream_payload(raw_event: dict[str, Any], sequence: int, trace_id: str) -> tuple[str, dict[str, Any]]: + event_type = str(raw_event.get("event_type", "trace_step")) + timestamp = str(raw_event.get("timestamp") or "") + + if event_type == "trace_step": + step = raw_event.get("step") if isinstance(raw_event.get("step"), dict) else {} + payload = { + "event_type": event_type, + "event_id": f"{trace_id}:{sequence}", + "trace_id": trace_id, + "sequence": sequence, + "timestamp": step.get("timestamp") or timestamp, + "status": str(raw_event.get("status", "in_progress")), + "step": step.get("step", ""), + "agent": step.get("agent", ""), + "observation": step.get("observation", ""), + "sources": step.get("sources", []), + "metadata": step.get("metadata", {}), + } + return event_type, payload + + payload: dict[str, Any] = { + "event_type": event_type, + "event_id": f"{trace_id}:{sequence}", + "trace_id": trace_id, + "sequence": sequence, + "timestamp": timestamp, + "status": str(raw_event.get("status", "in_progress")), + "metadata": raw_event.get("metadata", {}), + } + + if event_type == "trace_complete": + payload.update( + { + "answer": raw_event.get("answer", ""), + "needs_approval": bool(raw_event.get("needs_approval", False)), + "suggested_action": raw_event.get("suggested_action"), + } + ) + if event_type == "trace_error": + payload.update( + { + "error_code": raw_event.get("error_code", "runtime_error"), + "error": raw_event.get("error", "unknown error"), + } + ) + + return event_type, payload + + +@router.post("/chat") +async def chat(payload: ChatRequest, request: Request) -> StreamingResponse: + query = payload.message.strip() if payload.message else "" + if payload.incident_report is not None: + query = incident_report_to_query(payload.incident_report) + + event_queue: queue.Queue[dict[str, Any] | None] = queue.Queue(maxsize=STREAM_QUEUE_MAXSIZE) + stop_event = threading.Event() + + def _queue_put(item: dict[str, Any] | None) -> bool: + while not stop_event.is_set(): + try: + event_queue.put(item, timeout=0.1) + return True + except queue.Full: + continue + return False + + def _worker() -> None: + try: + for stream_event in kernel.stream_query_events(query=query, session_id=payload.session_id): + if stop_event.is_set(): + break + if not _queue_put(stream_event): + break + except Exception as exc: + _queue_put( + { + "event_type": "trace_error", + "trace_id": "trace-unknown", + "status": "failed", + "error_code": "stream_worker_error", + "error": f"Stream worker failed: {exc}", + } + ) + finally: + _queue_put(None) + + threading.Thread(target=_worker, daemon=True).start() + + async def event_generator(): + sequence = 0 + active_trace_id: str | None = None + terminal_seen = False + worker_done = False + last_progress_at = time.monotonic() + + while True: + if await request.is_disconnected(): + stop_event.set() + break + + if worker_done and event_queue.empty(): + break + + try: + queue_item = await asyncio.to_thread(event_queue.get, True, STREAM_HEARTBEAT_SECONDS) + except queue.Empty: + if terminal_seen: + break + + idle_duration = time.monotonic() - last_progress_at + if idle_duration >= STREAM_IDLE_TIMEOUT_SECONDS: + sequence += 1 + timeout_trace = active_trace_id or "trace-pending" + timeout_payload = { + "event_type": "trace_error", + "event_id": f"{timeout_trace}:{sequence}", + "trace_id": timeout_trace, + "sequence": sequence, + "timestamp": "", + "status": "failed", + "metadata": { + "idle_timeout_seconds": STREAM_IDLE_TIMEOUT_SECONDS, + "idle_duration_seconds": round(idle_duration, 3), + }, + "error_code": "stream_timeout", + "error": "SSE stream timed out waiting for controller events.", + } + stop_event.set() + yield _format_sse( + event="trace_error", + payload=timeout_payload, + event_id=timeout_payload["event_id"], + retry_ms=STREAM_RETRY_MS if sequence == 1 else None, + ) + break + + sequence += 1 + heartbeat_trace = active_trace_id or "trace-pending" + heartbeat_payload = { + "event_type": "trace_heartbeat", + "event_id": f"{heartbeat_trace}:{sequence}", + "trace_id": heartbeat_trace, + "sequence": sequence, + "timestamp": "", + "status": "in_progress", + "metadata": { + "message": "stream alive", + "idle_duration_seconds": round(idle_duration, 3), + }, + } + yield _format_sse( + event="trace_heartbeat", + payload=heartbeat_payload, + event_id=heartbeat_payload["event_id"], + retry_ms=STREAM_RETRY_MS if sequence == 1 else None, + ) + continue + + if queue_item is None: + worker_done = True + if terminal_seen: + break + continue + + if not isinstance(queue_item, dict): + sequence += 1 + invalid_trace = active_trace_id or "trace-pending" + invalid_payload = { + "event_type": "trace_error", + "event_id": f"{invalid_trace}:{sequence}", + "trace_id": invalid_trace, + "sequence": sequence, + "timestamp": "", + "status": "failed", + "metadata": {}, + "error_code": "invalid_stream_event", + "error": "Controller emitted malformed stream event payload.", + } + stop_event.set() + yield _format_sse( + event="trace_error", + payload=invalid_payload, + event_id=invalid_payload["event_id"], + retry_ms=STREAM_RETRY_MS if sequence == 1 else None, + ) + break + + trace_id = str(queue_item.get("trace_id") or active_trace_id or "trace-pending") + active_trace_id = trace_id + sequence += 1 + event_type, stream_payload = _build_stream_payload(queue_item, sequence=sequence, trace_id=trace_id) + yield _format_sse( + event=event_type, + payload=stream_payload, + event_id=str(stream_payload.get("event_id", f"{trace_id}:{sequence}")), + retry_ms=STREAM_RETRY_MS if sequence == 1 else None, + ) + + if event_type in {"trace_complete", "trace_error"}: + terminal_seen = True + last_progress_at = time.monotonic() + + return StreamingResponse( + event_generator(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no", + }, + ) + + +@router.get("/chat/transcript/{trace_id}") +def get_transcript(trace_id: str, wait_timeout_seconds: float = Query(default=0.0, ge=0.0, le=5.0)) -> dict: + transcript = memory.wait_for_transcript(trace_id, timeout_seconds=wait_timeout_seconds) + if transcript is None: + raise HTTPException(status_code=404, detail=f"trace {trace_id} not found") + return transcript diff --git a/backend/app/api/routes/ingestion.py b/backend/app/api/routes/ingestion.py new file mode 100644 index 000000000..4aceedd1f --- /dev/null +++ b/backend/app/api/routes/ingestion.py @@ -0,0 +1,954 @@ +from __future__ import annotations + +import json +from typing import Any, Literal + +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel, Field, field_validator + +from app.api.routes.chat import IncidentReport, kernel +from src.adapters.confluence_client import ConfluenceClient, ConfluenceClientError +from src.adapters.grafana_client import GrafanaClient, GrafanaClientError +from src.adapters.github_client import GitHubClient, GitHubClientError +from src.adapters.iris_client import IrisClient, IrisClientError +from src.adapters.jira_client import JiraClient, JiraClientError +from src.adapters.slack_client import SlackClient, SlackClientError +from src.memory.three_tier_memory import MemoryDocument + +router = APIRouter() + + +class IngestIrisResponse(BaseModel): + ingested_count: int + source: str + case_id: str + incident_report: IncidentReport + + +class IngestionErrorEnvelope(BaseModel): + code: str + message: str + source: str + stage: str + retriable: bool + target: str | None = None + + +class CreateIrisIncidentRequest(BaseModel): + case_name: str = Field(min_length=2, max_length=255) + case_description: str = Field(min_length=2, max_length=5000) + severity: str = "medium" + tags: list[str] = Field(default_factory=list) + case_customer: int = 1 + case_soc_id: str = "" + classification_id: int | None = None + case_template_id: str | None = None + custom_attributes: dict[str, object] | None = None + + @field_validator("tags") + @classmethod + def validate_tags(cls, value: list[str]) -> list[str]: + cleaned: list[str] = [] + for tag in value: + normalized = tag.strip() + if normalized and normalized not in cleaned: + cleaned.append(normalized) + return cleaned + + +class CreateIrisIncidentResponse(BaseModel): + source: str + case_id: str + incident_report: IncidentReport + + +class IngestConfluenceRequest(BaseModel): + page_ids: list[str] = Field(min_length=1) + + @field_validator("page_ids") + @classmethod + def validate_page_ids(cls, value: list[str]) -> list[str]: + cleaned: list[str] = [] + for page_id in value: + normalized = page_id.strip() + if normalized and normalized not in cleaned: + cleaned.append(normalized) + + if not cleaned: + raise ValueError("page_ids must contain at least one non-empty page id") + return cleaned + + +class IngestConfluenceResult(BaseModel): + page_id: str + status: Literal["ingested", "failed"] + title: str | None = None + error: str | None = None + error_detail: IngestionErrorEnvelope | None = None + + +class IngestConfluenceResponse(BaseModel): + ingested_count: int + failed_count: int + source: str + results: list[IngestConfluenceResult] + + +class GitHubIssueRef(BaseModel): + repository: str = Field(min_length=3) + issue_number: int = Field(gt=0) + + @field_validator("repository") + @classmethod + def validate_repository(cls, value: str) -> str: + normalized = value.strip() + if "/" not in normalized: + raise ValueError("repository must be in owner/repo format") + return normalized + + +class IngestGitHubRequest(BaseModel): + issue_refs: list[GitHubIssueRef] = Field(min_length=1) + + +class IngestGitHubResult(BaseModel): + repository: str + issue_number: int + status: Literal["ingested", "failed"] + title: str | None = None + url: str | None = None + error: str | None = None + error_detail: IngestionErrorEnvelope | None = None + + +class IngestGitHubResponse(BaseModel): + ingested_count: int + failed_count: int + source: str + results: list[IngestGitHubResult] + + +class GrafanaDashboardRef(BaseModel): + public_dashboard_url: str = Field(min_length=1) + + @field_validator("public_dashboard_url") + @classmethod + def validate_public_dashboard_url(cls, value: str) -> str: + normalized = value.strip() + if not normalized: + raise ValueError("public_dashboard_url must be non-empty") + if not normalized.startswith("http://") and not normalized.startswith("https://"): + raise ValueError("public_dashboard_url must be an absolute URL") + return normalized + + +class IngestGrafanaRequest(BaseModel): + dashboards: list[GrafanaDashboardRef] = Field(min_length=1) + + @field_validator("dashboards") + @classmethod + def dedupe_dashboards(cls, value: list[GrafanaDashboardRef]) -> list[GrafanaDashboardRef]: + deduped: list[GrafanaDashboardRef] = [] + seen: set[str] = set() + + for ref in value: + key = ref.public_dashboard_url.strip().lower() + if key in seen: + continue + seen.add(key) + deduped.append(ref) + + if not deduped: + raise ValueError("dashboards must contain at least one valid Grafana dashboard URL") + return deduped + + +class IngestGrafanaResult(BaseModel): + public_dashboard_url: str + status: Literal["ingested", "failed"] + title: str | None = None + panel_count: int | None = None + error: str | None = None + error_detail: IngestionErrorEnvelope | None = None + + +class IngestGrafanaResponse(BaseModel): + ingested_count: int + failed_count: int + source: str + results: list[IngestGrafanaResult] + + +class IngestJiraRequest(BaseModel): + issue_keys: list[str] = Field(min_length=1) + + @field_validator("issue_keys") + @classmethod + def validate_issue_keys(cls, value: list[str]) -> list[str]: + cleaned: list[str] = [] + for issue_key in value: + normalized = issue_key.strip().upper() + if normalized and normalized not in cleaned: + cleaned.append(normalized) + + if not cleaned: + raise ValueError("issue_keys must contain at least one non-empty issue key") + return cleaned + + +class IngestJiraResult(BaseModel): + issue_key: str + status: Literal["ingested", "failed"] + summary: str | None = None + url: str | None = None + error: str | None = None + error_detail: IngestionErrorEnvelope | None = None + + +class IngestJiraResponse(BaseModel): + ingested_count: int + failed_count: int + source: str + results: list[IngestJiraResult] + + +class SlackChannelRef(BaseModel): + channel_id: str = Field(min_length=1) + limit: int = Field(default=20, ge=1, le=200) + + @field_validator("channel_id") + @classmethod + def validate_channel_id(cls, value: str) -> str: + normalized = value.strip() + if not normalized: + raise ValueError("channel_id must be non-empty") + return normalized + + +class IngestSlackChannelsRequest(BaseModel): + channels: list[SlackChannelRef] = Field(min_length=1) + + @field_validator("channels") + @classmethod + def dedupe_channels(cls, value: list[SlackChannelRef]) -> list[SlackChannelRef]: + deduped: list[SlackChannelRef] = [] + seen: set[str] = set() + + for ref in value: + if ref.channel_id in seen: + continue + seen.add(ref.channel_id) + deduped.append(ref) + + if not deduped: + raise ValueError("channels must contain at least one valid channel") + return deduped + + +class IngestSlackChannelResult(BaseModel): + channel_id: str + status: Literal["ingested", "failed"] + message_count: int | None = None + error: str | None = None + error_detail: IngestionErrorEnvelope | None = None + + +class IngestSlackChannelsResponse(BaseModel): + ingested_count: int + failed_count: int + source: str + results: list[IngestSlackChannelResult] + + +class SlackThreadRef(BaseModel): + channel_id: str = Field(min_length=1) + thread_ts: str = Field(min_length=1) + limit: int = Field(default=20, ge=1, le=200) + + @field_validator("channel_id") + @classmethod + def validate_thread_channel_id(cls, value: str) -> str: + normalized = value.strip() + if not normalized: + raise ValueError("channel_id must be non-empty") + return normalized + + @field_validator("thread_ts") + @classmethod + def validate_thread_ts(cls, value: str) -> str: + normalized = value.strip() + if not normalized: + raise ValueError("thread_ts must be non-empty") + return normalized + + +class IngestSlackThreadsRequest(BaseModel): + threads: list[SlackThreadRef] = Field(min_length=1) + + @field_validator("threads") + @classmethod + def dedupe_threads(cls, value: list[SlackThreadRef]) -> list[SlackThreadRef]: + deduped: list[SlackThreadRef] = [] + seen: set[tuple[str, str]] = set() + + for ref in value: + key = (ref.channel_id, ref.thread_ts) + if key in seen: + continue + seen.add(key) + deduped.append(ref) + + if not deduped: + raise ValueError("threads must contain at least one valid thread") + return deduped + + +class IngestSlackThreadResult(BaseModel): + channel_id: str + thread_ts: str + status: Literal["ingested", "failed"] + message_count: int | None = None + error: str | None = None + error_detail: IngestionErrorEnvelope | None = None + + +class IngestSlackThreadsResponse(BaseModel): + ingested_count: int + failed_count: int + source: str + results: list[IngestSlackThreadResult] + + +class VectorIndexStatusResponse(BaseModel): + source: str + status: dict[str, Any] + + +def _build_ingestion_error( + *, + source: str, + stage: str, + message: str, + target: str | None = None, + code: str = "ingestion_adapter_error", + retriable: bool = True, +) -> IngestionErrorEnvelope: + return IngestionErrorEnvelope( + code=code, + message=message, + source=source, + stage=stage, + retriable=retriable, + target=target, + ) + + +def _render_slack_messages(messages: list[dict[str, str]]) -> str: + rendered_lines: list[str] = [] + for message in messages: + ts = message.get("ts", "") + user = message.get("user", "") + text = message.get("text", "") + rendered_lines.append(f"- [{ts}] {user}: {text}") + + return "\n".join(rendered_lines) + + +def _sync_vector_index() -> dict[str, Any] | None: + retrieval_swarm = getattr(kernel, "retrieval_swarm", None) + semantic_service = getattr(retrieval_swarm, "semantic_service", None) + if semantic_service is None or not hasattr(semantic_service, "sync_documents"): + return None + + documents = kernel.memory.load_documents(force_reload=True) + try: + return semantic_service.sync_documents(documents) + except Exception as exc: + return { + "indexed": False, + "reason": f"vector_sync_failed: {exc}", + } + + +@router.post("/ingest/iris", response_model=IngestIrisResponse) +def ingest_iris(case_id: str) -> IngestIrisResponse: + try: + client = IrisClient.from_env() + except IrisClientError as exc: + detail = _build_ingestion_error( + source="iris", + stage="init", + message=str(exc), + target=case_id, + ) + raise HTTPException(status_code=502, detail=detail.model_dump()) from exc + + try: + case_payload = client.fetch_case(case_id=case_id) + incident_report = IncidentReport(**case_payload) + except IrisClientError as exc: + detail = _build_ingestion_error( + source="iris", + stage="fetch", + message=str(exc), + target=case_id, + ) + raise HTTPException(status_code=502, detail=detail.model_dump()) from exc + except ValueError as exc: + detail = _build_ingestion_error( + source="iris", + stage="transform", + message=str(exc), + target=case_id, + code="ingestion_payload_invalid", + retriable=False, + ) + raise HTTPException(status_code=502, detail=detail.model_dump()) from exc + + doc = MemoryDocument( + title=f"IRIS Case {incident_report.case_id or case_id}", + path=f"runtime/iris/{incident_report.case_id or case_id}.json", + source_type="incidents", + content=json.dumps(incident_report.model_dump(mode="json"), indent=2, ensure_ascii=False), + ) + kernel.memory.ingest_runtime_document(doc) + _sync_vector_index() + + return IngestIrisResponse( + ingested_count=1, + source="iris", + case_id=incident_report.case_id or case_id, + incident_report=incident_report, + ) + + +@router.post("/incidents/create", response_model=CreateIrisIncidentResponse) +def create_iris_incident(payload: CreateIrisIncidentRequest) -> CreateIrisIncidentResponse: + try: + client = IrisClient.from_env() + except IrisClientError as exc: + detail = _build_ingestion_error( + source="iris", + stage="init", + message=str(exc), + code="ingestion_adapter_unavailable", + ) + raise HTTPException(status_code=502, detail=detail.model_dump()) from exc + + try: + case_payload = client.create_incident( + case_name=payload.case_name, + case_description=payload.case_description, + severity=payload.severity, + tags=payload.tags, + case_customer=payload.case_customer, + case_soc_id=payload.case_soc_id, + classification_id=payload.classification_id, + case_template_id=payload.case_template_id, + custom_attributes=payload.custom_attributes, + ) + incident_report = IncidentReport(**case_payload) + except IrisClientError as exc: + detail = _build_ingestion_error( + source="iris", + stage="fetch", + message=str(exc), + code="ingestion_adapter_request_failed", + ) + raise HTTPException(status_code=502, detail=detail.model_dump()) from exc + except ValueError as exc: + detail = _build_ingestion_error( + source="iris", + stage="transform", + message=str(exc), + code="ingestion_payload_invalid", + retriable=False, + ) + raise HTTPException(status_code=502, detail=detail.model_dump()) from exc + + case_id = incident_report.case_id or "new" + doc = MemoryDocument( + title=f"IRIS Case {case_id}", + path=f"runtime/iris/{case_id}.json", + source_type="incidents", + content=json.dumps(incident_report.model_dump(mode="json"), indent=2, ensure_ascii=False), + ) + kernel.memory.ingest_runtime_document(doc) + _sync_vector_index() + + return CreateIrisIncidentResponse( + source="iris", + case_id=case_id, + incident_report=incident_report, + ) + + +@router.post("/ingest/confluence", response_model=IngestConfluenceResponse) +def ingest_confluence(payload: IngestConfluenceRequest) -> IngestConfluenceResponse: + try: + client = ConfluenceClient.from_env() + except ConfluenceClientError as exc: + detail = _build_ingestion_error( + source="confluence", + stage="init", + message=str(exc), + code="ingestion_adapter_unavailable", + ) + raise HTTPException(status_code=502, detail=detail.model_dump()) from exc + + results: list[IngestConfluenceResult] = [] + for page_id in payload.page_ids: + try: + page_payload = client.fetch_page(page_id=page_id) + content = f"# {page_payload['title']}\n\n{page_payload['body']}\n\nSource: {page_payload['source_url']}\n" + doc = MemoryDocument( + title=page_payload["title"], + path=f"runtime/confluence/{page_id}.md", + source_type="confluence", + content=content, + ) + kernel.memory.ingest_runtime_document(doc) + results.append( + IngestConfluenceResult( + page_id=page_id, + status="ingested", + title=page_payload["title"], + ) + ) + except Exception as exc: # Keep batch ingestion resilient to per-page failures. + error_detail = _build_ingestion_error( + source="confluence", + stage="fetch", + message=str(exc), + target=page_id, + ) + results.append( + IngestConfluenceResult( + page_id=page_id, + status="failed", + error=str(exc), + error_detail=error_detail, + ) + ) + + ingested_count = len([item for item in results if item.status == "ingested"]) + failed_count = len(results) - ingested_count + if ingested_count > 0: + _sync_vector_index() + + return IngestConfluenceResponse( + ingested_count=ingested_count, + failed_count=failed_count, + source="confluence", + results=results, + ) + + +@router.post("/ingest/github", response_model=IngestGitHubResponse) +def ingest_github(payload: IngestGitHubRequest) -> IngestGitHubResponse: + try: + client = GitHubClient.from_env() + except GitHubClientError as exc: + detail = _build_ingestion_error( + source="github", + stage="init", + message=str(exc), + code="ingestion_adapter_unavailable", + ) + raise HTTPException(status_code=502, detail=detail.model_dump()) from exc + + unique_refs: list[GitHubIssueRef] = [] + seen: set[tuple[str, int]] = set() + for ref in payload.issue_refs: + key = (ref.repository.strip(), ref.issue_number) + if key in seen: + continue + seen.add(key) + unique_refs.append(ref) + + results: list[IngestGitHubResult] = [] + for ref in unique_refs: + try: + issue_payload = client.fetch_issue(repository=ref.repository, issue_number=ref.issue_number) + title = str(issue_payload.get("title", "")) + issue_url = str(issue_payload.get("url", "")) + state = str(issue_payload.get("state", "unknown")) + body = str(issue_payload.get("body", "")).strip() + + content = ( + f"# GitHub Issue {ref.repository}#{ref.issue_number}\n\n" + f"Title: {title}\n" + f"State: {state}\n" + f"URL: {issue_url}\n\n" + f"{body}\n" + ) + safe_repo = ref.repository.replace("/", "__") + doc = MemoryDocument( + title=f"{ref.repository}#{ref.issue_number}", + path=f"runtime/github/{safe_repo}-{ref.issue_number}.md", + source_type="github", + content=content, + ) + kernel.memory.ingest_runtime_document(doc) + results.append( + IngestGitHubResult( + repository=ref.repository, + issue_number=ref.issue_number, + status="ingested", + title=title, + url=issue_url, + ) + ) + except Exception as exc: # Keep batch ingestion resilient to per-issue failures. + target = f"{ref.repository}#{ref.issue_number}" + error_detail = _build_ingestion_error( + source="github", + stage="fetch", + message=str(exc), + target=target, + ) + results.append( + IngestGitHubResult( + repository=ref.repository, + issue_number=ref.issue_number, + status="failed", + error=str(exc), + error_detail=error_detail, + ) + ) + + ingested_count = len([item for item in results if item.status == "ingested"]) + failed_count = len(results) - ingested_count + if ingested_count > 0: + _sync_vector_index() + + return IngestGitHubResponse( + ingested_count=ingested_count, + failed_count=failed_count, + source="github", + results=results, + ) + + +@router.post("/ingest/grafana", response_model=IngestGrafanaResponse) +def ingest_grafana(payload: IngestGrafanaRequest) -> IngestGrafanaResponse: + try: + client = GrafanaClient.from_env() + except GrafanaClientError as exc: + detail = _build_ingestion_error( + source="grafana", + stage="init", + message=str(exc), + code="ingestion_adapter_unavailable", + ) + raise HTTPException(status_code=502, detail=detail.model_dump()) from exc + + results: list[IngestGrafanaResult] = [] + for ref in payload.dashboards: + dashboard_url = ref.public_dashboard_url + try: + dashboard_payload = client.fetch_public_dashboard(public_dashboard_url=dashboard_url) + title = str(dashboard_payload.get("title", "Grafana Dashboard")) + panel_count = int(dashboard_payload.get("panel_count", 0) or 0) + token = str(dashboard_payload.get("public_dashboard_token", "dashboard")).replace("/", "_") + + content = json.dumps(dashboard_payload, ensure_ascii=False, indent=2) + doc = MemoryDocument( + title=f"Grafana {title}", + path=f"runtime/grafana/{token}.json", + source_type="grafana", + content=content, + ) + kernel.memory.ingest_runtime_document(doc) + results.append( + IngestGrafanaResult( + public_dashboard_url=dashboard_url, + status="ingested", + title=title, + panel_count=panel_count, + ) + ) + except Exception as exc: # Keep batch ingestion resilient to per-dashboard failures. + error_detail = _build_ingestion_error( + source="grafana", + stage="fetch", + message=str(exc), + target=dashboard_url, + ) + results.append( + IngestGrafanaResult( + public_dashboard_url=dashboard_url, + status="failed", + error=str(exc), + error_detail=error_detail, + ) + ) + + ingested_count = len([item for item in results if item.status == "ingested"]) + failed_count = len(results) - ingested_count + if ingested_count > 0: + _sync_vector_index() + + return IngestGrafanaResponse( + ingested_count=ingested_count, + failed_count=failed_count, + source="grafana", + results=results, + ) + + +@router.post("/ingest/jira", response_model=IngestJiraResponse) +def ingest_jira(payload: IngestJiraRequest) -> IngestJiraResponse: + try: + client = JiraClient.from_env() + except JiraClientError as exc: + detail = _build_ingestion_error( + source="jira", + stage="init", + message=str(exc), + code="ingestion_adapter_unavailable", + ) + raise HTTPException(status_code=502, detail=detail.model_dump()) from exc + + results: list[IngestJiraResult] = [] + for issue_key in payload.issue_keys: + try: + issue_payload = client.fetch_issue(issue_key=issue_key) + summary = str(issue_payload.get("summary", "")) + issue_url = str(issue_payload.get("url", "")) + status = str(issue_payload.get("status", "unknown")) + priority = str(issue_payload.get("priority", "")) + assignee = str(issue_payload.get("assignee", "")) + description = issue_payload.get("description") + + if isinstance(description, str): + description_text = description + else: + description_text = json.dumps(description, ensure_ascii=False, indent=2) if description else "" + + content = ( + f"# Jira Issue {issue_key}\n\n" + f"Summary: {summary}\n" + f"Status: {status}\n" + f"Priority: {priority}\n" + f"Assignee: {assignee}\n" + f"URL: {issue_url}\n\n" + f"{description_text}\n" + ) + doc = MemoryDocument( + title=f"Jira {issue_key}", + path=f"runtime/jira/{issue_key}.md", + source_type="jira", + content=content, + ) + kernel.memory.ingest_runtime_document(doc) + results.append( + IngestJiraResult( + issue_key=issue_key, + status="ingested", + summary=summary, + url=issue_url, + ) + ) + except Exception as exc: # Keep batch ingestion resilient to per-issue failures. + error_detail = _build_ingestion_error( + source="jira", + stage="fetch", + message=str(exc), + target=issue_key, + ) + results.append( + IngestJiraResult( + issue_key=issue_key, + status="failed", + error=str(exc), + error_detail=error_detail, + ) + ) + + ingested_count = len([item for item in results if item.status == "ingested"]) + failed_count = len(results) - ingested_count + if ingested_count > 0: + _sync_vector_index() + + return IngestJiraResponse( + ingested_count=ingested_count, + failed_count=failed_count, + source="jira", + results=results, + ) + + +@router.post("/ingest/slack/channels", response_model=IngestSlackChannelsResponse) +def ingest_slack_channels(payload: IngestSlackChannelsRequest) -> IngestSlackChannelsResponse: + try: + client = SlackClient.from_env() + except SlackClientError as exc: + detail = _build_ingestion_error( + source="slack", + stage="init", + message=str(exc), + code="ingestion_adapter_unavailable", + ) + raise HTTPException(status_code=502, detail=detail.model_dump()) from exc + + results: list[IngestSlackChannelResult] = [] + for ref in payload.channels: + try: + channel_payload = client.fetch_channel_messages(channel_id=ref.channel_id, limit=ref.limit) + messages = channel_payload.get("messages", []) + message_count = int(channel_payload.get("message_count", 0)) + + content = ( + f"# Slack Channel {ref.channel_id}\n\n" + f"Message Count: {message_count}\n" + f"Has More: {channel_payload.get('has_more', False)}\n\n" + f"{_render_slack_messages(messages if isinstance(messages, list) else [])}\n" + ) + + doc = MemoryDocument( + title=f"Slack Channel {ref.channel_id}", + path=f"runtime/slack/channel-{ref.channel_id}.md", + source_type="slack", + content=content, + ) + kernel.memory.ingest_runtime_document(doc) + + results.append( + IngestSlackChannelResult( + channel_id=ref.channel_id, + status="ingested", + message_count=message_count, + ) + ) + except Exception as exc: # Keep batch ingestion resilient to per-channel failures. + error_detail = _build_ingestion_error( + source="slack", + stage="fetch", + message=str(exc), + target=ref.channel_id, + ) + results.append( + IngestSlackChannelResult( + channel_id=ref.channel_id, + status="failed", + error=str(exc), + error_detail=error_detail, + ) + ) + + ingested_count = len([item for item in results if item.status == "ingested"]) + failed_count = len(results) - ingested_count + if ingested_count > 0: + _sync_vector_index() + + return IngestSlackChannelsResponse( + ingested_count=ingested_count, + failed_count=failed_count, + source="slack", + results=results, + ) + + +@router.post("/ingest/slack/threads", response_model=IngestSlackThreadsResponse) +def ingest_slack_threads(payload: IngestSlackThreadsRequest) -> IngestSlackThreadsResponse: + try: + client = SlackClient.from_env() + except SlackClientError as exc: + detail = _build_ingestion_error( + source="slack", + stage="init", + message=str(exc), + code="ingestion_adapter_unavailable", + ) + raise HTTPException(status_code=502, detail=detail.model_dump()) from exc + + results: list[IngestSlackThreadResult] = [] + for ref in payload.threads: + try: + thread_payload = client.fetch_thread_messages( + channel_id=ref.channel_id, + thread_ts=ref.thread_ts, + limit=ref.limit, + ) + messages = thread_payload.get("messages", []) + message_count = int(thread_payload.get("message_count", 0)) + safe_thread_ts = ref.thread_ts.replace(".", "_") + + content = ( + f"# Slack Thread {ref.channel_id}:{ref.thread_ts}\n\n" + f"Message Count: {message_count}\n" + f"Has More: {thread_payload.get('has_more', False)}\n\n" + f"{_render_slack_messages(messages if isinstance(messages, list) else [])}\n" + ) + + doc = MemoryDocument( + title=f"Slack Thread {ref.channel_id}:{ref.thread_ts}", + path=f"runtime/slack/thread-{ref.channel_id}-{safe_thread_ts}.md", + source_type="slack", + content=content, + ) + kernel.memory.ingest_runtime_document(doc) + + results.append( + IngestSlackThreadResult( + channel_id=ref.channel_id, + thread_ts=ref.thread_ts, + status="ingested", + message_count=message_count, + ) + ) + except Exception as exc: # Keep batch ingestion resilient to per-thread failures. + target = f"{ref.channel_id}:{ref.thread_ts}" + error_detail = _build_ingestion_error( + source="slack", + stage="fetch", + message=str(exc), + target=target, + ) + results.append( + IngestSlackThreadResult( + channel_id=ref.channel_id, + thread_ts=ref.thread_ts, + status="failed", + error=str(exc), + error_detail=error_detail, + ) + ) + + ingested_count = len([item for item in results if item.status == "ingested"]) + failed_count = len(results) - ingested_count + if ingested_count > 0: + _sync_vector_index() + + return IngestSlackThreadsResponse( + ingested_count=ingested_count, + failed_count=failed_count, + source="slack", + results=results, + ) + + +@router.get("/vector/status", response_model=VectorIndexStatusResponse) +def vector_status() -> VectorIndexStatusResponse: + retrieval_swarm = getattr(kernel, "retrieval_swarm", None) + semantic_service = getattr(retrieval_swarm, "semantic_service", None) + if semantic_service is None or not hasattr(semantic_service, "health"): + return VectorIndexStatusResponse( + source="vector", + status={ + "mode": "keyword", + "indexed": False, + "reason": "semantic service unavailable", + }, + ) + + status = semantic_service.health() + status["document_count"] = len(kernel.memory.load_documents(force_reload=False)) + return VectorIndexStatusResponse(source="vector", status=status) + + +@router.post("/vector/rebuild", response_model=VectorIndexStatusResponse) +def vector_rebuild() -> VectorIndexStatusResponse: + status = _sync_vector_index() or {"indexed": False, "reason": "semantic service unavailable"} + return VectorIndexStatusResponse(source="vector", status=status) diff --git a/backend/app/main.py b/backend/app/main.py new file mode 100644 index 000000000..0934ef92f --- /dev/null +++ b/backend/app/main.py @@ -0,0 +1,37 @@ +from pathlib import Path + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +try: + from dotenv import load_dotenv +except ImportError: # pragma: no cover - optional fallback if dependency is unavailable + load_dotenv = None + +if load_dotenv is not None: + repo_root = Path(__file__).resolve().parents[2] + load_dotenv(repo_root / ".env") + +# Import routes only after environment variables are loaded. +# The chat router initializes a shared ControllerKernel at import time, +# so LLM_PROVIDER/GROQ_* must be available first. +from app.api.routes.approvals import router as approvals_router +from app.api.routes.chat import router as chat_router +from app.api.routes.ingestion import router as ingestion_router + +app = FastAPI(title="UniOps API", version="0.1.0") +app.add_middleware( + CORSMiddleware, + allow_origins=["http://localhost:3000", "http://127.0.0.1:3000"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) +app.include_router(chat_router, prefix="/api") +app.include_router(ingestion_router, prefix="/api") +app.include_router(approvals_router, prefix="/api") + + +@app.get("/health") +def health() -> dict[str, str]: + return {"status": "ok"} diff --git a/backend/pytest.ini b/backend/pytest.ini new file mode 100644 index 000000000..c7b23ecb1 --- /dev/null +++ b/backend/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +pythonpath = . +testpaths = tests diff --git a/backend/requirements.txt b/backend/requirements.txt new file mode 100644 index 000000000..5664a75e0 --- /dev/null +++ b/backend/requirements.txt @@ -0,0 +1,13 @@ +fastapi==0.115.0 +uvicorn[standard]==0.31.0 +pydantic==2.11.7 +sse-starlette==2.1.3 +httpx==0.28.1 +pytest==9.0.3 +langchain==0.3.27 +llama-index==0.14.20 +llama-index-vector-stores-milvus==1.1.0 +pymilvus==2.6.12 +langchain-openai==0.3.35 +langchain-groq==0.3.8 +python-dotenv==1.1.1 diff --git a/backend/src/adapters/confluence_client.py b/backend/src/adapters/confluence_client.py new file mode 100644 index 000000000..403197c8a --- /dev/null +++ b/backend/src/adapters/confluence_client.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import os +from typing import Any + +import httpx + + +class ConfluenceClientError(RuntimeError): + pass + + +class ConfluenceClient: + def __init__( + self, + *, + base_url: str, + api_token: str, + email: str, + timeout_seconds: float = 15.0, + ) -> None: + self.base_url = base_url.rstrip("/") + self.api_token = api_token + self.email = email + self.timeout_seconds = timeout_seconds + + @classmethod + def from_env(cls) -> "ConfluenceClient": + base_url = os.getenv("CONFLUENCE_BASE_URL", "").strip() + api_token = os.getenv("CONFLUENCE_API_TOKEN", "").strip() + email = os.getenv("CONFLUENCE_EMAIL", "").strip() + + if not base_url: + raise ConfluenceClientError("CONFLUENCE_BASE_URL is not configured") + if not api_token: + raise ConfluenceClientError("CONFLUENCE_API_TOKEN is not configured") + if not email: + raise ConfluenceClientError("CONFLUENCE_EMAIL is not configured") + + return cls(base_url=base_url, api_token=api_token, email=email) + + def fetch_page(self, page_id: str) -> dict[str, str]: + auth = httpx.BasicAuth(username=self.email, password=self.api_token) + candidates = [ + f"/wiki/api/v2/pages/{page_id}?body-format=storage", + f"/rest/api/content/{page_id}?expand=body.storage", + ] + + last_error: str | None = None + with httpx.Client(timeout=self.timeout_seconds) as client: + for path in candidates: + url = f"{self.base_url}{path}" + try: + response = client.get(url, auth=auth) + if response.status_code >= 400: + last_error = f"GET {path} returned {response.status_code}" + continue + + payload = response.json() + title = payload.get("title", f"Confluence Page {page_id}") + + body = "" + body_obj: Any = payload.get("body", {}) + if isinstance(body_obj, dict): + if "storage" in body_obj and isinstance(body_obj["storage"], dict): + body = str(body_obj["storage"].get("value", "")) + elif "value" in body_obj: + body = str(body_obj.get("value", "")) + + webui = payload.get("_links", {}).get("webui") if isinstance(payload.get("_links"), dict) else None + source_url = f"{self.base_url}{webui}" if isinstance(webui, str) else f"{self.base_url}/wiki/spaces/{page_id}" + + return { + "page_id": page_id, + "title": title, + "body": body, + "source_url": source_url, + } + except (httpx.HTTPError, ValueError) as exc: + last_error = str(exc) + continue + + raise ConfluenceClientError( + f"Failed to fetch Confluence page {page_id}: {last_error or 'unknown error'}" + ) diff --git a/backend/src/adapters/github_client.py b/backend/src/adapters/github_client.py new file mode 100644 index 000000000..44a4d4c33 --- /dev/null +++ b/backend/src/adapters/github_client.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import os +from typing import Any + +import httpx + + +class GitHubClientError(RuntimeError): + pass + + +class GitHubClient: + def __init__( + self, + *, + token: str, + api_base_url: str = "https://api.github.com", + timeout_seconds: float = 15.0, + ) -> None: + self.token = token + self.api_base_url = api_base_url.rstrip("/") + self.timeout_seconds = timeout_seconds + + @classmethod + def from_env(cls) -> "GitHubClient": + token = os.getenv("GITHUB_TOKEN", "").strip() + api_base_url = os.getenv("GITHUB_API_BASE_URL", "https://api.github.com").strip() + + if not token: + raise GitHubClientError("GITHUB_TOKEN is not configured") + + return cls(token=token, api_base_url=api_base_url or "https://api.github.com") + + def fetch_issue(self, *, repository: str, issue_number: int) -> dict[str, Any]: + normalized_repo = repository.strip() + if not normalized_repo: + raise GitHubClientError("repository is required") + if issue_number <= 0: + raise GitHubClientError("issue_number must be a positive integer") + + url = f"{self.api_base_url}/repos/{normalized_repo}/issues/{issue_number}" + headers = { + "Authorization": f"Bearer {self.token}", + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + + try: + with httpx.Client(timeout=self.timeout_seconds) as client: + response = client.get(url, headers=headers) + except httpx.HTTPError as exc: + raise GitHubClientError(f"GitHub request failed: {exc}") from exc + + if response.status_code in {401, 403}: + raise GitHubClientError("GitHub issue fetch failed: authentication or permission error") + if response.status_code == 404: + raise GitHubClientError(f"GitHub issue {normalized_repo}#{issue_number} was not found") + if response.status_code >= 500: + raise GitHubClientError(f"GitHub service error while fetching {normalized_repo}#{issue_number}") + if response.status_code >= 400: + raise GitHubClientError(f"GitHub issue fetch failed with status {response.status_code}") + + body = response.json() + issue_url = str(body.get("html_url", "")).strip() or f"https://github.com/{normalized_repo}/issues/{issue_number}" + title = str(body.get("title", "")).strip() + state = str(body.get("state", "unknown")).strip() or "unknown" + issue_body = str(body.get("body", "")).strip() + + return { + "repository": normalized_repo, + "number": issue_number, + "title": title, + "state": state, + "url": issue_url, + "body": issue_body, + } diff --git a/backend/src/adapters/grafana_client.py b/backend/src/adapters/grafana_client.py new file mode 100644 index 000000000..a67b6d766 --- /dev/null +++ b/backend/src/adapters/grafana_client.py @@ -0,0 +1,184 @@ +from __future__ import annotations + +import os +from typing import Any +from urllib.parse import ParseResult, urlparse + +import httpx + + +class GrafanaClientError(RuntimeError): + pass + + +class GrafanaClient: + def __init__( + self, + *, + timeout_seconds: float = 15.0, + ) -> None: + self.timeout_seconds = timeout_seconds + + @classmethod + def from_env(cls) -> "GrafanaClient": + raw_timeout = os.getenv("GRAFANA_TIMEOUT_SECONDS", "15").strip() + try: + timeout_seconds = float(raw_timeout) + except ValueError as exc: + raise GrafanaClientError("GRAFANA_TIMEOUT_SECONDS must be a number") from exc + + if timeout_seconds <= 0: + raise GrafanaClientError("GRAFANA_TIMEOUT_SECONDS must be greater than 0") + + return cls(timeout_seconds=timeout_seconds) + + def fetch_public_dashboard(self, *, public_dashboard_url: str) -> dict[str, Any]: + parsed_url = self._parse_public_dashboard_url(public_dashboard_url) + token = self._extract_dashboard_token(parsed_url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + + api_url = f"{base_url}/api/public/dashboards/{token}" + if parsed_url.query: + api_url = f"{api_url}?{parsed_url.query}" + + try: + with httpx.Client(timeout=self.timeout_seconds) as client: + response = client.get(api_url) + except httpx.HTTPError as exc: + raise GrafanaClientError(f"Grafana request failed: {exc}") from exc + + if response.status_code in {401, 403}: + raise GrafanaClientError( + "Grafana public dashboard fetch failed: authentication or permission error" + ) + if response.status_code == 404: + raise GrafanaClientError(f"Grafana public dashboard token '{token}' was not found") + if response.status_code >= 500: + raise GrafanaClientError("Grafana service error while fetching public dashboard") + if response.status_code >= 400: + raise GrafanaClientError(f"Grafana public dashboard fetch failed with status {response.status_code}") + + try: + body = response.json() + except ValueError as exc: + raise GrafanaClientError("Grafana returned a non-JSON public dashboard payload") from exc + + if not isinstance(body, dict): + raise GrafanaClientError("Grafana public dashboard payload is invalid") + + dashboard = body.get("dashboard") + if not isinstance(dashboard, dict): + raise GrafanaClientError("Grafana public dashboard response is missing dashboard data") + + meta = body.get("meta") if isinstance(body.get("meta"), dict) else {} + panels = dashboard.get("panels") if isinstance(dashboard.get("panels"), list) else [] + + return { + "public_dashboard_token": token, + "source_url": public_dashboard_url.strip(), + "grafana_base_url": base_url, + "title": str(dashboard.get("title", "Grafana Dashboard")), + "uid": str(dashboard.get("uid", "")), + "version": int(dashboard.get("version", 0) or 0), + "timezone": str(dashboard.get("timezone", "")), + "refresh": str(dashboard.get("refresh", "")), + "time_range": { + "from": str((dashboard.get("time") or {}).get("from", "")), + "to": str((dashboard.get("time") or {}).get("to", "")), + }, + "meta": { + "slug": str(meta.get("slug", "")), + "created": str(meta.get("created", "")), + "updated": str(meta.get("updated", "")), + "public_dashboard_enabled": bool(meta.get("publicDashboardEnabled", False)), + }, + "panel_count": len(panels), + "panels": [self._normalize_panel(panel) for panel in panels if isinstance(panel, dict)], + } + + def _parse_public_dashboard_url(self, public_dashboard_url: str) -> ParseResult: + normalized = public_dashboard_url.strip() + if not normalized: + raise GrafanaClientError("public_dashboard_url is required") + + parsed = urlparse(normalized) + if parsed.scheme not in {"http", "https"} or not parsed.netloc: + raise GrafanaClientError( + "public_dashboard_url must be an absolute URL like https://example.grafana.net/public-dashboards/" + ) + + return parsed + + def _extract_dashboard_token(self, parsed_url: ParseResult) -> str: + path_parts = [part for part in parsed_url.path.split("/") if part] + if "public-dashboards" not in path_parts: + raise GrafanaClientError( + "public_dashboard_url must contain '/public-dashboards/'" + ) + + token_index = path_parts.index("public-dashboards") + 1 + if token_index >= len(path_parts): + raise GrafanaClientError("public_dashboard_url is missing dashboard token") + + token = path_parts[token_index].strip() + if not token: + raise GrafanaClientError("public_dashboard_url is missing dashboard token") + + return token + + def _normalize_panel(self, panel: dict[str, Any]) -> dict[str, Any]: + datasource = panel.get("datasource") + datasource_type = "" + datasource_uid = "" + + if isinstance(datasource, dict): + datasource_type = str(datasource.get("type", "")) + datasource_uid = str(datasource.get("uid", "")) + elif isinstance(datasource, str): + datasource_type = datasource + + raw_targets = panel.get("targets") if isinstance(panel.get("targets"), list) else [] + normalized_targets: list[dict[str, Any]] = [] + for raw_target in raw_targets: + if not isinstance(raw_target, dict): + continue + + target_datasource = raw_target.get("datasource") + target_datasource_type = "" + target_datasource_uid = "" + if isinstance(target_datasource, dict): + target_datasource_type = str(target_datasource.get("type", "")) + target_datasource_uid = str(target_datasource.get("uid", "")) + + query = ( + raw_target.get("expr") + or raw_target.get("query") + or raw_target.get("rawSql") + or raw_target.get("statement") + or "" + ) + + normalized_targets.append( + { + "ref_id": str(raw_target.get("refId", "")), + "query": str(query), + "editor_mode": str(raw_target.get("editorMode", "")), + "datasource_type": target_datasource_type, + "datasource_uid": target_datasource_uid, + "raw": raw_target, + } + ) + + return { + "id": panel.get("id"), + "title": str(panel.get("title", "")), + "type": str(panel.get("type", "")), + "datasource_type": datasource_type, + "datasource_uid": datasource_uid, + "grid_pos": panel.get("gridPos", {}), + "transparent": bool(panel.get("transparent", False)), + "plugin_version": str(panel.get("pluginVersion", "")), + "targets": normalized_targets, + "options": panel.get("options", {}), + "field_config": panel.get("fieldConfig", {}), + } diff --git a/backend/src/adapters/iris_client.py b/backend/src/adapters/iris_client.py new file mode 100644 index 000000000..cf3fddc4a --- /dev/null +++ b/backend/src/adapters/iris_client.py @@ -0,0 +1,203 @@ +from __future__ import annotations + +import os +from typing import Any + +import httpx + + +class IrisClientError(RuntimeError): + pass + + +class IrisClient: + def __init__( + self, + *, + base_url: str, + api_key: str, + verify_ssl: bool = True, + timeout_seconds: float = 15.0, + ) -> None: + self.base_url = base_url.rstrip("/") + self.api_key = api_key + self.verify_ssl = verify_ssl + self.timeout_seconds = timeout_seconds + + _SEVERITY_TO_ID = { + "critical": 1, + "high": 2, + "medium": 3, + "low": 4, + } + + @classmethod + def from_env(cls) -> "IrisClient": + base_url = os.getenv("IRIS_BASE_URL", "").strip() + api_key = os.getenv("IRIS_API_KEY", "").strip() + verify_ssl_env = os.getenv("IRIS_VERIFY_SSL", "true").strip().lower() + verify_ssl = verify_ssl_env not in {"0", "false", "no"} + + if not base_url: + raise IrisClientError("IRIS_BASE_URL is not configured") + if not api_key: + raise IrisClientError("IRIS_API_KEY is not configured") + + return cls(base_url=base_url, api_key=api_key, verify_ssl=verify_ssl) + + def _headers(self) -> dict[str, str]: + return { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + def _normalize_case_payload( + self, + *, + case_payload: dict[str, Any], + fallback_case_id: str, + fallback_case_name: str, + fallback_description: str, + fallback_severity: str, + fallback_tags: list[str] | None = None, + ) -> dict[str, Any]: + case_id = str(case_payload.get("case_id", case_payload.get("id", fallback_case_id))) + return { + "source_system": "iris", + "case_id": case_id, + "report_id": str(case_payload.get("report_id", case_payload.get("id", case_id))), + "report_url": case_payload.get("report_url") or f"{self.base_url}/case/{case_id}", + "ingested_at": case_payload.get("modification_date") or case_payload.get("created_at"), + "case_name": case_payload.get("case_name") or case_payload.get("name") or fallback_case_name, + "short_description": case_payload.get("case_description") + or case_payload.get("description") + or fallback_description, + "severity": str(case_payload.get("severity", fallback_severity)), + "tags": case_payload.get("tags") or fallback_tags or [], + "iocs": case_payload.get("iocs", []), + "timeline": case_payload.get("timeline", []), + } + + def _severity_id_from_label(self, severity: str) -> int: + normalized = severity.strip().lower() + if normalized in self._SEVERITY_TO_ID: + return self._SEVERITY_TO_ID[normalized] + + if normalized.isdigit() and int(normalized) > 0: + return int(normalized) + + return self._SEVERITY_TO_ID["medium"] + + def _extract_case_payload(self, payload: Any, case_id: str) -> dict[str, Any]: + if isinstance(payload, dict): + data = payload.get("data", payload) + if isinstance(data, dict): + return data + if isinstance(data, list): + for item in data: + item_case_id = str(item.get("case_id", item.get("id", ""))) + if item_case_id == case_id: + return item + if data: + return data[0] + + raise IrisClientError("Unable to parse case payload from IRIS response") + + def fetch_case(self, case_id: str) -> dict[str, Any]: + endpoints: list[tuple[str, str, dict[str, Any] | None]] = [ + ("POST", "/manage/cases/list?cid=1", {"case_id": case_id}), + ("GET", f"/manage/cases/{case_id}?cid=1", None), + ] + + last_error: str | None = None + with httpx.Client(timeout=self.timeout_seconds, verify=self.verify_ssl) as client: + for method, path, body in endpoints: + url = f"{self.base_url}{path}" + try: + response = client.request(method=method, url=url, json=body, headers=self._headers()) + if response.status_code >= 400: + last_error = f"{method} {path} returned {response.status_code}" + continue + + payload = response.json() + case_payload = self._extract_case_payload(payload, case_id) + return self._normalize_case_payload( + case_payload=case_payload, + fallback_case_id=case_id, + fallback_case_name=f"IRIS Case {case_id}", + fallback_description="No case description provided.", + fallback_severity="unknown", + ) + except (httpx.HTTPError, ValueError, IrisClientError) as exc: + last_error = str(exc) + continue + + raise IrisClientError(f"Failed to fetch case {case_id} from IRIS: {last_error or 'unknown error'}") + + def create_incident( + self, + *, + case_name: str, + case_description: str, + severity: str = "medium", + tags: list[str] | None = None, + case_customer: int = 1, + case_soc_id: str = "", + classification_id: int | None = None, + case_template_id: str | None = None, + custom_attributes: dict[str, Any] | None = None, + ) -> dict[str, Any]: + normalized_name = case_name.strip() + normalized_description = case_description.strip() + if not normalized_name: + raise IrisClientError("case_name must be provided") + if not normalized_description: + raise IrisClientError("case_description must be provided") + + payload: dict[str, Any] = { + "case_name": normalized_name, + "case_description": normalized_description, + "case_customer": case_customer, + "case_soc_id": case_soc_id, + "severity_id": self._severity_id_from_label(severity), + } + + if tags: + payload["case_tags"] = ",".join(item.strip() for item in tags if item.strip()) + if classification_id is not None: + payload["classification_id"] = classification_id + if case_template_id: + payload["case_template_id"] = str(case_template_id) + if custom_attributes is not None: + payload["custom_attributes"] = custom_attributes + + endpoints: list[tuple[str, str]] = [ + ("POST", "/manage/cases/add"), + ] + + last_error: str | None = None + with httpx.Client(timeout=self.timeout_seconds, verify=self.verify_ssl) as client: + for method, path in endpoints: + url = f"{self.base_url}{path}" + try: + response = client.request(method=method, url=url, json=payload, headers=self._headers()) + if response.status_code >= 400: + last_error = f"{method} {path} returned {response.status_code}" + continue + + body = response.json() + case_payload = self._extract_case_payload(body, case_id="new") + created_case_id = str(case_payload.get("case_id", case_payload.get("id", "new"))) + return self._normalize_case_payload( + case_payload=case_payload, + fallback_case_id=created_case_id, + fallback_case_name=normalized_name, + fallback_description=normalized_description, + fallback_severity=severity, + fallback_tags=tags, + ) + except (httpx.HTTPError, ValueError, IrisClientError) as exc: + last_error = str(exc) + continue + + raise IrisClientError(f"Failed to create IRIS incident: {last_error or 'unknown error'}") diff --git a/backend/src/adapters/jira_client.py b/backend/src/adapters/jira_client.py new file mode 100644 index 000000000..85af2fb92 --- /dev/null +++ b/backend/src/adapters/jira_client.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +import os +import re +from typing import Any + +import httpx + + +class JiraClientError(RuntimeError): + pass + + +JIRA_ISSUE_KEY_PATTERN = re.compile(r"^[A-Z][A-Z0-9]*-\d+$") + + +class JiraClient: + def __init__( + self, + *, + base_url: str, + email: str, + api_token: str, + timeout_seconds: float = 15.0, + ) -> None: + self.base_url = base_url.rstrip("/") + self.email = email + self.api_token = api_token + self.timeout_seconds = timeout_seconds + + @classmethod + def from_env(cls) -> "JiraClient": + base_url = os.getenv("JIRA_BASE_URL", "").strip() + email = os.getenv("JIRA_EMAIL", "").strip() + api_token = os.getenv("JIRA_API_TOKEN", "").strip() + + if not base_url: + raise JiraClientError("JIRA_BASE_URL is not configured") + if not email: + raise JiraClientError("JIRA_EMAIL is not configured") + if not api_token: + raise JiraClientError("JIRA_API_TOKEN is not configured") + + return cls(base_url=base_url, email=email, api_token=api_token) + + def fetch_issue(self, *, issue_key: str) -> dict[str, Any]: + normalized_issue_key = issue_key.strip().upper() + if not normalized_issue_key: + raise JiraClientError("issue_key is required") + if not JIRA_ISSUE_KEY_PATTERN.match(normalized_issue_key): + raise JiraClientError( + f"issue_key '{issue_key}' does not match expected format PROJECT-123" + ) + + url = f"{self.base_url}/rest/api/3/issue/{normalized_issue_key}" + auth = httpx.BasicAuth(username=self.email, password=self.api_token) + params = { + "fields": "summary,status,priority,assignee,description", + } + + try: + with httpx.Client(timeout=self.timeout_seconds) as client: + response = client.get(url, params=params, auth=auth) + except httpx.HTTPError as exc: + raise JiraClientError(f"Jira request failed: {exc}") from exc + + if response.status_code in {401, 403}: + raise JiraClientError("Jira issue fetch failed: authentication or permission error") + if response.status_code == 404: + raise JiraClientError(f"Jira issue {normalized_issue_key} was not found") + if response.status_code >= 500: + raise JiraClientError(f"Jira service error while fetching {normalized_issue_key}") + if response.status_code >= 400: + raise JiraClientError(f"Jira issue fetch failed with status {response.status_code}") + + body = response.json() + fields = body.get("fields", {}) + if not isinstance(fields, dict): + fields = {} + + status_block = fields.get("status", {}) + priority_block = fields.get("priority", {}) + assignee_block = fields.get("assignee", {}) + + status_name = str(status_block.get("name", "")) if isinstance(status_block, dict) else "" + priority_name = str(priority_block.get("name", "")) if isinstance(priority_block, dict) else "" + assignee_name = str(assignee_block.get("displayName", "")) if isinstance(assignee_block, dict) else "" + description = fields.get("description") + + return { + "key": normalized_issue_key, + "summary": str(fields.get("summary", "")), + "status": status_name, + "priority": priority_name, + "assignee": assignee_name, + "description": description, + "url": f"{self.base_url}/browse/{normalized_issue_key}", + } diff --git a/backend/src/adapters/llm_client.py b/backend/src/adapters/llm_client.py new file mode 100644 index 000000000..78d324b62 --- /dev/null +++ b/backend/src/adapters/llm_client.py @@ -0,0 +1,279 @@ +from __future__ import annotations + +import json +import os +from dataclasses import dataclass +from typing import Any, Protocol + +from pydantic import BaseModel, Field, ValidationError + + +class LLMProviderError(RuntimeError): + """Base exception for provider selection and runtime failures.""" + + +class LLMProviderConfigurationError(LLMProviderError): + """Raised when provider configuration is invalid or incomplete.""" + + +class LLMProviderRuntimeError(LLMProviderError): + """Raised when a selected provider fails while serving a request.""" + + +class _ReasoningResponse(BaseModel): + reasoning: str + answer: str + suggested_action: str + action_details: dict[str, Any] | None = None + reasoning_steps: list[str] | None = None + confidence_breakdown: dict[str, Any] | None = None + evidence_scores: list[dict[str, Any]] | None = None + + +class _QueryExpansionResponse(BaseModel): + expanded_terms: list[str] = Field(default_factory=list) + + +class _ExecutionAssessmentResponse(BaseModel): + normalized_action: str + reasoning: str + risk_hint: str | None = None + + +class ReasoningLLMClient(Protocol): + provider_name: str + model_name: str + + def reason( + self, + query: str, + confidence: float, + top_sources: list[dict[str, Any]], + dedup_summary: dict[str, Any] | None, + ) -> dict[str, Any]: + ... + + def expand_query_terms(self, query: str, query_tokens: list[str]) -> list[str]: + ... + + def assess_execution_action(self, action: str, action_details: dict[str, Any] | None) -> dict[str, Any]: + ... + + +@dataclass +class LangChainReasoningLLMClient: + provider_name: str + model_name: str + chat_model: Any + + def _extract_json_payload(self, content: str) -> dict[str, Any]: + normalized = content.strip() + + if normalized.startswith("```"): + lines = normalized.splitlines() + if len(lines) >= 3: + normalized = "\n".join(lines[1:-1]).strip() + + start = normalized.find("{") + end = normalized.rfind("}") + if start == -1 or end == -1 or start >= end: + raise LLMProviderRuntimeError("Provider did not return valid JSON output.") + + try: + return json.loads(normalized[start : end + 1]) + except json.JSONDecodeError as exc: + raise LLMProviderRuntimeError("Provider returned malformed JSON output.") from exc + + def reason( + self, + query: str, + confidence: float, + top_sources: list[dict[str, Any]], + dedup_summary: dict[str, Any] | None, + ) -> dict[str, Any]: + prompt = ( + "You are a reliability copilot. Return ONLY JSON with this schema: " + '{"reasoning": string, "answer": string, "suggested_action": string, ' + '"action_details": {"intent": string, "tool": string|null, "parameters": object, ' + '"approval_required": boolean, "risk_hint": string|null}, ' + '"reasoning_steps": [string, ...], ' + '"confidence_breakdown": object, ' + '"evidence_scores": [{"title": string, "path": string, "source_type": string, ' + '"raw_score": number, "priority_score": number}, ...]}. ' + "Do not include markdown fences or extra keys. " + f"Query: {query}\n" + f"Confidence (0-1): {confidence}\n" + f"Top sources: {json.dumps(top_sources, ensure_ascii=True)}\n" + f"Dedup summary: {json.dumps(dedup_summary or {}, ensure_ascii=True)}" + ) + + try: + response = self.chat_model.invoke(prompt) + content = str(getattr(response, "content", "") or "") + parsed = self._extract_json_payload(content) + validated = _ReasoningResponse.model_validate(parsed) + except LLMProviderRuntimeError: + raise + except ValidationError as exc: + raise LLMProviderRuntimeError("Provider response schema validation failed.") from exc + except Exception as exc: # pragma: no cover - network/provider dependent + raise LLMProviderRuntimeError(f"{self.provider_name} provider request failed: {exc}") from exc + + return { + "reasoning": validated.reasoning, + "answer": validated.answer, + "suggested_action": validated.suggested_action, + "action_details": validated.action_details, + "reasoning_steps": validated.reasoning_steps, + "confidence_breakdown": validated.confidence_breakdown, + "evidence_scores": validated.evidence_scores, + } + + def expand_query_terms(self, query: str, query_tokens: list[str]) -> list[str]: + prompt = ( + "You are helping retrieval quality for reliability incidents. Return ONLY JSON with this schema: " + '{"expanded_terms": [string, ...]}. ' + "Expand query tokens with incident/systems synonyms. Keep it concise (max 8 terms). " + "Do not include markdown fences or extra keys. " + f"Query: {query}\n" + f"Existing query tokens: {json.dumps(query_tokens, ensure_ascii=True)}" + ) + + try: + response = self.chat_model.invoke(prompt) + content = str(getattr(response, "content", "") or "") + parsed = self._extract_json_payload(content) + validated = _QueryExpansionResponse.model_validate(parsed) + except LLMProviderRuntimeError: + raise + except ValidationError as exc: + raise LLMProviderRuntimeError("Provider query-expansion schema validation failed.") from exc + except Exception as exc: # pragma: no cover - network/provider dependent + raise LLMProviderRuntimeError(f"{self.provider_name} provider request failed: {exc}") from exc + + deduped_terms: list[str] = [] + seen: set[str] = set() + for term in validated.expanded_terms: + normalized = str(term).strip().lower() + if not normalized: + continue + if normalized in seen: + continue + seen.add(normalized) + deduped_terms.append(normalized) + if len(deduped_terms) >= 8: + break + return deduped_terms + + def assess_execution_action(self, action: str, action_details: dict[str, Any] | None) -> dict[str, Any]: + prompt = ( + "You are evaluating a proposed reliability action before HITL approval. Return ONLY JSON with this schema: " + '{"normalized_action": string, "reasoning": string, "risk_hint": string|null}. ' + "Do not include markdown fences or extra keys. " + f"Suggested action: {action}\n" + f"Action details: {json.dumps(action_details or {}, ensure_ascii=True)}" + ) + + try: + response = self.chat_model.invoke(prompt) + content = str(getattr(response, "content", "") or "") + parsed = self._extract_json_payload(content) + validated = _ExecutionAssessmentResponse.model_validate(parsed) + except LLMProviderRuntimeError: + raise + except ValidationError as exc: + raise LLMProviderRuntimeError("Provider execution-assessment schema validation failed.") from exc + except Exception as exc: # pragma: no cover - network/provider dependent + raise LLMProviderRuntimeError(f"{self.provider_name} provider request failed: {exc}") from exc + + return { + "normalized_action": validated.normalized_action, + "reasoning": validated.reasoning, + "risk_hint": validated.risk_hint, + } + + +@dataclass +class LazyReasoningLLMClient: + provider_name: str + model_name: str = "pending" + _resolved_client: ReasoningLLMClient | None = None + + def _resolve_client(self) -> ReasoningLLMClient: + if self._resolved_client is None: + client = create_reasoning_llm_client(self.provider_name) + if client is None: + raise LLMProviderRuntimeError("No reasoning provider is configured for this request.") + self._resolved_client = client + self.model_name = getattr(client, "model_name", self.model_name) + return self._resolved_client + + def reason( + self, + query: str, + confidence: float, + top_sources: list[dict[str, Any]], + dedup_summary: dict[str, Any] | None, + ) -> dict[str, Any]: + return self._resolve_client().reason( + query=query, + confidence=confidence, + top_sources=top_sources, + dedup_summary=dedup_summary, + ) + + def expand_query_terms(self, query: str, query_tokens: list[str]) -> list[str]: + return self._resolve_client().expand_query_terms(query=query, query_tokens=query_tokens) + + def assess_execution_action(self, action: str, action_details: dict[str, Any] | None) -> dict[str, Any]: + return self._resolve_client().assess_execution_action(action=action, action_details=action_details) + + +def create_reasoning_llm_client(provider_name: str | None) -> ReasoningLLMClient | None: + normalized = (provider_name or "").strip().lower() + if not normalized: + return None + + if normalized not in {"groq", "apfel"}: + raise LLMProviderConfigurationError("LLM_PROVIDER must be either 'groq' or 'apfel'.") + + if normalized == "groq": + api_key = os.getenv("GROQ_API_KEY", "").strip() + model_name = os.getenv("GROQ_MODEL", "llama-3.1-8b-instant").strip() + if not api_key: + raise LLMProviderConfigurationError("GROQ_API_KEY is required when LLM_PROVIDER=groq.") + + try: + from langchain_groq import ChatGroq + except ImportError as exc: + raise LLMProviderConfigurationError( + "langchain-groq is required for LLM_PROVIDER=groq. Install backend requirements." + ) from exc + + chat_model = ChatGroq(api_key=api_key, model=model_name, temperature=0) + return LangChainReasoningLLMClient(provider_name="groq", model_name=model_name, chat_model=chat_model) + + base_url = os.getenv("APFEL_BASE_URL", "").strip() + api_key = os.getenv("APFEL_API_KEY", "").strip() + model_name = os.getenv("APFEL_MODEL", "apfel-chat").strip() + if not base_url: + raise LLMProviderConfigurationError("APFEL_BASE_URL is required when LLM_PROVIDER=apfel.") + if not api_key: + raise LLMProviderConfigurationError("APFEL_API_KEY is required when LLM_PROVIDER=apfel.") + + try: + from langchain_openai import ChatOpenAI + except ImportError as exc: + raise LLMProviderConfigurationError( + "langchain-openai is required for LLM_PROVIDER=apfel. Install backend requirements." + ) from exc + + chat_model = ChatOpenAI(api_key=api_key, model=model_name, base_url=base_url, temperature=0) + return LangChainReasoningLLMClient(provider_name="apfel", model_name=model_name, chat_model=chat_model) + + +def create_shared_reasoning_llm_client(provider_name: str | None) -> ReasoningLLMClient | None: + normalized = (provider_name or "").strip().lower() + if not normalized: + return None + return LazyReasoningLLMClient(provider_name=normalized) \ No newline at end of file diff --git a/backend/src/adapters/slack_client.py b/backend/src/adapters/slack_client.py new file mode 100644 index 000000000..f85a47401 --- /dev/null +++ b/backend/src/adapters/slack_client.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +import os +from typing import Any + +import httpx + + +class SlackClientError(RuntimeError): + pass + + +class SlackClient: + def __init__( + self, + *, + bot_token: str, + api_base_url: str = "https://slack.com/api", + timeout_seconds: float = 15.0, + ) -> None: + self.bot_token = bot_token + self.api_base_url = api_base_url.rstrip("/") + self.timeout_seconds = timeout_seconds + + @classmethod + def from_env(cls) -> "SlackClient": + bot_token = os.getenv("SLACK_BOT_TOKEN", "").strip() + api_base_url = os.getenv("SLACK_API_BASE_URL", "https://slack.com/api").strip() + + if not bot_token: + raise SlackClientError("SLACK_BOT_TOKEN is not configured") + + return cls(bot_token=bot_token, api_base_url=api_base_url or "https://slack.com/api") + + def fetch_channel_messages(self, *, channel_id: str, limit: int = 20) -> dict[str, Any]: + normalized_channel = channel_id.strip() + if not normalized_channel: + raise SlackClientError("channel_id is required") + if limit <= 0: + raise SlackClientError("limit must be a positive integer") + + body = self._request( + endpoint="/conversations.history", + params={"channel": normalized_channel, "limit": limit}, + ) + + messages = self._normalize_messages(body.get("messages", []), limit) + return { + "channel_id": normalized_channel, + "message_count": len(messages), + "has_more": bool(body.get("has_more", False)), + "messages": messages, + } + + def fetch_thread_messages(self, *, channel_id: str, thread_ts: str, limit: int = 20) -> dict[str, Any]: + normalized_channel = channel_id.strip() + normalized_thread_ts = thread_ts.strip() + if not normalized_channel: + raise SlackClientError("channel_id is required") + if not normalized_thread_ts: + raise SlackClientError("thread_ts is required") + if limit <= 0: + raise SlackClientError("limit must be a positive integer") + + body = self._request( + endpoint="/conversations.replies", + params={"channel": normalized_channel, "ts": normalized_thread_ts, "limit": limit}, + ) + + messages = self._normalize_messages(body.get("messages", []), limit) + return { + "channel_id": normalized_channel, + "thread_ts": normalized_thread_ts, + "message_count": len(messages), + "has_more": bool(body.get("has_more", False)), + "messages": messages, + } + + def _request(self, *, endpoint: str, params: dict[str, Any]) -> dict[str, Any]: + url = f"{self.api_base_url}{endpoint}" + headers = { + "Authorization": f"Bearer {self.bot_token}", + "Content-Type": "application/json", + } + + try: + with httpx.Client(timeout=self.timeout_seconds) as client: + response = client.get(url, params=params, headers=headers) + except httpx.HTTPError as exc: + raise SlackClientError(f"Slack request failed: {exc}") from exc + + if response.status_code >= 400: + raise SlackClientError(f"Slack API call failed with status {response.status_code}") + + body = response.json() + if not bool(body.get("ok", False)): + error_message = str(body.get("error", "unknown_error")) + raise SlackClientError(f"Slack API call failed: {error_message}") + + return body + + def _normalize_messages(self, raw_messages: object, limit: int) -> list[dict[str, str]]: + normalized_messages: list[dict[str, str]] = [] + if isinstance(raw_messages, list): + for entry in raw_messages[:limit]: + if not isinstance(entry, dict): + continue + normalized_messages.append( + { + "ts": str(entry.get("ts", "")), + "thread_ts": str(entry.get("thread_ts", entry.get("ts", ""))), + "user": str(entry.get("user", entry.get("username", ""))), + "text": str(entry.get("text", "")), + } + ) + + return normalized_messages \ No newline at end of file diff --git a/backend/src/agents/orchestrator.py b/backend/src/agents/orchestrator.py new file mode 100644 index 000000000..746f94eb0 --- /dev/null +++ b/backend/src/agents/orchestrator.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + + +@dataclass +class AgentOrchestrationResult: + retrieval: dict[str, Any] + reasoning: dict[str, Any] + execution: dict[str, Any] + mode: str + + +class LangChainOrchestrator: + """Runs retrieval, reasoning, and execution through a LangChain runnable pipeline. + + Falls back to sequential local orchestration when LangChain is not installed. + """ + + def __init__(self, retrieval_swarm: Any, reasoning_swarm: Any, execution_swarm: Any) -> None: + self.retrieval_swarm = retrieval_swarm + self.reasoning_swarm = reasoning_swarm + self.execution_swarm = execution_swarm + self._pipeline: Any = None + self._initialize_pipeline() + + def _initialize_pipeline(self) -> None: + try: + from langchain_core.runnables import RunnableLambda + except Exception: + self._pipeline = None + return + + self._pipeline = ( + RunnableLambda(self._run_retrieval_step) + | RunnableLambda(self._run_reasoning_step) + | RunnableLambda(self._run_execution_step) + ) + + def _run_retrieval_step(self, state: dict[str, Any]) -> dict[str, Any]: + retrieval = self.retrieval_swarm.run(query=state["query"]) + state["retrieval"] = retrieval + return state + + def _run_reasoning_step(self, state: dict[str, Any]) -> dict[str, Any]: + retrieval = state["retrieval"] + reasoning = self.reasoning_swarm.run( + { + "query": state["query"], + "sources": retrieval["sources"], + "dedup_summary": state["dedup_summary"], + } + ) + state["reasoning"] = reasoning + return state + + def _run_execution_step(self, state: dict[str, Any]) -> dict[str, Any]: + reasoning = state["reasoning"] + execution = self.execution_swarm.run( + trace_id=state["trace_id"], + action=reasoning["suggested_action"], + action_details=reasoning.get("action_details"), + ) + state["execution"] = execution + return state + + def run(self, query: str, trace_id: str, dedup_summary: dict[str, Any]) -> AgentOrchestrationResult: + initial_state = { + "query": query, + "trace_id": trace_id, + "dedup_summary": dedup_summary, + } + + if self._pipeline is None: + state = self._run_execution_step(self._run_reasoning_step(self._run_retrieval_step(initial_state))) + return AgentOrchestrationResult( + retrieval=state["retrieval"], + reasoning=state["reasoning"], + execution=state["execution"], + mode="sequential_fallback", + ) + + state = self._pipeline.invoke(initial_state) + return AgentOrchestrationResult( + retrieval=state["retrieval"], + reasoning=state["reasoning"], + execution=state["execution"], + mode="langchain", + ) diff --git a/backend/src/controller/controller.py b/backend/src/controller/controller.py new file mode 100644 index 000000000..543df92d6 --- /dev/null +++ b/backend/src/controller/controller.py @@ -0,0 +1,438 @@ +from __future__ import annotations + +from typing import Any, Generator +import os +from dataclasses import asdict, dataclass +from datetime import UTC, datetime +from uuid import uuid4 + +from src.agents.orchestrator import LangChainOrchestrator +from src.adapters.llm_client import LLMProviderError, ReasoningLLMClient, create_shared_reasoning_llm_client +from src.gates.permission_gate import PermissionGate +from src.memory.three_tier_memory import ThreeTierMemory +from src.swarms.execution_swarm import ExecutionSwarm +from src.swarms.reasoning_swarm import ReasoningSwarm +from src.swarms.retrieval_swarm import RetrievalSwarm + + +@dataclass +class TraceStep: + step: str + agent: str + observation: str + sources: list[dict] + timestamp: str + metadata: dict[str, Any] | None = None + + +@dataclass +class ControllerResult: + answer: str + trace_id: str + needs_approval: bool + suggested_action: str + trace: list[dict] + dedup_summary: dict[str, Any] + + +class ControllerKernel: + def __init__( + self, + provider_name: str | None = None, + reasoning_llm_client: ReasoningLLMClient | None = None, + ) -> None: + selected_provider = (provider_name if provider_name is not None else os.getenv("LLM_PROVIDER", "")).strip().lower() + shared_llm_client = reasoning_llm_client if reasoning_llm_client is not None else create_shared_reasoning_llm_client( + selected_provider + ) + + self.memory = ThreeTierMemory() + self.permission_gate = PermissionGate() + self.retrieval_swarm = RetrievalSwarm( + memory=self.memory, + provider_name=selected_provider, + llm_client=shared_llm_client, + ) + self.reasoning_swarm = ReasoningSwarm(provider_name=selected_provider, llm_client=shared_llm_client) + self.execution_swarm = ExecutionSwarm( + permission_gate=self.permission_gate, + provider_name=selected_provider, + llm_client=shared_llm_client, + ) + self.orchestrator = LangChainOrchestrator( + retrieval_swarm=self.retrieval_swarm, + reasoning_swarm=self.reasoning_swarm, + execution_swarm=self.execution_swarm, + ) + + @staticmethod + def _utc_now() -> datetime: + return datetime.now(UTC) + + @staticmethod + def _iso(ts: datetime) -> str: + return ts.isoformat() + + @staticmethod + def _duration_ms(started_at: datetime, finished_at: datetime) -> float: + return round((finished_at - started_at).total_seconds() * 1000, 3) + + def _trace_step( + self, + step: str, + agent: str, + observation: str, + sources: list[dict] | None = None, + metadata: dict[str, Any] | None = None, + ) -> TraceStep: + return TraceStep( + step=step, + agent=agent, + observation=observation, + sources=sources or [], + timestamp=datetime.now(UTC).isoformat(), + metadata=metadata, + ) + + def _trace_step_with_timing( + self, + step: str, + agent: str, + observation: str, + sources: list[dict] | None, + metadata: dict[str, Any] | None, + started_at: datetime, + finished_at: datetime, + ) -> TraceStep: + merged_metadata = dict(metadata or {}) + merged_metadata.update( + { + "started_at": self._iso(started_at), + "finished_at": self._iso(finished_at), + "duration_ms": self._duration_ms(started_at, finished_at), + } + ) + return TraceStep( + step=step, + agent=agent, + observation=observation, + sources=sources or [], + timestamp=self._iso(finished_at), + metadata=merged_metadata, + ) + + def stream_query_events(self, query: str, session_id: str) -> Generator[dict[str, Any], None, None]: + trace_id = f"trace-{uuid4().hex[:8]}" + trace: list[TraceStep] = [] + + self.memory.run_dedup_pass() + dedup_summary = self.memory.summary()["dedup_summary"] + + yield { + "event_type": "trace_started", + "trace_id": trace_id, + "status": "started", + "metadata": { + "session_id": session_id, + "dedup_summary": dedup_summary, + }, + } + + retrieval: dict[str, Any] | None = None + reasoning: dict[str, Any] | None = None + execution: dict[str, Any] | None = None + + try: + retrieval_started = self._utc_now() + retrieval = self.retrieval_swarm.run(query=query) + retrieval_finished = self._utc_now() + + retrieval_step = self._trace_step_with_timing( + step="retrieval", + agent="retrieval_swarm", + observation=( + f"Retrieved {retrieval['source_count']} sources for session {session_id} " + f"using {retrieval.get('retrieval_method', 'keyword')} retrieval." + ), + sources=[ + { + "title": source["title"], + "path": source["path"], + "source_type": source.get("source_type"), + "score": source.get("score"), + } + for source in retrieval["sources"] + ], + metadata={ + "retrieval_method": retrieval.get("retrieval_method", "keyword"), + "query_tokens": retrieval.get("query_tokens", []), + "llm_query_expansion": retrieval.get("llm_query_expansion"), + "vector_db": retrieval.get("vector_db"), + }, + started_at=retrieval_started, + finished_at=retrieval_finished, + ) + trace.append(retrieval_step) + yield { + "event_type": "trace_step", + "trace_id": trace_id, + "status": "in_progress", + "step": asdict(retrieval_step), + } + + reasoning_started = self._utc_now() + reasoning = self.reasoning_swarm.run( + { + "query": query, + "sources": retrieval["sources"], + "dedup_summary": dedup_summary, + } + ) + reasoning_finished = self._utc_now() + + reasoning_step = self._trace_step_with_timing( + step="reasoning", + agent="reasoning_swarm", + observation=reasoning["reasoning"], + sources=[ + { + "title": source["title"], + "path": source["path"], + "source_type": source.get("source_type"), + "score": source.get("score"), + } + for source in reasoning["sources"] + ], + metadata={ + "confidence": reasoning.get("confidence"), + "confidence_breakdown": reasoning.get("confidence_breakdown"), + "reasoning_steps": reasoning.get("reasoning_steps", []), + "evidence_scores": reasoning.get("evidence_scores", []), + "provider": reasoning.get("provider"), + "model": reasoning.get("model"), + "action_details": reasoning.get("action_details"), + "answer": reasoning.get("answer"), + "suggested_action": reasoning.get("suggested_action"), + }, + started_at=reasoning_started, + finished_at=reasoning_finished, + ) + trace.append(reasoning_step) + yield { + "event_type": "trace_step", + "trace_id": trace_id, + "status": "in_progress", + "step": asdict(reasoning_step), + } + + execution_started = self._utc_now() + execution = self.execution_swarm.run( + trace_id=trace_id, + action=reasoning["suggested_action"], + action_details=reasoning.get("action_details"), + ) + execution_finished = self._utc_now() + + execution_step = self._trace_step_with_timing( + step="execution", + agent="execution_swarm", + observation=f"{execution['status']}: {execution['reason']}", + sources=[], + metadata={ + "risk_level": execution.get("risk_level"), + "requires_human_approval": execution.get("requires_human_approval"), + "execution_reasoning": execution.get("execution_reasoning"), + "execution_mode": execution.get("execution_mode"), + "no_write_policy": execution.get("no_write_policy"), + "provider": execution.get("provider"), + "model": execution.get("model"), + "risk_hint": execution.get("risk_hint"), + "action": execution.get("action"), + "original_action": execution.get("original_action"), + }, + started_at=execution_started, + finished_at=execution_finished, + ) + trace.append(execution_step) + yield { + "event_type": "trace_step", + "trace_id": trace_id, + "status": "in_progress", + "step": asdict(execution_step), + } + + trace_payload = [asdict(item) for item in trace] + self.memory.persist_transcript( + trace_id=trace_id, + steps=trace_payload, + dedup_summary=dedup_summary, + suggested_action=reasoning["suggested_action"], + action_details=reasoning.get("action_details"), + needs_approval=execution["requires_human_approval"], + execution_status=execution["status"], + execution_mode=execution.get("execution_mode"), + ) + + yield { + "event_type": "trace_complete", + "trace_id": trace_id, + "status": "completed", + "answer": reasoning["answer"], + "needs_approval": execution["requires_human_approval"], + "suggested_action": reasoning["suggested_action"], + "metadata": { + "dedup_summary": dedup_summary, + "action_details": reasoning.get("action_details"), + "execution_status": execution.get("status"), + "execution_mode": execution.get("execution_mode"), + "step_count": len(trace_payload), + }, + } + except LLMProviderError as exc: + trace_payload = [asdict(item) for item in trace] + self.memory.persist_transcript( + trace_id=trace_id, + steps=trace_payload, + dedup_summary=dedup_summary, + suggested_action=(reasoning or {}).get("suggested_action"), + action_details=(reasoning or {}).get("action_details"), + needs_approval=(execution or {}).get("requires_human_approval"), + execution_status="failed", + execution_mode=(execution or {}).get("execution_mode"), + ) + yield { + "event_type": "trace_error", + "trace_id": trace_id, + "status": "failed", + "error_code": "provider_error", + "error": str(exc), + "metadata": { + "dedup_summary": dedup_summary, + "step_count": len(trace_payload), + }, + } + except Exception as exc: + trace_payload = [asdict(item) for item in trace] + self.memory.persist_transcript( + trace_id=trace_id, + steps=trace_payload, + dedup_summary=dedup_summary, + suggested_action=(reasoning or {}).get("suggested_action"), + action_details=(reasoning or {}).get("action_details"), + needs_approval=(execution or {}).get("requires_human_approval"), + execution_status="failed", + execution_mode=(execution or {}).get("execution_mode"), + ) + yield { + "event_type": "trace_error", + "trace_id": trace_id, + "status": "failed", + "error_code": "controller_runtime_error", + "error": f"Unhandled controller error: {exc}", + "metadata": { + "dedup_summary": dedup_summary, + "step_count": len(trace_payload), + }, + } + + def handle_query(self, query: str, session_id: str) -> ControllerResult: + trace_id = f"trace-{uuid4().hex[:8]}" + trace: list[TraceStep] = [] + self.memory.run_dedup_pass() + dedup_summary = self.memory.summary()["dedup_summary"] + + orchestration = self.orchestrator.run(query=query, trace_id=trace_id, dedup_summary=dedup_summary) + retrieval = orchestration.retrieval + trace.append( + self._trace_step( + step="retrieval", + agent="retrieval_swarm", + observation=( + f"Retrieved {retrieval['source_count']} sources for session {session_id} " + f"using {retrieval.get('retrieval_method', 'keyword')} retrieval." + ), + sources=[ + { + "title": source["title"], + "path": source["path"], + "source_type": source.get("source_type"), + "score": source.get("score"), + } + for source in retrieval["sources"] + ], + metadata={ + "retrieval_method": retrieval.get("retrieval_method", "keyword"), + "query_tokens": retrieval.get("query_tokens", []), + "llm_query_expansion": retrieval.get("llm_query_expansion"), + "vector_db": retrieval.get("vector_db"), + }, + ) + ) + + reasoning = orchestration.reasoning + trace.append( + self._trace_step( + step="reasoning", + agent="reasoning_swarm", + observation=reasoning["reasoning"], + sources=[ + { + "title": source["title"], + "path": source["path"], + "source_type": source.get("source_type"), + "score": source.get("score"), + } + for source in reasoning["sources"] + ], + metadata={ + "confidence": reasoning.get("confidence"), + "confidence_breakdown": reasoning.get("confidence_breakdown"), + "reasoning_steps": reasoning.get("reasoning_steps", []), + "evidence_scores": reasoning.get("evidence_scores", []), + "provider": reasoning.get("provider"), + "model": reasoning.get("model"), + "action_details": reasoning.get("action_details"), + }, + ) + ) + + execution = orchestration.execution + trace.append( + self._trace_step( + step="execution", + agent="execution_swarm", + observation=f"{execution['status']}: {execution['reason']}", + sources=[], + metadata={ + "risk_level": execution.get("risk_level"), + "requires_human_approval": execution.get("requires_human_approval"), + "execution_reasoning": execution.get("execution_reasoning"), + "execution_mode": execution.get("execution_mode"), + "no_write_policy": execution.get("no_write_policy"), + "provider": execution.get("provider"), + "model": execution.get("model"), + "risk_hint": execution.get("risk_hint"), + }, + ) + ) + + trace_payload = [asdict(item) for item in trace] + self.memory.persist_transcript( + trace_id=trace_id, + steps=trace_payload, + dedup_summary=dedup_summary, + suggested_action=reasoning["suggested_action"], + action_details=reasoning.get("action_details"), + needs_approval=execution["requires_human_approval"], + execution_status=execution["status"], + execution_mode=execution.get("execution_mode"), + ) + + return ControllerResult( + answer=reasoning["answer"], + trace_id=trace_id, + needs_approval=execution["requires_human_approval"], + suggested_action=reasoning["suggested_action"], + trace=trace_payload, + dedup_summary=dedup_summary, + ) diff --git a/backend/src/gates/permission_gate.py b/backend/src/gates/permission_gate.py new file mode 100644 index 000000000..266540501 --- /dev/null +++ b/backend/src/gates/permission_gate.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from dataclasses import asdict, dataclass + + +@dataclass +class PermissionRequest: + trace_id: str + action: str + + +@dataclass +class PermissionDecision: + trace_id: str + action: str + requires_human_approval: bool + risk_level: str + reason: str + + +class PermissionGate: + SAFE_ACTION_KEYWORDS = { + "summarize", + "explain", + "diagnostic", + "read-only", + "collect", + } + + HIGH_RISK_KEYWORDS = { + "rollback", + "deploy", + "delete", + "scale", + "create", + "post", + "update", + "execute", + } + + def evaluate(self, request: PermissionRequest) -> dict: + normalized = request.action.lower() + + if any(word in normalized for word in self.HIGH_RISK_KEYWORDS): + decision = PermissionDecision( + trace_id=request.trace_id, + action=request.action, + requires_human_approval=True, + risk_level="high", + reason="Action modifies external systems and requires explicit approval.", + ) + return asdict(decision) + + if any(word in normalized for word in self.SAFE_ACTION_KEYWORDS): + decision = PermissionDecision( + trace_id=request.trace_id, + action=request.action, + requires_human_approval=False, + risk_level="low", + reason="Action is read-only or summarization oriented.", + ) + return asdict(decision) + + decision = PermissionDecision( + trace_id=request.trace_id, + action=request.action, + requires_human_approval=True, + risk_level="medium", + reason="Action classification uncertain; defaulting to safe HITL approval.", + ) + return asdict(decision) diff --git a/backend/src/memory/three_tier_memory.py b/backend/src/memory/three_tier_memory.py new file mode 100644 index 000000000..1eb9551c3 --- /dev/null +++ b/backend/src/memory/three_tier_memory.py @@ -0,0 +1,293 @@ +from __future__ import annotations + +import json +from datetime import UTC, datetime +from dataclasses import dataclass +import hashlib +import os +from pathlib import Path +import tempfile +import time +from typing import Any + + +@dataclass +class MemoryDocument: + title: str + path: str + source_type: str + content: str + + +class ThreeTierMemory: + _runtime_documents: list[MemoryDocument] = [] + + def __init__(self) -> None: + self.index_layer = "MEMORY.MD" + self.docs_layer = "markdown" + self.transcript_layer = "json" + self.repo_root = Path(__file__).resolve().parents[3] + self.data_root = self.repo_root / "data" + self.transcript_root = self.repo_root / "backend" / ".uniops" / "transcripts" + self.approval_root = self.repo_root / "backend" / ".uniops" / "approvals" + self.transcript_root.mkdir(parents=True, exist_ok=True) + self.approval_root.mkdir(parents=True, exist_ok=True) + self._documents_cache: list[MemoryDocument] | None = None + self._last_dedup_report: dict[str, Any] = { + "documents": {"scanned": 0, "duplicates": 0, "retained": [], "duplicate_map": []}, + "transcripts": {"scanned": 0, "duplicates": 0, "retained": [], "duplicate_map": []}, + "deduped_count": 0, + "last_run_at": None, + } + + def _atomic_write_json(self, target: Path, payload: dict[str, Any]) -> None: + target.parent.mkdir(parents=True, exist_ok=True) + with tempfile.NamedTemporaryFile("w", encoding="utf-8", dir=target.parent, delete=False) as temp_file: + json.dump(payload, temp_file, indent=2) + temp_file.flush() + os.fsync(temp_file.fileno()) + temp_path = Path(temp_file.name) + temp_path.replace(target) + + def _read_json_file(self, target: Path) -> dict[str, Any] | None: + if not target.exists(): + return None + try: + return json.loads(target.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return None + + def summary(self) -> dict[str, Any]: + documents = self.load_documents() + dedup_total_scanned = self._last_dedup_report["documents"]["scanned"] + self._last_dedup_report["transcripts"]["scanned"] + dedup_ratio = 0.0 + if dedup_total_scanned > 0: + dedup_ratio = round(self._last_dedup_report["deduped_count"] / dedup_total_scanned, 4) + + return { + "index": self.index_layer, + "documents": self.docs_layer, + "transcripts": self.transcript_layer, + "document_count": len(documents), + "dedup_summary": { + "documents": { + "scanned": self._last_dedup_report["documents"]["scanned"], + "duplicates": self._last_dedup_report["documents"]["duplicates"], + }, + "transcripts": { + "scanned": self._last_dedup_report["transcripts"]["scanned"], + "duplicates": self._last_dedup_report["transcripts"]["duplicates"], + }, + "deduped_count": self._last_dedup_report["deduped_count"], + "duplication_ratio": dedup_ratio, + "last_run_at": self._last_dedup_report["last_run_at"], + }, + } + + def load_documents(self, force_reload: bool = False) -> list[MemoryDocument]: + if self._documents_cache is not None and not force_reload: + return self._documents_cache + + collected: list[MemoryDocument] = [] + source_dirs = ["confluence", "runbooks", "incidents", "github", "slack"] + for source_dir in source_dirs: + folder = self.data_root / source_dir + if not folder.exists(): + continue + + for file_path in folder.glob("**/*"): + if file_path.is_dir() or file_path.suffix.lower() not in {".md", ".json"}: + continue + + try: + content = file_path.read_text(encoding="utf-8") + except OSError: + continue + + rel_path = file_path.relative_to(self.repo_root).as_posix() + collected.append( + MemoryDocument( + title=file_path.stem.replace("-", " ").title(), + path=rel_path, + source_type=source_dir, + content=content, + ) + ) + + collected.extend(self.__class__._runtime_documents) + + self._documents_cache = collected + return collected + + def ingest_runtime_document(self, document: MemoryDocument) -> None: + runtime_documents = self.__class__._runtime_documents + runtime_documents[:] = [ + item for item in runtime_documents if not (item.path == document.path and item.source_type == document.source_type) + ] + runtime_documents.append(document) + self._documents_cache = None + + def persist_transcript( + self, + trace_id: str, + steps: list[dict[str, Any]], + dedup_summary: dict[str, Any] | None = None, + suggested_action: str | None = None, + action_details: dict[str, Any] | None = None, + needs_approval: bool | None = None, + execution_status: str | None = None, + execution_mode: str | None = None, + ) -> None: + target = self.transcript_root / f"{trace_id}.json" + payload = { + "trace_id": trace_id, + "steps": steps, + } + if dedup_summary is not None: + payload["dedup_summary"] = dedup_summary + if suggested_action is not None: + payload["suggested_action"] = suggested_action + if action_details is not None: + payload["action_details"] = action_details + if needs_approval is not None: + payload["needs_approval"] = needs_approval + if execution_status is not None: + payload["execution_status"] = execution_status + if execution_mode is not None: + payload["execution_mode"] = execution_mode + self._atomic_write_json(target, payload) + + def persist_approval_decision( + self, + trace_id: str, + approval: dict[str, Any], + execution_result: dict[str, Any], + final_status: str, + execution_mode: str | None = None, + ) -> None: + approval_target = self.approval_root / f"{trace_id}.json" + payload = { + "trace_id": trace_id, + "approval": approval, + "execution_result": execution_result, + "final_status": final_status, + } + if execution_mode is not None: + payload["execution_mode"] = execution_mode + self._atomic_write_json(approval_target, payload) + + transcript = self.get_transcript(trace_id) or {"trace_id": trace_id, "steps": []} + transcript["approval"] = approval + transcript["execution_result"] = execution_result + transcript["final_status"] = final_status + if execution_mode is not None: + transcript["execution_mode"] = execution_mode + + approval_step = { + "step": "approval", + "agent": "approval_router", + "observation": f"{approval.get('decision', 'unknown')}: {execution_result.get('status', 'unknown')}", + "sources": [], + "timestamp": datetime.now(UTC).isoformat(), + } + steps = transcript.get("steps", []) + steps = [step for step in steps if step.get("step") != "approval"] + steps.append(approval_step) + transcript["steps"] = steps + + transcript_target = self.transcript_root / f"{trace_id}.json" + self._atomic_write_json(transcript_target, transcript) + + def get_approval_decision(self, trace_id: str) -> dict[str, Any] | None: + target = self.approval_root / f"{trace_id}.json" + return self._read_json_file(target) + + def get_transcript(self, trace_id: str) -> dict[str, Any] | None: + target = self.transcript_root / f"{trace_id}.json" + return self._read_json_file(target) + + def wait_for_transcript( + self, + trace_id: str, + timeout_seconds: float = 0.0, + poll_interval_seconds: float = 0.05, + ) -> dict[str, Any] | None: + if timeout_seconds <= 0: + return self.get_transcript(trace_id) + + deadline = time.monotonic() + timeout_seconds + while True: + transcript = self.get_transcript(trace_id) + if transcript is not None: + return transcript + + remaining = deadline - time.monotonic() + if remaining <= 0: + return None + + time.sleep(min(poll_interval_seconds, remaining)) + + def run_dedup_pass(self) -> dict[str, Any]: + def normalize_text(text: str) -> str: + return " ".join(text.split()).lower() + + document_signature_to_retained: dict[str, str] = {} + document_duplicates: list[dict[str, str]] = [] + retained_documents: list[str] = [] + + documents = sorted(self.load_documents(force_reload=True), key=lambda item: item.path) + for document in documents: + signature = hashlib.sha256(normalize_text(document.content).encode("utf-8")).hexdigest() + retained_path = document_signature_to_retained.get(signature) + if retained_path is None: + document_signature_to_retained[signature] = document.path + retained_documents.append(document.path) + continue + + document_duplicates.append({"duplicate": document.path, "retained": retained_path}) + + transcript_signature_to_retained: dict[str, str] = {} + transcript_duplicates: list[dict[str, str]] = [] + retained_transcripts: list[str] = [] + transcript_paths = sorted(self.transcript_root.glob("*.json"), key=lambda item: item.name) + + for transcript_path in transcript_paths: + try: + payload = json.loads(transcript_path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + continue + + trace_id = str(payload.get("trace_id", transcript_path.stem)) + steps_payload = payload.get("steps", []) + canonical_steps = json.dumps(steps_payload, sort_keys=True, separators=(",", ":")) + signature = hashlib.sha256(canonical_steps.encode("utf-8")).hexdigest() + + retained_trace_id = transcript_signature_to_retained.get(signature) + if retained_trace_id is None: + transcript_signature_to_retained[signature] = trace_id + retained_transcripts.append(trace_id) + continue + + transcript_duplicates.append({"duplicate": trace_id, "retained": retained_trace_id}) + + dedup_report = { + "documents": { + "scanned": len(documents), + "duplicates": len(document_duplicates), + "retained": retained_documents, + "duplicate_map": document_duplicates, + }, + "transcripts": { + "scanned": len(transcript_paths), + "duplicates": len(transcript_duplicates), + "retained": retained_transcripts, + "duplicate_map": transcript_duplicates, + }, + "deduped_count": len(document_duplicates) + len(transcript_duplicates), + "last_run_at": datetime.now(UTC).isoformat(), + } + self._last_dedup_report = dedup_report + return dedup_report + + def get_last_dedup_report(self) -> dict[str, Any]: + return self._last_dedup_report diff --git a/backend/src/swarms/execution_swarm.py b/backend/src/swarms/execution_swarm.py new file mode 100644 index 000000000..72983cd3d --- /dev/null +++ b/backend/src/swarms/execution_swarm.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import os +from typing import Any + +from src.adapters.llm_client import LLMProviderError, LLMProviderRuntimeError, ReasoningLLMClient +from src.gates.permission_gate import PermissionGate, PermissionRequest + + +class ExecutionSwarm: + def __init__( + self, + permission_gate: PermissionGate, + provider_name: str | None = None, + llm_client: ReasoningLLMClient | None = None, + ) -> None: + self.permission_gate = permission_gate + self.provider_name = (provider_name if provider_name is not None else os.getenv("LLM_PROVIDER", "")).strip().lower() + self._llm_client = llm_client + + def _get_llm_client(self) -> ReasoningLLMClient: + if self._llm_client is None: + raise LLMProviderRuntimeError("No execution provider client is configured for this request.") + return self._llm_client + + def _llm_assess_action(self, action: str, action_details: dict[str, Any] | None) -> dict[str, Any]: + client = self._get_llm_client() + try: + return client.assess_execution_action(action=action, action_details=action_details) + except LLMProviderError: + raise + except Exception as exc: + raise LLMProviderRuntimeError(f"{self.provider_name} execution assessment failed: {exc}") from exc + + def run(self, trace_id: str, action: str, action_details: dict[str, Any] | None = None) -> dict: + if not self.provider_name: + raise LLMProviderRuntimeError("Execution requires a configured LLM provider.") + + assessment = self._llm_assess_action(action=action, action_details=action_details) + normalized_action = str(assessment.get("normalized_action") or action) + execution_reasoning = str(assessment.get("reasoning") or "Execution assessment completed via provider.") + risk_hint = assessment.get("risk_hint") + model_name = getattr(self._llm_client, "model_name", "unknown") if self._llm_client else "unknown" + + decision = self.permission_gate.evaluate(PermissionRequest(trace_id=trace_id, action=normalized_action)) + if decision["requires_human_approval"]: + return { + "action": normalized_action, + "original_action": action, + "action_details": action_details or {}, + "status": "pending_approval", + "execution_mode": "planner_only", + "no_write_policy": True, + "requires_human_approval": True, + "risk_level": decision["risk_level"], + "reason": decision["reason"], + "execution_reasoning": execution_reasoning, + "provider": self.provider_name, + "model": model_name, + "risk_hint": risk_hint, + } + + return { + "action": normalized_action, + "original_action": action, + "action_details": action_details or {}, + "status": "plan_generated", + "execution_mode": "planner_only", + "no_write_policy": True, + "requires_human_approval": False, + "risk_level": decision["risk_level"], + "reason": decision["reason"], + "execution_reasoning": execution_reasoning, + "provider": self.provider_name, + "model": model_name, + "risk_hint": risk_hint, + } diff --git a/backend/src/swarms/reasoning_swarm.py b/backend/src/swarms/reasoning_swarm.py new file mode 100644 index 000000000..7a32d73b0 --- /dev/null +++ b/backend/src/swarms/reasoning_swarm.py @@ -0,0 +1,243 @@ +from __future__ import annotations + +import os +from typing import Any + +from src.adapters.llm_client import ( + LLMProviderError, + LLMProviderRuntimeError, + ReasoningLLMClient, + create_reasoning_llm_client, +) + + +class ReasoningSwarm: + SOURCE_TYPE_WEIGHT = { + "runbooks": 0.25, + "confluence": 0.2, + "incidents": 0.15, + "github": 0.1, + "slack": 0.05, + } + + def __init__( + self, + provider_name: str | None = None, + llm_client: ReasoningLLMClient | None = None, + ) -> None: + self.provider_name = (provider_name if provider_name is not None else os.getenv("LLM_PROVIDER", "")).strip().lower() + self._llm_client = llm_client + + def _get_llm_client(self) -> ReasoningLLMClient: + if self._llm_client is None: + self._llm_client = create_reasoning_llm_client(self.provider_name) + if self._llm_client is None: + raise LLMProviderRuntimeError("No reasoning provider is configured for this request.") + return self._llm_client + + def _source_priority(self, source: dict[str, Any]) -> float: + score = float(source.get("score", 0.0) or 0.0) + source_type = str(source.get("source_type", "")).lower() + weight = self.SOURCE_TYPE_WEIGHT.get(source_type, 0.0) + return score + weight + + def _rank_sources(self, sources: list[dict[str, Any]]) -> list[dict[str, Any]]: + return sorted( + sources, + key=lambda source: (self._source_priority(source), str(source.get("title", "")).lower()), + reverse=True, + ) + + def _confidence(self, source_count: int, dedup_summary: dict[str, Any] | None) -> float: + base_confidence = min(0.95, 0.45 + (0.1 * source_count)) + if dedup_summary is None: + return round(base_confidence, 3) + + duplication_ratio = float(dedup_summary.get("duplication_ratio", 0.0) or 0.0) + deduped_count = int(dedup_summary.get("deduped_count", 0) or 0) + quality_bonus = max(0.0, 0.08 * (1.0 - duplication_ratio)) + duplicate_penalty = min(0.12, duplication_ratio * 0.25) + clean_evidence_bonus = 0.02 if deduped_count == 0 else 0.0 + + tuned_confidence = base_confidence + quality_bonus + clean_evidence_bonus - duplicate_penalty + return round(max(0.2, min(0.97, tuned_confidence)), 3) + + def _confidence_breakdown(self, source_count: int, dedup_summary: dict[str, Any] | None) -> dict[str, Any]: + base_confidence = min(0.95, 0.45 + (0.1 * source_count)) + if dedup_summary is None: + return { + "base_confidence": round(base_confidence, 3), + "quality_bonus": 0.0, + "duplicate_penalty": 0.0, + "clean_evidence_bonus": 0.0, + "final_confidence": round(base_confidence, 3), + } + + duplication_ratio = float(dedup_summary.get("duplication_ratio", 0.0) or 0.0) + deduped_count = int(dedup_summary.get("deduped_count", 0) or 0) + quality_bonus = max(0.0, 0.08 * (1.0 - duplication_ratio)) + duplicate_penalty = min(0.12, duplication_ratio * 0.25) + clean_evidence_bonus = 0.02 if deduped_count == 0 else 0.0 + final_confidence = round(max(0.2, min(0.97, base_confidence + quality_bonus + clean_evidence_bonus - duplicate_penalty)), 3) + + return { + "base_confidence": round(base_confidence, 3), + "quality_bonus": round(quality_bonus, 3), + "duplicate_penalty": round(duplicate_penalty, 3), + "clean_evidence_bonus": round(clean_evidence_bonus, 3), + "duplication_ratio": round(duplication_ratio, 3), + "final_confidence": final_confidence, + } + + def _reasoning_steps( + self, + query: str, + confidence: float, + top_sources: list[dict[str, Any]], + dedup_summary: dict[str, Any] | None, + ) -> list[str]: + source_types = sorted({str(source.get("source_type", "unknown")).lower() for source in top_sources}) + dedup_ratio = float((dedup_summary or {}).get("duplication_ratio", 0.0) or 0.0) + steps = [ + f"Parsed incident intent from query with {len(query.split())} tokens.", + f"Ranked top evidence sources by source-type weighting: {', '.join(source_types) or 'none' }.", + f"Adjusted confidence using dedup signals (duplication_ratio={dedup_ratio:.3f}).", + f"Selected action policy based on confidence={confidence:.3f} and operational intent.", + ] + return steps + + def _evidence_scores(self, top_sources: list[dict[str, Any]]) -> list[dict[str, Any]]: + scores: list[dict[str, Any]] = [] + for source in top_sources: + raw_score = float(source.get("score", 0.0) or 0.0) + priority_score = round(self._source_priority(source), 3) + scores.append( + { + "title": source.get("title", ""), + "path": source.get("path", ""), + "source_type": source.get("source_type", "unknown"), + "raw_score": raw_score, + "priority_score": priority_score, + } + ) + return scores + + def _suggest_action(self, query: str, confidence: float, top_sources: list[dict[str, Any]]) -> str: + normalized = query.lower() + if any(word in normalized for word in ["rollback", "revert", "jira", "slack", "deploy", "pr"]): + return "create rollback PR and notify Slack and Jira" + + operational_intent = any(word in normalized for word in ["cpu", "latency", "incident", "redis"]) + top_source_types = {str(source.get("source_type", "")).lower() for source in top_sources} + has_runbook_evidence = any(source_type in {"runbooks", "confluence", "incidents"} for source_type in top_source_types) + + if operational_intent and confidence >= 0.55 and has_runbook_evidence: + return "run high CPU diagnostic runbook in read-only mode" + + if confidence < 0.5: + return "collect additional incident context and request approval before external actions" + + return "summarize findings and request approval for external actions" + + def _build_action_details(self, suggested_action: str) -> dict[str, Any]: + normalized = suggested_action.lower() + if "rollback" in normalized or "revert" in normalized or "pr" in normalized: + return { + "intent": "rollback_and_notify", + "tool": "planner.rollback_and_notify", + "parameters": { + "notify_channels": ["slack", "jira"], + }, + "approval_required": True, + "risk_hint": "high", + } + + if "diagnostic" in normalized or "read-only" in normalized: + return { + "intent": "run_diagnostic", + "tool": "planner.run_diagnostic", + "parameters": { + "mode": "read-only", + }, + "approval_required": False, + "risk_hint": "low", + } + + if "collect additional" in normalized: + return { + "intent": "collect_context", + "tool": "planner.collect_context", + "parameters": {}, + "approval_required": True, + "risk_hint": "medium", + } + + return { + "intent": "summarize_and_request_approval", + "tool": "planner.summarize_and_request_approval", + "parameters": {}, + "approval_required": True, + "risk_hint": "medium", + } + + def _llm_reasoning( + self, + query: str, + confidence: float, + top_sources: list[dict[str, Any]], + dedup_summary: dict[str, Any] | None, + ) -> dict[str, Any]: + client = self._get_llm_client() + try: + llm_result = client.reason( + query=query, + confidence=confidence, + top_sources=top_sources, + dedup_summary=dedup_summary, + ) + except LLMProviderError: + raise + except Exception as exc: + raise LLMProviderRuntimeError(f"{self.provider_name} provider request failed: {exc}") from exc + + return { + "reasoning": llm_result["reasoning"], + "answer": llm_result["answer"], + "suggested_action": llm_result["suggested_action"], + "action_details": llm_result.get("action_details") + or self._build_action_details(llm_result["suggested_action"]), + "confidence": confidence, + "confidence_breakdown": llm_result.get("confidence_breakdown") + or self._confidence_breakdown(source_count=len(top_sources), dedup_summary=dedup_summary), + "reasoning_steps": llm_result.get("reasoning_steps") + or self._reasoning_steps( + query=query, + confidence=confidence, + top_sources=top_sources, + dedup_summary=dedup_summary, + ), + "evidence_scores": llm_result.get("evidence_scores") or self._evidence_scores(top_sources), + "sources": top_sources, + "provider": getattr(client, "provider_name", self.provider_name), + "model": getattr(client, "model_name", "unknown"), + } + + def run(self, context: dict[str, Any]) -> dict[str, Any]: + query = context.get("query", "") + sources = context.get("sources", []) + dedup_summary = context.get("dedup_summary") + + if not sources: + raise LLMProviderRuntimeError( + "No indexed evidence was found; ingest more operational context before reasoning." + ) + + ranked_sources = self._rank_sources(sources) + top_sources = ranked_sources[:3] + confidence = self._confidence(source_count=len(sources), dedup_summary=dedup_summary) + return self._llm_reasoning( + query=query, + confidence=confidence, + top_sources=top_sources, + dedup_summary=dedup_summary, + ) diff --git a/backend/src/swarms/retrieval_swarm.py b/backend/src/swarms/retrieval_swarm.py new file mode 100644 index 000000000..d6d24809b --- /dev/null +++ b/backend/src/swarms/retrieval_swarm.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import os +import re +from typing import Any + +from src.adapters.llm_client import LLMProviderError, LLMProviderRuntimeError, ReasoningLLMClient +from src.memory.three_tier_memory import MemoryDocument, ThreeTierMemory +from src.vector_store.llamaindex_hybrid import LlamaIndexHybridService + + +class RetrievalSwarm: + def __init__( + self, + memory: ThreeTierMemory, + max_sources: int = 4, + provider_name: str | None = None, + llm_client: ReasoningLLMClient | None = None, + ) -> None: + self.memory = memory + self.max_sources = max_sources + self.semantic_service = LlamaIndexHybridService() + self.provider_name = (provider_name if provider_name is not None else os.getenv("LLM_PROVIDER", "")).strip().lower() + self._llm_client = llm_client + + def _get_llm_client(self) -> ReasoningLLMClient: + if self._llm_client is None: + raise LLMProviderRuntimeError("No retrieval provider client is configured for this request.") + return self._llm_client + + def _tokenize(self, query: str) -> list[str]: + return [token for token in re.findall(r"[a-z0-9]+", query.lower()) if len(token) > 2] + + def _score(self, doc: MemoryDocument, query_tokens: list[str]) -> int: + haystack = f"{doc.title} {doc.content}".lower() + return sum(haystack.count(token) for token in query_tokens) + + def _keyword_retrieve(self, query: str, query_tokens: list[str] | None = None) -> dict: + active_query_tokens = query_tokens if query_tokens is not None else self._tokenize(query) + docs = self.memory.load_documents() + ranked: list[tuple[int, MemoryDocument]] = [] + for doc in docs: + ranked.append((self._score(doc, active_query_tokens), doc)) + + ranked.sort(key=lambda item: item[0], reverse=True) + top_ranked = ranked[: self.max_sources] + if top_ranked and top_ranked[0][0] == 0: + top_ranked = ranked[: min(len(ranked), 2)] + + sources: list[dict] = [] + for score, doc in top_ranked: + snippet = " ".join(doc.content.strip().split())[:220] + sources.append( + { + "title": doc.title, + "path": doc.path, + "source_type": doc.source_type, + "snippet": snippet, + "score": score, + } + ) + + return { + "query": query, + "sources": sources, + "source_count": len(sources), + "retrieval_method": "keyword", + "query_tokens": active_query_tokens, + "documents": docs, + } + + def _merge_sources(self, primary: list[dict[str, Any]], secondary: list[dict[str, Any]]) -> list[dict[str, Any]]: + merged: list[dict[str, Any]] = [] + seen_keys: set[tuple[str, str]] = set() + + for source in primary + secondary: + key = (str(source.get("path", "")), str(source.get("title", ""))) + if key in seen_keys: + continue + seen_keys.add(key) + merged.append(source) + if len(merged) >= self.max_sources: + break + + return merged + + def _llm_expand_query_terms(self, query: str, query_tokens: list[str]) -> list[str]: + client = self._get_llm_client() + try: + expanded_terms = client.expand_query_terms(query=query, query_tokens=query_tokens) + except LLMProviderError: + raise + except Exception as exc: + raise LLMProviderRuntimeError(f"{self.provider_name} retrieval expansion failed: {exc}") from exc + + combined_terms: list[str] = [] + seen_terms: set[str] = set() + for token in [*query_tokens, *expanded_terms]: + normalized = str(token).strip().lower() + if len(normalized) < 3: + continue + if normalized in seen_terms: + continue + seen_terms.add(normalized) + combined_terms.append(normalized) + + return combined_terms + + def run(self, query: str) -> dict: + if not self.provider_name: + raise LLMProviderRuntimeError("Retrieval requires a configured LLM provider.") + + keyword_result = self._keyword_retrieve(query=query) + keyword_sources = keyword_result["sources"] + query_tokens = keyword_result.get("query_tokens", []) + expanded_query_tokens = self._llm_expand_query_terms(query=query, query_tokens=query_tokens) + llm_query_expansion = { + "used": expanded_query_tokens != query_tokens, + "provider": self.provider_name, + "model": getattr(self._llm_client, "model_name", "unknown") if self._llm_client else "unknown", + "expanded_query_tokens": expanded_query_tokens, + } + + if expanded_query_tokens != query_tokens: + expanded_keyword_result = self._keyword_retrieve(query=query, query_tokens=expanded_query_tokens) + keyword_sources = self._merge_sources(keyword_sources, expanded_keyword_result["sources"]) + + semantic_result = self.semantic_service.run( + query=query, + max_sources=self.max_sources, + keyword_sources=keyword_sources, + source_documents=keyword_result.get("documents", []), + ) + + sources = semantic_result["sources"] + return { + "query": query, + "sources": sources, + "source_count": len(sources), + "retrieval_method": semantic_result["retrieval_method"], + "query_tokens": llm_query_expansion["expanded_query_tokens"], + "llm_query_expansion": llm_query_expansion, + "vector_db": semantic_result.get("vector_db", {}), + } diff --git a/backend/src/tools/confluence_tool_adapter.py b/backend/src/tools/confluence_tool_adapter.py new file mode 100644 index 000000000..cfb297eb7 --- /dev/null +++ b/backend/src/tools/confluence_tool_adapter.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from src.adapters.confluence_client import ConfluenceClient +from src.tools.registry import ToolRegistryError + + +class ConfluenceToolAdapter: + def __init__(self, *, client: ConfluenceClient | None = None) -> None: + self._client = client + + @classmethod + def from_env(cls) -> "ConfluenceToolAdapter": + return cls(client=ConfluenceClient.from_env()) + + def fetch_page(self, *, page_id: str) -> dict[str, str]: + if not page_id.strip(): + raise ToolRegistryError("page_id is required for Confluence page fetch") + + client = self._client or ConfluenceClient.from_env() + + try: + payload = client.fetch_page(page_id=page_id) + except Exception as exc: + raise ToolRegistryError(f"Confluence page fetch failed: {exc}") from exc + + title = str(payload.get("title", f"Confluence Page {page_id}")) + source_url = str(payload.get("source_url", "")) + return { + "status": "executed", + "output": f"Fetched Confluence page {page_id}: {title}.", + "page_id": page_id, + "title": title, + "source_url": source_url, + } diff --git a/backend/src/tools/executor.py b/backend/src/tools/executor.py new file mode 100644 index 000000000..a1fb13231 --- /dev/null +++ b/backend/src/tools/executor.py @@ -0,0 +1,444 @@ +from __future__ import annotations + +from datetime import UTC, datetime +import os +import re +from dataclasses import dataclass +from typing import Any + + +from src.adapters.iris_client import IrisClient, IrisClientError +from src.tools.confluence_tool_adapter import ConfluenceToolAdapter +from src.tools.github_adapter import GitHubAdapter +from src.tools.jira_adapter import JiraAdapter +from src.tools.registry import ToolRegistry, ToolRegistryError +from src.tools.slack_adapter import SlackAdapter + + +@dataclass(frozen=True) +class ToolInvocation: + name: str + params: dict[str, Any] + + +class ToolExecutor: + def __init__(self, registry: ToolRegistry | None = None) -> None: + self.registry = registry or self._build_registry() + + def _build_registry(self) -> ToolRegistry: + registry = ToolRegistry() + registry.register_tool( + name="github.fetch_issue", + description="Fetch GitHub issue details in read-only mode.", + read_only=True, + handler=self._execute_github_fetch_issue, + ) + registry.register_tool( + name="slack.fetch_channel_messages", + description="Fetch Slack channel context in read-only mode.", + read_only=True, + handler=self._execute_slack_fetch_channel_messages, + ) + registry.register_tool( + name="slack.fetch_thread_messages", + description="Fetch Slack thread context in read-only mode.", + read_only=True, + handler=self._execute_slack_fetch_thread_messages, + ) + registry.register_tool( + name="jira.fetch_issue", + description="Fetch Jira issue details in read-only mode.", + read_only=True, + handler=self._execute_jira_fetch_issue, + ) + registry.register_tool( + name="confluence.fetch_page", + description="Fetch a Confluence page in read-only mode.", + read_only=True, + handler=self._execute_confluence_fetch_page, + ) + registry.register_tool( + name="iris.create_incident", + description="Create an incident in IRIS after human approval.", + read_only=False, + handler=self._execute_iris_create_incident, + ) + return registry + + def _required_env(self, key: str) -> str: + value = os.getenv(key, "").strip() + if not value: + raise ToolRegistryError(f"{key} is not configured") + return value + + def _required_int_env(self, key: str) -> int: + raw_value = self._required_env(key) + try: + parsed = int(raw_value) + except ValueError as exc: + raise ToolRegistryError(f"{key} must be an integer") from exc + + if parsed <= 0: + raise ToolRegistryError(f"{key} must be a positive integer") + return parsed + + def _optional_int_env(self, key: str, default: int) -> int: + raw_value = os.getenv(key, "").strip() + if not raw_value: + return default + + try: + parsed = int(raw_value) + except ValueError as exc: + raise ToolRegistryError(f"{key} must be an integer") from exc + + if parsed <= 0: + raise ToolRegistryError(f"{key} must be a positive integer") + return parsed + + def _extract_jira_issue_key(self, action: str) -> str: + match = re.search(r"\b([A-Z][A-Z0-9]*-\d+)\b", action.upper()) + if match is None: + raise ToolRegistryError( + "No Jira issue key found in approved action text (expected format PROJECT-123)" + ) + return match.group(1) + + def _build_invocations(self, action: str) -> list[ToolInvocation]: + normalized = action.lower() + invocations: list[ToolInvocation] = [] + + if "incident" in normalized and any(keyword in normalized for keyword in ["create", "open", "raise"]): + invocations.append( + ToolInvocation( + name="iris.create_incident", + params=self._build_iris_create_incident_params(action), + ) + ) + + has_pull_request_token = bool(re.search(r"\bpr\b", normalized)) + if "rollback" in normalized or has_pull_request_token or "github" in normalized: + invocations.append( + ToolInvocation( + name="github.fetch_issue", + params={ + "repository": self._required_env("GITHUB_REPOSITORY"), + "issue_number": self._required_int_env("GITHUB_ISSUE_NUMBER"), + }, + ) + ) + + if any(keyword in normalized for keyword in ["slack", "post", "notify", "channel"]): + is_thread_fetch = any(keyword in normalized for keyword in ["thread", "reply", "conversation"]) + if is_thread_fetch: + invocations.append( + ToolInvocation( + name="slack.fetch_thread_messages", + params={ + "channel": self._required_env("SLACK_CHANNEL_ID"), + "thread_ts": self._required_env("SLACK_THREAD_TS"), + "limit": self._optional_int_env("SLACK_CONTEXT_LIMIT", 20), + }, + ) + ) + else: + invocations.append( + ToolInvocation( + name="slack.fetch_channel_messages", + params={ + "channel": self._required_env("SLACK_CHANNEL_ID"), + "limit": self._optional_int_env("SLACK_CONTEXT_LIMIT", 20), + }, + ) + ) + + if any(keyword in normalized for keyword in ["jira", "ticket", "issue", "update"]): + invocations.append( + ToolInvocation( + name="jira.fetch_issue", + params={ + "issue_key": self._extract_jira_issue_key(action), + }, + ) + ) + + if any(keyword in normalized for keyword in ["confluence", "runbook", "page"]): + invocations.append( + ToolInvocation( + name="confluence.fetch_page", + params={ + "page_id": self._required_env("CONFLUENCE_PAGE_ID"), + }, + ) + ) + + if not invocations: + raise ToolRegistryError("No registered tool mapping found for the approved action") + + return invocations + + def _build_iris_create_incident_params(self, action: str) -> dict[str, Any]: + normalized = action.lower() + + severity = "medium" + if any(keyword in normalized for keyword in ["critical", "sev-1", "sev1", "p0"]): + severity = "critical" + elif any(keyword in normalized for keyword in ["high", "sev-2", "sev2", "p1"]): + severity = "high" + elif any(keyword in normalized for keyword in ["low", "sev-4", "sev4", "p3"]): + severity = "low" + + tags: list[str] = [] + for tag in ["redis", "latency", "slack", "jira", "rollback", "deployment"]: + if tag in normalized and tag not in tags: + tags.append(tag) + + case_name = os.getenv("IRIS_DEFAULT_CASE_NAME", "UniOps Approved Incident") + quoted = re.findall(r'"([^"]+)"', action) + if quoted: + case_name = quoted[0].strip() + elif "for " in normalized: + candidate = action.split("for ", 1)[1].strip() + if candidate: + case_name = candidate[:120] + + return { + "case_name": case_name, + "case_description": f"Created from approved UniOps action: {action}", + "severity": severity, + "tags": tags, + "case_customer": self._optional_int_env("IRIS_CASE_CUSTOMER_ID", 1), + "case_soc_id": os.getenv("IRIS_CASE_SOC_ID", "").strip(), + } + + def _failure_payload( + self, + *, + action: str, + error: str, + details: list[dict[str, Any]], + timestamp: str, + ) -> dict[str, Any]: + tool_name = details[-1]["tool"] if details else "tool.registry.batch" + return { + "tool": tool_name, + "status": "failed", + "output": f"Execution failed for action '{action}': {error}", + "timestamp": timestamp, + "details": details, + } + + def execute(self, action: str, action_details: dict[str, Any] | None = None) -> dict[str, Any]: + timestamp = datetime.now(UTC).isoformat() + details: list[dict[str, Any]] = [] + effective_action = action + + if not effective_action and isinstance(action_details, dict): + effective_action = str(action_details.get("intent", "")).strip() + + if not effective_action: + return self._failure_payload( + action=action, + error="No approved action text supplied", + details=details, + timestamp=timestamp, + ) + + try: + invocations = self._build_invocations(effective_action) + except ToolRegistryError as exc: + return self._failure_payload(action=effective_action, error=str(exc), details=details, timestamp=timestamp) + + for invocation in invocations: + try: + result = self.registry.execute_tool(invocation.name, invocation.params) + except ToolRegistryError as exc: + details.append( + { + "tool": invocation.name, + "status": "failed", + "output": str(exc), + } + ) + return self._failure_payload( + action=effective_action, + error=str(exc), + details=details, + timestamp=timestamp, + ) + + details.append(result) + if str(result.get("status", "")).lower() != "executed": + return self._failure_payload( + action=effective_action, + error=str(result.get("output", "tool returned non-executed status")), + details=details, + timestamp=timestamp, + ) + + return { + "tool": "tool.registry.batch" if len(details) > 1 else details[0]["tool"], + "status": "executed", + "output": f"Executed {len(details)} tool action(s) successfully.", + "timestamp": timestamp, + "details": details, + } + + def _execute_github_fetch_issue(self, params: dict[str, Any]) -> dict[str, Any]: + adapter = GitHubAdapter.from_env() + return adapter.fetch_issue( + repository=str(params.get("repository", "")), + issue_number=int(params.get("issue_number", 0)), + ) + + def _execute_slack_fetch_channel_messages(self, params: dict[str, Any]) -> dict[str, Any]: + adapter = SlackAdapter.from_env() + return adapter.fetch_channel_messages( + channel=str(params.get("channel", "")), + limit=int(params.get("limit", 20)), + ) + + def _execute_slack_fetch_thread_messages(self, params: dict[str, Any]) -> dict[str, Any]: + adapter = SlackAdapter.from_env() + return adapter.fetch_thread_messages( + channel=str(params.get("channel", "")), + thread_ts=str(params.get("thread_ts", "")), + limit=int(params.get("limit", 20)), + ) + + def _execute_jira_fetch_issue(self, params: dict[str, Any]) -> dict[str, Any]: + adapter = JiraAdapter.from_env() + return adapter.fetch_issue( + issue_key=str(params.get("issue_key", "")), + ) + + def _execute_confluence_fetch_page(self, params: dict[str, Any]) -> dict[str, Any]: + adapter = ConfluenceToolAdapter.from_env() + return adapter.fetch_page(page_id=str(params.get("page_id", ""))) + + def _execute_iris_create_incident(self, params: dict[str, Any]) -> dict[str, Any]: + try: + adapter = IrisClient.from_env() + created = adapter.create_incident( + case_name=str(params.get("case_name", "")).strip(), + case_description=str(params.get("case_description", "")).strip(), + severity=str(params.get("severity", "medium")), + tags=list(params.get("tags", [])), + case_customer=int(params.get("case_customer", 1)), + case_soc_id=str(params.get("case_soc_id", "")), + ) + except (IrisClientError, ValueError) as exc: + raise ToolRegistryError(f"IRIS incident creation failed: {exc}") from exc + + return { + "status": "executed", + "output": f"Created IRIS incident {created.get('case_id', 'unknown')}", + "incident": { + "case_id": created.get("case_id"), + "report_url": created.get("report_url"), + "case_name": created.get("case_name"), + "severity": created.get("severity"), + }, + } + + +class PlanningToolExecutor: + """Builds planner-only execution artifacts without external write side effects.""" + + def _build_plan_steps(self, action: str, action_details: dict[str, Any] | None) -> list[dict[str, Any]]: + normalized = action.lower() + intent = str((action_details or {}).get("intent") or "generic_plan") + steps: list[dict[str, Any]] = [] + + if "rollback" in normalized or "pr" in normalized or intent == "rollback_and_notify": + steps.extend( + [ + { + "id": 1, + "title": "Collect rollback context", + "system": "github", + "mode": "planner_only", + "operation": "review latest deployment and candidate rollback commit", + }, + { + "id": 2, + "title": "Prepare rollback PR payload", + "system": "github", + "mode": "planner_only", + "operation": "draft PR title/body and reviewer list", + }, + { + "id": 3, + "title": "Draft communication updates", + "system": "slack+jira", + "mode": "planner_only", + "operation": "prepare incident broadcast text and Jira update template", + }, + ] + ) + + if "diagnostic" in normalized or "read-only" in normalized or intent == "run_diagnostic": + steps.extend( + [ + { + "id": len(steps) + 1, + "title": "Run diagnostic checklist", + "system": "runbook", + "mode": "planner_only", + "operation": "execute read-only runbook checks and capture findings", + } + ] + ) + + if not steps: + steps.extend( + [ + { + "id": 1, + "title": "Collect missing context", + "system": "knowledge", + "mode": "planner_only", + "operation": "gather additional evidence before external coordination", + }, + { + "id": 2, + "title": "Prepare approval-ready action plan", + "system": "approval", + "mode": "planner_only", + "operation": "summarize proposed actions with risk and rollback notes", + }, + ] + ) + + return steps + + def execute(self, action: str, action_details: dict[str, Any] | None = None) -> dict[str, Any]: + timestamp = datetime.now(UTC).isoformat() + plan_steps = self._build_plan_steps(action=action, action_details=action_details) + intent = str((action_details or {}).get("intent") or "generic_plan") + risk_hint = (action_details or {}).get("risk_hint") + + return { + "tool": "planner.external_action_plan", + "status": "plan_generated", + "output": "Generated planner-only execution plan. No external write operations were performed.", + "timestamp": timestamp, + "execution_mode": "planner_only", + "no_write_policy": True, + "plan": { + "intent": intent, + "summary": action, + "approval_required": bool((action_details or {}).get("approval_required", True)), + "risk_hint": risk_hint, + "prechecks": [ + "Confirm incident scope and blast radius.", + "Validate runbook and recent change history.", + "Capture approver decision and comment.", + ], + "steps": plan_steps, + "rollback": [ + "If plan is not approved, retain current state and request additional evidence.", + "If post-approval checks fail, halt and escalate to incident commander.", + ], + }, + } diff --git a/backend/src/tools/github_adapter.py b/backend/src/tools/github_adapter.py new file mode 100644 index 000000000..b1710e401 --- /dev/null +++ b/backend/src/tools/github_adapter.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +import os + +import httpx + +from src.tools.registry import ToolRegistryError + + +class GitHubAdapter: + def __init__( + self, + *, + token: str, + api_base_url: str = "https://api.github.com", + timeout_seconds: float = 15.0, + ) -> None: + self.token = token + self.api_base_url = api_base_url.rstrip("/") + self.timeout_seconds = timeout_seconds + + @classmethod + def from_env(cls) -> "GitHubAdapter": + token = os.getenv("GITHUB_TOKEN", "").strip() + api_base_url = os.getenv("GITHUB_API_BASE_URL", "https://api.github.com").strip() + + if not token: + raise ToolRegistryError("GITHUB_TOKEN is not configured") + + return cls(token=token, api_base_url=api_base_url or "https://api.github.com") + + def fetch_issue(self, *, repository: str, issue_number: int) -> dict[str, object]: + if not repository.strip(): + raise ToolRegistryError("repository is required for GitHub issue fetch") + if issue_number <= 0: + raise ToolRegistryError("issue_number must be a positive integer") + + url = f"{self.api_base_url}/repos/{repository}/issues/{issue_number}" + headers = { + "Authorization": f"Bearer {self.token}", + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + + try: + with httpx.Client(timeout=self.timeout_seconds) as client: + response = client.get(url, headers=headers) + except httpx.HTTPError as exc: + raise ToolRegistryError(f"GitHub request failed: {exc}") from exc + + if response.status_code >= 400: + raise ToolRegistryError(f"GitHub issue fetch failed with status {response.status_code}") + + body = response.json() + issue_url = str(body.get("html_url", "")).strip() or f"https://github.com/{repository}/issues/{issue_number}" + title = str(body.get("title", "")).strip() + state = str(body.get("state", "unknown")).strip() or "unknown" + + return { + "status": "executed", + "output": f"Fetched GitHub issue {repository}#{issue_number}.", + "issue": { + "repository": repository, + "number": issue_number, + "title": title, + "state": state, + "url": issue_url, + }, + } diff --git a/backend/src/tools/jira_adapter.py b/backend/src/tools/jira_adapter.py new file mode 100644 index 000000000..88fcf7163 --- /dev/null +++ b/backend/src/tools/jira_adapter.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import os +import re + +import httpx + +from src.tools.registry import ToolRegistryError + + +JIRA_ISSUE_KEY_PATTERN = re.compile(r"^[A-Z][A-Z0-9]*-\d+$") + + +class JiraAdapter: + def __init__( + self, + *, + base_url: str, + email: str, + api_token: str, + timeout_seconds: float = 15.0, + ) -> None: + self.base_url = base_url.rstrip("/") + self.email = email + self.api_token = api_token + self.timeout_seconds = timeout_seconds + + @classmethod + def from_env(cls) -> "JiraAdapter": + base_url = os.getenv("JIRA_BASE_URL", "").strip() + email = os.getenv("JIRA_EMAIL", "").strip() + api_token = os.getenv("JIRA_API_TOKEN", "").strip() + + if not base_url: + raise ToolRegistryError("JIRA_BASE_URL is not configured") + if not email: + raise ToolRegistryError("JIRA_EMAIL is not configured") + if not api_token: + raise ToolRegistryError("JIRA_API_TOKEN is not configured") + + return cls(base_url=base_url, email=email, api_token=api_token) + + def fetch_issue(self, *, issue_key: str) -> dict[str, object]: + normalized_issue_key = issue_key.strip().upper() + if not normalized_issue_key: + raise ToolRegistryError("issue_key is required for Jira issue fetch") + if not JIRA_ISSUE_KEY_PATTERN.match(normalized_issue_key): + raise ToolRegistryError( + f"issue_key '{issue_key}' does not match expected format PROJECT-123" + ) + + url = f"{self.base_url}/rest/api/3/issue/{normalized_issue_key}" + auth = httpx.BasicAuth(username=self.email, password=self.api_token) + params = { + "fields": "summary,status,priority,assignee", + } + + try: + with httpx.Client(timeout=self.timeout_seconds) as client: + response = client.get(url, params=params, auth=auth) + except httpx.HTTPError as exc: + raise ToolRegistryError(f"Jira request failed: {exc}") from exc + + if response.status_code in {401, 403}: + raise ToolRegistryError("Jira issue fetch failed: authentication or permission error") + if response.status_code == 404: + raise ToolRegistryError(f"Jira issue {normalized_issue_key} was not found") + if response.status_code >= 500: + raise ToolRegistryError(f"Jira service error while fetching {normalized_issue_key}") + if response.status_code >= 400: + raise ToolRegistryError(f"Jira issue fetch failed with status {response.status_code}") + + body = response.json() + fields = body.get("fields", {}) + if not isinstance(fields, dict): + fields = {} + + status_block = fields.get("status", {}) + priority_block = fields.get("priority", {}) + assignee_block = fields.get("assignee", {}) + + status_name = str(status_block.get("name", "")) if isinstance(status_block, dict) else "" + priority_name = str(priority_block.get("name", "")) if isinstance(priority_block, dict) else "" + assignee_name = str(assignee_block.get("displayName", "")) if isinstance(assignee_block, dict) else "" + + return { + "status": "executed", + "output": f"Fetched Jira issue {normalized_issue_key}.", + "issue": { + "key": normalized_issue_key, + "summary": str(fields.get("summary", "")), + "status": status_name, + "priority": priority_name, + "assignee": assignee_name, + }, + } diff --git a/backend/src/tools/registry.py b/backend/src/tools/registry.py new file mode 100644 index 000000000..329af0b33 --- /dev/null +++ b/backend/src/tools/registry.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Callable + + +class ToolRegistryError(RuntimeError): + pass + + +ToolHandler = Callable[[dict[str, Any]], dict[str, Any]] + + +@dataclass(frozen=True) +class RegisteredTool: + name: str + description: str + read_only: bool + handler: ToolHandler + + +class ToolRegistry: + def __init__(self) -> None: + self._tools: dict[str, RegisteredTool] = {} + + def register_tool( + self, + *, + name: str, + description: str, + read_only: bool, + handler: ToolHandler, + ) -> None: + if not name.strip(): + raise ValueError("tool name must be non-empty") + if name in self._tools: + raise ValueError(f"tool {name} is already registered") + + self._tools[name] = RegisteredTool( + name=name, + description=description, + read_only=read_only, + handler=handler, + ) + + def list_tools(self) -> list[str]: + return sorted(self._tools.keys()) + + def describe_tools(self) -> list[dict[str, Any]]: + return [ + { + "name": tool.name, + "description": tool.description, + "read_only": tool.read_only, + } + for tool in sorted(self._tools.values(), key=lambda item: item.name) + ] + + def execute_tool(self, name: str, params: dict[str, Any]) -> dict[str, Any]: + tool = self._tools.get(name) + if tool is None: + raise ToolRegistryError(f"tool {name} is not registered") + + try: + result = tool.handler(params) + except ToolRegistryError: + raise + except Exception as exc: # pragma: no cover - defensive fallback + raise ToolRegistryError(f"tool {name} failed: {exc}") from exc + + if not isinstance(result, dict): + raise ToolRegistryError(f"tool {name} returned an invalid result payload") + + status = str(result.get("status", "executed")) + output = str(result.get("output", "Tool executed.")) + payload: dict[str, Any] = { + "tool": name, + "status": status, + "output": output, + } + + for key, value in result.items(): + if key in payload: + continue + payload[key] = value + + return payload diff --git a/backend/src/tools/slack_adapter.py b/backend/src/tools/slack_adapter.py new file mode 100644 index 000000000..1d60eb266 --- /dev/null +++ b/backend/src/tools/slack_adapter.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +import os + +import httpx + +from src.tools.registry import ToolRegistryError + + +class SlackAdapter: + def __init__( + self, + *, + bot_token: str, + api_base_url: str = "https://slack.com/api", + timeout_seconds: float = 15.0, + ) -> None: + self.bot_token = bot_token + self.api_base_url = api_base_url.rstrip("/") + self.timeout_seconds = timeout_seconds + + @classmethod + def from_env(cls) -> "SlackAdapter": + bot_token = os.getenv("SLACK_BOT_TOKEN", "").strip() + api_base_url = os.getenv("SLACK_API_BASE_URL", "https://slack.com/api").strip() + + if not bot_token: + raise ToolRegistryError("SLACK_BOT_TOKEN is not configured") + + return cls(bot_token=bot_token, api_base_url=api_base_url or "https://slack.com/api") + + def fetch_channel_messages(self, *, channel: str, limit: int = 20) -> dict[str, object]: + if not channel.strip(): + raise ToolRegistryError("channel is required for Slack message fetch") + if limit <= 0: + raise ToolRegistryError("limit must be a positive integer") + + url = f"{self.api_base_url}/conversations.history" + headers = { + "Authorization": f"Bearer {self.bot_token}", + "Content-Type": "application/json", + } + params = { + "channel": channel, + "limit": limit, + } + + try: + with httpx.Client(timeout=self.timeout_seconds) as client: + response = client.get(url, params=params, headers=headers) + except httpx.HTTPError as exc: + raise ToolRegistryError(f"Slack request failed: {exc}") from exc + + if response.status_code >= 400: + raise ToolRegistryError(f"Slack conversations.history failed with status {response.status_code}") + + body = response.json() + if not bool(body.get("ok", False)): + error_message = str(body.get("error", "unknown_error")) + raise ToolRegistryError(f"Slack conversations.history failed: {error_message}") + + normalized_messages = self._normalize_messages(body.get("messages", []), limit) + + return { + "status": "executed", + "output": f"Fetched {len(normalized_messages)} Slack messages from channel {channel}.", + "channel": channel, + "message_count": len(normalized_messages), + "has_more": bool(body.get("has_more", False)), + "messages": normalized_messages, + } + + def fetch_thread_messages(self, *, channel: str, thread_ts: str, limit: int = 20) -> dict[str, object]: + if not channel.strip(): + raise ToolRegistryError("channel is required for Slack thread fetch") + if not thread_ts.strip(): + raise ToolRegistryError("thread_ts is required for Slack thread fetch") + if limit <= 0: + raise ToolRegistryError("limit must be a positive integer") + + url = f"{self.api_base_url}/conversations.replies" + headers = { + "Authorization": f"Bearer {self.bot_token}", + "Content-Type": "application/json", + } + params = { + "channel": channel, + "ts": thread_ts, + "limit": limit, + } + + try: + with httpx.Client(timeout=self.timeout_seconds) as client: + response = client.get(url, params=params, headers=headers) + except httpx.HTTPError as exc: + raise ToolRegistryError(f"Slack request failed: {exc}") from exc + + if response.status_code >= 400: + raise ToolRegistryError(f"Slack conversations.replies failed with status {response.status_code}") + + body = response.json() + if not bool(body.get("ok", False)): + error_message = str(body.get("error", "unknown_error")) + raise ToolRegistryError(f"Slack conversations.replies failed: {error_message}") + + normalized_messages = self._normalize_messages(body.get("messages", []), limit) + + return { + "status": "executed", + "output": f"Fetched {len(normalized_messages)} Slack thread messages from channel {channel}.", + "channel": channel, + "thread_ts": thread_ts, + "message_count": len(normalized_messages), + "has_more": bool(body.get("has_more", False)), + "messages": normalized_messages, + } + + def _normalize_messages(self, raw_messages: object, limit: int) -> list[dict[str, str]]: + normalized_messages: list[dict[str, str]] = [] + if isinstance(raw_messages, list): + for entry in raw_messages[:limit]: + if not isinstance(entry, dict): + continue + normalized_messages.append( + { + "ts": str(entry.get("ts", "")), + "user": str(entry.get("user", entry.get("username", ""))), + "text": str(entry.get("text", "")), + } + ) + return normalized_messages diff --git a/backend/src/vector_store/llamaindex_hybrid.py b/backend/src/vector_store/llamaindex_hybrid.py new file mode 100644 index 000000000..611e8c1d9 --- /dev/null +++ b/backend/src/vector_store/llamaindex_hybrid.py @@ -0,0 +1,355 @@ +from __future__ import annotations + +import hashlib +import os +from typing import Any + +from llama_index.core.base.embeddings.base import BaseEmbedding + + +class DeterministicHashEmbedding(BaseEmbedding): + """Local embedding model that deterministically maps text to normalized vectors.""" + + embed_dim: int + + def __init__(self, embed_dim: int = 768, **kwargs: Any) -> None: + super().__init__(embed_dim=embed_dim, **kwargs) + + @classmethod + def class_name(cls) -> str: + return "DeterministicHashEmbedding" + + def _vector_from_text(self, text: str) -> list[float]: + seed = hashlib.sha256(text.encode("utf-8")).digest() + values: list[float] = [] + counter = 0 + + while len(values) < self.embed_dim: + block = hashlib.sha256(seed + counter.to_bytes(4, "little")).digest() + counter += 1 + for idx in range(0, len(block), 4): + chunk = block[idx : idx + 4] + if len(chunk) < 4: + continue + raw = int.from_bytes(chunk, "little") + values.append((raw / 4294967295.0) * 2.0 - 1.0) + if len(values) >= self.embed_dim: + break + + norm = sum(value * value for value in values) ** 0.5 + if norm == 0.0: + return values + return [value / norm for value in values] + + async def _aget_query_embedding(self, query: str) -> list[float]: + return self._vector_from_text(query) + + async def _aget_text_embedding(self, text: str) -> list[float]: + return self._vector_from_text(text) + + def _get_query_embedding(self, query: str) -> list[float]: + return self._vector_from_text(query) + + def _get_text_embedding(self, text: str) -> list[float]: + return self._vector_from_text(text) + + +class LlamaIndexHybridService: + """Provides semantic retrieval through LlamaIndex + Milvus with keyword fallback.""" + + def __init__(self, mode: str | None = None) -> None: + self.mode = (mode or os.getenv("RETRIEVAL_MODE", "keyword")).strip().lower() + self.collection_name = os.getenv("MILVUS_COLLECTION_NAME", "uniops_documents") + self.milvus_host = os.getenv("MILVUS_HOST", "127.0.0.1") + self.milvus_port = int(os.getenv("MILVUS_PORT", "19530")) + self.embedding_provider = os.getenv("EMBEDDING_PROVIDER", "deterministic").strip().lower() + self.embedding_model = os.getenv("EMBEDDING_MODEL", "BAAI/bge-small-en-v1.5").strip() + self._index: Any | None = None + self._indexed_signature: str | None = None + self._index_doc_count = 0 + self._last_error: str | None = None + self._active_embedding_provider: str | None = None + + def _normalize_documents(self, source_documents: list[Any]) -> list[dict[str, str]]: + normalized: list[dict[str, str]] = [] + seen: set[str] = set() + for item in source_documents: + title = str(getattr(item, "title", "") or (item.get("title", "") if isinstance(item, dict) else "")).strip() + path = str(getattr(item, "path", "") or (item.get("path", "") if isinstance(item, dict) else "")).strip() + source_type = str( + getattr(item, "source_type", "") or (item.get("source_type", "") if isinstance(item, dict) else "") + ).strip() + content = str( + getattr(item, "content", "") + or (item.get("content", "") if isinstance(item, dict) else "") + or (item.get("snippet", "") if isinstance(item, dict) else "") + ).strip() + if not content: + continue + doc_key = path or title or hashlib.sha256(content.encode("utf-8")).hexdigest()[:16] + if doc_key in seen: + continue + seen.add(doc_key) + normalized.append( + { + "title": title or "Untitled", + "path": path or f"runtime://{doc_key}", + "source_type": source_type or "unknown", + "content": content, + } + ) + return normalized + + def _documents_signature(self, source_documents: list[dict[str, str]]) -> str: + h = hashlib.sha256() + for doc in sorted(source_documents, key=lambda entry: entry["path"]): + h.update(doc["path"].encode("utf-8")) + h.update(b"\n") + h.update(doc["content"].encode("utf-8")) + h.update(b"\n") + return h.hexdigest() + + def _resolve_embedding_model(self) -> tuple[Any | None, str, str | None]: + provider = self.embedding_provider + if provider in {"huggingface", "hf"}: + try: + from llama_index.embeddings.huggingface import HuggingFaceEmbedding + + return HuggingFaceEmbedding(model_name=self.embedding_model), "huggingface", None + except Exception as exc: + return None, "huggingface", f"HuggingFace embedding unavailable: {exc}" + + if provider == "openai": + api_key = os.getenv("OPENAI_API_KEY", "").strip() + if not api_key: + return None, "openai", "OPENAI_API_KEY is not configured for EMBEDDING_PROVIDER=openai" + try: + from llama_index.embeddings.openai import OpenAIEmbedding + + model_name = self.embedding_model or "text-embedding-3-small" + return OpenAIEmbedding(api_key=api_key, model=model_name), "openai", None + except Exception as exc: + return None, "openai", f"OpenAI embedding unavailable: {exc}" + + if provider in {"deterministic", "local", "hash"}: + return DeterministicHashEmbedding(embed_dim=768), "deterministic", None + + return None, provider, ( + "Unsupported EMBEDDING_PROVIDER. Use one of: deterministic, huggingface, openai" + ) + + def sync_documents(self, source_documents: list[Any]) -> dict[str, Any]: + """Indexes documents in Milvus when semantic/hybrid retrieval is enabled.""" + if self.mode == "keyword": + return { + "indexed": False, + "mode": self.mode, + "reason": "semantic indexing skipped because RETRIEVAL_MODE=keyword", + "collection": self.collection_name, + } + + normalized_documents = self._normalize_documents(source_documents) + if not normalized_documents: + return { + "indexed": False, + "mode": self.mode, + "reason": "no indexable documents were provided", + "collection": self.collection_name, + } + + signature = self._documents_signature(normalized_documents) + if signature == self._indexed_signature and self._index is not None: + return { + "indexed": True, + "mode": self.mode, + "reused": True, + "doc_count": self._index_doc_count, + "collection": self.collection_name, + "embedding_provider": self._active_embedding_provider, + } + + try: + from llama_index.core import Document, StorageContext, VectorStoreIndex + from llama_index.vector_stores.milvus import MilvusVectorStore + except Exception as exc: + self._last_error = f"LlamaIndex Milvus dependencies unavailable: {exc}" + return { + "indexed": False, + "mode": self.mode, + "collection": self.collection_name, + "reason": self._last_error, + } + + embed_model, provider_used, embed_error = self._resolve_embedding_model() + if embed_model is None: + self._last_error = embed_error or "No embedding model available" + return { + "indexed": False, + "mode": self.mode, + "collection": self.collection_name, + "reason": self._last_error, + } + + try: + vector_store = MilvusVectorStore( + uri=f"http://{self.milvus_host}:{self.milvus_port}", + collection_name=self.collection_name, + dim=768, + overwrite=True, + ) + storage_context = StorageContext.from_defaults(vector_store=vector_store) + llama_documents = [ + Document( + text=doc["content"], + metadata={ + "title": doc["title"], + "path": doc["path"], + "source_type": doc["source_type"], + }, + ) + for doc in normalized_documents + ] + self._index = VectorStoreIndex.from_documents( + llama_documents, + storage_context=storage_context, + embed_model=embed_model, + show_progress=False, + ) + self._indexed_signature = signature + self._index_doc_count = len(normalized_documents) + self._last_error = None + self._active_embedding_provider = provider_used + return { + "indexed": True, + "mode": self.mode, + "reused": False, + "doc_count": self._index_doc_count, + "collection": self.collection_name, + "embedding_provider": provider_used, + } + except Exception as exc: + self._index = None + self._indexed_signature = None + self._index_doc_count = 0 + self._last_error = f"Milvus indexing failed: {exc}" + return { + "indexed": False, + "mode": self.mode, + "collection": self.collection_name, + "reason": self._last_error, + } + + def health(self) -> dict[str, Any]: + return { + "mode": self.mode, + "collection": self.collection_name, + "milvus_host": self.milvus_host, + "milvus_port": self.milvus_port, + "embedding_provider": self._active_embedding_provider or self.embedding_provider, + "indexed": self._index is not None, + "doc_count": self._index_doc_count, + "last_error": self._last_error, + } + + def _semantic_retrieve(self, query: str, max_sources: int) -> list[dict[str, Any]]: + if self._index is None: + return [] + + try: + retriever = self._index.as_retriever(similarity_top_k=max_sources) + nodes = retriever.retrieve(query) + except Exception as exc: + self._last_error = f"Milvus semantic retrieval failed: {exc}" + return [] + + sources: list[dict[str, Any]] = [] + for node in nodes: + node_payload = getattr(node, "node", node) + metadata = dict(getattr(node_payload, "metadata", {}) or getattr(node, "metadata", {}) or {}) + content = "" + if hasattr(node_payload, "get_content"): + content = str(node_payload.get_content() or "") + + score = float(getattr(node, "score", 0.0) or getattr(node_payload, "score", 0.0) or 0.0) + + sources.append( + { + "title": str(metadata.get("title") or metadata.get("doc_title") or "Vector Source"), + "path": str(metadata.get("path") or metadata.get("source_path") or "vector://milvus"), + "source_type": str(metadata.get("source_type") or "vector"), + "snippet": " ".join(content.split())[:220], + "score": score, + } + ) + + return sources + + def _merge_sources( + self, + semantic_sources: list[dict[str, Any]], + keyword_sources: list[dict[str, Any]], + max_sources: int, + ) -> list[dict[str, Any]]: + merged: list[dict[str, Any]] = [] + seen_keys: set[tuple[str, str]] = set() + + for source in semantic_sources + keyword_sources: + key = (str(source.get("path", "")), str(source.get("title", ""))) + if key in seen_keys: + continue + seen_keys.add(key) + merged.append(source) + if len(merged) >= max_sources: + break + + return merged + + def run( + self, + query: str, + max_sources: int, + keyword_sources: list[dict[str, Any]], + source_documents: list[Any] | None = None, + ) -> dict[str, Any]: + if self.mode == "keyword": + return { + "sources": keyword_sources[:max_sources], + "retrieval_method": "keyword", + "vector_db": self.health(), + } + + index_state = self.sync_documents(source_documents or []) + + semantic_sources = self._semantic_retrieve(query=query, max_sources=max_sources) + vector_db = self.health() + vector_db.update({ + "index_state": index_state, + }) + + if self.mode == "semantic": + if semantic_sources: + return { + "sources": semantic_sources[:max_sources], + "retrieval_method": "semantic", + "vector_db": vector_db, + } + return { + "sources": keyword_sources[:max_sources], + "retrieval_method": "keyword_fallback", + "vector_db": vector_db, + } + + # Hybrid mode: semantic first, then keyword fill. + if semantic_sources: + merged = self._merge_sources(semantic_sources, keyword_sources, max_sources=max_sources) + return { + "sources": merged, + "retrieval_method": "hybrid", + "vector_db": vector_db, + } + + return { + "sources": keyword_sources[:max_sources], + "retrieval_method": "keyword_fallback", + "vector_db": vector_db, + } diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py new file mode 100644 index 000000000..93f6558e6 --- /dev/null +++ b/backend/tests/conftest.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from typing import Any + +import pytest + +from app.api.routes import chat as chat_route +from src.controller.controller import ControllerKernel + + +class _FakeGroqLLMClient: + provider_name = "groq" + model_name = "groq-test" + + def reason( + self, + query: str, + confidence: float, + top_sources: list[dict[str, Any]], + dedup_summary: dict[str, Any] | None, + ) -> dict[str, Any]: + normalized_query = query.lower() + high_risk = any(keyword in normalized_query for keyword in ["rollback", "revert", "pr", "slack", "jira", "deploy"]) + suggested_action = ( + "create rollback PR and notify Slack and Jira" + if high_risk + else "summarize findings and request approval for external actions" + ) + evidence_scores = [ + { + "title": source.get("title", ""), + "path": source.get("path", ""), + "source_type": source.get("source_type", "unknown"), + "raw_score": float(source.get("score", 0.0) or 0.0), + "priority_score": float(source.get("score", 0.0) or 0.0), + } + for source in top_sources + ] + return { + "reasoning": "Provider-generated reasoning based on indexed operational evidence.", + "answer": "Use runbook-guided mitigation and require approval for external coordination.", + "suggested_action": suggested_action, + "action_details": { + "intent": "rollback_and_notify" if high_risk else "summarize_and_request_approval", + "tool": "planner.rollback_and_notify" if high_risk else "planner.summarize_and_request_approval", + "parameters": {}, + "approval_required": True, + "risk_hint": "high" if high_risk else "medium", + }, + "reasoning_steps": [ + "Parsed operational context.", + "Ranked evidence sources.", + "Selected policy-compliant action.", + ], + "confidence_breakdown": { + "base_confidence": round(confidence, 3), + "quality_bonus": 0.0, + "duplicate_penalty": 0.0, + "clean_evidence_bonus": 0.0, + "final_confidence": round(confidence, 3), + }, + "evidence_scores": evidence_scores, + } + + def expand_query_terms(self, query: str, query_tokens: list[str]) -> list[str]: + expanded = [*query_tokens] + for token in ["incident", "runbook"]: + if token not in expanded: + expanded.append(token) + return expanded + + def assess_execution_action(self, action: str, action_details: dict[str, Any] | None) -> dict[str, Any]: + normalized = action.lower() + high_risk = "rollback" in normalized or "pr" in normalized or "rollback" in str((action_details or {}).get("intent", "")) + return { + "normalized_action": action, + "reasoning": "Provider execution assessment completed before permission-gate evaluation.", + "risk_hint": "high" if high_risk else "low", + } + + +@pytest.fixture(autouse=True) +def _inject_test_groq_kernel(monkeypatch: pytest.MonkeyPatch) -> None: + kernel = ControllerKernel(provider_name="groq", reasoning_llm_client=_FakeGroqLLMClient()) + monkeypatch.setattr(chat_route, "kernel", kernel) diff --git a/backend/tests/test_approvals.py b/backend/tests/test_approvals.py new file mode 100644 index 000000000..19263f3fe --- /dev/null +++ b/backend/tests/test_approvals.py @@ -0,0 +1,96 @@ +import json + +from fastapi.testclient import TestClient + +from app.main import app + + +def _create_high_risk_trace(client: TestClient) -> str: + with client.stream( + "POST", + "/api/chat", + json={ + "message": "Create rollback PR and notify Slack and Jira", + "session_id": "sess-approval", + }, + ) as response: + assert response.status_code == 200 + lines = list(response.iter_lines()) + + events: list[dict] = [] + pending_event = "message" + pending_data: list[str] = [] + for raw_line in lines: + line = str(raw_line).strip() + if line == "": + if pending_data: + events.append({"event": pending_event, "payload": json.loads("\n".join(pending_data))}) + pending_event = "message" + pending_data = [] + continue + if line.startswith("event:"): + pending_event = line.split("event:", 1)[1].strip() + continue + if line.startswith("data:"): + pending_data.append(line.split("data:", 1)[1].strip()) + + completion = next(item["payload"] for item in events if item["event"] == "trace_complete") + assert completion["needs_approval"] is True + return completion["trace_id"] + + +def test_approve_trace_generates_execution_plan_and_persists_audit() -> None: + client = TestClient(app) + trace_id = _create_high_risk_trace(client) + + response = client.post( + f"/api/approvals/{trace_id}", + json={ + "decision": "approve", + "approver_id": "sre-lead", + "comment": "Approved for execution.", + }, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["trace_id"] == trace_id + assert payload["final_status"] == "plan_approved" + assert payload["execution_mode"] == "planner_only" + assert payload["approval"]["decision"] == "approve" + assert payload["execution_result"]["status"] == "plan_generated" + assert payload["execution_result"]["execution_mode"] == "planner_only" + assert payload["execution_result"]["no_write_policy"] is True + + transcript = client.get(f"/api/chat/transcript/{trace_id}") + assert transcript.status_code == 200 + transcript_payload = transcript.json() + assert transcript_payload["final_status"] == "plan_approved" + assert transcript_payload["execution_mode"] == "planner_only" + assert transcript_payload["approval"]["approver_id"] == "sre-lead" + + +def test_reject_trace_does_not_execute_tool_and_marks_rejected() -> None: + client = TestClient(app) + trace_id = _create_high_risk_trace(client) + + response = client.post( + f"/api/approvals/{trace_id}", + json={ + "decision": "reject", + "approver_id": "incident-commander", + "comment": "Rejecting until more evidence.", + }, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["final_status"] == "plan_rejected" + assert payload["execution_mode"] == "planner_only" + assert payload["execution_result"]["status"] == "plan_rejected" + + transcript = client.get(f"/api/chat/transcript/{trace_id}") + assert transcript.status_code == 200 + transcript_payload = transcript.json() + assert transcript_payload["final_status"] == "plan_rejected" + assert transcript_payload["approval"]["decision"] == "reject" diff --git a/backend/tests/test_chat_iris_input.py b/backend/tests/test_chat_iris_input.py new file mode 100644 index 000000000..762f50452 --- /dev/null +++ b/backend/tests/test_chat_iris_input.py @@ -0,0 +1,143 @@ +import json +from unittest.mock import patch + +from fastapi.testclient import TestClient + +from app.api.routes import chat as chat_route +from app.main import app + + +def _incident_report_payload() -> dict: + return { + "source_system": "iris", + "case_id": "2847", + "report_id": "rep-42", + "report_url": "https://iris.local/cases/2847", + "ingested_at": "2026-04-16T10:00:00Z", + "case_name": "Redis Latency Spike", + "short_description": "P95 latency increased sharply after deployment.", + "severity": "high", + "tags": ["redis", "latency", "production"], + "iocs": [ + "redis://cache-prod", + {"type": "ip", "value": "10.2.3.44"}, + ], + "timeline": [ + {"time": "09:35", "event": "alert triggered"}, + {"time": "09:41", "event": "case escalated"}, + ], + } + + +def _collect_sse_events(client: TestClient, payload: dict) -> list[dict]: + with client.stream("POST", "/api/chat", json=payload) as response: + assert response.status_code == 200 + lines = list(response.iter_lines()) + + events: list[dict] = [] + pending_event = "message" + pending_data: list[str] = [] + for raw_line in lines: + line = str(raw_line).strip() + if line == "": + if pending_data: + events.append({"event": pending_event, "payload": json.loads("\n".join(pending_data))}) + pending_event = "message" + pending_data = [] + continue + if line.startswith("event:"): + pending_event = line.split("event:", 1)[1].strip() + continue + if line.startswith("data:"): + pending_data.append(line.split("data:", 1)[1].strip()) + + return events + + +def test_chat_accepts_incident_report_only() -> None: + client = TestClient(app) + events = _collect_sse_events( + client, + { + "session_id": "sess-iris-only", + "incident_report": _incident_report_payload(), + }, + ) + + completion = next(item["payload"] for item in events if item["event"] == "trace_complete") + assert completion["trace_id"].startswith("trace-") + assert isinstance(completion["answer"], str) + assert isinstance(completion["needs_approval"], bool) + assert isinstance(completion.get("metadata", {}).get("dedup_summary", {}), dict) + + +def test_chat_requires_message_or_incident_report() -> None: + client = TestClient(app) + response = client.post( + "/api/chat", + json={ + "session_id": "sess-iris-invalid", + }, + ) + + assert response.status_code == 422 + + +def test_chat_prefers_incident_report_over_message() -> None: + client = TestClient(app) + captured: dict[str, str] = {} + + def fake_stream_query_events(query: str, session_id: str): + captured["query"] = query + captured["session_id"] = session_id + yield { + "event_type": "trace_started", + "trace_id": "trace-iris-precedence", + "status": "started", + "metadata": { + "session_id": session_id, + "dedup_summary": {"deduped_count": 0}, + }, + } + yield { + "event_type": "trace_complete", + "trace_id": "trace-iris-precedence", + "status": "completed", + "answer": "ok", + "needs_approval": False, + "suggested_action": "summarize findings", + "metadata": { + "dedup_summary": {"deduped_count": 0}, + "step_count": 0, + }, + } + + with patch.object(chat_route.kernel, "stream_query_events", side_effect=fake_stream_query_events): + events = _collect_sse_events( + client, + { + "message": "ignore this free-text query", + "session_id": "sess-iris-both", + "incident_report": _incident_report_payload(), + }, + ) + + assert any(item["event"] == "trace_complete" for item in events) + assert "Case Name: Redis Latency Spike" in captured["query"] + assert "ignore this free-text query" not in captured["query"] + assert captured["session_id"] == "sess-iris-both" + + +def test_chat_message_mode_still_supported() -> None: + client = TestClient(app) + events = _collect_sse_events( + client, + { + "message": "Explain Redis latency from last week", + "session_id": "sess-message-mode", + }, + ) + + completion = next(item["payload"] for item in events if item["event"] == "trace_complete") + assert completion["trace_id"].startswith("trace-") + assert isinstance(completion.get("metadata", {}).get("dedup_summary", {}), dict) diff --git a/backend/tests/test_chat_orchestration.py b/backend/tests/test_chat_orchestration.py new file mode 100644 index 000000000..97c83f151 --- /dev/null +++ b/backend/tests/test_chat_orchestration.py @@ -0,0 +1,132 @@ +import json + +from fastapi.testclient import TestClient + +from app.api.routes import chat as chat_route +from app.main import app +from src.controller.controller import ControllerKernel + + +class _FakeGroqLLMClient: + provider_name = "groq" + model_name = "groq-test" + + def reason(self, query: str, confidence: float, top_sources: list[dict], dedup_summary: dict | None) -> dict[str, object]: + return { + "reasoning": "Provider-generated reasoning.", + "answer": "Provider-generated answer.", + "suggested_action": "create rollback PR and notify Slack and Jira", + "action_details": { + "intent": "rollback_and_notify", + "tool": "planner.rollback_and_notify", + "parameters": {}, + "approval_required": True, + "risk_hint": "high", + }, + "reasoning_steps": ["Analyze evidence", "Select action"], + "confidence_breakdown": {"base_confidence": confidence, "final_confidence": confidence}, + "evidence_scores": [], + } + + def expand_query_terms(self, query: str, query_tokens: list[str]) -> list[str]: + return query_tokens + + def assess_execution_action(self, action: str, action_details: dict | None) -> dict[str, str]: + return { + "normalized_action": action, + "reasoning": "Provider execution assessment completed.", + "risk_hint": "high", + } + + +def _collect_sse_events(client: TestClient, payload: dict) -> list[dict]: + with client.stream("POST", "/api/chat", json=payload) as response: + assert response.status_code == 200 + lines = list(response.iter_lines()) + + events: list[dict] = [] + pending_event = "message" + pending_data: list[str] = [] + + for raw_line in lines: + line = str(raw_line).strip() + if line == "": + if pending_data: + events.append({"event": pending_event, "payload": json.loads("\n".join(pending_data))}) + pending_event = "message" + pending_data = [] + continue + + if line.startswith("event:"): + pending_event = line.split("event:", 1)[1].strip() + continue + + if line.startswith("data:"): + pending_data.append(line.split("data:", 1)[1].strip()) + + return events + + +def test_chat_endpoint_uses_controller_kernel() -> None: + client = TestClient(app) + events = _collect_sse_events( + client, + {"message": "Explain Redis latency from last week", "session_id": "sess-1"}, + ) + + event_names = [item["event"] for item in events] + assert "trace_started" in event_names + assert "trace_complete" in event_names + + complete_payload = next(item["payload"] for item in events if item["event"] == "trace_complete") + assert complete_payload["trace_id"].startswith("trace-") + assert isinstance(complete_payload["answer"], str) + assert isinstance(complete_payload["needs_approval"], bool) + dedup_summary = complete_payload.get("metadata", {}).get("dedup_summary", {}) + assert isinstance(dedup_summary, dict) + assert "deduped_count" in dedup_summary + + transcript = client.get(f"/api/chat/transcript/{complete_payload['trace_id']}") + assert transcript.status_code == 200 + transcript_payload = transcript.json() + assert isinstance(transcript_payload.get("action_details"), dict) + assert transcript_payload["action_details"].get("intent") + + +def test_controller_marks_high_risk_actions_for_approval() -> None: + kernel = ControllerKernel(provider_name="groq", reasoning_llm_client=_FakeGroqLLMClient()) + result = kernel.handle_query("Create rollback PR and update Jira", session_id="sess-2") + + assert result.needs_approval is True + assert result.suggested_action + assert len(result.trace) == 3 + assert isinstance(result.dedup_summary, dict) + + +def test_chat_endpoint_emits_trace_error_for_invalid_provider(monkeypatch) -> None: + client = TestClient(app) + monkeypatch.setattr(chat_route, "kernel", ControllerKernel(provider_name="invalid-provider")) + + events = _collect_sse_events( + client, + {"message": "Explain Redis latency from last week", "session_id": "sess-invalid-provider"}, + ) + + error_payload = next(item["payload"] for item in events if item["event"] == "trace_error") + assert error_payload.get("error_code") == "provider_error" + assert "LLM_PROVIDER" in str(error_payload.get("error", "")) + + +def test_chat_endpoint_emits_trace_error_when_selected_provider_is_misconfigured(monkeypatch) -> None: + client = TestClient(app) + monkeypatch.delenv("GROQ_API_KEY", raising=False) + monkeypatch.setattr(chat_route, "kernel", ControllerKernel(provider_name="groq")) + + events = _collect_sse_events( + client, + {"message": "Explain Redis latency from last week", "session_id": "sess-missing-groq-key"}, + ) + + error_payload = next(item["payload"] for item in events if item["event"] == "trace_error") + assert error_payload.get("error_code") == "provider_error" + assert "GROQ_API_KEY" in str(error_payload.get("error", "")) diff --git a/backend/tests/test_chat_stream.py b/backend/tests/test_chat_stream.py new file mode 100644 index 000000000..bf71b5c70 --- /dev/null +++ b/backend/tests/test_chat_stream.py @@ -0,0 +1,216 @@ +import json +import time + +from fastapi.testclient import TestClient + +from app.api.routes import chat as chat_route +from app.main import app +from src.controller.controller import ControllerKernel + + +def _collect_sse_events(client: TestClient, message: str, session_id: str) -> list[dict]: + with client.stream( + "POST", + "/api/chat", + json={"message": message, "session_id": session_id}, + ) as response: + assert response.status_code == 200 + lines = list(response.iter_lines()) + + events: list[dict] = [] + pending_event: dict[str, str] = {} + pending_data: list[str] = [] + + for raw_line in lines: + line = str(raw_line).strip() + if line == "": + if pending_data: + payload_text = "\n".join(pending_data) + payload = json.loads(payload_text) + events.append( + { + "event": pending_event.get("event", "message"), + "id": pending_event.get("id", ""), + "payload": payload, + } + ) + pending_event = {} + pending_data = [] + continue + + if line.startswith("event:"): + pending_event["event"] = line.split("event:", 1)[1].strip() + continue + if line.startswith("id:"): + pending_event["id"] = line.split("id:", 1)[1].strip() + continue + if line.startswith("data:"): + pending_data.append(line.split("data:", 1)[1].strip()) + + return events + + +def _create_trace(client: TestClient, session_id: str) -> str: + events = _collect_sse_events(client, "Explain Redis latency from last week", session_id) + completion = next(item for item in events if item["event"] == "trace_complete") + trace_id = completion["payload"].get("trace_id", "") + assert isinstance(trace_id, str) + assert trace_id.startswith("trace-") + return trace_id + + +def test_transcript_endpoint_returns_persisted_trace() -> None: + client = TestClient(app) + trace_id = _create_trace(client, "sess-stream-1") + + response = client.get(f"/api/chat/transcript/{trace_id}") + assert response.status_code == 200 + + payload = response.json() + assert payload["trace_id"] == trace_id + assert isinstance(payload["steps"], list) + assert len(payload["steps"]) == 3 + assert isinstance(payload["dedup_summary"], dict) + assert "deduped_count" in payload["dedup_summary"] + + +def test_stream_endpoint_emits_contract_payload_shape() -> None: + client = TestClient(app) + events = _collect_sse_events(client, "Explain Redis latency from last week", "sess-stream-2") + + trace_events = [item for item in events if item["event"] == "trace_step"] + assert len(trace_events) == 3 + + expected_steps = ["retrieval", "reasoning", "execution"] + assert [item["payload"]["step"] for item in trace_events] == expected_steps + + lifecycle_events = [item["event"] for item in events] + assert "trace_started" in lifecycle_events + assert "trace_complete" in lifecycle_events + + for item in events: + payload = item["payload"] + assert isinstance(payload.get("event_id"), str) + assert isinstance(payload.get("trace_id"), str) + assert isinstance(payload.get("sequence"), int) + assert isinstance(payload.get("status"), str) + + for item in trace_events: + payload = item["payload"] + required_keys = {"step", "agent", "observation", "sources"} + assert required_keys.issubset(payload.keys()) + assert isinstance(payload["step"], str) + assert isinstance(payload["agent"], str) + assert isinstance(payload["observation"], str) + assert isinstance(payload["sources"], list) + assert isinstance(payload.get("metadata", {}), dict) + assert isinstance(payload.get("timestamp"), str) + metadata = payload.get("metadata", {}) + assert isinstance(metadata.get("duration_ms"), float) + assert isinstance(metadata.get("started_at"), str) + assert isinstance(metadata.get("finished_at"), str) + + reasoning_payload = next(item["payload"] for item in trace_events if item["payload"]["step"] == "reasoning") + metadata = reasoning_payload.get("metadata", {}) + assert isinstance(metadata.get("reasoning_steps"), list) + assert isinstance(metadata.get("evidence_scores"), list) + assert isinstance(metadata.get("confidence_breakdown"), dict) + assert metadata.get("provider") != "deterministic" + assert metadata.get("model") != "heuristic" + + execution_payload = next(item["payload"] for item in trace_events if item["payload"]["step"] == "execution") + execution_metadata = execution_payload.get("metadata", {}) + assert execution_metadata.get("provider") != "deterministic" + assert execution_metadata.get("model") != "heuristic" + + completion_payload = next(item["payload"] for item in events if item["event"] == "trace_complete") + assert isinstance(completion_payload.get("answer"), str) + assert isinstance(completion_payload.get("needs_approval"), bool) + + +def test_chat_stream_emits_error_for_invalid_provider(monkeypatch) -> None: + client = TestClient(app) + monkeypatch.setattr(chat_route, "kernel", ControllerKernel(provider_name="invalid-provider")) + events = _collect_sse_events(client, "Explain Redis latency from last week", "sess-invalid-provider") + error_payload = next(item["payload"] for item in events if item["event"] == "trace_error") + assert error_payload.get("error_code") == "provider_error" + assert "LLM_PROVIDER" in str(error_payload.get("error", "")) + + +def test_transcript_endpoint_returns_404_for_unknown_trace() -> None: + client = TestClient(app) + response = client.get("/api/chat/transcript/trace-not-found") + assert response.status_code == 404 + + +def test_transcript_endpoint_can_wait_for_ready_transcript(monkeypatch) -> None: + client = TestClient(app) + + def _wait_for_transcript(trace_id: str, timeout_seconds: float): + assert trace_id == "trace-delayed" + assert timeout_seconds == 0.2 + return {"trace_id": trace_id, "steps": [{"step": "retrieval"}]} + + monkeypatch.setattr(chat_route.memory, "wait_for_transcript", _wait_for_transcript) + + response = client.get("/api/chat/transcript/trace-delayed", params={"wait_timeout_seconds": 0.2}) + assert response.status_code == 200 + payload = response.json() + assert payload["trace_id"] == "trace-delayed" + + +def test_transcript_endpoint_wait_timeout_still_returns_404(monkeypatch) -> None: + client = TestClient(app) + + def _wait_for_transcript(trace_id: str, timeout_seconds: float): + assert trace_id == "trace-missing" + assert timeout_seconds == 0.1 + return None + + monkeypatch.setattr(chat_route.memory, "wait_for_transcript", _wait_for_transcript) + + response = client.get("/api/chat/transcript/trace-missing", params={"wait_timeout_seconds": 0.1}) + assert response.status_code == 404 + + +def test_stream_response_sets_sse_headers() -> None: + client = TestClient(app) + with client.stream( + "POST", + "/api/chat", + json={"message": "quick health check", "session_id": "sess-stream-headers"}, + ) as response: + assert response.status_code == 200 + assert response.headers.get("content-type", "").startswith("text/event-stream") + assert response.headers.get("cache-control") == "no-cache" + assert response.headers.get("connection") == "keep-alive" + + +def test_chat_stream_times_out_when_controller_stalls(monkeypatch) -> None: + class _StalledKernel: + def stream_query_events(self, query: str, session_id: str): + yield { + "event_type": "trace_started", + "trace_id": "trace-stalled", + "status": "started", + "metadata": {}, + } + time.sleep(0.2) + while True: + time.sleep(0.2) + + client = TestClient(app) + monkeypatch.setattr(chat_route, "kernel", _StalledKernel()) + monkeypatch.setattr(chat_route, "STREAM_HEARTBEAT_SECONDS", 0.02) + monkeypatch.setattr(chat_route, "STREAM_IDLE_TIMEOUT_SECONDS", 0.08) + + events = _collect_sse_events(client, "simulate stalled stream", "sess-stream-timeout") + lifecycle_events = [item["event"] for item in events] + + assert "trace_started" in lifecycle_events + assert "trace_heartbeat" in lifecycle_events + assert "trace_error" in lifecycle_events + + timeout_payload = next(item["payload"] for item in events if item["event"] == "trace_error") + assert timeout_payload.get("error_code") == "stream_timeout" + assert timeout_payload.get("status") == "failed" diff --git a/backend/tests/test_e2e_ingest_chat_approve.py b/backend/tests/test_e2e_ingest_chat_approve.py new file mode 100644 index 000000000..2e77336e3 --- /dev/null +++ b/backend/tests/test_e2e_ingest_chat_approve.py @@ -0,0 +1,149 @@ +import json +from unittest.mock import patch + +from fastapi.testclient import TestClient + +from app.api.routes.chat import kernel +from app.main import app +from src.memory.three_tier_memory import ThreeTierMemory + + +class _FakeConfluenceClient: + def fetch_page(self, page_id: str) -> dict[str, str]: + return { + "page_id": page_id, + "title": f"Redis Latency Runbook {page_id}", + "body": "Redis latency remediation steps include checking deploys and rollback readiness.", + "source_url": f"https://confluence.example.internal/wiki/{page_id}", + } + + +class _FakeExecutor: + def execute(self, action: str) -> dict[str, object]: + return { + "tool": "planner.external_action_plan", + "status": "plan_generated", + "output": f"Generated planner-only execution plan for action: {action}", + "timestamp": "2026-04-16T00:00:00+00:00", + "execution_mode": "planner_only", + "no_write_policy": True, + "plan": { + "intent": "rollback_and_notify", + "summary": action, + "steps": [ + { + "id": 1, + "title": "Collect rollback context", + "system": "github", + "mode": "planner_only", + "operation": "review latest deployment and candidate rollback commit", + } + ], + }, + } + + +def _clear_runtime_documents() -> None: + ThreeTierMemory._runtime_documents = [] + kernel.memory._documents_cache = None + + +def _read_stream_events(client: TestClient, payload: dict[str, str]) -> list[dict]: + with client.stream("POST", "/api/chat", json=payload) as response: + assert response.status_code == 200 + lines = list(response.iter_lines()) + + events: list[dict] = [] + pending_event = "message" + pending_data: list[str] = [] + + for raw_line in lines: + line = str(raw_line).strip() + if line == "": + if pending_data: + events.append( + { + "event": pending_event, + "payload": json.loads("\n".join(pending_data)), + } + ) + pending_event = "message" + pending_data = [] + continue + + if line.startswith("event:"): + pending_event = line.split("event:", 1)[1].strip() + continue + if line.startswith("data:"): + pending_data.append(line.split("data:", 1)[1].strip()) + + return events + + +def test_e2e_batch_ingest_chat_stream_approve_transcript() -> None: + _clear_runtime_documents() + client = TestClient(app) + + with patch("app.api.routes.ingestion.ConfluenceClient.from_env", return_value=_FakeConfluenceClient()): + ingest_response = client.post( + "/api/ingest/confluence", + json={"page_ids": ["12345", "67890"]}, + ) + + assert ingest_response.status_code == 200 + ingest_payload = ingest_response.json() + assert ingest_payload["source"] == "confluence" + assert ingest_payload["ingested_count"] == 2 + assert ingest_payload["failed_count"] == 0 + + stream_events = _read_stream_events( + client, + { + "message": "Create rollback PR and notify Slack and Jira for redis latency incident", + "session_id": "sess-e2e-golden-flow", + }, + ) + + event_names = [item["event"] for item in stream_events] + assert "trace_started" in event_names + assert "trace_complete" in event_names + + step_payloads = [item["payload"] for item in stream_events if item["event"] == "trace_step"] + assert [item["step"] for item in step_payloads] == ["retrieval", "reasoning", "execution"] + + completion_payload = next(item["payload"] for item in stream_events if item["event"] == "trace_complete") + assert completion_payload["needs_approval"] is True + trace_id = completion_payload["trace_id"] + + with patch("app.api.routes.approvals.executor", _FakeExecutor()): + approve_response = client.post( + f"/api/approvals/{trace_id}", + json={ + "decision": "approve", + "approver_id": "sre-lead", + "comment": "Approved in E2E test flow.", + }, + ) + + assert approve_response.status_code == 200 + approve_payload = approve_response.json() + assert approve_payload["final_status"] == "plan_approved" + assert approve_payload["execution_mode"] == "planner_only" + assert approve_payload["execution_result"]["status"] == "plan_generated" + + transcript_response = client.get(f"/api/chat/transcript/{trace_id}") + assert transcript_response.status_code == 200 + transcript_payload = transcript_response.json() + + assert transcript_payload["trace_id"] == trace_id + assert transcript_payload["final_status"] == "plan_approved" + assert transcript_payload["execution_mode"] == "planner_only" + assert transcript_payload["approval"]["decision"] == "approve" + assert transcript_payload["execution_result"]["status"] == "plan_generated" + assert [step["step"] for step in transcript_payload["steps"]] == [ + "retrieval", + "reasoning", + "execution", + "approval", + ] + _clear_runtime_documents() diff --git a/backend/tests/test_health.py b/backend/tests/test_health.py new file mode 100644 index 000000000..39d635eb2 --- /dev/null +++ b/backend/tests/test_health.py @@ -0,0 +1,10 @@ +from fastapi.testclient import TestClient + +from app.main import app + + +def test_health() -> None: + client = TestClient(app) + response = client.get("/health") + assert response.status_code == 200 + assert response.json() == {"status": "ok"} diff --git a/backend/tests/test_ingestion.py b/backend/tests/test_ingestion.py new file mode 100644 index 000000000..9e5a977b6 --- /dev/null +++ b/backend/tests/test_ingestion.py @@ -0,0 +1,680 @@ +from unittest.mock import patch + +from fastapi.testclient import TestClient + +from app.api.routes.chat import kernel +from app.main import app +from src.adapters.iris_client import IrisClientError +from src.memory.three_tier_memory import ThreeTierMemory + + +class _FakeIrisClient: + def fetch_case(self, case_id: str) -> dict: + return { + "source_system": "iris", + "case_id": case_id, + "report_id": f"rep-{case_id}", + "report_url": f"https://localhost/case/{case_id}", + "ingested_at": "2026-04-16T00:00:00Z", + "case_name": "Redis Latency Spike", + "short_description": "Latency increased after deployment", + "severity": "high", + "tags": ["redis", "latency"], + "iocs": [{"type": "host", "value": "cache-01"}], + "timeline": [{"time": "10:10", "event": "Alert fired"}], + } + + def create_incident( + self, + *, + case_name: str, + case_description: str, + severity: str, + tags: list[str], + case_customer: int, + case_soc_id: str, + classification_id: int | None, + case_template_id: str | None, + custom_attributes: dict[str, object] | None, + ) -> dict: + return { + "source_system": "iris", + "case_id": "9001", + "report_id": "rep-9001", + "report_url": "https://localhost/case/9001", + "ingested_at": "2026-04-16T00:00:00Z", + "case_name": case_name, + "short_description": case_description, + "severity": severity, + "tags": tags, + "iocs": [], + "timeline": [], + } + + +class _FakeConfluenceClient: + def fetch_page(self, page_id: str) -> dict[str, str]: + return { + "page_id": page_id, + "title": "Redis Latency Runbook", + "body": "Check recent deploys and cache hit ratio.", + "source_url": f"https://confluence.example.internal/wiki/{page_id}", + } + + +class _MixedConfluenceClient: + def fetch_page(self, page_id: str) -> dict[str, str]: + if page_id == "broken": + raise RuntimeError("simulated confluence fetch failure") + return { + "page_id": page_id, + "title": f"Runbook {page_id}", + "body": "Check recent deploys and cache hit ratio.", + "source_url": f"https://confluence.example.internal/wiki/{page_id}", + } + + +class _FakeGitHubClient: + def fetch_issue(self, *, repository: str, issue_number: int) -> dict: + return { + "repository": repository, + "number": issue_number, + "title": f"Issue {issue_number}", + "state": "open", + "url": f"https://github.com/{repository}/issues/{issue_number}", + "body": "Sample GitHub issue body", + } + + +class _MixedGitHubClient: + def fetch_issue(self, *, repository: str, issue_number: int) -> dict: + if issue_number == 404: + raise RuntimeError("simulated github fetch failure") + return { + "repository": repository, + "number": issue_number, + "title": f"Issue {issue_number}", + "state": "open", + "url": f"https://github.com/{repository}/issues/{issue_number}", + "body": "Sample GitHub issue body", + } + + +class _FakeJiraClient: + def fetch_issue(self, *, issue_key: str) -> dict: + return { + "key": issue_key, + "summary": f"Summary for {issue_key}", + "status": "To Do", + "priority": "High", + "assignee": "Demo User", + "description": "Sample Jira issue description", + "url": f"https://example.atlassian.net/browse/{issue_key}", + } + + +class _MixedJiraClient: + def fetch_issue(self, *, issue_key: str) -> dict: + if issue_key == "OPS-404": + raise RuntimeError("simulated jira fetch failure") + return { + "key": issue_key, + "summary": f"Summary for {issue_key}", + "status": "To Do", + "priority": "High", + "assignee": "Demo User", + "description": "Sample Jira issue description", + "url": f"https://example.atlassian.net/browse/{issue_key}", + } + + +class _FakeSlackClient: + def fetch_channel_messages(self, *, channel_id: str, limit: int = 20) -> dict: + messages = [ + {"ts": "1712345678.100001", "thread_ts": "1712345678.100001", "user": "U123", "text": "Investigating"}, + {"ts": "1712345679.100002", "thread_ts": "1712345678.100001", "user": "U124", "text": "Rollback started"}, + ] + return { + "channel_id": channel_id, + "message_count": min(len(messages), limit), + "has_more": False, + "messages": messages[:limit], + } + + def fetch_thread_messages(self, *, channel_id: str, thread_ts: str, limit: int = 20) -> dict: + messages = [ + {"ts": thread_ts, "thread_ts": thread_ts, "user": "U123", "text": "Primary alert thread"}, + {"ts": "1712345680.100003", "thread_ts": thread_ts, "user": "U124", "text": "Mitigation confirmed"}, + ] + return { + "channel_id": channel_id, + "thread_ts": thread_ts, + "message_count": min(len(messages), limit), + "has_more": False, + "messages": messages[:limit], + } + + +class _MixedSlackClient: + def fetch_channel_messages(self, *, channel_id: str, limit: int = 20) -> dict: + if channel_id == "C-BROKEN": + raise RuntimeError("simulated slack channel fetch failure") + return { + "channel_id": channel_id, + "message_count": 1, + "has_more": False, + "messages": [{"ts": "1712345678.100001", "thread_ts": "1712345678.100001", "user": "U123", "text": "Investigating"}], + } + + def fetch_thread_messages(self, *, channel_id: str, thread_ts: str, limit: int = 20) -> dict: + if thread_ts == "1712345999.999999": + raise RuntimeError("simulated slack thread fetch failure") + return { + "channel_id": channel_id, + "thread_ts": thread_ts, + "message_count": 1, + "has_more": False, + "messages": [{"ts": thread_ts, "thread_ts": thread_ts, "user": "U123", "text": "Thread message"}], + } + + +class _FakeGrafanaClient: + def fetch_public_dashboard(self, *, public_dashboard_url: str) -> dict: + token = public_dashboard_url.rstrip("/").split("/")[-1] + return { + "public_dashboard_token": token, + "source_url": public_dashboard_url, + "grafana_base_url": "https://f4tal1t.grafana.net", + "title": "Poxil Dashboard", + "uid": "ftk9vtl", + "version": 7, + "timezone": "browser", + "refresh": "30s", + "time_range": { + "from": "2026-04-16T19:31:58.104Z", + "to": "2026-04-16T19:32:01.591Z", + }, + "meta": { + "slug": "poxil-dashboard", + "created": "2026-04-16T20:07:35Z", + "updated": "2026-04-16T23:38:50Z", + "public_dashboard_enabled": True, + }, + "panel_count": 3, + "panels": [ + { + "id": 1, + "title": "% increase in req", + "type": "timeseries", + "datasource_type": "prometheus", + "datasource_uid": "grafanacloud-prom", + "grid_pos": {"h": 10, "w": 8, "x": 0, "y": 0}, + "transparent": True, + "plugin_version": "13.1.0-24455754975", + "targets": [ + { + "ref_id": "B", + "query": "sum(increase(http_requests_total[15m]))", + "editor_mode": "code", + "datasource_type": "prometheus", + "datasource_uid": "grafanacloud-prom", + "raw": {"refId": "B"}, + } + ], + "options": {}, + "field_config": {}, + } + ], + } + + +class _MixedGrafanaClient: + def fetch_public_dashboard(self, *, public_dashboard_url: str) -> dict: + if "broken" in public_dashboard_url: + raise RuntimeError("simulated grafana fetch failure") + return _FakeGrafanaClient().fetch_public_dashboard(public_dashboard_url=public_dashboard_url) + + +def _clear_runtime_documents() -> None: + ThreeTierMemory._runtime_documents = [] + kernel.memory._documents_cache = None + + +def test_ingest_iris_adds_runtime_incident_document() -> None: + _clear_runtime_documents() + client = TestClient(app) + + with patch("app.api.routes.ingestion.IrisClient.from_env", return_value=_FakeIrisClient()): + response = client.post("/api/ingest/iris", params={"case_id": "2847"}) + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "iris" + assert payload["case_id"] == "2847" + + docs = kernel.memory.load_documents(force_reload=True) + assert any(doc.path == "runtime/iris/2847.json" for doc in docs) + _clear_runtime_documents() + + +def test_ingest_confluence_batch_adds_runtime_documents() -> None: + _clear_runtime_documents() + client = TestClient(app) + + with patch("app.api.routes.ingestion.ConfluenceClient.from_env", return_value=_FakeConfluenceClient()): + response = client.post( + "/api/ingest/confluence", + json={"page_ids": ["12345", "98765"]}, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "confluence" + assert payload["ingested_count"] == 2 + assert payload["failed_count"] == 0 + assert len(payload["results"]) == 2 + assert all(item["status"] == "ingested" for item in payload["results"]) + + docs = kernel.memory.load_documents(force_reload=True) + assert any(doc.path == "runtime/confluence/12345.md" for doc in docs) + assert any(doc.path == "runtime/confluence/98765.md" for doc in docs) + _clear_runtime_documents() + + +def test_ingest_confluence_batch_reports_partial_failures() -> None: + _clear_runtime_documents() + client = TestClient(app) + + with patch("app.api.routes.ingestion.ConfluenceClient.from_env", return_value=_MixedConfluenceClient()): + response = client.post( + "/api/ingest/confluence", + json={"page_ids": ["12345", "broken"]}, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "confluence" + assert payload["ingested_count"] == 1 + assert payload["failed_count"] == 1 + + success = next(item for item in payload["results"] if item["page_id"] == "12345") + failure = next(item for item in payload["results"] if item["page_id"] == "broken") + assert success["status"] == "ingested" + assert success["title"] == "Runbook 12345" + assert failure["status"] == "failed" + assert "simulated confluence fetch failure" in failure["error"] + assert failure["error_detail"]["code"] == "ingestion_adapter_error" + assert failure["error_detail"]["source"] == "confluence" + assert failure["error_detail"]["stage"] == "fetch" + assert failure["error_detail"]["target"] == "broken" + + docs = kernel.memory.load_documents(force_reload=True) + assert any(doc.path == "runtime/confluence/12345.md" for doc in docs) + assert not any(doc.path == "runtime/confluence/broken.md" for doc in docs) + _clear_runtime_documents() + + +def test_create_iris_incident_adds_runtime_document() -> None: + _clear_runtime_documents() + client = TestClient(app) + + with patch("app.api.routes.ingestion.IrisClient.from_env", return_value=_FakeIrisClient()): + response = client.post( + "/api/incidents/create", + json={ + "case_name": "Redis latency in production", + "case_description": "P95 latency increased after deploy", + "severity": "high", + "tags": ["redis", "latency", "redis"], + }, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "iris" + assert payload["case_id"] == "9001" + assert payload["incident_report"]["case_name"] == "Redis latency in production" + assert payload["incident_report"]["tags"] == ["redis", "latency"] + + docs = kernel.memory.load_documents(force_reload=True) + assert any(doc.path == "runtime/iris/9001.json" for doc in docs) + _clear_runtime_documents() + + +def test_create_iris_incident_returns_502_on_upstream_failure() -> None: + client = TestClient(app) + + with patch( + "app.api.routes.ingestion.IrisClient.from_env", + side_effect=IrisClientError("IRIS API unavailable"), + ): + response = client.post( + "/api/incidents/create", + json={ + "case_name": "Redis latency in production", + "case_description": "P95 latency increased after deploy", + }, + ) + + assert response.status_code == 502 + detail = response.json()["detail"] + assert detail["code"] == "ingestion_adapter_unavailable" + assert detail["source"] == "iris" + assert detail["stage"] == "init" + assert "IRIS API unavailable" in detail["message"] + + +def test_ingest_github_batch_adds_runtime_documents() -> None: + _clear_runtime_documents() + client = TestClient(app) + + with patch("app.api.routes.ingestion.GitHubClient.from_env", return_value=_FakeGitHubClient()): + response = client.post( + "/api/ingest/github", + json={ + "issue_refs": [ + {"repository": "org/repo", "issue_number": 101}, + {"repository": "org/repo", "issue_number": 202}, + ] + }, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "github" + assert payload["ingested_count"] == 2 + assert payload["failed_count"] == 0 + + docs = kernel.memory.load_documents(force_reload=True) + assert any(doc.path == "runtime/github/org__repo-101.md" for doc in docs) + assert any(doc.path == "runtime/github/org__repo-202.md" for doc in docs) + _clear_runtime_documents() + + +def test_ingest_github_batch_reports_partial_failures() -> None: + _clear_runtime_documents() + client = TestClient(app) + + with patch("app.api.routes.ingestion.GitHubClient.from_env", return_value=_MixedGitHubClient()): + response = client.post( + "/api/ingest/github", + json={ + "issue_refs": [ + {"repository": "org/repo", "issue_number": 101}, + {"repository": "org/repo", "issue_number": 404}, + ] + }, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "github" + assert payload["ingested_count"] == 1 + assert payload["failed_count"] == 1 + + success = next(item for item in payload["results"] if item["issue_number"] == 101) + failure = next(item for item in payload["results"] if item["issue_number"] == 404) + assert success["status"] == "ingested" + assert failure["status"] == "failed" + assert "simulated github fetch failure" in failure["error"] + assert failure["error_detail"]["source"] == "github" + assert failure["error_detail"]["stage"] == "fetch" + assert failure["error_detail"]["target"] == "org/repo#404" + + docs = kernel.memory.load_documents(force_reload=True) + assert any(doc.path == "runtime/github/org__repo-101.md" for doc in docs) + assert not any(doc.path == "runtime/github/org__repo-404.md" for doc in docs) + _clear_runtime_documents() + + +def test_ingest_grafana_batch_adds_runtime_documents() -> None: + _clear_runtime_documents() + client = TestClient(app) + + with patch("app.api.routes.ingestion.GrafanaClient.from_env", return_value=_FakeGrafanaClient()): + response = client.post( + "/api/ingest/grafana", + json={ + "dashboards": [ + {"public_dashboard_url": "https://f4tal1t.grafana.net/public-dashboards/a56bc0dfd37746ac8d32f2bb23519ac9"}, + {"public_dashboard_url": "https://f4tal1t.grafana.net/public-dashboards/xyz987654321"}, + ] + }, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "grafana" + assert payload["ingested_count"] == 2 + assert payload["failed_count"] == 0 + + docs = kernel.memory.load_documents(force_reload=True) + assert any(doc.path == "runtime/grafana/a56bc0dfd37746ac8d32f2bb23519ac9.json" for doc in docs) + assert any(doc.path == "runtime/grafana/xyz987654321.json" for doc in docs) + _clear_runtime_documents() + + +def test_ingest_grafana_batch_reports_partial_failures() -> None: + _clear_runtime_documents() + client = TestClient(app) + + with patch("app.api.routes.ingestion.GrafanaClient.from_env", return_value=_MixedGrafanaClient()): + response = client.post( + "/api/ingest/grafana", + json={ + "dashboards": [ + {"public_dashboard_url": "https://f4tal1t.grafana.net/public-dashboards/a56bc0dfd37746ac8d32f2bb23519ac9"}, + {"public_dashboard_url": "https://f4tal1t.grafana.net/public-dashboards/broken-token"}, + ] + }, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "grafana" + assert payload["ingested_count"] == 1 + assert payload["failed_count"] == 1 + + success = next(item for item in payload["results"] if item["status"] == "ingested") + failure = next(item for item in payload["results"] if item["status"] == "failed") + assert success["title"] == "Poxil Dashboard" + assert success["panel_count"] == 3 + assert "simulated grafana fetch failure" in failure["error"] + assert failure["error_detail"]["source"] == "grafana" + assert failure["error_detail"]["stage"] == "fetch" + assert failure["error_detail"]["target"] == "https://f4tal1t.grafana.net/public-dashboards/broken-token" + + docs = kernel.memory.load_documents(force_reload=True) + assert any(doc.path == "runtime/grafana/a56bc0dfd37746ac8d32f2bb23519ac9.json" for doc in docs) + assert not any(doc.path == "runtime/grafana/broken-token.json" for doc in docs) + _clear_runtime_documents() + + +def test_ingest_jira_batch_adds_runtime_documents() -> None: + _clear_runtime_documents() + client = TestClient(app) + + with patch("app.api.routes.ingestion.JiraClient.from_env", return_value=_FakeJiraClient()): + response = client.post( + "/api/ingest/jira", + json={"issue_keys": ["OPS-101", "OPS-202"]}, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "jira" + assert payload["ingested_count"] == 2 + assert payload["failed_count"] == 0 + + docs = kernel.memory.load_documents(force_reload=True) + assert any(doc.path == "runtime/jira/OPS-101.md" for doc in docs) + assert any(doc.path == "runtime/jira/OPS-202.md" for doc in docs) + _clear_runtime_documents() + + +def test_ingest_jira_batch_reports_partial_failures() -> None: + _clear_runtime_documents() + client = TestClient(app) + + with patch("app.api.routes.ingestion.JiraClient.from_env", return_value=_MixedJiraClient()): + response = client.post( + "/api/ingest/jira", + json={"issue_keys": ["OPS-101", "OPS-404"]}, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "jira" + assert payload["ingested_count"] == 1 + assert payload["failed_count"] == 1 + + success = next(item for item in payload["results"] if item["issue_key"] == "OPS-101") + failure = next(item for item in payload["results"] if item["issue_key"] == "OPS-404") + assert success["status"] == "ingested" + assert failure["status"] == "failed" + assert "simulated jira fetch failure" in failure["error"] + assert failure["error_detail"]["source"] == "jira" + assert failure["error_detail"]["stage"] == "fetch" + assert failure["error_detail"]["target"] == "OPS-404" + + docs = kernel.memory.load_documents(force_reload=True) + assert any(doc.path == "runtime/jira/OPS-101.md" for doc in docs) + assert not any(doc.path == "runtime/jira/OPS-404.md" for doc in docs) + _clear_runtime_documents() + + +def test_ingest_slack_channels_adds_runtime_documents() -> None: + _clear_runtime_documents() + client = TestClient(app) + + with patch("app.api.routes.ingestion.SlackClient.from_env", return_value=_FakeSlackClient()): + response = client.post( + "/api/ingest/slack/channels", + json={"channels": [{"channel_id": "C12345", "limit": 10}]}, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "slack" + assert payload["ingested_count"] == 1 + assert payload["failed_count"] == 0 + + docs = kernel.memory.load_documents(force_reload=True) + assert any(doc.path == "runtime/slack/channel-C12345.md" for doc in docs) + _clear_runtime_documents() + + +def test_ingest_slack_channels_reports_partial_failures() -> None: + _clear_runtime_documents() + client = TestClient(app) + + with patch("app.api.routes.ingestion.SlackClient.from_env", return_value=_MixedSlackClient()): + response = client.post( + "/api/ingest/slack/channels", + json={ + "channels": [ + {"channel_id": "C12345", "limit": 10}, + {"channel_id": "C-BROKEN", "limit": 10}, + ] + }, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "slack" + assert payload["ingested_count"] == 1 + assert payload["failed_count"] == 1 + + success = next(item for item in payload["results"] if item["channel_id"] == "C12345") + failure = next(item for item in payload["results"] if item["channel_id"] == "C-BROKEN") + assert success["status"] == "ingested" + assert failure["status"] == "failed" + assert "simulated slack channel fetch failure" in failure["error"] + assert failure["error_detail"]["source"] == "slack" + assert failure["error_detail"]["stage"] == "fetch" + assert failure["error_detail"]["target"] == "C-BROKEN" + + docs = kernel.memory.load_documents(force_reload=True) + assert any(doc.path == "runtime/slack/channel-C12345.md" for doc in docs) + assert not any(doc.path == "runtime/slack/channel-C-BROKEN.md" for doc in docs) + _clear_runtime_documents() + + +def test_ingest_slack_threads_adds_runtime_documents() -> None: + _clear_runtime_documents() + client = TestClient(app) + + with patch("app.api.routes.ingestion.SlackClient.from_env", return_value=_FakeSlackClient()): + response = client.post( + "/api/ingest/slack/threads", + json={"threads": [{"channel_id": "C12345", "thread_ts": "1712345678.123456", "limit": 10}]}, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "slack" + assert payload["ingested_count"] == 1 + assert payload["failed_count"] == 0 + + docs = kernel.memory.load_documents(force_reload=True) + assert any(doc.path == "runtime/slack/thread-C12345-1712345678_123456.md" for doc in docs) + _clear_runtime_documents() + + +def test_ingest_slack_threads_reports_partial_failures() -> None: + _clear_runtime_documents() + client = TestClient(app) + + with patch("app.api.routes.ingestion.SlackClient.from_env", return_value=_MixedSlackClient()): + response = client.post( + "/api/ingest/slack/threads", + json={ + "threads": [ + {"channel_id": "C12345", "thread_ts": "1712345678.123456", "limit": 10}, + {"channel_id": "C12345", "thread_ts": "1712345999.999999", "limit": 10}, + ] + }, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "slack" + assert payload["ingested_count"] == 1 + assert payload["failed_count"] == 1 + + success = next(item for item in payload["results"] if item["thread_ts"] == "1712345678.123456") + failure = next(item for item in payload["results"] if item["thread_ts"] == "1712345999.999999") + assert success["status"] == "ingested" + assert failure["status"] == "failed" + assert "simulated slack thread fetch failure" in failure["error"] + assert failure["error_detail"]["source"] == "slack" + assert failure["error_detail"]["stage"] == "fetch" + assert failure["error_detail"]["target"] == "C12345:1712345999.999999" + + docs = kernel.memory.load_documents(force_reload=True) + assert any(doc.path == "runtime/slack/thread-C12345-1712345678_123456.md" for doc in docs) + assert not any(doc.path == "runtime/slack/thread-C12345-1712345999_999999.md" for doc in docs) + _clear_runtime_documents() + + +def test_vector_status_endpoint_returns_vector_payload() -> None: + client = TestClient(app) + response = client.get("/api/vector/status") + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "vector" + assert isinstance(payload["status"], dict) + assert "mode" in payload["status"] + + +def test_vector_rebuild_endpoint_returns_rebuild_status() -> None: + _clear_runtime_documents() + client = TestClient(app) + response = client.post("/api/vector/rebuild") + + assert response.status_code == 200 + payload = response.json() + assert payload["source"] == "vector" + assert isinstance(payload["status"], dict) + assert "indexed" in payload["status"] diff --git a/backend/tests/test_memory_dedup.py b/backend/tests/test_memory_dedup.py new file mode 100644 index 000000000..19650a3d4 --- /dev/null +++ b/backend/tests/test_memory_dedup.py @@ -0,0 +1,107 @@ +import json +from pathlib import Path +import threading +import time + +from src.memory.three_tier_memory import ThreeTierMemory + + +def _build_memory(tmp_path: Path) -> ThreeTierMemory: + memory = ThreeTierMemory() + memory.repo_root = tmp_path + memory.data_root = tmp_path / "data" + memory.transcript_root = tmp_path / "backend" / ".uniops" / "transcripts" + memory.transcript_root.mkdir(parents=True, exist_ok=True) + return memory + + +def _write_file(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def _write_transcript(transcript_root: Path, trace_id: str, steps: list[dict]) -> None: + payload = {"trace_id": trace_id, "steps": steps} + (transcript_root / f"{trace_id}.json").write_text(json.dumps(payload), encoding="utf-8") + + +def _strip_timestamp(report: dict) -> dict: + clone = json.loads(json.dumps(report)) + clone["last_run_at"] = "" + return clone + + +def test_run_dedup_pass_identifies_document_and_transcript_duplicates(tmp_path: Path) -> None: + memory = _build_memory(tmp_path) + + _write_file(tmp_path / "data" / "confluence" / "redis-a.md", "Redis latency runbook") + _write_file(tmp_path / "data" / "runbooks" / "redis-b.md", " redis latency RUNBOOK ") + _write_file(tmp_path / "data" / "incidents" / "inc-1.json", '{"kind":"incident"}') + + _write_transcript(memory.transcript_root, "trace-01", [{"step": "retrieval", "observation": "same"}]) + _write_transcript(memory.transcript_root, "trace-02", [{"step": "retrieval", "observation": "same"}]) + _write_transcript(memory.transcript_root, "trace-03", [{"step": "execution", "observation": "different"}]) + + report = memory.run_dedup_pass() + + assert report["documents"]["scanned"] == 3 + assert report["documents"]["duplicates"] == 1 + assert report["documents"]["duplicate_map"] == [ + {"duplicate": "data/runbooks/redis-b.md", "retained": "data/confluence/redis-a.md"} + ] + + assert report["transcripts"]["scanned"] == 3 + assert report["transcripts"]["duplicates"] == 1 + assert report["transcripts"]["duplicate_map"] == [{"duplicate": "trace-02", "retained": "trace-01"}] + + assert report["deduped_count"] == 2 + + summary = memory.summary()["dedup_summary"] + assert summary["documents"]["duplicates"] == 1 + assert summary["transcripts"]["duplicates"] == 1 + assert summary["duplication_ratio"] == 0.3333 + + +def test_run_dedup_pass_is_idempotent_and_deterministic(tmp_path: Path) -> None: + memory = _build_memory(tmp_path) + + _write_file(tmp_path / "data" / "confluence" / "same.md", "CPU runbook") + _write_file(tmp_path / "data" / "github" / "same-copy.md", "cpu RUNBOOK") + + _write_transcript(memory.transcript_root, "trace-10", [{"step": "a", "sources": [{"p": "x"}]}]) + _write_transcript(memory.transcript_root, "trace-11", [{"step": "a", "sources": [{"p": "x"}]}]) + + first = memory.run_dedup_pass() + second = memory.run_dedup_pass() + + assert _strip_timestamp(first) == _strip_timestamp(second) + + assert first["documents"]["retained"] == ["data/confluence/same.md"] + assert first["transcripts"]["retained"] == ["trace-10"] + + summary = memory.summary()["dedup_summary"] + assert summary["deduped_count"] == 2 + assert summary["documents"]["scanned"] == 2 + assert summary["transcripts"]["scanned"] == 2 + assert summary["duplication_ratio"] == 0.5 + + +def test_wait_for_transcript_returns_when_transcript_appears(tmp_path: Path) -> None: + memory = _build_memory(tmp_path) + + def _persist_later() -> None: + time.sleep(0.03) + memory.persist_transcript(trace_id="trace-delayed", steps=[{"step": "retrieval"}]) + + worker = threading.Thread(target=_persist_later, daemon=True) + worker.start() + + transcript = memory.wait_for_transcript( + trace_id="trace-delayed", + timeout_seconds=0.3, + poll_interval_seconds=0.01, + ) + + worker.join(timeout=0.3) + assert transcript is not None + assert transcript["trace_id"] == "trace-delayed" diff --git a/backend/tests/test_reasoning_tuning.py b/backend/tests/test_reasoning_tuning.py new file mode 100644 index 000000000..983079147 --- /dev/null +++ b/backend/tests/test_reasoning_tuning.py @@ -0,0 +1,212 @@ +import pytest + +from src.adapters.llm_client import LLMProviderConfigurationError, LLMProviderRuntimeError +from src.controller.controller import ControllerKernel +from src.swarms.reasoning_swarm import ReasoningSwarm + + +class _FakeSuccessLLMClient: + provider_name = "apfel" + model_name = "apfel-test" + + def reason(self, query: str, confidence: float, top_sources: list[dict], dedup_summary: dict | None) -> dict[str, str]: + return { + "reasoning": f"LLM reasoning for query: {query}", + "answer": "Use runbook first and request approval for external updates.", + "suggested_action": "summarize findings and request approval for external actions", + "reasoning_steps": ["Ranked evidence", "Prepared safe action"], + "confidence_breakdown": {"base_confidence": 0.65, "final_confidence": 0.72}, + "evidence_scores": [ + { + "title": "Runbook", + "path": "data/runbooks/high-cpu-service-x.md", + "source_type": "runbooks", + "raw_score": 2, + "priority_score": 2.25, + } + ], + } + + def expand_query_terms(self, query: str, query_tokens: list[str]) -> list[str]: + return query_tokens + + def assess_execution_action(self, action: str, action_details: dict | None) -> dict[str, str | None]: + return { + "normalized_action": action, + "reasoning": "Execution assessment completed.", + "risk_hint": "medium", + } + + +class _FakeFailingLLMClient: + provider_name = "groq" + model_name = "groq-test" + + def reason(self, query: str, confidence: float, top_sources: list[dict], dedup_summary: dict | None) -> dict[str, str]: + raise RuntimeError("provider unavailable") + + +def _sample_context() -> dict: + return { + "query": "redis latency incident", + "sources": [ + { + "title": "Runbook", + "path": "data/runbooks/high-cpu-service-x.md", + "source_type": "runbooks", + "score": 2, + }, + { + "title": "Incident", + "path": "data/incidents/incident-2026-04-08.json", + "source_type": "incidents", + "score": 1, + }, + ], + "dedup_summary": { + "documents": {"scanned": 5, "duplicates": 0}, + "transcripts": {"scanned": 10, "duplicates": 0}, + "deduped_count": 0, + "duplication_ratio": 0.0, + "last_run_at": "2026-04-16T00:00:00Z", + }, + } + + +def test_reasoning_reranks_sources_by_quality_weight() -> None: + swarm = ReasoningSwarm(provider_name="apfel", llm_client=_FakeSuccessLLMClient()) + result = swarm.run( + { + "query": "redis latency incident", + "sources": [ + { + "title": "Slack Thread", + "path": "data/slack/customer-xyz-thread.md", + "source_type": "slack", + "score": 2, + }, + { + "title": "Runbook", + "path": "data/runbooks/high-cpu-service-x.md", + "source_type": "runbooks", + "score": 2, + }, + ], + "dedup_summary": { + "documents": {"scanned": 5, "duplicates": 0}, + "transcripts": {"scanned": 10, "duplicates": 0}, + "deduped_count": 0, + "duplication_ratio": 0.0, + "last_run_at": "2026-04-16T00:00:00Z", + }, + } + ) + + assert result["sources"][0]["source_type"] == "runbooks" + assert result["confidence"] >= 0.65 + assert isinstance(result["reasoning_steps"], list) + assert isinstance(result["confidence_breakdown"], dict) + assert isinstance(result["evidence_scores"], list) + + +def test_reasoning_confidence_drops_with_high_duplication_ratio() -> None: + swarm = ReasoningSwarm(provider_name="apfel", llm_client=_FakeSuccessLLMClient()) + base_context = { + "query": "investigate incident", + "sources": [ + {"title": "Incident", "path": "data/incidents/x.json", "source_type": "incidents", "score": 3}, + {"title": "Runbook", "path": "data/runbooks/x.md", "source_type": "runbooks", "score": 2}, + ], + } + + high_quality = swarm.run( + { + **base_context, + "dedup_summary": { + "documents": {"scanned": 5, "duplicates": 0}, + "transcripts": {"scanned": 10, "duplicates": 0}, + "deduped_count": 0, + "duplication_ratio": 0.0, + "last_run_at": "2026-04-16T00:00:00Z", + }, + } + ) + low_quality = swarm.run( + { + **base_context, + "dedup_summary": { + "documents": {"scanned": 5, "duplicates": 4}, + "transcripts": {"scanned": 10, "duplicates": 8}, + "deduped_count": 12, + "duplication_ratio": 0.8, + "last_run_at": "2026-04-16T00:00:00Z", + }, + } + ) + + assert high_quality["confidence"] > low_quality["confidence"] + + +def test_controller_passes_dedup_summary_to_reasoning_swarm() -> None: + kernel = ControllerKernel(provider_name="groq", reasoning_llm_client=_FakeSuccessLLMClient()) + captured_context: dict = {} + + def fake_reasoning_run(context: dict) -> dict: + captured_context.update(context) + return { + "reasoning": "ok", + "answer": "ok", + "suggested_action": "summarize findings and request approval for external actions", + "action_details": { + "intent": "summarize_and_request_approval", + "tool": "planner.summarize_and_request_approval", + "parameters": {}, + "approval_required": True, + "risk_hint": "medium", + }, + "confidence": 0.7, + "sources": context["sources"][:3], + } + + kernel.reasoning_swarm.run = fake_reasoning_run # type: ignore[assignment] + result = kernel.handle_query("redis incident", session_id="sess-reasoning-dedup") + + assert "dedup_summary" in captured_context + assert "deduped_count" in captured_context["dedup_summary"] + assert isinstance(result.trace, list) + assert len(result.trace) == 3 + + +def test_reasoning_uses_provider_client_when_selected() -> None: + swarm = ReasoningSwarm(provider_name="apfel", llm_client=_FakeSuccessLLMClient()) + result = swarm.run(_sample_context()) + + assert result["provider"] == "apfel" + assert result["model"] == "apfel-test" + assert result["suggested_action"] == "summarize findings and request approval for external actions" + assert isinstance(result["action_details"], dict) + assert result["action_details"]["intent"] + assert isinstance(result["reasoning_steps"], list) + assert isinstance(result["confidence_breakdown"], dict) + assert isinstance(result["evidence_scores"], list) + + +def test_reasoning_strict_mode_has_no_deterministic_fallback_on_provider_failure() -> None: + swarm = ReasoningSwarm(provider_name="groq", llm_client=_FakeFailingLLMClient()) + + with pytest.raises(LLMProviderRuntimeError): + swarm.run(_sample_context()) + + +def test_reasoning_invalid_provider_raises_configuration_error() -> None: + swarm = ReasoningSwarm(provider_name="invalid-provider") + + with pytest.raises(LLMProviderConfigurationError): + swarm.run(_sample_context()) + + +def test_reasoning_no_sources_fails_in_strict_mode() -> None: + swarm = ReasoningSwarm(provider_name="groq", llm_client=_FakeSuccessLLMClient()) + + with pytest.raises(LLMProviderRuntimeError): + swarm.run({"query": "redis incident", "sources": [], "dedup_summary": {}}) diff --git a/backend/tests/test_retrieval_execution_groq.py b/backend/tests/test_retrieval_execution_groq.py new file mode 100644 index 000000000..492ebaee1 --- /dev/null +++ b/backend/tests/test_retrieval_execution_groq.py @@ -0,0 +1,155 @@ +import pytest + +from src.adapters.llm_client import LLMProviderRuntimeError +from src.gates.permission_gate import PermissionGate +from src.memory.three_tier_memory import MemoryDocument +from src.swarms.execution_swarm import ExecutionSwarm +from src.swarms.retrieval_swarm import RetrievalSwarm + + +class _StubMemory: + def __init__(self) -> None: + self._documents = [ + MemoryDocument( + title="Redis Latency Runbook", + path="data/runbooks/high-cpu-service-x.md", + source_type="runbooks", + content="Redis latency runbook covers diagnostic and rollback readiness.", + ), + MemoryDocument( + title="Incident Timeline", + path="data/incidents/incident-2026-04-08.json", + source_type="incidents", + content="Incident notes mention cache saturation and redis latency.", + ), + ] + + def load_documents(self) -> list[MemoryDocument]: + return self._documents + + +class _FakeRetrievalLLMClient: + provider_name = "groq" + model_name = "groq-retrieval-test" + + def expand_query_terms(self, query: str, query_tokens: list[str]) -> list[str]: + return [*query_tokens, "cache", "throughput"] + + def reason(self, query, confidence, top_sources, dedup_summary): # pragma: no cover - not used in this test + raise NotImplementedError + + def assess_execution_action(self, action, action_details): # pragma: no cover - not used in this test + raise NotImplementedError + + +class _FakeFailingRetrievalLLMClient(_FakeRetrievalLLMClient): + def expand_query_terms(self, query: str, query_tokens: list[str]) -> list[str]: + raise RuntimeError("provider unavailable") + + +class _FakeExecutionLLMClient: + provider_name = "groq" + model_name = "groq-execution-test" + + def reason(self, query, confidence, top_sources, dedup_summary): # pragma: no cover - not used in this test + raise NotImplementedError + + def expand_query_terms(self, query, query_tokens): # pragma: no cover - not used in this test + raise NotImplementedError + + def assess_execution_action(self, action: str, action_details: dict | None) -> dict: + return { + "normalized_action": "create rollback PR and notify Slack and Jira", + "reasoning": "Action modifies external systems; keep approval required.", + "risk_hint": "high", + } + + +class _FakeFailingExecutionLLMClient(_FakeExecutionLLMClient): + def assess_execution_action(self, action: str, action_details: dict | None) -> dict: + raise RuntimeError("provider unavailable") + + +def test_retrieval_uses_llm_query_expansion_when_provider_selected() -> None: + swarm = RetrievalSwarm( + memory=_StubMemory(), + provider_name="groq", + llm_client=_FakeRetrievalLLMClient(), + ) + + result = swarm.run("redis latency") + + assert result["llm_query_expansion"]["used"] is True + assert "cache" in result["query_tokens"] + assert result["llm_query_expansion"]["provider"] == "groq" + + +def test_retrieval_strict_mode_raises_on_provider_failure() -> None: + swarm = RetrievalSwarm( + memory=_StubMemory(), + provider_name="groq", + llm_client=_FakeFailingRetrievalLLMClient(), + ) + + with pytest.raises(LLMProviderRuntimeError): + swarm.run("redis latency") + + +def test_retrieval_strict_mode_raises_when_provider_is_missing() -> None: + swarm = RetrievalSwarm( + memory=_StubMemory(), + provider_name="", + llm_client=_FakeRetrievalLLMClient(), + ) + + with pytest.raises(LLMProviderRuntimeError): + swarm.run("redis latency") + + +def test_execution_uses_llm_assessment_when_provider_selected() -> None: + swarm = ExecutionSwarm( + permission_gate=PermissionGate(), + provider_name="groq", + llm_client=_FakeExecutionLLMClient(), + ) + + result = swarm.run( + trace_id="trace-groq-execution", + action="rollback maybe", + action_details={"intent": "rollback_and_notify"}, + ) + + assert result["provider"] == "groq" + assert result["model"] == "groq-execution-test" + assert result["action"] == "create rollback PR and notify Slack and Jira" + assert result["execution_reasoning"] + + +def test_execution_strict_mode_raises_on_provider_failure() -> None: + swarm = ExecutionSwarm( + permission_gate=PermissionGate(), + provider_name="groq", + llm_client=_FakeFailingExecutionLLMClient(), + ) + + with pytest.raises(LLMProviderRuntimeError): + swarm.run( + trace_id="trace-groq-execution-fail", + action="rollback maybe", + action_details={"intent": "rollback_and_notify"}, + ) + + +def test_execution_strict_mode_raises_when_provider_is_missing() -> None: + swarm = ExecutionSwarm( + permission_gate=PermissionGate(), + provider_name="", + llm_client=_FakeExecutionLLMClient(), + ) + + with pytest.raises(LLMProviderRuntimeError): + swarm.run( + trace_id="trace-groq-execution-missing-provider", + action="rollback maybe", + action_details={"intent": "rollback_and_notify"}, + ) diff --git a/backend/tests/test_tool_registry_adapters.py b/backend/tests/test_tool_registry_adapters.py new file mode 100644 index 000000000..abcbb5ceb --- /dev/null +++ b/backend/tests/test_tool_registry_adapters.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +from src.tools.executor import ToolExecutor +from src.tools.registry import ToolRegistry, ToolRegistryError + + +def test_tool_registry_register_and_execute() -> None: + registry = ToolRegistry() + registry.register_tool( + name="example.tool", + description="Example tool for registry tests.", + read_only=True, + handler=lambda params: {"status": "executed", "output": f"processed:{params['value']}"}, + ) + + assert registry.list_tools() == ["example.tool"] + result = registry.execute_tool("example.tool", {"value": "ok"}) + + assert result["tool"] == "example.tool" + assert result["status"] == "executed" + assert result["output"] == "processed:ok" + + +def test_tool_registry_rejects_unknown_tool() -> None: + registry = ToolRegistry() + + try: + registry.execute_tool("unknown.tool", {}) + assert False, "Expected ToolRegistryError for unknown tool" + except ToolRegistryError as exc: + assert "not registered" in str(exc) + + +def test_executor_maps_multi_tool_action_and_executes_in_order(monkeypatch) -> None: + monkeypatch.setenv("GITHUB_REPOSITORY", "org/repo") + monkeypatch.setenv("GITHUB_ISSUE_NUMBER", "123") + monkeypatch.setenv("SLACK_CHANNEL_ID", "C001") + monkeypatch.setenv("SLACK_THREAD_TS", "1712345678.123456") + monkeypatch.setenv("SLACK_CONTEXT_LIMIT", "5") + + call_order: list[str] = [] + registry = ToolRegistry() + + def _github_handler(params: dict[str, object]) -> dict[str, object]: + call_order.append("github.fetch_issue") + assert params["repository"] == "org/repo" + assert params["issue_number"] == 123 + return {"status": "executed", "output": "github ok"} + + def _slack_handler(params: dict[str, object]) -> dict[str, object]: + call_order.append("slack.fetch_thread_messages") + assert params["channel"] == "C001" + assert params["thread_ts"] == "1712345678.123456" + assert params["limit"] == 5 + return {"status": "executed", "output": "slack ok"} + + def _jira_handler(params: dict[str, object]) -> dict[str, object]: + call_order.append("jira.fetch_issue") + assert params["issue_key"] == "OPS-101" + return {"status": "executed", "output": "jira ok"} + + registry.register_tool( + name="github.fetch_issue", + description="GitHub issue fetch", + read_only=True, + handler=_github_handler, + ) + registry.register_tool( + name="slack.fetch_thread_messages", + description="Slack thread fetch", + read_only=True, + handler=_slack_handler, + ) + registry.register_tool( + name="jira.fetch_issue", + description="Jira issue fetch", + read_only=True, + handler=_jira_handler, + ) + + executor = ToolExecutor(registry=registry) + result = executor.execute("Create rollback PR, fetch Slack thread, and update Jira OPS-101") + + assert result["status"] == "executed" + assert len(result["details"]) == 3 + assert [item["tool"] for item in result["details"]] == [ + "github.fetch_issue", + "slack.fetch_thread_messages", + "jira.fetch_issue", + ] + assert call_order == [ + "github.fetch_issue", + "slack.fetch_thread_messages", + "jira.fetch_issue", + ] + + +def test_executor_fails_when_jira_keyword_has_no_issue_key(monkeypatch) -> None: + monkeypatch.setenv("SLACK_CHANNEL_ID", "C001") + + registry = ToolRegistry() + registry.register_tool( + name="jira.fetch_issue", + description="Jira issue fetch", + read_only=True, + handler=lambda params: {"status": "executed", "output": "ok"}, + ) + + executor = ToolExecutor(registry=registry) + result = executor.execute("Notify Jira about incident update") + + assert result["status"] == "failed" + assert "No Jira issue key found" in result["output"] + + +def test_executor_fails_when_slack_thread_target_missing(monkeypatch) -> None: + monkeypatch.setenv("SLACK_CHANNEL_ID", "C001") + monkeypatch.delenv("SLACK_THREAD_TS", raising=False) + + registry = ToolRegistry() + registry.register_tool( + name="slack.fetch_thread_messages", + description="Slack thread fetch", + read_only=True, + handler=lambda params: {"status": "executed", "output": "ok"}, + ) + + executor = ToolExecutor(registry=registry) + result = executor.execute("Fetch Slack thread for incident response") + + assert result["status"] == "failed" + assert "SLACK_THREAD_TS" in result["output"] + + +def test_executor_fails_fast_when_required_env_missing(monkeypatch) -> None: + monkeypatch.delenv("GITHUB_REPOSITORY", raising=False) + monkeypatch.delenv("GITHUB_ISSUE_NUMBER", raising=False) + + registry = ToolRegistry() + registry.register_tool( + name="github.fetch_issue", + description="GitHub issue fetch", + read_only=True, + handler=lambda params: {"status": "executed", "output": "ok"}, + ) + + executor = ToolExecutor(registry=registry) + result = executor.execute("Create rollback PR") + + assert result["status"] == "failed" + assert "GITHUB_REPOSITORY" in result["output"] + + +def test_executor_supports_confluence_read_only_dispatch(monkeypatch) -> None: + monkeypatch.setenv("CONFLUENCE_PAGE_ID", "65868") + + registry = ToolRegistry() + registry.register_tool( + name="confluence.fetch_page", + description="Confluence read-only page fetch", + read_only=True, + handler=lambda params: { + "status": "executed", + "output": f"Fetched {params['page_id']}", + }, + ) + + executor = ToolExecutor(registry=registry) + result = executor.execute("Fetch confluence runbook page") + + assert result["status"] == "executed" + assert len(result["details"]) == 1 + assert result["details"][0]["tool"] == "confluence.fetch_page" + + +def test_executor_maps_iris_create_incident(monkeypatch) -> None: + monkeypatch.setenv("IRIS_CASE_CUSTOMER_ID", "1") + monkeypatch.setenv("IRIS_CASE_SOC_ID", "") + + registry = ToolRegistry() + captured: dict[str, object] = {} + + def _iris_handler(params: dict[str, object]) -> dict[str, object]: + captured.update(params) + return { + "status": "executed", + "output": "Created IRIS incident 9001", + "incident": {"case_id": "9001"}, + } + + registry.register_tool( + name="iris.create_incident", + description="IRIS mutation", + read_only=False, + handler=_iris_handler, + ) + + executor = ToolExecutor(registry=registry) + result = executor.execute('Create incident for redis latency in prod "Redis Latency Spike"') + + assert result["status"] == "executed" + assert len(result["details"]) == 1 + assert result["details"][0]["tool"] == "iris.create_incident" + assert captured["case_name"] == "Redis Latency Spike" + assert captured["severity"] == "medium" diff --git a/data/confluence/redis-latency-runbook.md b/data/confluence/redis-latency-runbook.md new file mode 100644 index 000000000..03c120d73 --- /dev/null +++ b/data/confluence/redis-latency-runbook.md @@ -0,0 +1,5 @@ +# Redis Latency Runbook (Sample) + +1. Check recent deployment and config changes. +2. Check memory pressure and key eviction metrics. +3. Apply safe rollback only after approval. diff --git a/data/github/pr-rollback-example.md b/data/github/pr-rollback-example.md new file mode 100644 index 000000000..335d28dee --- /dev/null +++ b/data/github/pr-rollback-example.md @@ -0,0 +1,5 @@ +# PR Rollback Example (Sample) + +- PR: #142 +- Reason: Increased latency and error rate +- Action: Revert commit and trigger deployment diff --git a/data/incidents/incident-2026-04-08.json b/data/incidents/incident-2026-04-08.json new file mode 100644 index 000000000..e0a566bc9 --- /dev/null +++ b/data/incidents/incident-2026-04-08.json @@ -0,0 +1,7 @@ +{ + "id": "INC-2026-04-08-001", + "service": "service-x", + "severity": "SEV-2", + "summary": "Redis latency spike after deployment", + "status": "resolved" +} diff --git a/data/incidents/incident-2026-04-17-locust-capacity-saturation.json b/data/incidents/incident-2026-04-17-locust-capacity-saturation.json new file mode 100644 index 000000000..d3bc4d928 --- /dev/null +++ b/data/incidents/incident-2026-04-17-locust-capacity-saturation.json @@ -0,0 +1,58 @@ +{ + "id": "INC-2026-04-17-LOCUST-003", + "source_system": "locust", + "case_id": "LT-2026-04-17-003", + "service": "platform-edge", + "severity": "SEV-2", + "summary": "Platform saturation risk detected at 80 simulated users", + "short_description": "Aggregated load test metrics show high tail latency and high failure throughput while user count remained at 80.", + "status": "open", + "tags": [ + "load-test", + "capacity", + "saturation", + "performance-regression" + ], + "metrics": { + "total_request_count": 24775, + "total_failure_count": 19295, + "overall_failure_rate_percent": 77.9, + "total_avg_response_ms": 314.81, + "total_p95_response_ms": 460, + "total_p99_response_ms": 740, + "total_max_response_ms": 6422.06, + "peak_requests_per_second": 148.6, + "peak_failures_per_second": 117.0 + }, + "evidence": { + "stats_file": "scripts/artifacts/locust_incident_demo_stats.csv", + "history_file": "scripts/artifacts/locust_incident_demo_stats_history.csv", + "exceptions_file": "scripts/artifacts/locust_incident_demo_exceptions.csv", + "user_count_plateau": 80 + }, + "timeline": [ + { + "timestamp_epoch": 1776382064, + "event": "Traffic ramps and first non-zero failures appear", + "requests_per_second": 26.888889, + "failures_per_second": 0.0 + }, + { + "timestamp_epoch": 1776382076, + "event": "Failure rate increases rapidly during sustained load", + "requests_per_second": 142.1, + "failures_per_second": 99.6 + }, + { + "timestamp_epoch": 1776382161, + "event": "System remains near throughput ceiling with persistent errors", + "requests_per_second": 147.0, + "failures_per_second": 116.1 + } + ], + "recommended_actions": [ + "Run endpoint-specific soak tests after introducing backpressure and queue limits.", + "Validate autoscaling thresholds against 80+ concurrent user scenarios.", + "Add per-endpoint budget alerts for 429 rate and p99 latency simultaneously." + ] +} diff --git a/data/incidents/incident-2026-04-17-locust-latency-regression.json b/data/incidents/incident-2026-04-17-locust-latency-regression.json new file mode 100644 index 000000000..e866c32b0 --- /dev/null +++ b/data/incidents/incident-2026-04-17-locust-latency-regression.json @@ -0,0 +1,53 @@ +{ + "id": "INC-2026-04-17-LOCUST-002", + "source_system": "locust", + "case_id": "LT-2026-04-17-002", + "service": "health-monitoring-path", + "severity": "SEV-2", + "summary": "Health endpoint latency regression under concurrent user ramp", + "short_description": "GET /health remained successful but showed tail latency spikes indicating capacity stress.", + "status": "open", + "tags": [ + "load-test", + "latency", + "tail-latency", + "health-endpoint" + ], + "metrics": { + "request_count": 4980, + "failure_count": 0, + "failure_rate_percent": 0.0, + "median_response_ms": 280, + "avg_response_ms": 312.96, + "p95_response_ms": 440, + "p99_response_ms": 800, + "max_response_ms": 6304.33 + }, + "evidence": { + "stats_file": "scripts/artifacts/locust_incident_demo_stats.csv", + "endpoint": "GET /health", + "observation": "No hard failures but high tail latency and >6s max response time under load." + }, + "timeline": [ + { + "timestamp_epoch": 1776382060, + "event": "First extreme latency spike observed during ramp", + "p95_ms": 6300 + }, + { + "timestamp_epoch": 1776382071, + "event": "Tail latency remains elevated while traffic increases", + "p95_ms": 4000 + }, + { + "timestamp_epoch": 1776382120, + "event": "Latency partially recovers but remains above low-load baseline", + "p95_ms": 570 + } + ], + "recommended_actions": [ + "Profile health handler dependencies and reduce synchronous blocking calls.", + "Add connection pool and thread/worker telemetry to correlate spikes.", + "Define and enforce a p95 health latency SLO during load tests." + ] +} diff --git a/data/incidents/incident-2026-04-17-locust-rate-limit.json b/data/incidents/incident-2026-04-17-locust-rate-limit.json new file mode 100644 index 000000000..678de8238 --- /dev/null +++ b/data/incidents/incident-2026-04-17-locust-rate-limit.json @@ -0,0 +1,55 @@ +{ + "id": "INC-2026-04-17-LOCUST-001", + "source_system": "locust", + "case_id": "LT-2026-04-17-001", + "service": "api-gateway", + "severity": "SEV-1", + "summary": "Severe rate limiting on GET /api/projects during load test", + "short_description": "Load test generated sustained HTTP 429 responses on the unauthorized projects endpoint.", + "status": "open", + "tags": [ + "load-test", + "rate-limit", + "http-429", + "api-gateway" + ], + "metrics": { + "request_count": 19795, + "failure_count": 19295, + "failure_rate_percent": 97.47, + "median_response_ms": 280, + "avg_response_ms": 315.27, + "p95_response_ms": 460, + "p99_response_ms": 730, + "max_response_ms": 6422.06 + }, + "evidence": { + "stats_file": "scripts/artifacts/locust_incident_demo_stats.csv", + "failures_file": "scripts/artifacts/locust_incident_demo_failures.csv", + "error_signature": "Expected 401 but got 429: Too many requests, please try again later.", + "failure_occurrences": 19295, + "endpoint": "GET /api/projects (unauthorized)" + }, + "timeline": [ + { + "timestamp_epoch": 1776382068, + "event": "Failure rate starts rising above baseline", + "failures_per_second": 8.6 + }, + { + "timestamp_epoch": 1776382075, + "event": "High failure rate sustained", + "failures_per_second": 88.1 + }, + { + "timestamp_epoch": 1776382081, + "event": "Failure rate peaks in early saturation window", + "failures_per_second": 117.0 + } + ], + "recommended_actions": [ + "Review API gateway/auth middleware rate-limit policy for unauthorized requests.", + "Split 401 and 429 handling paths to preserve expected security semantics.", + "Add dedicated alert on sudden 429 spike for auth-protected endpoints." + ] +} diff --git a/data/iris/import_bundle/iris-import-manifest.json b/data/iris/import_bundle/iris-import-manifest.json new file mode 100644 index 000000000..0e494f2c6 --- /dev/null +++ b/data/iris/import_bundle/iris-import-manifest.json @@ -0,0 +1,18 @@ +{ + "bundle_name": "iris-incident-resolution-bundle", + "project_key": "SERVICE-X", + "service": "service-x", + "generated_at": "2026-04-16T00:55:57.903404+00:00", + "source_files": [ + "data/confluence/redis-latency-runbook.md", + "data/runbooks/high-cpu-service-x.md", + "data/incidents/incident-2026-04-08.json", + "data/github/pr-rollback-example.md", + "data/slack/customer-xyz-thread.md" + ], + "output_files": [ + "iris-incident-seed.json", + "iris-resolution-plan.json", + "iris-runbook-mapping.json" + ] +} \ No newline at end of file diff --git a/data/iris/import_bundle/iris-incident-seed.json b/data/iris/import_bundle/iris-incident-seed.json new file mode 100644 index 000000000..aaf95aaa2 --- /dev/null +++ b/data/iris/import_bundle/iris-incident-seed.json @@ -0,0 +1,15 @@ +{ + "project_key": "SERVICE-X", + "source": "uniops-data-bundle", + "external_incident_id": "INC-2026-04-08-001", + "service": "service-x", + "severity": "SEV-2", + "summary": "Redis latency spike after deployment", + "status": "resolved", + "tags": [ + "redis", + "latency", + "production" + ], + "created_at": "2026-04-16T00:55:57.903404+00:00" +} \ No newline at end of file diff --git a/data/iris/import_bundle/iris-resolution-plan.json b/data/iris/import_bundle/iris-resolution-plan.json new file mode 100644 index 000000000..ad7a3f73d --- /dev/null +++ b/data/iris/import_bundle/iris-resolution-plan.json @@ -0,0 +1,59 @@ +{ + "project_key": "SERVICE-X", + "service": "service-x", + "incident_summary": "Redis latency spike after deployment", + "workflow": [ + "detect", + "triage", + "diagnose", + "propose_action", + "approval", + "execute", + "resolve", + "postmortem" + ], + "runbooks": [ + { + "name": "Redis Latency Runbook", + "source_path": "data/confluence/redis-latency-runbook.md", + "steps": [ + "Check recent deployment and config changes.", + "Check memory pressure and key eviction metrics.", + "Apply safe rollback only after approval." + ] + }, + { + "name": "High CPU Runbook for Service X", + "source_path": "data/runbooks/high-cpu-service-x.md", + "steps": [ + "Confirm alert threshold and duration.", + "Check latest PRs and feature flags.", + "Scale up only after explicit approval." + ] + } + ], + "approval_policy": { + "required_for_actions": [ + "rollback", + "deploy", + "update", + "scale", + "create" + ], + "approver_role": "sre_lead", + "note": "Rollback and scale actions require explicit SRE approval before execution." + }, + "operational_evidence": { + "github": { + "source_path": "data/github/pr-rollback-example.md", + "pr": "#142", + "reason": "Increased latency and error rate", + "action": "Revert commit and trigger deployment" + }, + "slack": { + "source_path": "data/slack/customer-xyz-thread.md", + "summary": "# Slack Thread Summary (Sample)\n\nCustomer XYZ experienced elevated latency after enabling feature flag `beta_cache_path`.\nKey takeaway: rollback decision required explicit SRE approval." + } + }, + "generated_at": "2026-04-16T00:55:57.903404+00:00" +} \ No newline at end of file diff --git a/data/iris/import_bundle/iris-runbook-mapping.json b/data/iris/import_bundle/iris-runbook-mapping.json new file mode 100644 index 000000000..960e63755 --- /dev/null +++ b/data/iris/import_bundle/iris-runbook-mapping.json @@ -0,0 +1,28 @@ +{ + "project_key": "SERVICE-X", + "incident_type": "redis_latency_spike_after_deployment", + "service": "service-x", + "severity_map": { + "SEV-1": "critical", + "SEV-2": "high", + "SEV-3": "medium", + "SEV-4": "low" + }, + "runbook_links": [ + { + "title": "Redis Latency Runbook", + "category": "confluence", + "path": "data/confluence/redis-latency-runbook.md" + }, + { + "title": "High CPU Runbook for Service X", + "category": "runbooks", + "path": "data/runbooks/high-cpu-service-x.md" + } + ], + "required_context_sources": [ + "data/incidents/incident-2026-04-08.json", + "data/github/pr-rollback-example.md", + "data/slack/customer-xyz-thread.md" + ] +} \ No newline at end of file diff --git a/data/runbooks/high-cpu-service-x.md b/data/runbooks/high-cpu-service-x.md new file mode 100644 index 000000000..00359f79b --- /dev/null +++ b/data/runbooks/high-cpu-service-x.md @@ -0,0 +1,5 @@ +# High CPU Runbook for Service X (Sample) + +1. Confirm alert threshold and duration. +2. Check latest PRs and feature flags. +3. Scale up only after explicit approval. diff --git a/data/slack/customer-xyz-thread.md b/data/slack/customer-xyz-thread.md new file mode 100644 index 000000000..7684c0e89 --- /dev/null +++ b/data/slack/customer-xyz-thread.md @@ -0,0 +1,4 @@ +# Slack Thread Summary (Sample) + +Customer XYZ experienced elevated latency after enabling feature flag `beta_cache_path`. +Key takeaway: rollback decision required explicit SRE approval. diff --git a/docs/FEATURE_GAP_AUDIT.md b/docs/FEATURE_GAP_AUDIT.md new file mode 100644 index 000000000..6230f530b --- /dev/null +++ b/docs/FEATURE_GAP_AUDIT.md @@ -0,0 +1,187 @@ +# UniOps Feature GAP AUDIT + +Date: 2026-04-17 +Source of scope: TO-DO.md + +## 1. Baseline Policy and Scope + +This audit follows the current product policy: + +- External write operations are intentionally not executed from UniOps runtime. +- Execution behavior is planner-only for external actions (GitHub, Slack, Jira writes remain no-write by design). +- Human approval is still required for high-risk actions. +- The execution agent must return a clear action plan and audit trail, not a mutation claim. + +This document converts TO-DO feature items into: + +- current implementation percentage, +- implementation gaps, +- required implementation to close each gap, +- vertical-slice build order. + +## 2. Scoring Method + +Percentages are based on four checks per feature area: + +1. API or runtime path exists. +2. Contract and metadata are complete and unambiguous. +3. Tests cover happy path and failure path. +4. Demo/operator readiness is verified and documented. + +## 3. Feature-Wise Implementation Status and Required Work + +| Feature Area | Current % | Current State | Implementation Required | Acceptance Criteria | +|---|---:|---|---|---| +| Controller pipeline with swarm chaining | 100% | Implemented and verified on main; retrieval -> reasoning -> execution chain stable. | No functional gap; keep regression coverage for pipeline sequencing and trace ordering. | Existing orchestration tests continue to pass with trace order unchanged. | +| Retrieval + reasoning output schema and citation handoff | 95% | Implemented with confidence metadata, reasoning steps, evidence scores, source citations. | Normalize action detail semantics for planner mode (remove outdated tool naming from reasoning metadata). | Reasoning step metadata exposes planner intent only and remains schema compatible. | +| Permission policy rules for HITL decisions | 90% | Native permission gate works; high-risk actions route to approval. | Tighten policy output text so approval means approve plan execution workflow, not external mutation execution. | Approval gating messages are explicit about planner-only mode in backend response metadata. | +| Memory summary and Kairos-lite dedup pass API | 93% | Dedup pass and dedup summary are implemented and tested. | Add optional scheduled dedup trigger policy and persistence strategy note for runtime-ingested docs. | Dedup can run on schedule or explicit trigger; status visible in audit metadata. | +| Core MVP golden flow: IRIS + Confluence end-to-end on main baseline | 92% | Flow is operational and validated in logs (ingest -> chat -> stream -> approval -> transcript), with planner-only completion semantics now implemented. | Add stronger failure-path assertions and policy-focused negative tests. | Golden flow transcript clearly marks planner mode and produces deterministic final plan status. | +| HITL completion path: pending approval -> approve/reject -> audit trace | 90% | Path stores approval decisions, planner-only execution mode, and plan approval/rejection statuses in transcript and response payloads. | Add enriched decision rationale fields and lifecycle event consistency checks. | Approval response includes execution_mode=planner_only and no misleading mutation semantics. | +| Live demo runbook finalization with validated IDs | 76% | Valid IDs are known and used in validation logs. | Consolidate defaults in one canonical config path and document fallback behavior for missing env values. | One canonical runbook for demo defaults; frontend and script flows use same IDs by default. | +| SSE live trace stream and transcript lifecycle | 86% | Trace events, heartbeat, complete/error events are implemented. | Add reconnect-safe and timeout-hardening behavior and include planner-mode annotation in terminal events. | SSE resilience tests pass for reconnect/timeout paths; trace_complete metadata includes planner mode. | +| Ingestion APIs (Confluence, IRIS, GitHub, Jira, Slack) | 88% | Endpoints implemented and benchmarked; partial-failure reporting exists. | Add unified adapter error envelope and retry strategy guidelines for flaky upstreams. | Ingestion responses show consistent error structure; retry behavior documented and tested for one source. | +| Frontend live demo wiring (chat, trace, approval, transcript) | 86% | Core UX implemented and validated in browser, with execution mode surfaced in system and transcript cards. | Render structured execution plan steps and add stronger policy copy in chat/approval UX. | UI never implies external write execution; plan details are visible after approval. | +| Shared contracts for chat, stream, ingestion, approvals, transcript | 95% | Planner-only fields and status enums are now added for approval/transcript/trace metadata. | Add explicit plan object shape guidance for stream payload examples. | Contract and frontend typings compile with new planner fields and all tests pass. | +| Tool execution adapter layer (planner-safe) | 89% | Planner executor now generates plan artifacts with no-write metadata and rollback/precheck steps. | Add deeper intent-specific plan templates and policy tests for mutation-claim prevention. | Any approved action returns plan artifact with explicit no-write metadata and audit step. | +| Verification and benchmark assets | 90% | Benchmarks and E2E API test exist; broad test suite passing in logs. | Add policy tests that fail on mutation-claim wording and fail if execution_mode is missing. | Policy tests pass; no runtime payload claims external write completion. | +| Documentation consistency (TO-DO and handoff docs) | 65% | Status evidence exists but some sections remain stale or contradictory. | Reconcile In Progress vs Verification Log and align docs to planner-only semantics. | TO-DO and working docs contain no contradictions on completion and policy. | + +## 4. Summary Completion + +- Weighted overall completion: 89% +- Primary remaining work: reliability hardening, richer plan templates in UI, and documentation consistency updates. + +## 5. Vertical Slice Build Plan + +## Slice 1: Planner-Only Execution Semantics (Highest Priority) + +Goal: make runtime truthfully planner-only for external actions. + +Implementation: + +1. Introduce execution_mode in execution, approval, and transcript payloads with value planner_only. +2. Replace ambiguous statuses (executed for external writes) with plan_generated, plan_approved, plan_rejected. +3. Ensure execution output is a plan artifact (steps, dependencies, risks, rollback notes). + +Target files: + +- backend/src/swarms/execution_swarm.py +- backend/app/api/routes/approvals.py +- backend/src/tools/executor.py +- shared/contracts/chat.contract.json +- frontend/lib/chat-api.ts + +Definition of done: + +- No external-action response implies external mutation occurred. +- Approval response and transcript clearly represent plan state only. + +## Slice 2: HITL and Audit Clarity + +Goal: make approval decisions auditable and operator-friendly. + +Implementation: + +1. Persist decision rationale and approver comment as first-class transcript fields. +2. Add explicit audit markers for plan approval lifecycle transitions. +3. Standardize approval event metadata between stream and transcript. + +Target files: + +- backend/src/memory/three_tier_memory.py +- backend/app/api/routes/chat.py +- backend/app/api/routes/approvals.py + +Definition of done: + +- Audit trail can reconstruct full decision timeline without ambiguity. + +## Slice 3: Frontend Transparency and Operator UX + +Goal: remove any confusion between plan and execution. + +Implementation: + +1. Add planner-only banner and execution policy note in UI. +2. Render generated plan steps after reasoning and after approval. +3. Show plan_status and execution_mode in transcript panel. + +Target files: + +- frontend/app/page.tsx +- frontend/lib/chat-api.ts + +Definition of done: + +- Demo user can clearly tell that approved actions generate an execution plan only. + +## Slice 4: Reliability and Integration Hardening + +Goal: improve runtime resilience for demo and development. + +Implementation: + +1. Add SSE reconnect and timeout behavior tests. +2. Add unified adapter error envelope for ingestion endpoints. +3. Reduce read-after-write race risk for transcript fetch with explicit readiness metadata. + +Target files: + +- backend/app/api/routes/chat.py +- backend/app/api/routes/ingestion.py +- backend/tests/test_chat_stream.py +- backend/tests/test_e2e_ingest_chat_approve.py + +Definition of done: + +- Stream and transcript lifecycle remains stable under interruption and delayed writes. + +## Slice 5: Documentation and Verification Closure + +Goal: align operational documents to actual behavior. + +Implementation: + +1. Update TO-DO In Progress section to match verified completion state. +2. Align implementation status docs with planner-only terminology. +3. Add a policy verification checklist to release/demo runbook. + +Target files: + +- TO-DO.md +- docs/ways-of-working/IMPLEMENTATION_STATUS_2026-04-16.md +- docs/ways-of-working/HANDOFF_2026-04-16.md + +Definition of done: + +- No status contradiction remains across tracking docs. + +## 6. Implementation Backlog Derived from TO-DO Features + +Priority P0: + +1. Planner-only status and contract normalization (Slice 1). +2. HITL audit semantic hardening (Slice 2). +3. Frontend planner-only transparency (Slice 3). + +Priority P1: + +1. SSE and adapter reliability hardening (Slice 4). +2. Demo runbook/default ID canonicalization and docs sync (Slice 5). + +Priority P2: + +1. Optional dedup scheduling policy. +2. Extended policy conformance tests and benchmark scenarios. + +## 7. Build Start Checklist + +Before implementation starts: + +1. Confirm status vocabulary for planner-only mode. +2. Confirm contract fields to be added in shared schema. +3. Confirm frontend UX copy for no-write policy. +4. Confirm P0 slice order and owners. + +When all four are confirmed, implementation can begin with Slice 1. diff --git a/docs/Uni-Ops Sequence Diagram.png b/docs/Uni-Ops Sequence Diagram.png new file mode 100644 index 000000000..39f74cc7a Binary files /dev/null and b/docs/Uni-Ops Sequence Diagram.png differ diff --git a/docs/UniOps PRD.md b/docs/UniOps PRD.md new file mode 100644 index 000000000..e921eacae --- /dev/null +++ b/docs/UniOps PRD.md @@ -0,0 +1,206 @@ +# UniOps PRD + +* **Product Requirements Document** +* **Version:** 1.1 (28-Hour Hackathon Edition – Post-Mentorship) +* **Date:** April 15, 2026 +* **Project Name:** UniOps – Small OS for Operations +* **Hackathon:** DevOps Hackathon (Reva University – Team 07\) + +**Hackathon Duration:** 28 hours (compressed from original 36-hour plan) + +--- + +### **1\. Executive Summary** + +UniOps is an **agentic “Small OS for Operations”** purpose-built for DevOps/SRE teams to eliminate toil and drastically reduce cognitive load. It unifies fragmented operational knowledge — primarily Confluence/runbooks (single source of truth), GitHub PRs/changes, simulated incident data (ServiceNow-style), Slack threads, and tribal knowledge — into one transparent, auditable, human-controlled intelligent system. + +* **Core Architecture (updated post-mentorship):** +* Controller Kernel \+ Dynamic Swarms \+ **Native** Permission Gates (HITL-first) \+ Three-Tier Memory \+ Kairos + +(Strong emphasis on **transparency** — full chain-of-thought \+ data-point citations — and **plug-and-play data sources** as advised by mentors.) + +* **Vision (directly from mentor feedback):** + +SREs/Platform Engineers ask natural-language questions (“Why did Redis latency spike last week?” or “Run the standard high-CPU runbook on service X safely”) → UniOps retrieves from Confluence/runbooks \+ GitHub history \+ simulated incidents → shows **live reasoning trace with exact sources** → proposes safe actions → **executes only after explicit human approval**. + +* **Hackathon Goal (28-hour MVP):** + +Deliver a fully functional, demo-ready product that clearly beats partial solutions (HolmesGPT, OpenSRE, Port.io, Rootly) in **knowledge unification, radical transparency, native human-in-the-loop safety**, and **realistic DevOps toil reduction**. Stack redesigned for lower cost and MVP stability (Milvus instead of Chroma; GraphRAG kept lightweight). + +**Key Mentor-Driven Changes Incorporated:** + +* Prioritize Confluence/runbooks as single source of truth \+ GitHub change history over pure Slack. +* Simulate ServiceNow-style incidents for realism (not just GitHub Actions). +* Strong focus on **transparency** (chain-of-thought \+ data points used). +* **Native HITL** Permission Gate (no non-native implementation). +* Vector store switched to **Milvus** (enterprise-ready; Chroma risk noted by Mentor 2). +* Plug-and-play data source philosophy. +* Acknowledge tribal knowledge exists (\~10% of enterprises) but is secondary. + --- + + ### **2\. Problem Statement (Source of Truth: Mentorship Transcript)** + +Engineering teams (especially SREs/DevOps) lose hours daily because: + +* Operational knowledge lives in Confluence (single source), runbooks/playbooks, GitHub PRs, Slack threads, and occasional tribal knowledge (forgotten fixes). +* Context switching between tools is painful even when documentation exists. +* Incident response often requires hunting history of changes, feature flags, or customer-specific customizations. +* Existing tools are either telemetry-heavy or lack deep multi-agent reasoning \+ strong human-in-the-loop. +* GitHub Actions alone is too ephemeral/limited; real value is in production incidents and change history. + +Mentor insight: In well-documented teams, knowledge hunting is “very quick” for known errors, but still painful for new issues or customer-specific customizations. Tribal knowledge exists in \~10% of enterprises. + +--- + +### **3\. Solution Overview** + +**Core Metaphor:** UniOps is a lightweight “Small OS” running inside your engineering workspace. + +* **Controller (Kernel)** → single entry point, spawns swarms, enforces native safety. +* **Dynamic Swarms** → Retrieval, Reasoning, Execution agents working in parallel. +* **Native Permission Gate** → every external action requires explicit human approval \+ full audit trail \+ chain-of-thought visibility. +* **Three-Tier Memory** → MEMORY.MD index → Markdown runbooks/Confluence → JSON transcripts. +* **Kairos (autoDream)** → background agent that maintains memory hygiene and deduplication. +* **Live Reasoning Trace** → real-time SSE stream showing every thought → tool → observation \+ exact data sources cited. +* **Frontend** → clean Next.js chat \+ trace panel \+ approval modal. + +**Post-Mentorship Focus:** Plug-and-play data ingestion (Confluence \+ GitHub \+ simulated incidents first) and radical transparency so engineers can trust (and audit) every conclusion. + +--- + +### **4\. Key Features (MVP Scope for 28h)** + +**Must-have (Demo-ready)** + +1. Natural language query interface +2. **Live reasoning trace (SSE)** with full chain-of-thought \+ source citations +3. Retrieval from ingested Confluence-style runbooks \+ GitHub PRs \+ sample Slack \+ simulated ServiceNow incidents +4. Multi-agent swarm orchestration (Controller → Swarms) +5. **Native** Permission Gate \+ Human approval modal for every action +6. Safe planner-only tool execution planning (GitHub PR comment/create rollback, Slack post, Jira update) +7. Three-Tier Memory with basic Kairos (deduplication) +8. Audit log of every agent step \+ data sources used + +**Nice-to-have (if time)** + +* Lightweight Neo4j graph for service/feature-flag dependencies (Phase 2\) +* One-click “ingest new runbook/Confluence page” button +* Plug-and-play data source demo (show how to add Grafana/ServiceNow) + --- + + ### **5\. Target Users & Use Cases (Mentor-Validated Demo Flows)** + +**Primary User:** SRE / Platform Engineer / On-call Developer + +**Key Use Cases (Demo Flows – directly inspired by transcript):** + +1. “Explain the high Redis latency incident from last week” (pulls from Confluence runbook \+ GitHub changes \+ simulated incident) +2. “Run the standard high-CPU runbook on service X” (retrieves runbook, shows steps, asks for approval before any planner-only execution) +3. “Create a rollback PR for the last deployment, post to Slack, and update Jira” (full human approval flow \+ audit) +4. “Summarise tribal knowledge \+ Confluence notes from recent Slack thread about customer XYZ feature flag” + --- + + ### **6\. Technical Architecture (High-Level – Redesigned per Mentor 2\)** + +* **Frontend:** Next.js 15 (App Router) \+ Tailwind \+ shadcn/ui \+ EventSource (SSE) +* **Backend:** FastAPI \+ Python 3.12 +* **Orchestration:** Custom Small OS (Controller \+ Swarms) on LangGraph patterns +* **LLM:** Groq (primary) \+ Apfel local fallback (M1) +* **Knowledge Layer:** LlamaIndex \+ **Milvus** (vector store – replaced Chroma per mentor advice) \+ SimpleDirectoryReader for markdown/Confluence export +* **Memory:** Three-tier system \+ Common Swarm Memory +* **Safety:** **Native** PermissionGate \+ approval queue (HITL-first) +* **Observability:** SSE live trace \+ structured audit logs (every data point cited) + +**Cost & Stability Note:** Hybrid GraphRAG kept lightweight. Milvus chosen for enterprise-readiness and to avoid Chroma \+ GraphRAG breakage risk highlighted by Mentor 2\. + +--- + +### **7\. Phase-Wise Implementation Plan (28-Hour Hackathon)** + +**Team Roles (finalised post-mentorship):** + +* **Chirag DS** – Overall \+ Backend \+ Agents \+ Memory \+ Orchestration +* **Dhruva** – Frontend \+ SSE \+ UI polish \+ Approval Modal +* **Srinidhi** – Data ingestion \+ Testing \+ PPT/Demo video \+ Milvus setup + + #### **Phase 0: Setup & Monorepo (0–2 hours)** + +* Monorepo: uniops/ (frontend/, backend/, data/, src/) +* Next.js 15 \+ Tailwind \+ shadcn/ui +* FastAPI \+ uv/venv \+ requirements.txt +* .env.example, docker-compose.yml +* **Milvus** persistent storage \+ sample data (data/runbooks/, data/incidents/, data/confluence/) +* Groq \+ Apfel config +* **Deliverable:** docker-compose up shows empty chat UI + + #### **Phase 1: Core Small OS Foundation (2–6 hours)** + +* src/controller/controller.py (Kernel) +* src/memory/three\_tier\_memory.py \+ memory\_index.py +* src/gates/permission\_gate.py (**native HITL**) +* Basic LangGraph state machine \+ step\_callback for live tracing +* **Deliverable:** Kernel accepts query → returns structured plan with transparency + + #### **Phase 2: Swarms \+ Knowledge Layer (6–13 hours)** + +* Three swarms: retrieval\_swarm.py, reasoning\_swarm.py, execution\_swarm.py +* LlamaIndex \+ **Milvus** Hybrid retrieval (focus on Confluence \+ GitHub \+ simulated incidents) +* Ingest sample data (5–6 markdown runbooks \+ GitHub examples \+ Slack \+ simulated ServiceNow incidents) +* **Deliverable:** End-to-end query → retrieval → transparent reasoning (planner-only execution) + + #### **Phase 3: Tools, Safety & Live Trace (13–20 hours)** + +* Tool registry (src/tools/) – GitHub, Slack, Jira planner-safe adapters +* Full **native** Permission Gate \+ approval queue +* FastAPI SSE endpoint (/chat/stream) +* Frontend: Chat \+ ReasoningTrace (with citations) \+ ApprovalModal +* Human-in-the-loop flow complete +* **Deliverable:** Live trace visible \+ safe planner action approval + + #### **Phase 4: Polish \+ Kairos \+ Dashboard (20–25 hours)** + +* Basic Kairos background agent (deduplication) +* Enhanced dashboard with audit log \+ source citations +* Error handling \+ local LLM fallback +* Responsive UI \+ loading states +* **Deliverable:** Polished, production-like UI + + #### **Phase 5: Testing, Demo & Submission (25–28 hours)** + +* Run 4 mentor-validated demo flows end-to-end +* Record 2-minute demo video +* Finalise PPT (8–10 slides using htf.pptx template) +* README with architecture diagram \+ one-click run instructions +* Docker packaging +* **Deliverable:** Ready-to-submit repo \+ demo video \+ PPT \+ differentiation table vs competitors + --- + + ### **8\. Non-Functional Requirements** + +* **Safety:** Every external action **MUST** go through native Permission Gate \+ explicit human approval (mentor emphasis) +* **Observability:** 100% of agent steps \+ data sources \+ chain-of-thought visible in live trace +* **Performance:** \< 8 seconds to first reasoning step (Groq) +* **Cost Awareness:** Stack redesigned per Mentor 2 feedback (Milvus \+ lightweight GraphRAG) +* **Local-first:** Fully works with Apfel LLM (no internet required for demo) +* **Plug-and-play:** Designed so new data sources (Grafana, ServiceNow, etc.) can be added easily + --- + + ### **9\. Success Criteria (Judges will love these)** + +* Live demo shows natural language → visible chain-of-thought \+ source citations → human approval → planner action approved +* Clear differentiation table vs HolmesGPT / OpenSRE / Port.io / Rootly (transparency \+ native HITL \+ Milvus-based unification) +* **Transparency** is the star: engineers can see exactly which Confluence page, GitHub PR, or incident was used +* Memory hygiene via Kairos (even if basic) +* Clean, professional UI with real-time trace +* Plug-and-play philosophy visibly demonstrated + +--- + +### **10\. As-Built Status (2026-04-16)** + +For the latest implementation reality (endpoints, flows, tests, and pending gaps), use: + +* `docs/ways-of-working/IMPLEMENTATION_STATUS_2026-04-16.md` + +This section is intentionally pointer-based to keep the PRD stable while the implementation evolves quickly. + diff --git a/docs/ways-of-working/BACKEND_SPLIT_24H.md b/docs/ways-of-working/BACKEND_SPLIT_24H.md new file mode 100644 index 000000000..d03256226 --- /dev/null +++ b/docs/ways-of-working/BACKEND_SPLIT_24H.md @@ -0,0 +1,109 @@ +# Backend Split for 2 Engineers (Skill-Aligned and Equal) + +Goal: split backend execution into two equal ownership tracks with minimal overlap. + +Branch mapping enforced by CI: +- Engineer 1: `feat/backend-core-` +- Engineer 2: `feat/backend-systems-` +- Shared/contract edits: `chore/shared-` + +## Engineer 1 (You: deep technical understanding, agent architecture) +Primary fit: core intelligence and safety semantics. + +Ownership track: +- Controller kernel and orchestration flow +- Retrieval, reasoning, and execution swarm logic +- Three-tier memory behavior and Kairos dedup logic +- Permission policy logic and approval decision semantics +- Contract decisions for reasoning trace and tool-action payloads + +Files owned first: +- backend/src/controller/** +- backend/src/swarms/** +- backend/src/memory/** +- backend/src/gates/permission_gate.py (decision logic) +- shared/contracts/chat.contract.json (only via shared branch) + +## Engineer 2 (Teammate: systems, production, deployment) +Primary fit: reliability, runtime, integrations, and delivery. + +Ownership track: +- FastAPI routes and API lifecycle +- SSE stream endpoint and connection stability +- Approval queue persistence path and action execution wiring +- Tool registry adapters (GitHub/Slack/Jira planner-safe adapters) +- Audit logging, health checks, failure handling +- Docker, runtime env, local deployment workflow, Milvus operations + +Files owned first: +- backend/app/** +- backend/src/tools/** +- backend/src/gates/** (queue and API-side integration) +- backend/tests/** +- infra/** +- scripts/** + +## Equal Workload Definition +Use feature points to keep load balanced (target 50/50): +- P0 feature = 3 points +- P1 feature = 2 points +- P2 feature = 1 point + +Both engineers should carry 8 to 10 points in first 18 hours. + +## Feature Allocation (POC-Compliant) +Engineer 1: +1. [x] Controller pipeline with swarm chaining (P0, 3) +2. [x] Retrieval + reasoning output schema and citation model (P0, 3) +3. [x] Permission decision policy rules for HITL (P1, 2) +4. [x] Memory summary and dedup pass API for Kairos-lite (P1, 2) +Total: 10 + +Engineer 2: +1. [x] FastAPI chat + stream endpoints and response contracts (P0, 3) +2. [x] SSE event delivery, reconnect-safe behavior, timeout handling (P0, 3) +3. [x] Approval queue execution path + planner-only tool invocation hooks (P1, 2) +4. [x] Structured audit logs + health checks + docker runtime hardening (P1, 2) +Total: 10 + +Progress note (2026-04-16): +- Implemented backend endpoints now include chat, transcript, stream, ingestion, and approval APIs. +- Contract updates for ingestion and approval are complete in `shared/contracts/chat.contract.json`. +- Groq-first integration slice completed across retrieval, reasoning, and execution swarms. +- Trace payloads now expose richer metadata (`confidence_breakdown`, `reasoning_steps`, `evidence_scores`, structured action details). +- Backend regression baseline currently: `31 passed` in backend test suite. +- SSE reliability hardening is complete: reconnect-safe behavior, heartbeat events, idle timeout handling, and malformed stream event guard. +- API reliability hardening now includes consistent ingestion error envelopes and transcript race mitigation (atomic writes + wait-based reads). + +## Integration Contract Between Both +- Engineer 1 outputs trace events in canonical shape. +- Engineer 2 streams those events over SSE without shape mutation. +- Any schema change goes through shared contract PR first. + +## 24-Hour Backend Timeline +Hour 0-2: +- [x] Engineer 1: controller flow skeleton + swarm interfaces +- [x] Engineer 2: API skeleton + health + chat route baseline + +Hour 2-8: +- [x] Engineer 1: retrieval and reasoning composition + source citation model +- [x] Engineer 2: SSE stream endpoint reliability and API error envelopes + +Hour 8-14: +- [x] Engineer 1: permission policy rules + memory hooks +- [x] Engineer 2: approval queue API + tool registry wiring + +Hour 14-20: +- [x] Engineer 1: Kairos-lite dedup and reasoning quality improvements +- [x] Engineer 2: audit logging, reliability checks, docker and Milvus validation + +Hour 20-24: +- [~] Both: bug fixing, smoke tests, demo hardening, no schema-breaking changes + +## Conflict Prevention Rules for Backend Pair +1. Engineer 1 should not edit backend/app except interface signatures. +2. Engineer 2 should not edit backend/src/swarms logic except integration adapters. +3. Shared files are lock-based for 20 minutes max: +- backend/src/gates/permission_gate.py +- shared/contracts/chat.contract.json +4. Merge backend branches every 90 minutes. diff --git a/docs/ways-of-working/BRANCHING.md b/docs/ways-of-working/BRANCHING.md new file mode 100644 index 000000000..7822540ad --- /dev/null +++ b/docs/ways-of-working/BRANCHING.md @@ -0,0 +1,32 @@ +# Branching Strategy (No-Conflict Fast Flow) + +## Branch naming +- `feat/frontend-` +- `feat/backend-core-` +- `feat/backend-systems-` +- `chore/shared-` + +## Rules +1. No direct commits to `main`. +2. Engineer A only opens `feat/frontend-*` unless working on a shared lock. +3. Backend Engineer 1 (core intelligence) uses `feat/backend-core-*`. +4. Backend Engineer 2 (systems and production) uses `feat/backend-systems-*`. +5. Use the explicit backend lanes (`feat/backend-core-*` and `feat/backend-systems-*`) for ownership clarity. +6. Shared changes must be isolated in `chore/shared-*`. +7. Keep PRs small: target under 250 lines changed where possible. + +## CI enforcement +- Workflow: `.github/workflows/ownership-boundary-check.yml` +- Script: `scripts/check-boundaries.sh` +- The check fails PRs when branch changes violate ownership lane rules. + +## Merge cadence for 24h sprint +- Sync checkpoint every 2 hours: + - Rebase active branch on `main` + - Resolve conflicts immediately + - Merge green PRs quickly + +## Commit convention +- `feat(frontend): add reasoning trace panel shell` +- `feat(backend): add permission gate queue model` +- `chore(shared): update chat contract v0` diff --git a/docs/ways-of-working/HANDOFF_2026-04-16.md b/docs/ways-of-working/HANDOFF_2026-04-16.md new file mode 100644 index 000000000..6f241fdc6 --- /dev/null +++ b/docs/ways-of-working/HANDOFF_2026-04-16.md @@ -0,0 +1,158 @@ +# UniOps Handoff - 2026-04-16 + +## Handoff Status (Updated 2026-04-16) + +Legend: +- [x] Completed for now +- [~] In progress / partial +- [ ] Pending + +Current status: +- [x] Backend core APIs and HITL approval flow +- [x] Frontend integration flow (ingestion + approval lifecycle implemented) +- [x] SSE completion (reconnect/heartbeat/timeout behavior implemented) +- [x] Reliability hardening (ingestion error envelope + transcript race mitigation) +- [x] Groq-first backend integration path (retrieval + reasoning + execution) with strict provider failure behavior + +## Scope Completed + +This handoff captures the implementation completed for the UniOps demo-ready slice, including: + +- Backend API expansion for ingestion and approvals +- Chat streaming baseline endpoint and contract payload flow +- Frontend end-to-end interactive control flow +- Contract and test coverage updates +- Local DFIR-IRIS setup and incident parity assets +- Demo runbook and execution evidence updates + +## Key Outcomes + +- Added batch Confluence ingestion endpoint with per-page success/failure results. +- Added IRIS ingestion endpoint returning normalized incident report payload. +- Added approval endpoint for planner-only execution plans with transcript persistence (`plan_approved` / `plan_rejected`). +- Updated chat stream endpoint to use FastAPI StreamingResponse with hardened SSE behavior (heartbeat, idle timeout, malformed event guard, anti-buffering headers). +- Unified Groq provider path across retrieval, reasoning, and execution swarms via shared LLM client wiring. +- Added strict provider error handling (`/api/chat` emits terminal `trace_error` SSE event for provider misconfiguration/runtime failure). +- Added structured `action_details` persistence in transcript payload. +- Added rich reasoning metadata in trace payloads (`confidence_breakdown`, `reasoning_steps`, `evidence_scores`). +- Added unified ingestion error envelope for both endpoint-level and per-item failures. +- Added transcript race hardening with atomic JSON writes and wait-based transcript reads. +- Added backend startup `.env` loading for local credential workflow. +- Implemented frontend controls for ingest, chat session, approval, and transcript refresh. +- Updated shared chat contract for ingestion and approval flow compatibility. +- Added API-level end-to-end test for ingest -> chat -> approve workflow. +- Added scriptable demo flow: `scripts/e2e_confluence_flow.sh`. + +## Files Added (Highlights) + +- `backend/app/api/routes/ingestion.py` +- `backend/app/api/routes/approvals.py` +- `backend/src/adapters/confluence_client.py` +- `backend/src/adapters/iris_client.py` +- `backend/src/tools/executor.py` +- `backend/src/adapters/llm_client.py` +- `backend/src/agents/orchestrator.py` +- `backend/src/vector_store/llamaindex_hybrid.py` +- `backend/tests/test_ingestion.py` +- `backend/tests/test_approvals.py` +- `backend/tests/test_e2e_ingest_chat_approve.py` +- `backend/tests/test_retrieval_execution_groq.py` +- `scripts/e2e_confluence_flow.sh` +- `scripts/iris_setup_from_data.py` +- `scripts/iris/install_iris_web.sh` +- `docs/ways-of-working/IRIS_INCIDENT_SETUP.md` +- `docs/ways-of-working/LOCAL_DFIR_IRIS_SETUP_MACOS.md` +- `docs/ways-of-working/IMPLEMENTATION_STATUS_2026-04-16.md` + +## Updated Files (Highlights) + +- `shared/contracts/chat.contract.json` +- `backend/app/api/routes/chat.py` +- `backend/app/main.py` +- `backend/src/controller/controller.py` +- `backend/src/swarms/retrieval_swarm.py` +- `backend/src/swarms/reasoning_swarm.py` +- `backend/src/swarms/execution_swarm.py` +- `backend/src/memory/three_tier_memory.py` +- `frontend/lib/chat-api.ts` +- `frontend/app/page.tsx` +- `README.md` +- `TO-DO.md` + +## Demo-Ready Defaults + +- Confluence page IDs: `65868,65898` +- IRIS case ID: `1` +- IRIS UI: `https://localhost` +- IRIS username: `administrator` + +## How To Run Locally + +1. Start backend: + +```bash +cd /Volumes/LocalDrive/hacktofuture4-D07/backend +.venv/bin/python -m uvicorn app.main:app --app-dir /Volumes/LocalDrive/hacktofuture4-D07/backend --host 0.0.0.0 --port 8000 +``` + +2. Start frontend: + +```bash +cd /Volumes/LocalDrive/hacktofuture4-D07/frontend +npm run dev +``` + +3. Open demo UI: + +- `http://localhost:3000` + +4. Execute workflow in order: + +- Ingest Confluence Runbooks +- Ingest IRIS Incident +- Start Incident Session +- Approve Action +- Verify transcript final status is `plan_approved` + +## Swagger Quick Test Payloads + +- `POST /api/ingest/confluence` + +```json +{ + "page_ids": ["65868", "65898"] +} +``` + +- `POST /api/ingest/iris?case_id=1` (query param only, no body) + +- `POST /api/chat` + +```json +{ + "message": "Create rollback PR and notify Slack and Jira for redis latency incident", + "session_id": "demo-session-001" +} +``` + +- `POST /api/approvals/{trace_id}` + +```json +{ + "decision": "approve", + "approver_id": "demo-approver", + "comment": "Approved from handoff flow." +} +``` + +## Open Follow-Ups + +- Keep external execution planner-only until explicit policy change permits write operations. +- Implement live external execution adapters (GitHub/Slack/Jira) in dedicated workstream if write policy changes. +- Rotate IRIS admin credentials/API key used during local setup. +- Consider creating a dedicated non-admin service account token for integration runs. +- Add CI automation for the new end-to-end API flow. + +## Notes + +- This document is created as part of the same commit that includes all currently uncommitted repository changes. diff --git a/docs/ways-of-working/IMPLEMENTATION_STATUS_2026-04-16.md b/docs/ways-of-working/IMPLEMENTATION_STATUS_2026-04-16.md new file mode 100644 index 000000000..60e220ea2 --- /dev/null +++ b/docs/ways-of-working/IMPLEMENTATION_STATUS_2026-04-16.md @@ -0,0 +1,200 @@ +# UniOps Implementation Status (Updated 2026-04-17) + +This document captures the as-built state after Slice 4 reliability hardening and Slice 5 documentation cleanup. + +## 0) Current Project Status Snapshot + +Legend: +- [x] Completed for now +- [~] In progress / partial +- [ ] Pending + +Status: +- [x] Backend APIs and approval flow +- [x] Frontend integration completeness for chat, trace timeline, ingestion, and approval actions +- [x] SSE completion with reconnect-safe behavior, heartbeat events, and idle timeout termination +- [x] Ingestion error envelope consistency across endpoint-level and per-item failures +- [x] Transcript readiness hardening (atomic writes + wait-based reads) +- [x] Groq-first backend rollout across retrieval, reasoning, and execution swarms + +## 1) What Is Implemented End-to-End + +### Backend API +- Health endpoint: + - `GET /health` +- Chat and trace endpoints: + - `POST /api/chat` (live SSE execution stream) + - `GET /api/chat/transcript/{trace_id}` +- Ingestion endpoints: + - `POST /api/ingest/iris?case_id=` + - `POST /api/ingest/confluence` with body `{ "page_ids": ["..."] }` + - `POST /api/ingest/github` + - `POST /api/ingest/jira` + - `POST /api/ingest/slack/channels` + - `POST /api/ingest/slack/threads` +- Vector endpoints: + - `GET /api/vector/status` + - `POST /api/vector/rebuild` +- Approval endpoint: + - `POST /api/approvals/{trace_id}` + +### Core Runtime Flow +1. Query enters Controller Kernel. +2. Retrieval swarm performs keyword plus optional hybrid retrieval, with Groq-assisted query expansion when provider is enabled. +3. Reasoning swarm prioritizes evidence, computes confidence and reasoning breakdown metadata, and proposes structured action details. +4. Execution swarm performs Groq-based action normalization/risk rationale (when enabled), then classifies risk with native permission gate. +5. If high or uncertain risk, status is `pending_approval`. +6. Approval API applies `approve` or `reject` decision. +7. Transcript and audit artifacts are updated with final outcome (`plan_approved` or `plan_rejected`) under planner-only execution policy. + +### Memory and Audit +- Three-tier memory currently supports: + - Static source loading from `data/{confluence,runbooks,incidents,github,slack}` + - Runtime ingestion merge for IRIS/Confluence/GitHub/Jira/Slack docs + - Dedup pass and summary metadata + - Transcript persistence with action details and approval status fields + - Atomic JSON persistence for transcript and approval artifacts + - Wait-based transcript reads to reduce read-after-write races + - Approval audit persistence under `backend/.uniops/approvals/` + +### SSE Status +- SSE now runs as a live execution stream on `POST /api/chat`. +- Stream lifecycle events implemented: `trace_started`, `trace_step`, `trace_heartbeat`, `trace_complete`, and `trace_error`. +- Event envelopes include ordered sequencing and observability context: `event_id`, `trace_id`, `sequence`, and `status`. +- Step metadata now includes timing fields (`started_at`, `finished_at`, `duration_ms`) in addition to reasoning/execution metadata. +- Stream hardening includes bounded queue handling, disconnect stop signaling, idle timeout termination (`stream_timeout`), malformed-event guard (`invalid_stream_event`), and explicit SSE anti-buffering headers. + +## 2) Key Files Added/Updated + +### API routes +- `backend/app/api/routes/chat.py` +- `backend/app/api/routes/ingestion.py` +- `backend/app/api/routes/approvals.py` +- `backend/app/main.py` + +### Core orchestration and memory +- `backend/src/agents/orchestrator.py` +- `backend/src/controller/controller.py` +- `backend/src/swarms/retrieval_swarm.py` +- `backend/src/swarms/reasoning_swarm.py` +- `backend/src/swarms/execution_swarm.py` +- `backend/src/memory/three_tier_memory.py` +- `backend/src/gates/permission_gate.py` +- `backend/src/vector_store/llamaindex_hybrid.py` +- `backend/src/adapters/llm_client.py` + +### Integrations and tools +- `backend/src/adapters/iris_client.py` +- `backend/src/adapters/confluence_client.py` +- `backend/src/tools/executor.py` +- `backend/src/tools/registry.py` + +### Contract +- `shared/contracts/chat.contract.json` + +### Local DFIR-IRIS setup +- `Makefile` targets for IRIS lifecycle: + - `iris-install`, `iris-up`, `iris-down`, `iris-logs`, `iris-admin-password` +- `scripts/iris/install_iris_web.sh` +- `docs/ways-of-working/LOCAL_DFIR_IRIS_SETUP_MACOS.md` +- `docs/ways-of-working/IRIS_INCIDENT_SETUP.md` + +## 3) Current Contract Highlights + +### Chat request modes +- `message_only` +- `incident_report_only` +- `message_and_incident_report` + +Rule: when `incident_report` is present, backend derives canonical query context from that report. + +### Chat response +- Live SSE event stream from `POST /api/chat` +- Terminal `trace_complete` event carries `answer`, `needs_approval`, `trace_id`, and dedup summary in metadata + +### Provider error behavior +- `POST /api/chat` emits terminal `trace_error` SSE events for selected provider misconfiguration/runtime failure. + +### Transcript metadata (implemented) +- `suggested_action` +- `action_details` +- `needs_approval` +- `execution_status` +- Optional after approval: + - `approval` + - `execution_result` + - `final_status` + +### Trace and stream metadata (implemented) +- `retrieval_method`, `query_tokens`, `llm_query_expansion` +- `confidence`, `confidence_breakdown`, `reasoning_steps`, `evidence_scores` +- `risk_level`, `requires_human_approval`, `execution_reasoning`, `risk_hint` + +### Approval API response (implemented) +- `trace_id` +- `final_status` (`plan_approved` or `plan_rejected`) +- `approval` object +- `execution_result` object + +### Ingestion contract (implemented) +- Request body: + - source-specific IDs/refs (deduplicated, non-empty) +- Response: + - `ingested_count` + - `failed_count` + - `source` + - `results[]` with per-item `status`, optional `error`, and structured `error_detail` envelope (`code`, `message`, `source`, `stage`, `retriable`, `target`) + +## 4) Test and Verification Evidence + +### Confirmed passing in current environment +- Reliability-focused regression suite: + - `tests/test_chat_orchestration.py` + - `tests/test_chat_iris_input.py` + - `tests/test_chat_stream.py` + - `tests/test_approvals.py` + - `tests/test_ingestion.py` + - `tests/test_memory_dedup.py` + - Result: `36 passed` +- Additional post-merge full backend run on 2026-04-17: + - `backend/.venv/bin/python -m pytest -q` + - Result: `52 passed` + +### Frontend validation +- `frontend npm run build` + - Result: success (Next.js production build complete) + +### Additional implemented test files +- `backend/tests/test_chat_iris_input.py` +- `backend/tests/test_chat_orchestration.py` +- `backend/tests/test_chat_stream.py` +- `backend/tests/test_ingestion.py` +- `backend/tests/test_memory_dedup.py` +- `backend/tests/test_reasoning_tuning.py` +- `backend/tests/test_retrieval_execution_groq.py` +- `backend/tests/test_e2e_ingest_chat_approve.py` + +### Manual E2E verification script +- `scripts/e2e_confluence_flow.sh` + - Sequence: ingest Confluence pages -> chat -> stream -> approval -> transcript + - Required env: `CONFLUENCE_PAGE_IDS` + +### Docs and schema availability +- Swagger UI active at `http://127.0.0.1:8000/docs` +- OpenAPI JSON active at `http://127.0.0.1:8000/openapi.json` + +## 5) Frontend Status + +### Implemented +- Next.js interactive UI in `frontend/app/page.tsx` and `frontend/app/globals.css` +- Backend API utility and SSE event parsing in `frontend/lib/chat-api.ts` +- Trace timeline rendering for real streamed events and metadata +- Ingestion controls for supported sources and transcript refresh flow +- Approval interaction wiring for planner-only decision flow + +## 6) Known Gaps (Next Work Items) + +1. Keep external execution policy planner-only until explicit approval for real write operations is granted. +2. Add adapter retry/backoff policy tuning for flaky provider network conditions. +3. Add optional scheduled sync mode (currently manual ingestion trigger only). +4. Runtime-ingested documents are intentionally non-persistent for this slice and must be re-ingested after restart. diff --git a/docs/ways-of-working/INTEGRATION_RULES.md b/docs/ways-of-working/INTEGRATION_RULES.md new file mode 100644 index 000000000..43e8f960f --- /dev/null +++ b/docs/ways-of-working/INTEGRATION_RULES.md @@ -0,0 +1,22 @@ +# Integration Rules + +## API contract first +- Backend and frontend integration only through files in `shared/contracts/`. +- Do not break contract fields without bumping version in contract file. + +## Contract update protocol +1. Create `chore/shared-*` branch. +2. Update contract file in `shared/contracts/`. +3. Both engineers review quickly. +4. Merge contract PR before dependent implementation PRs. + +## Freeze windows +- Last 2 hours of hackathon: + - No major refactors + - Bug fixes only + - No contract-breaking changes + +## Fast smoke checks before merging +- Frontend: app builds and route loads +- Backend: `/health` responds with 200 +- Contract: both sides still parse request/response payload diff --git a/docs/ways-of-working/IRIS_INCIDENT_SETUP.md b/docs/ways-of-working/IRIS_INCIDENT_SETUP.md new file mode 100644 index 000000000..045894fca --- /dev/null +++ b/docs/ways-of-working/IRIS_INCIDENT_SETUP.md @@ -0,0 +1,118 @@ +# IRIS Incident Resolution Setup (Service-X) + +This guide sets up DFIR-IRIS incident resolution using the same source data already present in this repository. + +Before continuing, complete local DFIR-IRIS setup: + +- `docs/ways-of-working/LOCAL_DFIR_IRIS_SETUP_MACOS.md` + +## Source Data (Authoritative Inputs) + +- `data/confluence/redis-latency-runbook.md` +- `data/runbooks/high-cpu-service-x.md` +- `data/incidents/incident-2026-04-08.json` +- `data/github/pr-rollback-example.md` +- `data/slack/customer-xyz-thread.md` + +## Step 0: Restore Data Folder from `origin/main` + +Run this before setup to ensure data parity: + +```bash +cd /Volumes/LocalDrive/hacktofuture4-D07 +git fetch origin +git restore --source origin/main -- data +git status --short -- data +``` + +Expected: no unexpected local drift in `data/`. + +## Step 1: Generate IRIS Import Bundle from Repository Data + +```bash +cd /Volumes/LocalDrive/hacktofuture4-D07 +python3 scripts/iris_setup_from_data.py --project-key SERVICE-X +``` + +Generated files: + +- `data/iris/import_bundle/iris-incident-seed.json` +- `data/iris/import_bundle/iris-resolution-plan.json` +- `data/iris/import_bundle/iris-runbook-mapping.json` +- `data/iris/import_bundle/iris-import-manifest.json` + +## Step 2: Configure DFIR-IRIS Project and Taxonomy + +1. Create or select project `SERVICE-X` in DFIR-IRIS. +2. Create service `service-x`. +3. Create incident type `redis_latency_spike_after_deployment`. +4. Configure severity map: + - `SEV-1 -> critical` + - `SEV-2 -> high` + - `SEV-3 -> medium` + - `SEV-4 -> low` +5. Add tags: `redis`, `latency`, `production`, `feature-flag`. + +## Step 3: Import Incident Seed and Runbook Mapping + +1. Import `iris-incident-seed.json` into DFIR-IRIS incident templates/seeds. +2. Import `iris-runbook-mapping.json` into DFIR-IRIS runbook linkage configuration. +3. Validate the incident type links to both runbooks: + - Redis Latency Runbook + - High CPU Runbook for Service X + +## Step 4: Create Incident Resolution Workflow in DFIR-IRIS + +Create workflow stages: + +1. Detect +2. Triage +3. Diagnose +4. Propose Action +5. Approval +6. Execute +7. Resolve +8. Postmortem + +Set policy: actions containing rollback/deploy/update/scale/create require explicit SRE approval. + +## Step 5: Add Operational Evidence Context in DFIR-IRIS + +1. Add GitHub rollback note using `data/github/pr-rollback-example.md`. +2. Add Slack context using `data/slack/customer-xyz-thread.md`. +3. Ensure these records are linked to the incident type and appear during triage/diagnosis. + +## Step 6: Validate Against UniOps Contract Expectations + +Use `incident_report` payload shape expected by: + +- `backend/app/api/routes/chat.py` +- `shared/contracts/chat.contract.json` + +Required behavior: + +1. `message` only request works. +2. `incident_report` only request works. +3. If both are present, `incident_report` takes precedence for canonical query context. + +## Step 7: Execute End-to-End Resolution Verification + +1. Trigger test incident in DFIR-IRIS with the generated seed attributes. +2. Confirm runbook recommendation includes approval-gated rollback guidance. +3. Confirm incident transitions through `Approval` stage for high-risk actions. +4. Confirm final resolution + postmortem includes linked source evidence. + +## Optional API Environment Settings + +Add these to `.env` when integrating live DFIR-IRIS and Confluence APIs: + +```env +IRIS_BASE_URL=https://localhost +IRIS_PROJECT_KEY=SERVICE-X +IRIS_API_KEY=replace_me +IRIS_VERIFY_SSL=false +CONFLUENCE_BASE_URL=https://confluence.example.internal +CONFLUENCE_SPACE_KEY=OPS +CONFLUENCE_API_TOKEN=replace_me +CONFLUENCE_EMAIL=replace_me@example.com +``` diff --git a/docs/ways-of-working/LOCAL_DFIR_IRIS_SETUP_MACOS.md b/docs/ways-of-working/LOCAL_DFIR_IRIS_SETUP_MACOS.md new file mode 100644 index 000000000..3e15cd7b7 --- /dev/null +++ b/docs/ways-of-working/LOCAL_DFIR_IRIS_SETUP_MACOS.md @@ -0,0 +1,80 @@ +# Local DFIR-IRIS Setup on macOS + +This runbook installs and runs the official `dfir-iris/iris-web` stack locally for UniOps integration testing. + +## Prerequisites + +- Docker Desktop running +- `git` installed +- Ports available: + - `443` for IRIS HTTPS + - `5672` for RabbitMQ + - `5432` for PostgreSQL + +## Install and Start + +From repository root: + +```bash +make iris-install +make iris-up +``` + +This installs `dfir-iris/iris-web` to `.vendor/iris-web` and checks out `v2.4.27`. + +## Get Initial Admin Password + +```bash +make iris-admin-password +``` + +Look for the line containing `create_safe_admin`. + +## Access IRIS + +Open: + +- `https://localhost` + +Use username `administrator` and the password from logs. + +## Create API Key for UniOps + +In DFIR-IRIS UI: + +1. Open user settings for `administrator`. +2. Generate an API key. +3. Put it in project `.env` as `IRIS_API_KEY`. + +## UniOps Environment Mapping + +Use these keys in `.env`: + +```env +IRIS_BASE_URL=https://localhost +IRIS_API_KEY=replace_me +IRIS_PROJECT_KEY=SERVICE-X +IRIS_VERIFY_SSL=false + +CONFLUENCE_BASE_URL=https://confluence.example.internal +CONFLUENCE_SPACE_KEY=OPS +CONFLUENCE_API_TOKEN=replace_me +CONFLUENCE_EMAIL=replace_me@example.com +``` + +## Stop Stack + +```bash +make iris-down +``` + +## Tail App Logs + +```bash +make iris-logs +``` + +## Notes + +- `IRIS_VERIFY_SSL=false` is for local self-signed cert setup only. +- For production-like environments, set valid TLS certs and SSL verification on. diff --git a/docs/ways-of-working/NEXT_CHAT_HANDOFF_2026-04-16.md b/docs/ways-of-working/NEXT_CHAT_HANDOFF_2026-04-16.md new file mode 100644 index 000000000..08c64dccb --- /dev/null +++ b/docs/ways-of-working/NEXT_CHAT_HANDOFF_2026-04-16.md @@ -0,0 +1,68 @@ +# Next Chat Handoff (2026-04-16) + +## Current Source of Truth + +Use this implementation snapshot first: +- `docs/ways-of-working/IMPLEMENTATION_STATUS_2026-04-16.md` + +## Current Achieved State + +- DFIR-IRIS local stack install workflow is in place (`make iris-install`, `make iris-up`, `make iris-admin-password`). +- Backend supports: + - chat + transcript + SSE stream (reliability hardened: reconnect-safe behavior, heartbeat, timeout, and terminal errors) + - IRIS and Confluence ingestion endpoints + - GitHub/Jira/Slack ingestion endpoints + - approval decision endpoint with planner-only plan generation and audit persistence + - vector status and rebuild endpoints +- Groq integration slices completed: + - shared Groq provider wiring across retrieval/reasoning/execution + - strict provider failure behavior (chat stream emits terminal `trace_error` SSE events) + - structured `action_details` in transcript + - rich trace metadata (`confidence_breakdown`, `reasoning_steps`, `evidence_scores`) +- Slice 4 hardening completed: + - 4.1 SSE reliability hardening + - 4.2 unified ingestion error envelope (`error_detail`) + - 4.3 transcript readiness/race hardening (atomic writes + wait-based reads) +- Shared contract includes ingestion and approval schemas. +- Reliability-focused backend suite now passes (`36 passed`), with earlier full backend validation at `52 passed`. + +## Where To Continue Next + +1. Adapter retry/backoff policy hardening for live API instability windows. +2. Additional UI polish for approval-state clarity and richer failure rendering. +3. Optional scheduled ingestion sync mode. +4. Real external write execution enablement only if policy is explicitly relaxed from planner-only. +5. Consolidate dated docs into a new status snapshot file to reduce drift. + +## Files To Open First + +- `docs/ways-of-working/IMPLEMENTATION_STATUS_2026-04-16.md` +- `shared/contracts/chat.contract.json` +- `backend/app/api/routes/chat.py` +- `backend/app/api/routes/ingestion.py` +- `backend/app/api/routes/approvals.py` +- `backend/src/adapters/llm_client.py` +- `backend/src/swarms/reasoning_swarm.py` +- `backend/src/swarms/retrieval_swarm.py` +- `backend/src/swarms/execution_swarm.py` +- `backend/src/memory/three_tier_memory.py` +- `frontend/app/page.tsx` +- `frontend/lib/chat-api.ts` + +## Runtime Commands + +### Backend + +cd /Volumes/LocalDrive/hacktofuture4-D07/backend +source .venv/bin/activate +python -m uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 + +### Frontend + +cd /Volumes/LocalDrive/hacktofuture4-D07/frontend +NEXT_PUBLIC_API_BASE_URL=http://localhost:8000 npm run dev + +### Docs / API + +- Swagger UI: `http://127.0.0.1:8000/docs` +- OpenAPI: `http://127.0.0.1:8000/openapi.json` diff --git a/docs/ways-of-working/OWNERSHIP.md b/docs/ways-of-working/OWNERSHIP.md new file mode 100644 index 000000000..1018c5542 --- /dev/null +++ b/docs/ways-of-working/OWNERSHIP.md @@ -0,0 +1,26 @@ +# Ownership Map (2-Engineer Setup) + +Goal: prevent merge conflicts by assigning strict ownership boundaries. + +## Engineer A (Frontend + UX) +- Owns `frontend/**` +- Owns chat UX, reasoning trace UI, approval modal, loading/error states +- Can update `shared/contracts/**` only through contract PR process + +## Engineer B (Backend + Agents) +- Owns `backend/**` +- Owns FastAPI routes, controller kernel, swarms, permission gate, memory +- Can update `shared/contracts/**` only through contract PR process + +## Shared Zone (High-risk for conflicts) +- `shared/contracts/**` +- `infra/**` +- `README.md` +- `docs/ways-of-working/**` + +Rule for shared zone: only one engineer edits at a time on a short-lived branch. + +## Daily Lock Strategy (24h hackathon) +- Every 2 hours, announce lock ownership in chat. +- Lock expires after 20 minutes if no commit is pushed. +- Never keep a shared file lock while coding unrelated tasks. diff --git a/docs/ways-of-working/TASK_SPLIT_24H.md b/docs/ways-of-working/TASK_SPLIT_24H.md new file mode 100644 index 000000000..16f5730a0 --- /dev/null +++ b/docs/ways-of-working/TASK_SPLIT_24H.md @@ -0,0 +1,49 @@ +# 24-Hour Execution Split (2 Engineers) + +For backend-first execution with equal split by skill profile, use: +- docs/ways-of-working/BACKEND_SPLIT_24H.md + +## Progress Snapshot (as of 2026-04-16) + +Legend: +- [x] Completed for now +- [~] In progress / partial +- [ ] Pending + +## Hour 0-2 +- [x] Engineer A: Frontend setup, chat shell, trace panel layout +- [x] Engineer B: FastAPI setup, /health, /api/chat stub + +## Hour 2-8 +- [x] Engineer A: SSE client, reasoning timeline UI, source citation cards +- [x] Engineer B: Controller + retrieval/reasoning/execution swarm stubs, SSE reliability hardening + +## Hour 8-14 +- [x] Engineer A: Approval modal and action queue UI +- [x] Engineer B: Native permission gate and planner-safe tool registry + +## Hour 14-20 +- [~] Engineer A: Polish UX, loading/error states, responsive layout +- [x] Engineer B: Memory layer, audit logs, ingestion glue + +## Hour 20-24 (stabilization window) +- [~] Both: Bug fixing and demo prep only +- [~] No new architecture changes +- [~] Keep PR size small and merge every 60-90 minutes + +## As-Built Delta (from IMPLEMENTATION_STATUS_2026-04-16) + +- Backend now exposes: + - `GET /health` + - `POST /api/chat` + - `GET /api/chat/transcript/{trace_id}` + - `POST /api/chat` (live SSE stream) + - `POST /api/ingest/iris?case_id=` + - `POST /api/ingest/confluence` with body `{"page_ids": ["..."]}` + - `POST /api/approvals/{trace_id}` +- Shared contract has ingestion and approval schemas implemented. +- Shared contract now includes additive action and trace metadata fields for Groq-backed reasoning/execution transparency. +- Approval tests are passing in current backend environment (`tests/test_approvals.py`). +- Backend reliability-focused suite now passes (`36 passed`). +- SSE reliability completion is done (reconnect-safe behavior, heartbeat, timeout, malformed event guard). +- Ingestion error envelopes and transcript race/readiness hardening are complete. diff --git a/frontend/.gitignore b/frontend/.gitignore new file mode 100644 index 000000000..5ef6a5207 --- /dev/null +++ b/frontend/.gitignore @@ -0,0 +1,41 @@ +# See https://help.github.com/articles/ignoring-files/ for more about ignoring files. + +# dependencies +/node_modules +/.pnp +.pnp.* +.yarn/* +!.yarn/patches +!.yarn/plugins +!.yarn/releases +!.yarn/versions + +# testing +/coverage + +# next.js +/.next/ +/out/ + +# production +/build + +# misc +.DS_Store +*.pem + +# debug +npm-debug.log* +yarn-debug.log* +yarn-error.log* +.pnpm-debug.log* + +# env files (can opt-in for committing if needed) +.env* + +# vercel +.vercel + +# typescript +*.tsbuildinfo +next-env.d.ts diff --git a/frontend/README.md b/frontend/README.md new file mode 100644 index 000000000..7f2aba995 --- /dev/null +++ b/frontend/README.md @@ -0,0 +1,55 @@ +This is a [Next.js](https://nextjs.org) frontend for UniOps. + +## API Integration Notes + +- Chat execution uses `POST /api/chat` as a live `text/event-stream` endpoint. +- Frontend consumes SSE events via `fetch` stream parsing (not `EventSource`) to support JSON request body payloads. +- Supported live event types: + - `trace_started` + - `trace_step` + - `trace_heartbeat` + - `trace_complete` + - `trace_error` +- Frontend enforces strict Groq provider policy for agent steps: + - `reasoning` and `execution` `trace_step` metadata must include `provider=groq` + - any `provider=deterministic` or `model=heuristic` metadata is treated as an error +- Retrieval query-expansion metadata is also validated when present (`llm_query_expansion.provider` must be `groq`). +- `trace_complete` provides `trace_id`, which is then used for: + - `GET /api/chat/transcript/{trace_id}` + - `POST /api/approvals/{trace_id}` +- Optional incident context mode sends `incident_report` in the chat request when an IRIS case has been ingested. + +## Getting Started + +First, run the development server: + +```bash +npm run dev +# or +yarn dev +# or +pnpm dev +# or +bun dev +``` + +Open [http://localhost:3000](http://localhost:3000) with your browser to see the result. + +You can start editing the page by modifying `app/page.tsx`. The page auto-updates as you edit the file. + +This project uses [`next/font`](https://nextjs.org/docs/app/building-your-application/optimizing/fonts) to automatically optimize and load [Geist](https://vercel.com/font), a new font family for Vercel. + +## Learn More + +To learn more about Next.js, take a look at the following resources: + +- [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API. +- [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial. + +You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js) - your feedback and contributions are welcome! + +## Deploy on Vercel + +The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js. + +Check out our [Next.js deployment documentation](https://nextjs.org/docs/app/building-your-application/deploying) for more details. diff --git a/frontend/app/favicon.ico b/frontend/app/favicon.ico new file mode 100644 index 000000000..718d6fea4 Binary files /dev/null and b/frontend/app/favicon.ico differ diff --git a/frontend/app/globals.css b/frontend/app/globals.css new file mode 100644 index 000000000..44e2d4871 --- /dev/null +++ b/frontend/app/globals.css @@ -0,0 +1,369 @@ +@import "tailwindcss"; + +:root { + --bg: #f4f6fb; + --bg-alt: #e6ebf5; + --ink: #152036; + --ink-soft: #4f607f; + --panel: rgba(255, 255, 255, 0.82); + --panel-border: rgba(31, 55, 101, 0.18); + --accent: #ff7a2f; + --accent-ink: #622300; +} + +@theme inline { + --color-background: var(--bg); + --color-foreground: var(--ink); +} + +* { + box-sizing: border-box; +} + +html, +body { + margin: 0; + min-height: 100%; +} + +.font-body { + font-family: var(--font-body), monospace; +} + +.title-highlight { + font-family: var(--font-title), serif; + color: var(--accent); +} + +body { + color: var(--ink); + background: + radial-gradient(1200px 500px at 16% -6%, #d4dff7 0%, transparent 65%), + radial-gradient(1100px 500px at 100% 8%, #ffe8d9 0%, transparent 58%), + linear-gradient(160deg, var(--bg), var(--bg-alt)); +} + +.app-shell { + width: min(1240px, 100% - 2.5rem); + margin: 2rem auto; + display: grid; + gap: 1rem; +} + +.top-nav { + display: flex; + align-items: center; + justify-content: space-between; + gap: 1rem; + padding: 0.85rem 1rem; + border: 1px solid var(--panel-border); + border-radius: 14px; + background: var(--panel); + backdrop-filter: blur(8px); +} + +.brand-wrap { + display: flex; + align-items: center; + gap: 0.8rem; +} + +.brand-pill { + font-size: 0.7rem; + text-transform: uppercase; + letter-spacing: 0.08em; + border: 1px solid var(--panel-border); + border-radius: 999px; + padding: 0.2rem 0.55rem; + color: var(--ink-soft); +} + +.brand-name { + margin: 0; + font-size: 1.25rem; + line-height: 1; + letter-spacing: -0.02em; +} + +.nav-links { + display: flex; + align-items: center; + gap: 0.4rem; + flex-wrap: wrap; +} + +.nav-link { + border: 1px solid transparent; + background: transparent; + color: var(--ink-soft); + border-radius: 8px; + padding: 0.4rem 0.65rem; + font-size: 0.78rem; + cursor: pointer; +} + +.nav-link:hover { + color: var(--ink); + border-color: var(--panel-border); + background: rgba(255, 255, 255, 0.64); +} + +.dashboard-grid { + display: grid; + grid-template-columns: repeat(12, minmax(0, 1fr)); + gap: 1rem; +} + +.panel { + border: 1px solid var(--panel-border); + border-radius: 16px; + background: var(--panel); + backdrop-filter: blur(8px); + padding: 1rem; + box-shadow: 0 10px 30px rgba(18, 32, 62, 0.06); +} + +.hero-panel { + grid-column: span 8; +} + +.status-panel { + grid-column: span 4; +} + +.trace-panel, +.runbook-panel { + grid-column: span 6; +} + +.kicker { + margin: 0 0 0.5rem; + color: var(--ink-soft); + font-size: 0.75rem; + text-transform: uppercase; + letter-spacing: 0.1em; +} + +.hero-title { + margin: 0; + font-size: clamp(1.6rem, 2.5vw, 2.8rem); + line-height: 1.1; + max-width: 24ch; +} + +.hero-copy { + margin: 0.9rem 0 0; + color: var(--ink-soft); + max-width: 55ch; + line-height: 1.65; + font-size: 0.9rem; +} + +.hero-actions { + margin-top: 1rem; + display: flex; + flex-wrap: wrap; + gap: 0.7rem; +} + +.chat-form { + margin-top: 1.15rem; + display: grid; + gap: 0.65rem; +} + +.chat-label { + font-size: 0.74rem; + letter-spacing: 0.08em; + text-transform: uppercase; + color: var(--ink-soft); +} + +.message-input { + width: 100%; + resize: vertical; + border: 1px solid var(--panel-border); + border-radius: 12px; + padding: 0.65rem 0.75rem; + font-family: inherit; + background: rgba(255, 255, 255, 0.72); + color: var(--ink); +} + +.message-input:focus { + outline: 2px solid rgba(255, 122, 47, 0.2); + border-color: rgba(255, 122, 47, 0.45); +} + +.btn { + border: 1px solid transparent; + border-radius: 10px; + font-family: inherit; + font-size: 0.78rem; + font-weight: 600; + padding: 0.55rem 0.85rem; + cursor: pointer; +} + +.btn-primary { + background: var(--accent); + color: #fff; +} + +.btn-primary:hover { + background: #e46820; +} + +.btn:disabled { + opacity: 0.72; + cursor: not-allowed; +} + +.btn-ghost { + background: rgba(255, 255, 255, 0.6); + border-color: var(--panel-border); + color: var(--ink); +} + +.error-callout { + margin: 0.85rem 0 0; + border: 1px solid rgba(196, 48, 48, 0.22); + border-radius: 10px; + background: rgba(255, 233, 233, 0.75); + color: #8f2626; + padding: 0.65rem 0.75rem; + font-size: 0.82rem; +} + +.response-card { + margin-top: 0.9rem; + border: 1px solid var(--panel-border); + border-radius: 12px; + background: rgba(255, 255, 255, 0.72); + padding: 0.85rem; +} + +.response-text { + margin: 0; + color: var(--ink); + line-height: 1.65; + font-size: 0.9rem; +} + +.response-meta { + margin: 0.75rem 0 0; + color: var(--ink-soft); + font-size: 0.75rem; +} + +.status-list { + list-style: none; + margin: 0.75rem 0 0; + padding: 0; + display: grid; + gap: 0.65rem; +} + +.status-list li { + display: flex; + justify-content: space-between; + align-items: center; + gap: 1rem; + border: 1px solid var(--panel-border); + border-radius: 10px; + padding: 0.6rem 0.7rem; + font-size: 0.8rem; +} + +.status-list span { + color: var(--ink-soft); +} + +.status-list strong { + color: var(--accent-ink); +} + +.trace-panel p, +.runbook-panel p { + color: var(--ink-soft); + line-height: 1.65; + font-size: 0.85rem; +} + +.trace-lines { + margin-top: 0.8rem; + display: grid; + gap: 0.55rem; +} + +.trace-status { + margin: 0.8rem 0 0; + font-size: 0.78rem; + color: var(--accent-ink); +} + +.trace-events { + margin: 0.8rem 0 0; + padding: 0; + list-style: none; + display: grid; + gap: 0.55rem; +} + +.trace-events li { + border: 1px solid var(--panel-border); + border-radius: 10px; + background: rgba(255, 255, 255, 0.62); + padding: 0.55rem 0.65rem; + font-size: 0.8rem; + line-height: 1.45; + color: var(--ink-soft); +} + +.trace-lines span { + height: 8px; + border-radius: 999px; + background: linear-gradient(90deg, #e1e7f6, #f4f7fe); +} + +.chip-row { + margin-top: 0.7rem; + display: flex; + flex-wrap: wrap; + gap: 0.5rem; +} + +.chip { + border: 1px solid var(--panel-border); + border-radius: 999px; + padding: 0.3rem 0.55rem; + font-size: 0.72rem; + color: var(--ink-soft); + background: rgba(255, 255, 255, 0.58); +} + +@media (max-width: 980px) { + .hero-panel, + .status-panel, + .trace-panel, + .runbook-panel { + grid-column: span 12; + } + + .top-nav { + flex-direction: column; + align-items: flex-start; + } +} + +@media (max-width: 640px) { + .app-shell { + width: min(1240px, 100% - 1rem); + margin: 0.75rem auto; + } + + .hero-actions { + flex-direction: column; + align-items: stretch; + } +} diff --git a/frontend/app/layout.tsx b/frontend/app/layout.tsx new file mode 100644 index 000000000..03a965a26 --- /dev/null +++ b/frontend/app/layout.tsx @@ -0,0 +1,34 @@ +import type { Metadata } from "next"; +import { Instrument_Serif, JetBrains_Mono } from "next/font/google"; +import "./globals.css"; + +const titleSerif = Instrument_Serif({ + variable: "--font-title", + weight: "400", + subsets: ["latin"], +}); + +const bodyMono = JetBrains_Mono({ + variable: "--font-body", + subsets: ["latin"], +}); + +export const metadata: Metadata = { + title: "UniOps", + description: "Small OS for Operations", +}; + +export default function RootLayout({ + children, +}: Readonly<{ + children: React.ReactNode; +}>) { + return ( + + {children} + + ); +} diff --git a/frontend/app/page.tsx b/frontend/app/page.tsx new file mode 100644 index 000000000..f414de177 --- /dev/null +++ b/frontend/app/page.tsx @@ -0,0 +1,755 @@ +"use client"; + +import { type FormEvent, useEffect, useRef, useState } from "react"; + +import { + type ApprovalResponse, + type ChatStreamEvent, + type IncidentReport, + type IngestConfluenceResponse, + type IngestIrisResponse, + type TraceStep, + type TranscriptResponse, + getTranscript, + ingestConfluence, + ingestIris, + streamChat, + submitApproval, +} from "@/lib/chat-api"; + +const STRICT_LLM_PROVIDER = "groq"; + +function parsePageIds(value: string): string[] { + return Array.from( + new Set( + value + .split(/[\s,]+/) + .map((item) => item.trim()) + .filter(Boolean), + ), + ); +} + +function errorToMessage(error: unknown): string { + if (error instanceof Error) { + return error.message; + } + return "Unexpected request failure."; +} + +function providerPolicyViolation(event: ChatStreamEvent): string | null { + if (event.event_type !== "trace_step") { + return null; + } + + const stepName = String(event.step ?? "").toLowerCase(); + if (stepName === "reasoning" || stepName === "execution") { + const provider = String(event.metadata?.provider ?? "").trim().toLowerCase(); + const model = String(event.metadata?.model ?? "").trim().toLowerCase(); + + if (!provider) { + return `Missing provider metadata on ${stepName} step. Expected ${STRICT_LLM_PROVIDER}.`; + } + + if (provider === "deterministic" || model === "heuristic") { + return `Invalid deterministic metadata detected on ${stepName} step.`; + } + + if (provider !== STRICT_LLM_PROVIDER) { + return `Unexpected provider '${provider}' on ${stepName} step. Expected ${STRICT_LLM_PROVIDER}.`; + } + } + + if (stepName === "retrieval") { + const expansionProvider = String(event.metadata?.llm_query_expansion?.provider ?? "").trim().toLowerCase(); + if (expansionProvider && expansionProvider !== STRICT_LLM_PROVIDER) { + return `Unexpected retrieval expansion provider '${expansionProvider}'. Expected ${STRICT_LLM_PROVIDER}.`; + } + } + + return null; +} + +export default function Home() { + const navItems = ["Overview", "Trace", "Approvals", "Runbooks"]; + const [sessionId] = useState(() => { + if (typeof crypto !== "undefined" && typeof crypto.randomUUID === "function") { + return crypto.randomUUID(); + } + return `sess-${Date.now()}`; + }); + + const [message, setMessage] = useState( + "Create rollback PR and notify Slack and Jira for redis latency incident", + ); + const [confluencePageIds, setConfluencePageIds] = useState("65868,65898"); + const [irisCaseId, setIrisCaseId] = useState("1"); + const [useIrisContextForChat, setUseIrisContextForChat] = useState(false); + const [approverId, setApproverId] = useState("demo-approver"); + const [approvalComment, setApprovalComment] = useState("Approved from frontend demo flow."); + + const [traceId, setTraceId] = useState(null); + const [answer, setAnswer] = useState(""); + const [needsApproval, setNeedsApproval] = useState(false); + const [streamStatus, setStreamStatus] = useState("idle"); + const [streamEvents, setStreamEvents] = useState([]); + const [traceSteps, setTraceSteps] = useState([]); + const [transcript, setTranscript] = useState(null); + const [confluenceResult, setConfluenceResult] = useState(null); + const [irisResult, setIrisResult] = useState(null); + const [approvalResult, setApprovalResult] = useState(null); + + const [chatLoading, setChatLoading] = useState(false); + const [confluenceLoading, setConfluenceLoading] = useState(false); + const [irisLoading, setIrisLoading] = useState(false); + const [approvalLoading, setApprovalLoading] = useState(false); + const [transcriptLoading, setTranscriptLoading] = useState(false); + const [errorMessage, setErrorMessage] = useState(null); + + const streamAbortRef = useRef(null); + const seenEventIdsRef = useRef>(new Set()); + + useEffect(() => { + return () => { + streamAbortRef.current?.abort(); + }; + }, []); + + function stepFromStreamEvent(event: ChatStreamEvent): TraceStep { + const metadata = { + ...(event.metadata ?? {}), + stream_sequence: event.sequence, + }; + return { + step: event.step ?? "unknown", + agent: event.agent ?? "unknown_agent", + observation: event.observation ?? "", + sources: event.sources ?? [], + metadata, + timestamp: event.timestamp, + }; + } + + function formatTimestamp(value?: string): string { + if (!value) { + return "n/a"; + } + const parsed = new Date(value); + if (Number.isNaN(parsed.getTime())) { + return value; + } + return parsed.toLocaleString(); + } + + async function refreshTranscript(activeTraceId: string): Promise { + setTranscriptLoading(true); + try { + const result = await getTranscript(activeTraceId); + setTranscript(result); + setTraceSteps(result.steps); + if (result.final_status) { + setNeedsApproval(false); + } + } finally { + setTranscriptLoading(false); + } + } + + async function handleConfluenceIngest(): Promise { + const pageIds = parsePageIds(confluencePageIds); + if (pageIds.length === 0) { + setErrorMessage("Provide at least one Confluence page ID."); + return; + } + + setErrorMessage(null); + setConfluenceLoading(true); + try { + const result = await ingestConfluence(pageIds); + setConfluenceResult(result); + } catch (error: unknown) { + setErrorMessage(errorToMessage(error)); + } finally { + setConfluenceLoading(false); + } + } + + async function handleIrisIngest(): Promise { + const caseId = irisCaseId.trim(); + if (!caseId) { + setErrorMessage("Provide an IRIS case ID before ingesting."); + return; + } + + setErrorMessage(null); + setIrisLoading(true); + try { + const result = await ingestIris(caseId); + setIrisResult(result); + } catch (error: unknown) { + setErrorMessage(errorToMessage(error)); + } finally { + setIrisLoading(false); + } + } + + async function handleChatSubmit(event: FormEvent): Promise { + event.preventDefault(); + const hasIrisContext = useIrisContextForChat && Boolean(irisResult?.incident_report); + if (!hasIrisContext && !message.trim()) { + setErrorMessage("Enter an incident prompt before starting the session."); + return; + } + if (useIrisContextForChat && !irisResult?.incident_report) { + setErrorMessage("Ingest an IRIS case first before using incident_report chat mode."); + return; + } + + setErrorMessage(null); + setChatLoading(true); + setStreamStatus("idle"); + setStreamEvents([]); + setTraceSteps([]); + setTranscript(null); + setApprovalResult(null); + setTraceId(null); + setAnswer(""); + + streamAbortRef.current?.abort(); + const abortController = new AbortController(); + streamAbortRef.current = abortController; + seenEventIdsRef.current = new Set(); + + let terminalErrorMessage: string | null = null; + + try { + let completedTraceId: string | null = null; + + setStreamStatus("connecting"); + const payload: { + message?: string; + session_id: string; + incident_report?: IncidentReport; + } = { + session_id: sessionId, + }; + + if (!hasIrisContext) { + payload.message = message.trim(); + } + + if (hasIrisContext && irisResult?.incident_report) { + payload.incident_report = irisResult.incident_report; + } + + await streamChat( + payload, + { + signal: abortController.signal, + onEvent: (streamEvent) => { + if (seenEventIdsRef.current.has(streamEvent.event_id)) { + return; + } + seenEventIdsRef.current.add(streamEvent.event_id); + + if (streamEvent.trace_id && streamEvent.trace_id !== "trace-pending") { + setTraceId(streamEvent.trace_id); + } + + setStreamEvents((previous) => [...previous.slice(-99), streamEvent]); + + if (streamEvent.event_type === "trace_started") { + setStreamStatus("streaming"); + return; + } + + if (streamEvent.event_type === "trace_heartbeat") { + setStreamStatus("heartbeat"); + return; + } + + if (streamEvent.event_type === "trace_step") { + const providerViolation = providerPolicyViolation(streamEvent); + if (providerViolation) { + terminalErrorMessage = providerViolation; + setStreamStatus("error"); + abortController.abort(); + return; + } + + setStreamStatus("streaming"); + setTraceSteps((previous) => [...previous, stepFromStreamEvent(streamEvent)]); + return; + } + + if (streamEvent.event_type === "trace_complete") { + completedTraceId = streamEvent.trace_id; + setAnswer(streamEvent.answer ?? ""); + setNeedsApproval(Boolean(streamEvent.needs_approval)); + setStreamStatus("complete"); + return; + } + + if (streamEvent.event_type === "trace_error") { + terminalErrorMessage = streamEvent.error ?? "Streaming failed."; + setStreamStatus("error"); + } + }, + }, + ); + + if (terminalErrorMessage) { + setErrorMessage(terminalErrorMessage); + } + + if (completedTraceId) { + await refreshTranscript(completedTraceId); + } + } catch (error: unknown) { + if (error instanceof DOMException && error.name === "AbortError") { + if (terminalErrorMessage) { + setErrorMessage(terminalErrorMessage); + setStreamStatus("error"); + } else { + setStreamStatus("aborted"); + } + return; + } + setErrorMessage(errorToMessage(error)); + setStreamStatus("error"); + } finally { + setChatLoading(false); + if (streamAbortRef.current === abortController) { + streamAbortRef.current = null; + } + } + } + + async function handleApproval(decision: "approve" | "reject"): Promise { + if (!traceId) { + setErrorMessage("Start a chat trace before submitting approval."); + return; + } + + setErrorMessage(null); + setApprovalLoading(true); + try { + const result = await submitApproval(traceId, { + decision, + approver_id: approverId.trim() || "demo-approver", + comment: approvalComment.trim() || undefined, + }); + setApprovalResult(result); + setNeedsApproval(false); + await refreshTranscript(traceId); + } catch (error: unknown) { + setErrorMessage(errorToMessage(error)); + } finally { + setApprovalLoading(false); + } + } + + async function handleTranscriptRefresh(): Promise { + if (!traceId) { + setErrorMessage("No active trace yet. Start with a chat request."); + return; + } + + setErrorMessage(null); + try { + await refreshTranscript(traceId); + } catch (error: unknown) { + setErrorMessage(errorToMessage(error)); + } + } + + return ( +
+
+
+

UniOps

+
+ +
+ +
+
+

Ops Copilot Workspace

+

+ Observe, reason, and act with human control +

+

+ Run the full demo from this page: ingest Confluence and IRIS context, generate trace-guided reasoning, + and complete the human approval path with persisted transcript evidence. +

+

Chat now runs as live POST SSE stream against /api/chat.

+

Strict provider mode: reasoning and execution must report provider={STRICT_LLM_PROVIDER}.

+ +
+ +